From df95ada79073b3969b91dd0e35c853da85fcba1b Mon Sep 17 00:00:00 2001 From: chessai Date: Wed, 26 Jun 2024 19:18:58 -0500 Subject: [PATCH 1/4] fold in chainweb-storage Change-Id: I3739cec07a0fefe6b1f1f2aae4fff245b6b593ca --- Setup.hs | 445 ---------- cabal.project | 8 +- cabal.project.freeze | 1 - chainweb.cabal | 56 +- vendored/chainweb-storage/LICENSE | 30 + vendored/chainweb-storage/README.md | 33 + .../src/Chainweb/Storage/DedupStore.hs | 436 ++++++++++ .../src/Chainweb/Storage/Table.hs | 184 +++++ .../src/Chainweb/Storage/Table/Forgetful.hs | 34 + .../src/Chainweb/Storage/Table/HashMap.hs | 65 ++ .../src/Chainweb/Storage/Table/RocksDB.hs | 780 ++++++++++++++++++ 11 files changed, 1603 insertions(+), 469 deletions(-) delete mode 100644 Setup.hs create mode 100644 vendored/chainweb-storage/LICENSE create mode 100644 vendored/chainweb-storage/README.md create mode 100644 vendored/chainweb-storage/src/Chainweb/Storage/DedupStore.hs create mode 100644 vendored/chainweb-storage/src/Chainweb/Storage/Table.hs create mode 100644 vendored/chainweb-storage/src/Chainweb/Storage/Table/Forgetful.hs create mode 100644 vendored/chainweb-storage/src/Chainweb/Storage/Table/HashMap.hs create mode 100644 vendored/chainweb-storage/src/Chainweb/Storage/Table/RocksDB.hs diff --git a/Setup.hs b/Setup.hs deleted file mode 100644 index 74302cb25e..0000000000 --- a/Setup.hs +++ /dev/null @@ -1,445 +0,0 @@ --- ------------------------------------------------------ -- --- Copyright © 2019-2023 Kadena LLC --- Copyright © 2019 Colin Woodbury --- Copyright © 2015-2018 Lars Kuhtz --- Copyright © 2014 AlephCloud Systems, Inc. --- ------------------------------------------------------ -- - -{-# LANGUAGE CPP #-} -{-# LANGUAGE LambdaCase #-} -{-# LANGUAGE MultiWayIf #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ScopedTypeVariables #-} - -{-# OPTIONS_HADDOCK show-extensions #-} - --- | This module contains a @Setup.hs@ script that hooks into the cabal build --- process at the end of the configuration phase and generates a module with --- package information for each component of the cabal package. --- --- The modules are created in the /autogen/ build directories where also the --- @Path_@ modules are created by cabal's simple build setup. --- --- = Usage as Setup Script --- --- There are three ways how this module can be used: --- --- 1. Copy the code of this module into a file called @Setup.hs@ in the root --- directory of your package. --- --- 2. If the /configuration-tools/ package is already installed in the system --- where the build is done, following code can be used as @Setup.hs@ script: --- --- > module Main (main) where --- > --- > import Configuration.Utils.Setup --- --- 3. For usage within a more complex @Setup.hs@ script you shall import this --- module qualified and use the 'mkPkgInfoModules' function. For example: --- --- > module Main (main) where --- > --- > import qualified Configuration.Utils.Setup as ConfTools --- > --- > main :: IO () --- > main = defaultMainWithHooks (ConfTools.mkPkgInfoModules simpleUserHooks) --- > --- --- With all methods the field @Build-Type@ in the package description (cabal) file --- must be set to @Custom@: --- --- > Build-Type: Custom --- --- --- = Integration With "Configuration.Utils" --- --- You can integrate the information provided by the @PkgInfo@ modules with the --- command line interface of an application by importing the respective module --- for the component and using the --- 'Configuration.Utils.runWithPkgInfoConfiguration' function from the module --- "Configuration.Utils" as show in the following example: --- --- > {-# LANGUAGE OverloadedStrings #-} --- > {-# LANGUAGE FlexibleInstances #-} --- > --- > module Main --- > ( main --- > ) where --- > --- > import Configuration.Utils --- > import PkgInfo --- > --- > instance FromJSON (() -> ()) where parseJSON _ = pure id --- > --- > mainInfo :: ProgramInfo () --- > mainInfo = programInfo "Hello World" (pure id) () --- > --- > main :: IO () --- > main = runWithPkgInfoConfiguration mainInfo pkgInfo . const $ putStrLn "hello world" --- --- With that the resulting application supports the following additional command --- line options: --- --- [@--version@, @-v@] --- prints the version of the application and exits. --- --- [@--info@, @-i@] --- prints a short info message for the application and exits. --- --- [@--long-info@] --- print a detailed info message for the application and exits. --- Beside component name, package name, version, revision, and copyright --- the message also contain information about the compiler that --- was used for the build, the build architecture, build flags, --- the author, the license type, and a list of all direct and --- indirect dependencies along with their licenses and copyrights. --- --- [@--license@] --- prints the text of the lincense of the application and exits. --- -module Main -( main -) where - -import qualified Distribution.Compat.Graph as Graph -import qualified Distribution.InstalledPackageInfo as I -import Distribution.PackageDescription -import Distribution.Pretty -import Distribution.Simple -import Distribution.Simple.BuildPaths -import Distribution.Simple.LocalBuildInfo -import Distribution.Simple.PackageIndex -import Distribution.Simple.Setup -import Distribution.Text -import Distribution.Types.LocalBuildInfo -import Distribution.Types.UnqualComponentName -import Distribution.Utils.Path -import Distribution.Utils.ShortText - -import System.Process - -import Control.Applicative -import Control.Monad - -import qualified Data.ByteString as B -import Data.ByteString.Char8 (pack) -import Data.Char (isSpace) -import Data.List (intercalate) -import Data.Maybe (fromMaybe) -import Data.Monoid - -import System.Directory - (canonicalizePath, createDirectoryIfMissing, doesDirectoryExist, - doesFileExist, getCurrentDirectory) -import System.Environment (lookupEnv) -import System.Exit (ExitCode(ExitSuccess)) -import System.FilePath (isDrive, takeDirectory, ()) - --- | Include this function when your setup doesn't contain any --- extra functionality. --- -main :: IO () -main = defaultMainWithHooks (mkPkgInfoModules simpleUserHooks) - --- | Modifies the given record of hooks by adding functionality that --- creates a package info module for each component of the cabal package. --- --- This function is intended for usage in more complex @Setup.hs@ scripts. --- If your setup doesn't contain any other function you can just import --- the 'main' function from this module. --- --- The modules are created in the /autogen/ build directories where also the --- @Path_@ modules are created by cabal's simple build setup. --- -mkPkgInfoModules - :: UserHooks - -> UserHooks -mkPkgInfoModules hooks = hooks - { postConf = mkPkgInfoModulesPostConf (postConf hooks) - } - --- -------------------------------------------------------------------------- -- --- Compat Implementations - -prettyLicense :: I.InstalledPackageInfo -> String -prettyLicense = either prettyShow prettyShow . I.license - -ft :: ShortText -> String -ft = fromShortText - --- -------------------------------------------------------------------------- -- --- Cabal 2.0 - -mkPkgInfoModulesPostConf - :: (Args -> ConfigFlags -> PackageDescription -> LocalBuildInfo -> IO ()) - -> Args - -> ConfigFlags - -> PackageDescription - -> LocalBuildInfo - -> IO () -mkPkgInfoModulesPostConf hook args flags pkgDesc bInfo = do - mapM_ (updatePkgInfoModule pkgDesc bInfo) $ Graph.toList $ componentGraph bInfo - hook args flags pkgDesc bInfo - -updatePkgInfoModule :: PackageDescription -> LocalBuildInfo -> ComponentLocalBuildInfo -> IO () -updatePkgInfoModule pkgDesc bInfo clbInfo = do - createDirectoryIfMissing True dirName - moduleBytes <- pkgInfoModule moduleName cName pkgDesc bInfo - updateFile fileName moduleBytes - - -- legacy module - legacyModuleBytes <- pkgInfoModule legacyModuleName cName pkgDesc bInfo - updateFile legacyFileName legacyModuleBytes - - where - dirName = autogenComponentModulesDir bInfo clbInfo - cName = unUnqualComponentName <$> componentNameString (componentLocalName clbInfo) - - moduleName = pkgInfoModuleName - fileName = dirName ++ "/" ++ moduleName ++ ".hs" - - legacyModuleName = legacyPkgInfoModuleName cName - legacyFileName = dirName ++ "/" ++ legacyModuleName ++ ".hs" - --- -------------------------------------------------------------------------- -- --- Generate PkgInfo Module - -pkgInfoModuleName :: String -pkgInfoModuleName = "PkgInfo" - -updateFile :: FilePath -> B.ByteString -> IO () -updateFile fileName content = do - x <- doesFileExist fileName - if | not x -> update - | otherwise -> do - oldRevisionFile <- B.readFile fileName - when (oldRevisionFile /= content) update - where - update = B.writeFile fileName content - -legacyPkgInfoModuleName :: Maybe String -> String -legacyPkgInfoModuleName Nothing = "PkgInfo" -legacyPkgInfoModuleName (Just cn) = "PkgInfo_" ++ map tr cn - where - tr '-' = '_' - tr c = c - -trim :: String -> String -trim = f . f - where f = reverse . dropWhile isSpace - --- -------------------------------------------------------------------------- -- --- VCS - -getVCS :: IO (Maybe KnownRepoType) -getVCS = getCurrentDirectory >>= getVcsOfDir - where - getVcsOfDir d = do - canonicDir <- canonicalizePath d - doesDirectoryExist (canonicDir ".hg") >>= \x0 -> if x0 - then return (Just Mercurial) - else doesDirectoryExist (canonicDir ".git") >>= \x1 -> if x1 - then return $ Just Git - else if isDrive canonicDir - then return Nothing - else getVcsOfDir (takeDirectory canonicDir) - --- | Returns tag, revision, and branch name. --- -hgInfo :: IO (String, String, String) -hgInfo = do - tag <- trim <$> readProcess "hg" ["id", "-r", "max(ancestors(\".\") and tag())", "-t"] "" - rev <- trim <$> readProcess "hg" ["id", "-i"] "" - branch <- trim <$> readProcess "hg" ["id", "-b"] "" - return (tag, rev, branch) - --- | Returns tag, revision, and branch name. --- -gitInfo :: IO (String, String, String) -gitInfo = do - tag <- do - (exitCode, out, _err) <- readProcessWithExitCode "git" ["describe", "--exact-match", "--tags", "--abbrev=0"] "" - case exitCode of - ExitSuccess -> return $ trim out - _ -> return "" - rev <- trim <$> readProcess "git" ["rev-parse", "--short", "HEAD"] "" - branch <- trim <$> readProcess "git" ["rev-parse", "--abbrev-ref", "HEAD"] "" - return (tag, rev, branch) - --- | Returns tag, revision, and branch name. --- -gitlabCiVcsInfo :: IO (Maybe (String, String, String)) -gitlabCiVcsInfo = do - lookupEnv "CI_COMMIT_SHORT_SHA" >>= \case - Nothing -> return Nothing - Just _ -> do - rev <- fromMaybe "" <$> lookupEnv "CI_COMMIT_SHORT_SHA" - branch <- fromMaybe "" <$> lookupEnv "CI_COMMIT_REF_NAME" - tag <- fromMaybe "" <$> lookupEnv "CI_COMMIT_TAG" - return $ Just (tag, rev, branch) - --- | The file format is revision, branch, and tag separate by newline --- characters. --- -fileVcsInfo :: IO (Maybe (String, String, String)) -fileVcsInfo = do - doesFileExist ".vcs-info" >>= \x -> if x - then do - (rev : branch : tag : _) <- (<> ["", "", ""]) . lines <$> readFile ".vcs-info" - return $ Just (tag, rev, branch) - else return Nothing - --- | Returns tag, revision, and branch name. --- -noVcsInfo :: IO (String, String, String) -noVcsInfo = return ("", "", "") - --- | Returns tag, revision, and branch name. --- -getVcsInfo :: IO (String, String, String) -getVcsInfo = getVCS >>= \case - Just Mercurial -> hgInfo - Just Git -> gitInfo - _ -> gitlabCiVcsInfo >>= \case - Just a -> return a - Nothing -> fileVcsInfo >>= \case - Just a -> return a - Nothing -> noVcsInfo - --- -------------------------------------------------------------------------- -- --- Generate Module - -pkgInfoModule :: String -> Maybe String -> PackageDescription -> LocalBuildInfo -> IO B.ByteString -pkgInfoModule moduleName cName pkgDesc bInfo = do - (tag, revision, branch) <- getVcsInfo - - let vcsBranch = if branch == "default" || branch == "master" then "" else branch - vcsVersion = intercalate "-" . filter (/= "") $ [tag, revision, vcsBranch] - flags = map (unFlagName . fst) . filter snd . unFlagAssignment . configConfigurationsFlags . configFlags $ bInfo - - licenseString <- licenseFilesText pkgDesc - - return $ B.intercalate "\n" - [ "{-# LANGUAGE OverloadedStrings #-}" - , "{-# LANGUAGE RankNTypes #-}" - , "" - , "module " <> pack moduleName <> " " <> deprecatedMsg <> " where" - , "" - , " import Data.String (IsString)" - , " import Data.Monoid" - , " import Prelude hiding ((<>))" - , "" - , " name :: IsString a => Maybe a" - , " name = " <> maybe "Nothing" (\x -> "Just \"" <> pack x <> "\"") cName - , "" - , " tag :: IsString a => a" - , " tag = \"" <> pack tag <> "\"" - , "" - , " revision :: IsString a => a" - , " revision = \"" <> pack revision <> "\"" - , "" - , " branch :: IsString a => a" - , " branch = \"" <> pack branch <> "\"" - , "" - , " branch' :: IsString a => a" - , " branch' = \"" <> pack vcsBranch <> "\"" - , "" - , " vcsVersion :: IsString a => a" - , " vcsVersion = \"" <> pack vcsVersion <> "\"" - , "" - , " compiler :: IsString a => a" - , " compiler = \"" <> (pack . display . compilerId . compiler) bInfo <> "\"" - , "" - , " flags :: IsString a => [a]" - , " flags = " <> (pack . show) flags - , "" - , " optimisation :: IsString a => a" - , " optimisation = \"" <> (displayOptimisationLevel . withOptimization) bInfo <> "\"" - , "" - , " arch :: IsString a => a" - , " arch = \"" <> (pack . display . hostPlatform) bInfo <> "\"" - , "" - , " license :: IsString a => a" - , " license = \"" <> (pack . prettyShow . license) pkgDesc <> "\"" - , "" - , " licenseText :: IsString a => a" - , " licenseText = " <> (pack . show) licenseString - , "" - , " copyright :: IsString a => a" - , " copyright = " <> (pack . show . copyright) pkgDesc - , "" - , " author :: IsString a => a" - , " author = \"" <> (pack . ft . author) pkgDesc <> "\"" - , "" - , " homepage :: IsString a => a" - , " homepage = \"" <> (pack . ft . homepage) pkgDesc <> "\"" - , "" - , " package :: IsString a => a" - , " package = \"" <> (pack . display . package) pkgDesc <> "\"" - , "" - , " packageName :: IsString a => a" - , " packageName = \"" <> (pack . display . packageName) pkgDesc <> "\"" - , "" - , " packageVersion :: IsString a => a" - , " packageVersion = \"" <> (pack . display . packageVersion) pkgDesc <> "\"" - , "" - , " dependencies :: IsString a => [a]" - , " dependencies = " <> (pack . show . map (display . packageId) . allPackages . installedPkgs) bInfo - , "" - , " dependenciesWithLicenses :: IsString a => [a]" - , " dependenciesWithLicenses = " <> (pack . show . map pkgIdWithLicense . allPackages . installedPkgs) bInfo - , "" - , " versionString :: (Monoid a, IsString a) => a" - , " versionString = case name of" - , " Nothing -> package <> \" (revision \" <> vcsVersion <> \")\"" - , " Just n -> n <> \"-\" <> packageVersion <> \" (package \" <> package <> \" revision \" <> vcsVersion <> \")\"" - , "" - , " info :: (Monoid a, IsString a) => a" - , " info = versionString <> \"\\n\" <> copyright" - , "" - , " longInfo :: (Monoid a, IsString a) => a" - , " longInfo = info <> \"\\n\\n\"" - , " <> \"Author: \" <> author <> \"\\n\"" - , " <> \"License: \" <> license <> \"\\n\"" - , " <> \"Homepage: \" <> homepage <> \"\\n\"" - , " <> \"Build with: \" <> compiler <> \" (\" <> arch <> \")\" <> \"\\n\"" - , " <> \"Build flags: \" <> mconcat (map (\\x -> \" \" <> x) flags) <> \"\\n\"" - , " <> \"Optimisation: \" <> optimisation <> \"\\n\\n\"" - , " <> \"Dependencies:\\n\" <> mconcat (map (\\x -> \" \" <> x <> \"\\n\") dependenciesWithLicenses)" - , "" - , " pkgInfo :: (Monoid a, IsString a) => (a, a, a, a)" - , " pkgInfo =" - , " ( info" - , " , longInfo" - , " , versionString" - , " , licenseText" - , " )" - , "" - ] - where - displayOptimisationLevel NoOptimisation = "none" - displayOptimisationLevel NormalOptimisation = "normal" - displayOptimisationLevel MaximumOptimisation = "maximum" - - deprecatedMsg = if moduleName /= pkgInfoModuleName - then "{-# DEPRECATED \"Update to Cabal 2.0 or later and use just PkgInfo as module name.\" #-}" - else "" - -licenseFilesText :: PackageDescription -> IO B.ByteString -licenseFilesText pkgDesc = - B.intercalate "\n------------------------------------------------------------\n" <$> mapM fileTextStr - (licenseFiles pkgDesc) - where - fileTextStr = fileText . getSymbolicPath - fileText file = doesFileExist file >>= \x -> if x - then B.readFile file - else return "" - -pkgIdWithLicense :: I.InstalledPackageInfo -> String -pkgIdWithLicense a = (display . packageId) a - ++ " [" - ++ prettyLicense a - ++ (if cr /= "" then ", " ++ cr else "") - ++ "]" - where - cr = (unwords . words . ft . I.copyright) a diff --git a/cabal.project b/cabal.project index 0a22c1ab57..a6d0b218a0 100644 --- a/cabal.project +++ b/cabal.project @@ -1,4 +1,4 @@ -packages: chainweb.cabal +packages: . debug-info: True @@ -84,12 +84,6 @@ source-repository-package tag: 1d260bfaa48312b54851057885de4c43c420e35f --sha256: 0fzq4mzaszj5clvixx9mn1x6r4dcrnwvbl2znd0p5mmy5h2jr0hh -source-repository-package - type: git - location: https://github.com/kadena-io/chainweb-storage.git - tag: 4b45c1ab9c070c6d16a058bcbab0c06ac0fb6d4e - --sha256: 0m6c7kl6x5a3k02q2i7qzfx91kxz19dzav0piqfxra52bq0x3sm6 - source-repository-package type: git location: https://github.com/kadena-io/rocksdb-haskell.git diff --git a/cabal.project.freeze b/cabal.project.freeze index 5c02e6683f..1be9de4a7f 100644 --- a/cabal.project.freeze +++ b/cabal.project.freeze @@ -63,7 +63,6 @@ constraints: any.Boolean ==0.2.4, any.cborg ==0.2.10.0, any.cereal ==0.5.8.3, chainweb -debug -ed25519 -ghc-flags, - any.chainweb-storage ==0.1.0.0, any.character-ps ==0.1, any.charset ==0.3.10, any.chronos ==1.1.5.1, diff --git a/chainweb.cabal b/chainweb.cabal index a8b1bfde49..d9b9105c4d 100644 --- a/chainweb.cabal +++ b/chainweb.cabal @@ -10,9 +10,9 @@ license: BSD-3-Clause license-file: LICENSE author: Chainweb Dev Team maintainer: chainweb-dev@kadena.io -copyright: Copyright (C) 2018 - 2023 Kadena LLC +copyright: Copyright (C) 2018 - 2024 Kadena LLC category: Blockchain, Currency, Bitcoin, Kadena -build-type: Custom +build-type: Simple tested-with: GHC == 9.6 @@ -108,19 +108,43 @@ common warning-flags -- problems with ghci. -Wno-missing-home-modules -custom-setup - setup-depends: - Cabal >= 3.8 - , base >= 4.12 && < 5 - , bytestring >= 0.10.12 - , directory >= 1.3 - , filepath >= 1.4 - , process >= 1.5 - -- -------------------------------------------------------------------------- -- -- Chainweb Library -- -------------------------------------------------------------------------- -- +library chainweb-storage + -- import: warning-flags, debugging-flags + hs-source-dirs: vendored/chainweb-storage/src + default-language: Haskell2010 + exposed-modules: + Chainweb.Storage.Table + Chainweb.Storage.Table.Forgetful + Chainweb.Storage.Table.HashMap + Chainweb.Storage.Table.RocksDB + Chainweb.Storage.DedupStore + build-depends: + , base >=4.10 && <5 + , bytestring >=0.10 + , containers >=0.5 + , cryptonite >= 0.25 + , deepseq >=1.4 + , directory >=1.3 + , exceptions >=0.10 + , filepath + , hashable >=1.2 + , lens >=4.16 + , memory >=0.14 + , mtl >= 2.2 + , nothunks >= 0.1.0.0 + , rocksdb-haskell-kadena >=1.1.0 + , stm >=2.4 + , streaming >=0.2 + , temporary >=1.3 + , text >=1.2 + , transformers >=0.5 + , unordered-containers >=0.2 + , vector >=0.12 + library import: warning-flags, debugging-flags default-language: Haskell2010 @@ -371,7 +395,7 @@ library , bytestring >= 0.10.12 , case-insensitive >= 1.2 , cassava >= 0.5.1 - , chainweb-storage >= 0.1 + , chainweb:chainweb-storage , chronos >= 1.1 , clock >= 0.7 , configuration-tools >= 0.6 @@ -568,7 +592,7 @@ test-suite chainweb-tests -- due to change in error message that is inlined in pact , bytestring >= 0.10.12 , case-insensitive >= 1.2 - , chainweb-storage >= 0.1 + , chainweb:chainweb-storage , crypton-connection >=0.2 , containers >= 0.5 , crypton >= 0.31 @@ -668,7 +692,7 @@ executable chainweb-node -- external , async >= 2.2 , base >= 4.12 && < 5 - , chainweb-storage >= 0.1 + , chainweb:chainweb-storage , configuration-tools >= 0.6 , deepseq >= 1.4 , directory >= 1.3 @@ -750,7 +774,7 @@ executable cwtool , case-insensitive >= 1.2 , cassava >= 0.5.1 , cereal >= 0.5 - , chainweb-storage >= 0.1 + , chainweb:chainweb-storage , chronos >= 1.1 , configuration-tools >= 0.6 , crypton-connection >=0.2 @@ -845,7 +869,7 @@ benchmark bench , bytestring >= 0.10.12 , case-insensitive >= 1.2 , chainweb - , chainweb-storage >= 0.1 + , chainweb:chainweb-storage , containers >= 0.5 , criterion , deepseq >= 1.4 diff --git a/vendored/chainweb-storage/LICENSE b/vendored/chainweb-storage/LICENSE new file mode 100644 index 0000000000..5a492b5e94 --- /dev/null +++ b/vendored/chainweb-storage/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2019-2022, Kadena LLC + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Lars Kuhtz nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendored/chainweb-storage/README.md b/vendored/chainweb-storage/README.md new file mode 100644 index 0000000000..67af61c042 --- /dev/null +++ b/vendored/chainweb-storage/README.md @@ -0,0 +1,33 @@ +This package provides some storage tools that are used in +[chainweb](https://github.com/kadena-io/chainweb). + +## Installation + +This package depends on RocksDb and a C++ compiler which must be installed prior to building. + +On Ubuntu: + +```bash +sudo apt-get install librocksdb-dev g++ +``` + +On MacOSX using [homebrew](https://brew.sh): + +```bash +brew install rocksdb +``` + +On Windows using [msys2](https://www.msys2.org): + +```bash +pacman -S mingw-w64-x86_64-rocksdb g++ +``` + +On windows, for GHC to find installed libraries you'll have to add the following +settings to your `cabal.project.local` file: + +```cabal +package rocksdb-haskell + extra-lib-dirs: C:\\msys64\\mingw64\\lib + extra-include-dirs: C:\\msys64\\mingw64\\include +``` diff --git a/vendored/chainweb-storage/src/Chainweb/Storage/DedupStore.hs b/vendored/chainweb-storage/src/Chainweb/Storage/DedupStore.hs new file mode 100644 index 0000000000..57991d3996 --- /dev/null +++ b/vendored/chainweb-storage/src/Chainweb/Storage/DedupStore.hs @@ -0,0 +1,436 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE FlexibleContexts #-} + +-- | +-- Module: Chainweb.Storage.DedupStore +-- Copyright: Copyright © 2019 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz +-- Stability: experimental +-- +-- A Deduplicated Key-Value Store +-- +module Chainweb.Storage.DedupStore +( +-- * Deduplicated Key-Value Store + DedupStore(..) +, newDedupStore +, dedupInsert +, dedupLookup +, DedupStoreException(..) + +-- * Low level Deduplicated Chunk Store +, dedupStore +, dedupStore' +, dedupRestore +) where + +import Control.DeepSeq +import Control.Exception (Exception) +import Control.Monad +import Control.Monad.Catch (throwM) + +import qualified Crypto.Hash as C + +import Data.Bits +import qualified Data.ByteArray as BA +import qualified Data.ByteString as B +import qualified Data.ByteString.Builder as BB +import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString.Unsafe as BU +import Data.Hashable +import Data.Int +import qualified Data.List as L +import Data.String +import qualified Data.Text as T +import Data.Word + +import Foreign.Ptr +import Foreign.Storable + +import GHC.Generics +import GHC.Stack + +import System.IO.Unsafe + +-- internal modules + +import Chainweb.Storage.Table +import Chainweb.Storage.Table.RocksDB + +-- -------------------------------------------------------------------------- -- +-- Utils + +fst3 :: (a, b, c) -> a +fst3 (a, _, _) = a +{-# INLINE fst3 #-} + +snd3 :: (a, b, c) -> b +snd3 (_, a, _) = a +{-# INLINE snd3 #-} + +third3 :: (a, b, c) -> c +third3 (_, _, a) = a +{-# INLINE third3 #-} + +-- -------------------------------------------------------------------------- -- +-- Dedup Hash + +dedupHashAlg :: C.Blake2b_256 +dedupHashAlg = C.Blake2b_256 + +type DedupHashAlg = C.Blake2b_256 + +dedupHashSize :: Int +dedupHashSize = C.hashDigestSize dedupHashAlg + +newtype DedupHash = DedupHash { _dedupHashBytes :: B.ByteString } + deriving (Show, Eq, Ord, Generic) + deriving newtype (NFData, Hashable, BA.ByteArrayAccess) + +-- -------------------------------------------------------------------------- -- +-- DedupStore + +-- | Exception thrown by the DedupStore +-- +newtype DedupStoreException = DedupStoreCorruptedData T.Text + deriving (Eq, Show, Ord, Generic) + deriving newtype (NFData, Hashable, IsString) + +instance Exception DedupStoreException + +-- | A deduplicating key value store +-- +data DedupStore k v = DedupStore + { _dedupRoots :: !(RocksDbTable k DedupHash) + , _dedupChunks :: !(Casify RocksDbTable Chunk) + , _dedupValueCodec :: !(Codec v) + } + +-- | Create a new 'DedupStore' +-- +newDedupStore + :: HasCallStack + => RocksDb + -> Codec v + -> Codec k + -> [B.ByteString] + -> DedupStore k v +newDedupStore rdb vc kc n = DedupStore + { _dedupRoots = newTable rdb dedupHashCodec kc (n <> ["roots"]) + , _dedupChunks = Casify $ newTable rdb dedupChunkCodec dedupHashCodec (n <> ["chunks"]) + , _dedupValueCodec = vc + } + where + dedupHashCodec = Codec + { _codecEncode = _dedupHashBytes + , _codecDecode = \l -> do + unless (B.length l == dedupHashSize) + $ throwM $ DedupStoreCorruptedData "Internal key has wrong size" + return (DedupHash l) + } + + dedupChunkCodec = Codec + { _codecEncode = \(Chunk t h b) -> B.singleton t <> _dedupHashBytes h <> b + , _codecDecode = \bs -> case B.uncons bs of + Nothing -> throwM $ DedupStoreCorruptedData "Got empty chunk" + Just (a, b) -> case B.splitAt dedupHashSize b of + (c, d) + | B.length c < dedupHashSize + -> throwM $ DedupStoreCorruptedData "Got too short chunk" + | otherwise -> return $ Chunk a (DedupHash c) d + } + +-- | Insert a new items into a 'DedupStore'. +-- +dedupInsert :: DedupStore k v -> k -> v -> IO () +dedupInsert store k v = do + dedupKey <- dedupStore (_dedupChunks store) + $ BL.fromStrict $ _codecEncode (_dedupValueCodec store) v + tableInsert (_dedupRoots store) k dedupKey +{-# INLINE dedupInsert #-} + +-- | @dedupLookup db k@ returns 'Just' the value at key @k@ in the +-- 'DedupStore' @db@ if it exists, or 'Nothing' if the @k@ doesn't exist in +-- the table. +-- +dedupLookup :: DedupStore k v -> k -> IO (Maybe v) +dedupLookup store k = do + tableLookup (_dedupRoots store) k >>= \case + Nothing -> return Nothing + Just dedupKey -> do + dedupRestore (_dedupChunks store) dedupKey >>= \case + Nothing -> return Nothing + Just bytes -> Just <$> + _codecDecode (_dedupValueCodec store) (BL.toStrict bytes) +{-# INLINE dedupLookup #-} + +-- -------------------------------------------------------------------------- -- +-- Low-Level Deduplication CAS + +-- | A chunk of data +-- +data Chunk = Chunk Word8 DedupHash B.ByteString + deriving (Show, Eq, Ord, Generic) + +instance IsCasValue Chunk where + type CasKeyType Chunk = DedupHash + casKey (Chunk _ h _) = h + +type DedupCas cas = Cas cas Chunk + +-- | Store a sequence of bytes in a deduplicated content addressed store. +-- Returns the hash of the bytes, that can be used to query the data from the +-- store. +-- +dedupStore + :: HasCallStack + => DedupCas cas + => cas + -> BL.ByteString + -> IO DedupHash +dedupStore store bytes = third3 <$> dedupStore' store bytes +{-# INLINE dedupStore #-} + +-- | Store a sequence of bytes in a deduplicated content addressed store. +-- Returns the hash of the bytes, that can be used to query the data from the +-- store. +-- +-- It also returns the number of chunk hits and chunk misses that are a measure +-- for the deduplication rate. +-- +dedupStore' + :: HasCallStack + => DedupCas cas + => cas + -> BL.ByteString + -> IO (Int, Int, DedupHash) +dedupStore' store = go0 + where + go0 :: HasCallStack => BL.ByteString -> IO (Int, Int, DedupHash) + go0 bytes = mapM (hashAndStore 0x0) (toChunks $ roll bytes) >>= \case + [] -> error "Chainweb.Storage.DedupStore.dedupStore.go0: Data.ByteString.Lazy.toChunks must not return an empty list" + [x] -> return x + l -> do + (h, m, r) <- go1 (third3 <$> l) + return (sum (fst3 <$> l) + h, sum (snd3 <$> l) + m, r) + + go1 :: HasCallStack => [DedupHash] -> IO (Int, Int, DedupHash) + go1 hashes = mapM (hashAndStore 0x1) (rollHashes hashes) >>= \case + [] -> error "Chainweb.Storage.DedupStore.dedupStore.go1: Data.ByteString.Lazy.toChunks must not return an empty list" + [x] -> return x + l -> do + (h, m, r) <- go1 (third3 <$> l) + return (sum (fst3 <$> l) + h, sum (snd3 <$> l) + m, r) + + toChunks x = case BL.toChunks x of + [] -> [""] + l -> l + {-# INLINE toChunks #-} + + -- This is called as the dedup hashes are produced, in a bottom up way. We + -- could optimize for data base queries (at the cost of more memory + -- overhead) by first computing the keys, checking for keys top-down, and + -- inserting the data in a second pass only for those keys that aren't in + -- the database already. + -- + hashAndStore :: Word8 -> B.ByteString -> IO (Int, Int, DedupHash) + hashAndStore tag c = do + let h = DedupHash $ BA.convert $ C.hash @_ @DedupHashAlg (B.cons tag c) + (hit, miss) <- tableMember store h >>= \x -> if x + then return (1, 0) + else do + casInsert store (Chunk tag h c) + return (0, 1) + return (hit, miss, h) + {-# INLINE hashAndStore #-} + {-# SCC hashAndStore #-} + + rollHashes :: [DedupHash] -> [B.ByteString] + rollHashes hs = fmap (B.concat . fmap _dedupHashBytes) $ rollLazy pickInt64 8 2 11 hs + {-# INLINE rollHashes #-} + {-# SCC rollHashes #-} + + pickInt64 :: DedupHash -> Int64 + pickInt64 s = unsafePerformIO $ BU.unsafeUseAsCString + (_dedupHashBytes s) + (peek . castPtr) + {-# INLINE pickInt64 #-} + {-# SCC pickInt64 #-} +{-# SCC dedupStore' #-} + +-- | Retrieve a sequence of bytes from a deduplicated content addressable store. +-- +dedupRestore + :: DedupCas cas + => cas + -> DedupHash + -> IO (Maybe BL.ByteString) +dedupRestore store key = fmap BB.toLazyByteString <$> do + tableLookup store key >>= \case + Nothing -> return Nothing + Just (Chunk 0x0 _ b) -> return $ Just $ BB.byteString b + Just (Chunk 0x1 _ b) -> Just <$> splitHashes b + _ -> throwM $ DedupStoreCorruptedData "Unknown chunk tag" + + where + go h = do + tableLookup store h >>= \case + Nothing -> throwM $ DedupStoreCorruptedData "Missing chunk from store" + Just (Chunk 0x0 _ b) -> return (BB.byteString b) + Just (Chunk 0x1 _ b) -> splitHashes b + _ -> throwM $ DedupStoreCorruptedData "Unknown chunk tag" + + splitHashes b + | B.null b = return mempty + | otherwise = case B.splitAt (C.hashDigestSize C.Blake2b_256) b of + (h, r) -> (<>) <$> go (DedupHash h) <*> splitHashes r +{-# SCC dedupRestore #-} + +-- -------------------------------------------------------------------------- -- +-- Rabin-Karp Rolling Hash +-- + +-- | Re-chunks a lazy 'BL.ByteString' using the Rabin-Karp rolling hash +-- function. The expected chunk size of the resuling 'BL.ByteString' is 8k, with +-- a minimal chunk size of 128 bytes and a maximal chunk size of 64k. +-- +-- The implementation is adapted from https://github.com/ekmett/hash, Copyright +-- 2012-2013 Edward Kmett. +-- +roll :: BL.ByteString -> BL.ByteString +roll z = BL.fromChunks $ go seed 0 z za ze + where + z' = BL.unpack z + za = BL.unpack (BL.replicate window 0) <> z' + {-# SCC za #-} + ze = z' + {-# SCC ze #-} + + go + :: Int64 + -- ^ current hash + -> Int64 + -- ^ current byte count in input string + -> BL.ByteString + -- ^ input bytestring + -> [Word8] + -- ^ bytes from start of window (starts all 0 before input string) + -> [Word8] + -- ^ bytes after end window + -> [B.ByteString] + -- ^ Chunk + go !h !c !bs (x:xs) (y:ys) + | ((h' .&. mask == mask) && c >= minSize) || c >= maxSize = case BL.splitAt (c + 1) bs of + (l,r) -> {-# SCC concat #-} B.concat (BL.toChunks l) : go seed 0 r xs ys + | otherwise = go h' (c + 1) bs xs ys + where + !x' = if c < window then 0 else fromIntegral x + !h' = rem (rem (h - magic_d * x') magic_q * magic_r + fromIntegral y) magic_q + {-# SCC h' #-} + go _ _ bs _ _ = [B.concat $ BL.toChunks bs] + {-# SCC go #-} + + magic_d = powqp magic_r (window-1) magic_q + mask = 8191 + minSize = 128 + maxSize = 65536 + magic_r = 256 + magic_q = 32749 + window = 128 + seed = 0 +{-# INLINE roll #-} +{-# SCC roll #-} + +-- | Breaks an input list into chunks using Rabin-Karp rolling hash with the +-- provided expected, minimal, and maximal chunks size. +-- +-- The implementation is adapted from https://github.com/ekmett/hash, Copyright +-- 2012-2013 Edward Kmett. +-- +rollLazy + :: forall a + . (a -> Int64) + -- ^ hash function (64 bit should be sufficient for must use cases) + -> Int + -- ^ expected size (as power of 2) + -> Int + -- ^ min size (as power of 2). Must be equal or smaller than expected. + -> Int + -- ^ max size (as power of 2). Must be equal or larger than expected. + -> [a] + -> [[a]] +rollLazy _ _ _ _ [] = [] +rollLazy f expectedPow2 minSizePow2 maxSizePow2 z + = go seed 0 z (L.replicate window (head z) <> z) z + where + go + :: Int64 + -- current hash + -> Int + -- current item count in input string + -> [a] + -- input items. + -> [a] + -- items from start of window (starts all 0 before input string). We + -- drop the first item of this on each rehash + -> [a] + -- items after end window. We add the first item from this on each + -- rehash. + -> [[a]] + -- Chunks + go !h !c !bs (x:xs) (y:ys) + + -- Found new chunk + | ((h' .&. mask == mask) && c >= minSize) || c >= maxSize + = case L.splitAt (c + 1) bs of + (l,r) -> l : go seed 0 r xs ys + + -- Continue + | otherwise + = go h' (c + 1) bs xs ys + where + !x' = if c < window then 0 else f x + !h' = rem (rem (h - magic_d * x') magic_q * magic_r + f y) magic_q + go _ _ bs _ _ = [bs] + + mask = (2^expectedPow2) - 1 + minSize = 2^minSizePow2 + maxSize = 2^maxSizePow2 + + magic_d = powqp magic_r (fromIntegral window - 1) magic_q + magic_r = 256 + magic_q = 32749 + + window :: Int + window = minSize + + seed :: Int64 + seed = 0 +{-# INLINE rollLazy #-} +{-# SCC rollLazy #-} + +-- | Calculates @powqp m n p = 'rem' (m^n) p@ under the assumption that @m * p * +-- p@ can fit in an @Int64@. +-- +-- This function is copied from https://github.com/ekmett/hash, Copyright +-- 2012-2013 Edward Kmett. +-- +powqp :: Int64 -> Int64 -> Int64 -> Int64 +powqp _ 0 _ = 1 +powqp m 1 p = rem m p +powqp m n p = case quotRem n 2 of + (q, 1) | k <- powqp m q p -> rem (k * k * m) p + (q, _) | k <- powqp m q p -> rem (k * k) p +{-# INLINE powqp #-} +{-# SCC powqp #-} + diff --git a/vendored/chainweb-storage/src/Chainweb/Storage/Table.hs b/vendored/chainweb-storage/src/Chainweb/Storage/Table.hs new file mode 100644 index 0000000000..484a1719dd --- /dev/null +++ b/vendored/chainweb-storage/src/Chainweb/Storage/Table.hs @@ -0,0 +1,184 @@ +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FunctionalDependencies #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE InstanceSigs #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE QuantifiedConstraints #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UndecidableInstances #-} + +-- | +-- Module: Chainweb.Storage.Table +-- Copyright: Copyright © 2019 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz , Edmund Noble +-- Stability: experimental +-- Description: Key Value Store +-- +-- API for Key-Value Stores +module Chainweb.Storage.Table + ( IsCasValue (..) + , ReadableTable (..) + , tableLookupBatch + , ReadableTable1 + , ReadableCas + , Table (..) + , Table1 + , Casify(..) + , Cas + , casInsert + , casInsertBatch + , casDelete + , casDeleteBatch + , IterableTable (..) + , IterableTable1 + , IterableCas + , Entry (..) + , Iterator (..) + , Iterator1 + , CasIterator + , tableLookupM + , casLookupM + , TableException (..) + ) +where + +import Control.Exception (Exception, SomeException) +import Control.Lens +import Control.Monad.Catch (throwM) + +import Data.Coerce +import Data.Foldable +import Data.Maybe +import Data.Text (Text) + +import GHC.Generics +import GHC.Stack + +-- | The class of content-addressable values. +-- +-- The casKey function must be morally injective: +-- +-- prop> casKey a /= casKey b || a == b +-- +-- Usually, 'casKey' is a cryptographic, i.e. collision resistant, hash +-- function. +class Eq (CasKeyType v) => IsCasValue v where + type CasKeyType v + casKey :: v -> CasKeyType v + +-- | Read-Only View of a Key-Value Store +-- +class ReadableTable t k v | t -> k v where + tableLookup :: t -> k -> IO (Maybe v) + tableLookupBatch' :: t -> Traversal s r k (Maybe v) -> s -> IO r + tableLookupBatch' t l = l (tableLookup t) + tableMember :: t -> k -> IO Bool + tableMember t k = isJust <$> tableLookup t k + +tableLookupBatch :: (ReadableTable t k v, Each s t' k (Maybe v)) => t -> s -> IO t' +tableLookupBatch t = tableLookupBatch' t each + +type ReadableCas t v = ReadableTable t (CasKeyType v) v +type ReadableTable1 t = forall k v. ReadableTable (t k v) k v + +class ReadableTable t k v => Table t k v | t -> k v where + tableInsert :: t -> k -> v -> IO () + tableInsertBatch :: t -> [(k, v)] -> IO () + tableInsertBatch t kvs = traverse_ (uncurry (tableInsert t)) kvs + tableDelete :: t -> k -> IO () + tableDeleteBatch :: t -> [k] -> IO () + tableDeleteBatch t ks = traverse_ (tableDelete t) ks +type Table1 t = forall k v. Table (t k v) k v + +type Cas t v = Table t (CasKeyType v) v + +casInsert :: (IsCasValue v, Cas t v) => t -> v -> IO () +casInsert t v = tableInsert t (casKey v) v + +casInsertBatch :: (IsCasValue v, Cas t v) => t -> [v] -> IO () +casInsertBatch t vs = tableInsertBatch t [(casKey v, v) | v <- vs] + +casDelete :: (IsCasValue v, Cas t v) => t -> v -> IO () +casDelete t = tableDelete t . casKey + +casDeleteBatch :: (IsCasValue v, Cas t v) => t -> [v] -> IO () +casDeleteBatch t = tableDeleteBatch t . fmap casKey + +class (Table t k v, Iterator i k v) => IterableTable t i k v | t -> i k v where + -- the created iterator must be positioned at the start of the table. + withTableIterator :: t -> (i -> IO a) -> IO a +type IterableTable1 t i = forall k v. IterableTable (t k v) (i k v) k v + +type IterableCas t v = IterableTable t (CasKeyType v) v + +data Entry k v = Entry !k !v + deriving (Eq, Show, Ord) + +class Iterator i k v | i -> k v where + iterSeek :: i -> k -> IO () + iterLast :: i -> IO () + iterFirst :: i -> IO () + iterNext :: i -> IO () + iterPrev :: i -> IO () + iterEntry :: i -> IO (Maybe (Entry k v)) + iterKey :: i -> IO (Maybe k) + iterKey i = (fmap . fmap) (\(Entry k _) -> k) $ iterEntry i + iterValue :: i -> IO (Maybe v) + iterValue i = (fmap . fmap) (\(Entry _ v) -> v) $ iterEntry i + iterValid :: i -> IO Bool + iterValid i = isJust <$> iterKey i +type Iterator1 i = forall k v. Iterator (i k v) k v + +type CasIterator i v = Iterator i (CasKeyType v) v + +-- | A newtype wrapper that takes only a single type constructor. This useful in +-- situations where a Higher Order type constructor for a CAS is required. A +-- type synonym doesn't work in this situation because type synonyms must be +-- fully applied. +-- +newtype Casify t v = Casify { unCasify :: t (CasKeyType v) v } +instance forall t k v. (CasKeyType v ~ k, ReadableTable (t k v) k v) => ReadableTable (Casify t v) k v where + tableLookup = coerce @(t k v -> k -> IO (Maybe v)) tableLookup + -- can't seem to write `coerce` without the resulting instantiation being impredicative + tableLookupBatch' (Casify t) b = tableLookupBatch' t b + tableMember = coerce @(t k v -> k -> IO Bool) tableMember +instance forall t k v. (CasKeyType v ~ k, Table (t k v) k v) => Table (Casify t v) k v where + tableInsert = coerce @(t k v -> k -> v -> IO ()) tableInsert + tableInsertBatch = coerce @(t k v -> [(k, v)] -> IO ()) tableInsertBatch + tableDelete = coerce @(t k v -> k -> IO ()) tableDelete + tableDeleteBatch = coerce @(t k v -> [k] -> IO ()) tableDeleteBatch +-- TODO: why is this Iterator superclass needed? +instance forall t i k v. (CasKeyType v ~ k, IterableTable (t k v) i k v, Iterator i k v) => IterableTable (Casify t v) i k v where + withTableIterator :: forall a. Casify t v -> (i -> IO a) -> IO a + withTableIterator = coerce @(t k v -> (i -> IO a) -> IO a) withTableIterator + +-- | Lookup a value by its key in a key-value store and throw an +-- 'TableException' if the value doesn't exist in the store +tableLookupM :: (HasCallStack, ReadableTable t k v) => t -> k -> IO v +tableLookupM cas k = + tableLookup cas k >>= \case + Nothing -> + throwM . TableException $ + "tableLookupM: lookup failed for table key" + Just v -> return $! v + +casLookupM :: (HasCallStack, ReadableCas t v) => t -> CasKeyType v -> IO v +casLookupM = tableLookupM + +-- | Exceptions that are thrown by instances of 'IsCas'. +data TableException + = TableException !Text + | TableImplementationException !SomeException + deriving (Show, Generic) + +instance Exception TableException + diff --git a/vendored/chainweb-storage/src/Chainweb/Storage/Table/Forgetful.hs b/vendored/chainweb-storage/src/Chainweb/Storage/Table/Forgetful.hs new file mode 100644 index 0000000000..a1dfff3079 --- /dev/null +++ b/vendored/chainweb-storage/src/Chainweb/Storage/Table/Forgetful.hs @@ -0,0 +1,34 @@ +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} + +-- | +-- Module: Chainweb.Storage.Table.Forgetful +-- Copyright: Copyright © 2019 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz +-- Stability: experimental +-- +-- A key-value store that forgets everything that is stored in it. +-- +module Chainweb.Storage.Table.Forgetful +( ForgetfulTable(..) +) where + +import Chainweb.Storage.Table + +-- | A key-value store that forgets everything that is added to it. +-- +data ForgetfulTable k v = ForgetfulTable + +instance ReadableTable (ForgetfulTable k v) k v where + tableLookup _ _ = return Nothing + tableMember _ _ = return False + {-# INLINE tableLookup #-} + +instance Table (ForgetfulTable k v) k v where + tableInsert _ _ _ = return () + tableDelete _ _ = return () + {-# INLINE tableInsert #-} + {-# INLINE tableDelete #-} + diff --git a/vendored/chainweb-storage/src/Chainweb/Storage/Table/HashMap.hs b/vendored/chainweb-storage/src/Chainweb/Storage/Table/HashMap.hs new file mode 100644 index 0000000000..62565979cf --- /dev/null +++ b/vendored/chainweb-storage/src/Chainweb/Storage/Table/HashMap.hs @@ -0,0 +1,65 @@ +{-# LANGUAGE ExistentialQuantification #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE FunctionalDependencies #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE UndecidableInstances #-} + +-- | +-- Module: Chainweb.Storage.Table.HashMap +-- Copyright: Copyright © 2019 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz +-- Stability: experimental +-- Description: +-- +-- A thread-safe in-memory 'Table' implementation based on 'HM.HashMap'. +-- +module Chainweb.Storage.Table.HashMap +( HashMapTable +, emptyTable +, toList +, size +) where + +import Control.Concurrent.STM.TVar +import Control.Monad ((<$!>)) +import Control.Monad.STM + +import Data.Hashable +import qualified Data.HashMap.Strict as HM + +-- internal modules + +import Chainweb.Storage.Table + +-- | An 'IsTable' implementation that is base on 'HM.HashMap'. +-- +newtype HashMapTable k v = HashMapTable (TVar (HM.HashMap k v)) + +instance (Hashable k, Eq k) => ReadableTable (HashMapTable k v) k v where + tableLookup (HashMapTable var) k = HM.lookup k <$!> readTVarIO var + tableMember (HashMapTable var) k = + HM.member k <$> readTVarIO var + +instance (Hashable k, Eq k) => Table (HashMapTable k v) k v where + tableInsert (HashMapTable var) k v = + atomically $ modifyTVar' var (HM.insert k v) + tableDelete (HashMapTable var) k = + atomically $ modifyTVar' var (HM.delete k) + +-- | Create new empty CAS +-- +emptyTable :: (Hashable k, Eq k) => IO (HashMapTable k v) +emptyTable = HashMapTable <$> newTVarIO mempty + +-- | Return all entries of CAS as List +-- +toList :: HashMapTable k v -> IO [v] +toList (HashMapTable var) = HM.elems <$!> readTVarIO var + +-- | The number of items in the CAS +-- +size :: HashMapTable k v -> IO Int +size (HashMapTable var) = HM.size <$!> readTVarIO var + diff --git a/vendored/chainweb-storage/src/Chainweb/Storage/Table/RocksDB.hs b/vendored/chainweb-storage/src/Chainweb/Storage/Table/RocksDB.hs new file mode 100644 index 0000000000..5bce7aa6d7 --- /dev/null +++ b/vendored/chainweb-storage/src/Chainweb/Storage/Table/RocksDB.hs @@ -0,0 +1,780 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DerivingVia #-} +{-# LANGUAGE ExistentialQuantification #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UndecidableInstances #-} +{-# LANGUAGE ViewPatterns #-} + +-- | +-- Module: Chainweb.Storage.Table.RocksDB +-- Copyright: Copyright © 2019 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz +-- Stability: experimental +-- +-- Persisted key-value and and content-addressable-key-value stores with a +-- RocksDB backend. +-- +-- A 'RocksDbTable' provides a typed key-value store with an additional iterator +-- interace. +-- +-- 'RocksDbCas' adds an 'IsCas' instance for a 'RocksDbTable' where the value +-- type is content-addressable. +-- +-- TODO: Abstract the 'RocksDbTable' API into a typeclass so that one can +-- provide alternative implementations for it. +-- +module Chainweb.Storage.Table.RocksDB + ( RocksDb(..) + -- , rocksDbHandle + -- , rocksDbNamespace + , openRocksDb + , openReadOnlyRocksDb + , closeRocksDb + , withRocksDb + , withReadOnlyRocksDb + , withTempRocksDb + , destroyRocksDb + , resetOpenRocksDb + , modernDefaultOptions + + -- * Rocks DB Table + , Codec(..) + , RocksDbTable + , newTable + , tableLookup + , tableLookupBatch + , tableInsert + , tableDelete + + -- * Batch Updates + , RocksDbUpdate(..) + , updateBatch + + -- * Rocks DB Table Iterator + , RocksDbTableIter(..) + + -- ** Streams + , iterToEntryStream + , iterToValueStream + , iterToReverseValueStream + , iterToKeyStream + + -- ** Extremal Table Entries + , tableMaxKey + , tableMaxValue + , tableMaxEntry + , tableMinKey + , tableMinValue + , tableMinEntry + + -- * RocksDB-specific tools + , checkpointRocksDb + , deleteRangeRocksDb + , compactRangeRocksDb + ) where + +import Control.Exception(evaluate) +import Control.Lens +import Control.Monad +import Control.Monad.Catch +import Control.Monad.IO.Class + +import Data.ByteString(ByteString) +import qualified Data.ByteString as B +import qualified Data.ByteString.Char8 as B8 +import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Unsafe as BU +import Data.Coerce +import Data.Foldable +import Data.Maybe + +import Chainweb.Storage.Table +import Data.String +import qualified Data.Text as T + +import Foreign +import Foreign.C + +import qualified Database.RocksDB.Base as R +import qualified Database.RocksDB.ReadOptions as R +import qualified Database.RocksDB.C as C +import qualified Database.RocksDB.Internal as R +import qualified Database.RocksDB.Iterator as I + +import GHC.Generics (Generic) +import qualified GHC.Foreign as GHC +import qualified GHC.IO.Encoding as GHC +import GHC.Stack + +import NoThunks.Class + +import qualified Streaming.Prelude as S + +import System.Directory +import System.IO.Temp + +-- -------------------------------------------------------------------------- -- +-- Utils + +-- | A newtype wrapper for tagger values as "expected" outcomes of some +-- computation. +-- +newtype Expected a = Expected { getExpected :: a } + deriving (Show, Eq, Ord, Generic, Functor) + deriving newtype (NoThunks) + +-- | A newtype wrapper for tagger values as "actual" outcomes of some +-- computation. +-- +newtype Actual a = Actual { getActual :: a } + deriving (Show, Eq, Ord, Generic, Functor) + deriving newtype (NoThunks) + +-- | A textual message that describes the 'Expected' and the 'Actual' outcome of +-- some computation. +-- +unexpectedMsg :: Show a => T.Text -> Expected a -> Actual a -> T.Text +unexpectedMsg msg expected actual = msg + <> ", expected: " <> sshow (getExpected expected) + <> ", actual: " <> sshow (getActual actual) + +-- | Show a value as any type that is an instance of 'IsString'. +-- +sshow :: Show a => IsString b => a -> b +sshow = fromString . show + +-- -------------------------------------------------------------------------- -- +-- RocksDb + +-- | This wrapper allows to create key namespaces. This is, for instance, useful +-- for testing when running several concurrent tests on the same rocks db +-- instances. Key namespaces must not contain the character '-'. +-- +data RocksDb = RocksDb + { _rocksDbHandle :: !R.DB + , _rocksDbNamespace :: !B.ByteString + } + +instance NoThunks RocksDb where + wNoThunks ctx (RocksDb a b) = allNoThunks + [ noThunks ctx (InspectHeapNamed @"Data.RocksDB.Base.DB" a) + , noThunks ctx b + ] + showTypeOf _ = "Chainweb.Storage.Table.RocksDB.RocksDb" + + +-- makeLenses ''RocksDb + +modernDefaultOptions :: R.Options +modernDefaultOptions = R.defaultOptions + { R.maxOpenFiles = -1 + , R.writeBufferSize = 64 `shift` 20 + , R.createIfMissing = True + } + +-- | Open a 'RocksDb' instance with the default namespace. If no rocks db exists +-- at the provided directory path, a new database is created. +-- +openRocksDb :: FilePath -> C.OptionsPtr -> IO RocksDb +openRocksDb path opts_ptr = do + GHC.setFileSystemEncoding GHC.utf8 + createDirectoryIfMissing True path + db <- withFilePath path $ \path_ptr -> + fmap R.DB + $ R.throwIfErr "open" + $ C.c_rocksdb_open opts_ptr path_ptr + let rdb = RocksDb db mempty + initializeRocksDb rdb + return rdb + +openReadOnlyRocksDb :: FilePath -> C.OptionsPtr -> IO RocksDb +openReadOnlyRocksDb path opts_ptr = do + GHC.setFileSystemEncoding GHC.utf8 + createDirectoryIfMissing True path + db <- withFilePath path $ \path_ptr -> + fmap R.DB + $ R.throwIfErr "open" + $ C.c_rocksdb_open_for_read_only opts_ptr path_ptr (0 :: CChar) -- ignore extant wal file + let rdb = RocksDb db mempty + -- can't initialize read-only rocksdb + return rdb + +withOpts :: R.Options -> (R.Options' -> IO a) -> IO a +withOpts opts = + bracket (R.mkOpts opts) R.freeOpts + +-- | Each table key starts with @_rocksDbNamespace db <> "-"@. Here we insert a +-- dummy key that is guaranteed to be appear after any other key in the +-- database. We rely on its existence that in the implementation of +-- 'tableIteratorLast'. +-- +initializeRocksDb :: RocksDb -> IO () +initializeRocksDb db = R.put + (_rocksDbHandle db) + R.defaultWriteOptions + (_rocksDbNamespace db <> ".") + "" + +-- | Open a 'RocksDb' and reset it if it already exists. +-- +resetOpenRocksDb :: FilePath -> IO RocksDb +resetOpenRocksDb path = do + destroyRocksDb path + db <- RocksDb <$> R.open path opts <*> mempty + initializeRocksDb db + return db + where + opts = modernDefaultOptions { R.errorIfExists = True } + +-- | Close a 'RocksDb' instance. +-- +closeRocksDb :: RocksDb -> IO () +closeRocksDb (RocksDb (R.DB db_ptr) _) = C.c_rocksdb_close db_ptr + +-- | Provide a computation with a 'RocksDb' instance. If no rocks db exists at +-- the provided directory path, a new database is created. +-- +withRocksDb :: FilePath -> R.Options -> (RocksDb -> IO a) -> IO a +withRocksDb path opts act = + withOpts opts $ \(R.Options' opts_ptr _ _) -> + bracket (openRocksDb path opts_ptr) closeRocksDb act + +-- | Provide a computation with a read-only 'RocksDb' instance. If no rocks db exists at +-- the provided directory path, a new database is created(?) TODO +-- +withReadOnlyRocksDb :: FilePath -> R.Options -> (RocksDb -> IO a) -> IO a +withReadOnlyRocksDb path opts act = + withOpts opts $ \(R.Options' opts_ptr _ _) -> + bracket (openReadOnlyRocksDb path opts_ptr) closeRocksDb act + +-- | Provide a computation with a temporary 'RocksDb'. The database is deleted +-- when the computation exits. +-- +withTempRocksDb :: String -> (RocksDb -> IO a) -> IO a +withTempRocksDb template f = withSystemTempDirectory template $ \dir -> + withRocksDb dir modernDefaultOptions f + +-- | Delete the RocksDb instance. +-- +-- This is the prefered method of deleting an rocks db instance. A rocks db +-- instance may store files in different locations. This function guarantees +-- that all files are deleted. +-- +destroyRocksDb :: FilePath -> IO () +destroyRocksDb path = R.destroy path opts + where + opts = modernDefaultOptions { R.createIfMissing = False } + +-- -------------------------------------------------------------------------- -- +-- RocksDb Table + +-- NOTE: the implementation currently doesnt' support iteration over nested +-- namespaces. This could easily be added via an iterator that iterates over +-- all respective prefixes ending with '/' or '$' and using a codec that +-- could handle the types in all sub-namespaces. This could for instance +-- be useful for iterating over the complete database. + +-- | A binary codec for encoding and decoding values that are stored in a +-- 'RocksDb' Table. +-- +data Codec a = Codec + { _codecEncode :: !(a -> B.ByteString) + -- ^ encode a value. + , _codecDecode :: !(forall m. MonadThrow m => B.ByteString -> m a) + -- ^ decode a value. Throws an exception of decoding fails. + } + +instance NoThunks (Codec a) where + -- NoThunks does not look inside of closures for captured thunks + wNoThunks _ _ = return Nothing + showTypeOf _ = "Chainweb.Storage.Table.RocksDB.Codec" + +-- | A logical table in a 'RocksDb'. Tables in a rocks db are have isolated key +-- namespaces. +-- +data RocksDbTable k v = RocksDbTable + { _rocksDbTableValueCodec :: !(Codec v) + , _rocksDbTableKeyCodec :: !(Codec k) + , _rocksDbTableNamespace :: !B.ByteString + , _rocksDbTableDb :: !R.DB + } + +instance NoThunks (RocksDbTable k v) where + wNoThunks ctx (RocksDbTable a b c d) = allNoThunks + [ noThunks ctx a + , noThunks ctx b + , noThunks ctx c + , noThunks ctx (InspectHeapNamed @"Data.RocksDB.Base.DB" d) + ] + showTypeOf _ = "Chainweb.Storage.Table.RocksDB.RocksDbTable" + +-- | Create a new 'RocksDbTable' in the given 'RocksDb'. +-- +-- Table name components must NOT contain any of '$', '%', and '/'. A user +-- error is raised if the namespace contains any of these characters. +-- +newTable + :: HasCallStack + => RocksDb + -> Codec v + -> Codec k + -> [B.ByteString] + -> RocksDbTable k v +newTable db valCodec keyCodec namespace + | any (B8.any (\x -> x `elem` ['$', '%', '/'])) namespace + = error $ "Chainweb.Storage.Table.RocksDb.newTable: invalid character in table namespace: " <> sshow namespace + | otherwise + = RocksDbTable valCodec keyCodec ns (_rocksDbHandle db) + where + ns = _rocksDbNamespace db <> "-" <> B.intercalate "/" namespace + + +-- -------------------------------------------------------------------------- -- +-- Batches + +-- | A single update operation for a 'RocksDbTable' +-- +data RocksDbUpdate + = forall k v . RocksDbDelete + { _rocksDbUpdateTable :: !(RocksDbTable k v) + , _rocksDbUpdateKey :: !k + } + | forall k v . RocksDbInsert + { _rocksDbUpdateTable :: !(RocksDbTable k v) + , _rocksDbUpdateKey :: !k + , _rocksDbUpdateValue :: !v + } + +rocksDbUpdateDb :: RocksDbUpdate -> R.DB +rocksDbUpdateDb (RocksDbDelete t _) = _rocksDbTableDb t +rocksDbUpdateDb (RocksDbInsert t _ _) = _rocksDbTableDb t + +-- | Atomically execute a batch of rocks db updates. +-- +-- All tables in the batch operations belong to the same rocks db instance. +-- Otherwise an error is raised. +-- +updateBatch :: HasCallStack => [RocksDbUpdate] -> IO () +updateBatch [] = return () +updateBatch batch = R.write rdb R.defaultWriteOptions $ checkMkOp <$> batch + where + rdb = rocksDbUpdateDb $ head batch + + checkMkOp o + | rdb == rocksDbUpdateDb o = mkOp o + | otherwise = error "Chainweb.Storage.Table.RocksDB.updateBatch: all operations in a batch must be for the same RocksDB instance." + + mkOp (RocksDbDelete t k) = R.Del (encKey t k) + mkOp (RocksDbInsert t k v) = R.Put (encKey t k) (encVal t v) + +-- -------------------------------------------------------------------------- -- +-- Table Iterator + +-- | An iterator for a 'RocksDbTable'. An interator is a stateful object that +-- represents an enumeration of a subset of the entries of an immutable snapshop +-- of the table. +-- +-- An iterator should be used single threaded. Since the iterator retains the +-- snapshot of the database, one should avoid storing iterators over longer +-- periods of time. After usage it should be released in a timely manner. +-- +-- The recommended way to created a 'RocksDbTableIter' is to use the function +-- `withTableIterator`. +-- +data RocksDbTableIter k v = RocksDbTableIter + { _rocksDbTableIterValueCodec :: !(Codec v) + , _rocksDbTableIterKeyCodec :: !(Codec k) + , _rocksDbTableIterNamespace :: !B.ByteString + , _rocksDbTableIter :: !I.Iterator + } + +instance NoThunks (RocksDbTableIter k v) where + wNoThunks ctx (RocksDbTableIter a b c d) = allNoThunks + [ noThunks ctx a + , noThunks ctx b + , noThunks ctx c + , noThunks ctx (InspectHeapNamed @"Data.RocksDB.Iterator.Iterator" d) + ] + showTypeOf _ = "Chainweb.Storage.Table.RocksDB.RocksDbTableIterator" + +instance Iterator (RocksDbTableIter k v) k v where + iterValid it = + I.iterValid (_rocksDbTableIter it) + + iterSeek it = I.iterSeek (_rocksDbTableIter it) . encIterKey it + + iterFirst it = + I.iterFirst (_rocksDbTableIter it) + + iterLast it = + I.iterLast (_rocksDbTableIter it) + + iterNext = I.iterNext . _rocksDbTableIter + + iterPrev = I.iterPrev . _rocksDbTableIter + + iterEntry it = + I.iterEntry (_rocksDbTableIter it) >>= \case + Nothing -> return Nothing + Just (k, v) -> do + k' <- decIterKey it k + v' <- decIterVal it v + return $! Just $! Entry k' v' + + iterValue it = I.iterValue (_rocksDbTableIter it) >>= \case + Nothing -> return Nothing + Just v -> Just <$> (evaluate =<< decIterVal it v) + + iterKey it = I.iterKey (_rocksDbTableIter it) >>= \case + Nothing -> return Nothing + Just k -> Just <$> (evaluate =<< decIterKey it k) + +instance IterableTable (RocksDbTable k v) (RocksDbTableIter k v) k v where + withTableIterator db k = R.withReadOptions readOptions $ \opts_ptr -> + I.withIter (_rocksDbTableDb db) opts_ptr $ \iter -> do + let tableIter = makeTableIter iter + iterFirst tableIter + k tableIter + where + readOptions = fold + [ R.setLowerBound (namespaceFirst $ _rocksDbTableNamespace db) + , R.setUpperBound (namespaceLast $ _rocksDbTableNamespace db) + -- TODO: this setting tells rocksdb to use prefix seek *when it can*. + -- the question remains: is it actually being used? + , R.setAutoPrefixMode True + ] + makeTableIter = + RocksDbTableIter + (_rocksDbTableValueCodec db) + (_rocksDbTableKeyCodec db) + (_rocksDbTableNamespace db) + +-- | Returns the stream of key-value pairs of an 'RocksDbTableIter'. +-- +-- The iterator must be released after the stream is consumed. Releasing the +-- iterator to early while the stream is still in use results in a runtime +-- error. Not releasing the iterator after the processing of the stream has +-- finished results in a memory leak. +-- +iterToEntryStream :: RocksDbTableIter k v -> S.Stream (S.Of (Entry k v)) IO () +iterToEntryStream it = + liftIO (iterEntry it) >>= \case + Nothing -> return () + Just x -> S.yield x >> liftIO (iterNext it) >> iterToEntryStream it + +-- | Returns the stream of values of an 'RocksDbTableIter'. +-- +-- The iterator must be released after the stream is consumed. Releasing the +-- iterator to early while the stream is still in use results in a runtime +-- error. Not releasing the iterator after the processing of the stream has +-- finished results in a memory leak. +-- +iterToValueStream :: Show k => RocksDbTableIter k v -> S.Stream (S.Of v) IO () +iterToValueStream it = do + liftIO (iterValue it) >>= \case + Nothing -> return () + Just x -> S.yield x >> liftIO (iterNext it) >> iterToValueStream it + +-- Returns the stream of key-value pairs of an 'RocksDbTableIter' in reverse +-- order. +-- +-- The iterator must be released after the stream is consumed. Releasing the +-- iterator to early while the stream is still in use results in a runtime +-- error. Not releasing the iterator after the processing of the stream has +-- finished results in a memory leak. +-- +iterToReverseValueStream :: RocksDbTableIter k v -> S.Stream (S.Of v) IO () +iterToReverseValueStream it = + liftIO (iterValue it) >>= \case + Nothing -> return () + Just x -> S.yield x >> liftIO (iterPrev it) >> iterToReverseValueStream it + +-- | Returns the stream of keys of an 'RocksDbTableIter'. +-- +-- The iterator must be released after the stream is consumed. Releasing the +-- iterator to early while the stream is still in use results in a runtime +-- error. Not releasing the iterator after the processing of the stream has +-- finished results in a memory leak. +-- +iterToKeyStream :: RocksDbTableIter k v -> S.Stream (S.Of k) IO () +iterToKeyStream it = + liftIO (iterKey it) >>= \case + Nothing -> return () + Just x -> S.yield x >> liftIO (iterNext it) >> iterToKeyStream it + +-- Extremal Table Entries + +-- | The maximum key in a 'RocksDbTable'. +-- +tableMaxKey :: RocksDbTable k v -> IO (Maybe k) +tableMaxKey = flip withTableIterator $ \i -> iterLast i *> iterKey i + +-- | The maximum value in a 'RocksDbTable'. +-- +tableMaxValue :: RocksDbTable k v -> IO (Maybe v) +tableMaxValue = flip withTableIterator $ \i -> iterLast i *> iterValue i + +-- | The maximum key-value in a 'RocksDbTable'. +-- +tableMaxEntry :: RocksDbTable k v -> IO (Maybe (Entry k v)) +tableMaxEntry = flip withTableIterator $ \i -> iterLast i *> iterEntry i + +-- | The minimum key in a 'RocksDbTable'. +-- +tableMinKey :: RocksDbTable k v -> IO (Maybe k) +tableMinKey = flip withTableIterator $ \i -> iterFirst i *> iterKey i + +-- | The minimum value in a 'RocksDbTable'. +-- +tableMinValue :: RocksDbTable k v -> IO (Maybe v) +tableMinValue = flip withTableIterator $ \i -> iterFirst i *> iterValue i + +-- | The minimum key-value in a 'RocksDbTable'. +-- +tableMinEntry :: RocksDbTable k v -> IO (Maybe (Entry k v)) +tableMinEntry = flip withTableIterator $ \i -> iterFirst i *> iterEntry i + +-- -------------------------------------------------------------------------- -- +-- CAS + +-- | For a 'IsCasValue' @v@ with 'CasKeyType v ~ k@, a 'RocksDbTable k v' is an +-- instance of 'HasCasLookup'. +-- +instance forall k v. ReadableTable (RocksDbTable k v) k v where + tableLookup db k = do + maybeBytes <- getRocksDb (_rocksDbTableDb db) mempty (encKey db k) + traverse (decVal db) maybeBytes + + tableMember db k = + isJust <$> getRocksDb (_rocksDbTableDb db) mempty (encKey db k) + + -- | @tableLookupBatch db ks@ returns for each @k@ in @ks@ 'Just' the value at + -- key @k@ in the 'RocksDbTable' @db@ if it exists, or 'Nothing' if the @k@ + -- doesn't exist in the table. + -- + tableLookupBatch' db t = unsafePartsOf t $ \ks -> do + results <- multiGet (_rocksDbTableDb db) mempty (map (encKey db) ks) + forM results $ \case + Left e -> error $ "Chainweb.Storage.Table.RocksDB.tableLookupBatch: " <> e + Right x -> traverse (decVal db) x + +-- | For a 'IsCasValue' @v@ with 'CasKeyType v ~ k@, a 'RocksDbTable k v' is an +-- instance of 'IsCas'. +-- +instance Table (RocksDbTable k v) k v where + tableInsert db k v = R.put + (_rocksDbTableDb db) + R.defaultWriteOptions + (encKey db k) + (encVal db v) + + tableDelete db k = R.delete + (_rocksDbTableDb db) + R.defaultWriteOptions + (encKey db k) + + tableInsertBatch db kvs = + updateBatch (uncurry (RocksDbInsert db) <$> kvs) + + tableDeleteBatch db ks = + updateBatch (RocksDbDelete db <$> ks) + +-- -------------------------------------------------------------------------- -- +-- Exceptions + +-- | Excpeptions that can be thrown by functions in this module. +-- +data RocksDbException + = RocksDbTableIterInvalidKeyNamespace !(Expected B.ByteString) !(Actual B.ByteString) + deriving (Eq, Ord, Generic, NoThunks) + +instance Exception RocksDbException where + displayException (RocksDbTableIterInvalidKeyNamespace e a) + = T.unpack $ unexpectedMsg "Chainweb.Storage.Table.RocksDB: invalid table key" e a + +instance Show RocksDbException where + show = displayException + +-- -------------------------------------------------------------------------- -- +-- Table Utils + +encVal :: RocksDbTable k v -> v -> B.ByteString +encVal = _codecEncode . _rocksDbTableValueCodec + +encKey :: RocksDbTable k v -> k -> B.ByteString +encKey it k = namespaceFirst ns <> _codecEncode (_rocksDbTableKeyCodec it) k + where + ns = _rocksDbTableNamespace it + +decVal :: MonadThrow m => RocksDbTable k v -> B.ByteString -> m v +decVal tbl = _codecDecode $ _rocksDbTableValueCodec tbl + +-- -------------------------------------------------------------------------- -- +-- Iter Utils + +namespaceFirst :: B.ByteString -> B.ByteString +namespaceFirst ns = ns <> "$" + +namespaceLast :: B.ByteString -> B.ByteString +namespaceLast ns = ns <> "%" + +encIterKey :: RocksDbTableIter k v -> k -> B.ByteString +encIterKey it k = namespaceFirst ns <> _codecEncode (_rocksDbTableIterKeyCodec it) k + where + ns = _rocksDbTableIterNamespace it + +decIterVal :: MonadThrow m => RocksDbTableIter k v -> B.ByteString -> m v +decIterVal i = _codecDecode (_rocksDbTableIterValueCodec i) + +decIterKey :: MonadThrow m => RocksDbTableIter k v -> B.ByteString -> m k +decIterKey it k = + _codecDecode (_rocksDbTableIterKeyCodec it) (B.drop (B.length prefix) k) + where + prefix = namespaceFirst $ _rocksDbTableIterNamespace it + +checked :: HasCallStack => String -> (Ptr CString -> IO a) -> IO a +checked whatWasIDoing act = alloca $ \errPtr -> do + poke errPtr (nullPtr :: CString) + r <- act errPtr + err <- peek errPtr + unless (err == nullPtr) $ do + errStr <- B.packCString err + let msg = unwords ["Chainweb.Storage.Table.RocksDB.checked: error while", whatWasIDoing <> ":", B8.unpack errStr] + C.c_rocksdb_free err + error msg + return r + +-- to unconditionally flush the WAL log before making the checkpoint, set logSizeFlushThreshold to zero. +-- to *never* flush the WAL log, set logSizeFlushThreshold to maxBound :: CULong. +checkpointRocksDb :: RocksDb -> CULong -> FilePath -> IO () +checkpointRocksDb RocksDb { _rocksDbHandle = R.DB dbPtr } logSizeFlushThreshold path = + bracket mkCheckpointObject C.rocksdb_checkpoint_object_destroy mkCheckpoint + where + mkCheckpointObject = + checked "creating checkpoint object" $ + C.rocksdb_checkpoint_object_create dbPtr + mkCheckpoint cp = + withCString path $ \path' -> + checked "creating checkpoint" $ + C.rocksdb_checkpoint_create cp path' logSizeFlushThreshold + +validateRangeOrdered :: HasCallStack => RocksDbTable k v -> (Maybe k, Maybe k) -> (B.ByteString, B.ByteString) +validateRangeOrdered table (Just (encKey table -> l), Just (encKey table -> u)) + | l >= u = + error "Chainweb.Storage.Table.RocksDB.validateRangeOrdered: range bounds not ordered according to codec" + | otherwise = (l, u) +validateRangeOrdered table (l, u) = + ( maybe (namespaceFirst (_rocksDbTableNamespace table)) (encKey table) l + , maybe (namespaceLast (_rocksDbTableNamespace table)) (encKey table) u + ) + +-- | Batch delete a range of keys in a table. +-- Throws if the range of the *encoded keys* is not ordered (lower, upper). +deleteRangeRocksDb :: HasCallStack => RocksDbTable k v -> (Maybe k, Maybe k) -> IO () +deleteRangeRocksDb table range = do + let !range' = validateRangeOrdered table range + let R.DB dbPtr = _rocksDbTableDb table + R.withCWriteOpts R.defaultWriteOptions $ \optsPtr -> + BU.unsafeUseAsCStringLen (fst range') $ \(minKeyPtr, minKeyLen) -> + BU.unsafeUseAsCStringLen (snd range') $ \(maxKeyPtr, maxKeyLen) -> + checked "Chainweb.Storage.Table.RocksDB.deleteRangeRocksDb" $ + C.rocksdb_delete_range dbPtr optsPtr + minKeyPtr (fromIntegral minKeyLen :: CSize) + maxKeyPtr (fromIntegral maxKeyLen :: CSize) + +compactRangeRocksDb :: HasCallStack => RocksDbTable k v -> (Maybe k, Maybe k) -> IO () +compactRangeRocksDb table range = + BU.unsafeUseAsCStringLen (fst range') $ \(minKeyPtr, minKeyLen) -> + BU.unsafeUseAsCStringLen (snd range') $ \(maxKeyPtr, maxKeyLen) -> + C.rocksdb_compact_range dbPtr + minKeyPtr (fromIntegral minKeyLen :: CSize) + maxKeyPtr (fromIntegral maxKeyLen :: CSize) + where + !range' = validateRangeOrdered table range + R.DB dbPtr = _rocksDbTableDb table + +-- | Read a value by key. +-- One less copy than the version in rocksdb-haskell by using unsafePackCStringFinalizer. +getRocksDb :: MonadIO m => R.DB -> R.ReadOptions -> ByteString -> m (Maybe ByteString) +getRocksDb (R.DB db_ptr) opts key = liftIO $ R.withReadOptions opts $ \opts_ptr -> + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + alloca $ \vlen_ptr -> do + val_ptr <- checked "Chainweb.Storage.Table.RocksDB.get" $ + C.c_rocksdb_get db_ptr opts_ptr key_ptr (R.intToCSize klen) vlen_ptr + vlen <- peek vlen_ptr + if val_ptr == nullPtr + then return Nothing + else do + Just <$> BU.unsafePackCStringFinalizer + ((coerce :: Ptr CChar -> Ptr Word8) val_ptr) + (R.cSizeToInt vlen) + (C.c_rocksdb_free val_ptr) + +-- | Marshal a 'FilePath' (Haskell string) into a `NUL` terminated C string using +-- temporary storage. +-- On Linux, UTF-8 is almost always the encoding used. +-- When on Windows, UTF-8 can also be used, although the default for those devices is +-- UTF-16. For a more detailed explanation, please refer to +-- https://msdn.microsoft.com/en-us/library/windows/desktop/dd374081(v=vs.85).aspx. +withFilePath :: FilePath -> (CString -> IO a) -> IO a +withFilePath = GHC.withCString GHC.utf8 + +-- -------------------------------------------------------------------------- -- +-- Multi Get + +multiGet + :: MonadIO m + => R.DB + -> R.ReadOptions + -> [ByteString] + -> m [Either String (Maybe ByteString)] +multiGet (R.DB db_ptr) opts keys = liftIO $ R.withReadOptions opts $ \opts_ptr -> + allocaArray len $ \keysArray -> + allocaArray len $ \keySizesArray -> + allocaArray len $ \valuesArray -> + allocaArray len $ \valueSizesArray -> + allocaArray len $ \errsArray -> + let go i (key : ks) = + BU.unsafeUseAsCStringLen key $ \(keyPtr, keyLen) -> do + pokeElemOff keysArray i keyPtr + pokeElemOff keySizesArray i (R.intToCSize keyLen) + go (i + 1) ks + + go _ [] = do + C.rocksdb_multi_get db_ptr opts_ptr (R.intToCSize len) + keysArray keySizesArray + valuesArray valueSizesArray + errsArray + forM [0..len-1] $ \i -> do + valuePtr <- peekElemOff valuesArray i + if valuePtr /= nullPtr + then do + valueLen <- R.cSizeToInt <$> peekElemOff valueSizesArray i + r <- BU.unsafePackCStringFinalizer (castPtr valuePtr) valueLen (C.c_rocksdb_free valuePtr) + return $ Right $ Just r + else do + errPtr <- peekElemOff errsArray i + if errPtr /= nullPtr + then do + errLen <- BI.c_strlen errPtr + err <- B8.unpack <$> BU.unsafePackCStringFinalizer (castPtr errPtr) (fromIntegral errLen :: Int) (C.c_rocksdb_free errPtr) + return $ Left err + else + return $ Right Nothing + in go 0 keys + where + len = length keys From f6140bc72a8c2dfb9511e987d7ea240f2febcfa1 Mon Sep 17 00:00:00 2001 From: chessai Date: Wed, 26 Jun 2024 20:15:31 -0500 Subject: [PATCH 2/4] fold in rocksdb-haskell-kadena Change-Id: If6e2cceb4b712b80c3a5466035e309f7215adf8a --- cabal.project | 9 - cabal.project.freeze | 4 +- chainweb.cabal | 463 +- vendored/rocksdb-haskell-kadena/CHANGELOG.md | 35 + vendored/rocksdb-haskell-kadena/LICENSE | 31 + vendored/rocksdb-haskell-kadena/README.md | 47 + .../cpp/chainweb-rocksdb.cpp | 132 + .../cpp/chainweb-rocksdb.h | 15 + .../rocksdb-8.3.2/db/arena_wrapped_db_iter.cc | 163 + .../rocksdb-8.3.2/db/arena_wrapped_db_iter.h | 127 + .../rocksdb-8.3.2/db/blob/blob_constants.h | 16 + .../rocksdb-8.3.2/db/blob/blob_contents.cc | 42 + .../rocksdb-8.3.2/db/blob/blob_contents.h | 59 + .../db/blob/blob_counting_iterator.h | 150 + .../rocksdb-8.3.2/db/blob/blob_fetcher.cc | 34 + .../rocksdb-8.3.2/db/blob/blob_fetcher.h | 37 + .../db/blob/blob_file_addition.cc | 156 + .../db/blob/blob_file_addition.h | 67 + .../db/blob/blob_file_builder.cc | 426 + .../rocksdb-8.3.2/db/blob/blob_file_builder.h | 112 + .../rocksdb-8.3.2/db/blob/blob_file_cache.cc | 101 + .../rocksdb-8.3.2/db/blob/blob_file_cache.h | 56 + .../db/blob/blob_file_completion_callback.h | 84 + .../db/blob/blob_file_garbage.cc | 134 + .../rocksdb-8.3.2/db/blob/blob_file_garbage.h | 57 + .../rocksdb-8.3.2/db/blob/blob_file_meta.cc | 62 + .../rocksdb-8.3.2/db/blob/blob_file_meta.h | 170 + .../rocksdb-8.3.2/db/blob/blob_file_reader.cc | 624 + .../rocksdb-8.3.2/db/blob/blob_file_reader.h | 112 + .../db/blob/blob_garbage_meter.cc | 100 + .../db/blob/blob_garbage_meter.h | 102 + .../rocksdb-8.3.2/db/blob/blob_index.h | 187 + .../rocksdb-8.3.2/db/blob/blob_log_format.cc | 143 + .../rocksdb-8.3.2/db/blob/blob_log_format.h | 164 + .../db/blob/blob_log_sequential_reader.cc | 134 + .../db/blob/blob_log_sequential_reader.h | 83 + .../rocksdb-8.3.2/db/blob/blob_log_writer.cc | 178 + .../rocksdb-8.3.2/db/blob/blob_log_writer.h | 83 + .../rocksdb-8.3.2/db/blob/blob_read_request.h | 58 + .../rocksdb-8.3.2/db/blob/blob_source.cc | 459 + .../rocksdb-8.3.2/db/blob/blob_source.h | 161 + .../db/blob/prefetch_buffer_collection.cc | 21 + .../db/blob/prefetch_buffer_collection.h | 38 + .../rocksdb-8.3.2/db/builder.cc | 447 + .../rocksdb-8.3.2/db/builder.h | 78 + .../rocksdb-8.3.2/db/c.cc | 6559 +++++ .../rocksdb-8.3.2/db/column_family.cc | 1708 ++ .../rocksdb-8.3.2/db/column_family.h | 851 + .../db/compaction/clipping_iterator.h | 281 + .../rocksdb-8.3.2/db/compaction/compaction.cc | 857 + .../rocksdb-8.3.2/db/compaction/compaction.h | 565 + .../compaction/compaction_iteration_stats.h | 49 + .../db/compaction/compaction_iterator.cc | 1445 + .../db/compaction/compaction_iterator.h | 530 + .../db/compaction/compaction_job.cc | 2054 ++ .../db/compaction/compaction_job.h | 504 + .../db/compaction/compaction_outputs.cc | 784 + .../db/compaction/compaction_outputs.h | 403 + .../db/compaction/compaction_picker.cc | 1225 + .../db/compaction/compaction_picker.h | 318 + .../db/compaction/compaction_picker_fifo.cc | 471 + .../db/compaction/compaction_picker_fifo.h | 60 + .../db/compaction/compaction_picker_level.cc | 888 + .../db/compaction/compaction_picker_level.h | 33 + .../compaction/compaction_picker_universal.cc | 1452 + .../compaction/compaction_picker_universal.h | 30 + .../db/compaction/compaction_service_job.cc | 833 + .../db/compaction/compaction_state.cc | 46 + .../db/compaction/compaction_state.h | 42 + .../rocksdb-8.3.2/db/compaction/file_pri.h | 92 + .../db/compaction/sst_partitioner.cc | 83 + .../db/compaction/subcompaction_state.cc | 106 + .../db/compaction/subcompaction_state.h | 220 + .../rocksdb-8.3.2/db/convenience.cc | 81 + .../rocksdb-8.3.2/db/db_filesnapshot.cc | 436 + .../db/db_impl/compacted_db_impl.cc | 260 + .../db/db_impl/compacted_db_impl.h | 157 + .../rocksdb-8.3.2/db/db_impl/db_impl.cc | 6132 +++++ .../rocksdb-8.3.2/db/db_impl/db_impl.h | 2841 ++ .../db/db_impl/db_impl_compaction_flush.cc | 3980 +++ .../rocksdb-8.3.2/db/db_impl/db_impl_debug.cc | 314 + .../db/db_impl/db_impl_experimental.cc | 159 + .../rocksdb-8.3.2/db/db_impl/db_impl_files.cc | 1014 + .../rocksdb-8.3.2/db/db_impl/db_impl_open.cc | 2202 ++ .../db/db_impl/db_impl_readonly.cc | 334 + .../db/db_impl/db_impl_readonly.h | 173 + .../db/db_impl/db_impl_secondary.cc | 964 + .../db/db_impl/db_impl_secondary.h | 406 + .../rocksdb-8.3.2/db/db_impl/db_impl_write.cc | 2487 ++ .../rocksdb-8.3.2/db/db_info_dumper.cc | 147 + .../rocksdb-8.3.2/db/db_info_dumper.h | 15 + .../rocksdb-8.3.2/db/db_iter.cc | 1708 ++ .../rocksdb-8.3.2/db/db_iter.h | 411 + .../rocksdb-8.3.2/db/db_test_util.h | 1340 + .../db/db_with_timestamp_test_util.h | 126 + .../rocksdb-8.3.2/db/dbformat.cc | 213 + .../rocksdb-8.3.2/db/dbformat.h | 869 + .../rocksdb-8.3.2/db/error_handler.cc | 792 + .../rocksdb-8.3.2/db/error_handler.h | 124 + .../rocksdb-8.3.2/db/event_helpers.cc | 323 + .../rocksdb-8.3.2/db/event_helpers.h | 78 + .../rocksdb-8.3.2/db/experimental.cc | 144 + .../db/external_sst_file_ingestion_job.cc | 1104 + .../db/external_sst_file_ingestion_job.h | 238 + .../rocksdb-8.3.2/db/file_indexer.cc | 218 + .../rocksdb-8.3.2/db/file_indexer.h | 140 + .../rocksdb-8.3.2/db/flush_job.cc | 1097 + .../rocksdb-8.3.2/db/flush_job.h | 200 + .../rocksdb-8.3.2/db/flush_scheduler.cc | 86 + .../rocksdb-8.3.2/db/flush_scheduler.h | 55 + .../rocksdb-8.3.2/db/forward_iterator.cc | 1070 + .../rocksdb-8.3.2/db/forward_iterator.h | 166 + .../db/history_trimming_iterator.h | 95 + .../db/import_column_family_job.cc | 364 + .../db/import_column_family_job.h | 82 + .../rocksdb-8.3.2/db/internal_stats.cc | 2121 ++ .../rocksdb-8.3.2/db/internal_stats.h | 875 + .../rocksdb-8.3.2/db/job_context.h | 237 + .../rocksdb-8.3.2/db/kv_checksum.h | 484 + .../rocksdb-8.3.2/db/log_format.h | 55 + .../rocksdb-8.3.2/db/log_reader.cc | 934 + .../rocksdb-8.3.2/db/log_reader.h | 241 + .../rocksdb-8.3.2/db/log_writer.cc | 279 + .../rocksdb-8.3.2/db/log_writer.h | 144 + .../db/logs_with_prep_tracker.cc | 67 + .../rocksdb-8.3.2/db/logs_with_prep_tracker.h | 62 + .../rocksdb-8.3.2/db/lookup_key.h | 68 + .../rocksdb-8.3.2/db/malloc_stats.cc | 53 + .../rocksdb-8.3.2/db/malloc_stats.h | 22 + .../rocksdb-8.3.2/db/memtable.cc | 1675 ++ .../rocksdb-8.3.2/db/memtable.h | 660 + .../rocksdb-8.3.2/db/memtable_list.cc | 987 + .../rocksdb-8.3.2/db/memtable_list.h | 471 + .../rocksdb-8.3.2/db/merge_context.h | 147 + .../rocksdb-8.3.2/db/merge_helper.cc | 606 + .../rocksdb-8.3.2/db/merge_helper.h | 224 + .../rocksdb-8.3.2/db/merge_operator.cc | 85 + .../rocksdb-8.3.2/db/output_validator.cc | 33 + .../rocksdb-8.3.2/db/output_validator.h | 48 + .../db/periodic_task_scheduler.cc | 111 + .../db/periodic_task_scheduler.h | 108 + .../db/pinned_iterators_manager.h | 92 + .../rocksdb-8.3.2/db/post_memtable_callback.h | 25 + .../rocksdb-8.3.2/db/pre_release_callback.h | 37 + .../rocksdb-8.3.2/db/range_del_aggregator.cc | 555 + .../rocksdb-8.3.2/db/range_del_aggregator.h | 478 + .../db/range_tombstone_fragmenter.cc | 502 + .../db/range_tombstone_fragmenter.h | 356 + .../rocksdb-8.3.2/db/read_callback.h | 54 + .../rocksdb-8.3.2/db/repair.cc | 839 + .../rocksdb-8.3.2/db/seqno_to_time_mapping.cc | 341 + .../rocksdb-8.3.2/db/seqno_to_time_mapping.h | 189 + .../rocksdb-8.3.2/db/snapshot_checker.h | 58 + .../rocksdb-8.3.2/db/snapshot_impl.cc | 25 + .../rocksdb-8.3.2/db/snapshot_impl.h | 239 + .../rocksdb-8.3.2/db/table_cache.cc | 710 + .../rocksdb-8.3.2/db/table_cache.h | 283 + .../db/table_cache_sync_and_async.h | 130 + .../db/table_properties_collector.cc | 74 + .../db/table_properties_collector.h | 177 + .../rocksdb-8.3.2/db/transaction_log_impl.cc | 296 + .../rocksdb-8.3.2/db/transaction_log_impl.h | 128 + .../db/trim_history_scheduler.cc | 54 + .../rocksdb-8.3.2/db/trim_history_scheduler.h | 46 + .../rocksdb-8.3.2/db/version_builder.cc | 1426 + .../rocksdb-8.3.2/db/version_builder.h | 93 + .../rocksdb-8.3.2/db/version_edit.cc | 1059 + .../rocksdb-8.3.2/db/version_edit.h | 702 + .../rocksdb-8.3.2/db/version_edit_handler.cc | 1017 + .../rocksdb-8.3.2/db/version_edit_handler.h | 334 + .../rocksdb-8.3.2/db/version_set.cc | 7286 +++++ .../rocksdb-8.3.2/db/version_set.h | 1714 ++ .../db/version_set_sync_and_async.h | 172 + .../rocksdb-8.3.2/db/version_util.h | 72 + .../rocksdb-8.3.2/db/wal_edit.cc | 211 + .../rocksdb-8.3.2/db/wal_edit.h | 177 + .../rocksdb-8.3.2/db/wal_manager.cc | 527 + .../rocksdb-8.3.2/db/wal_manager.h | 136 + .../db/wide/wide_column_serialization.cc | 182 + .../db/wide/wide_column_serialization.h | 77 + .../rocksdb-8.3.2/db/wide/wide_columns.cc | 22 + .../rocksdb-8.3.2/db/write_batch.cc | 3142 +++ .../rocksdb-8.3.2/db/write_batch_base.cc | 94 + .../rocksdb-8.3.2/db/write_batch_internal.h | 401 + .../rocksdb-8.3.2/db/write_callback.h | 27 + .../rocksdb-8.3.2/db/write_controller.cc | 121 + .../rocksdb-8.3.2/db/write_controller.h | 148 + .../rocksdb-8.3.2/db/write_stall_stats.cc | 179 + .../rocksdb-8.3.2/db/write_stall_stats.h | 47 + .../rocksdb-8.3.2/db/write_thread.cc | 844 + .../rocksdb-8.3.2/db/write_thread.h | 464 + .../db_stress_tool/db_stress_common.h | 674 + .../db_stress_compaction_filter.h | 96 + .../db_stress_tool/db_stress_driver.h | 18 + .../db_stress_tool/db_stress_env_wrapper.h | 78 + .../db_stress_tool/db_stress_listener.h | 269 + .../db_stress_tool/db_stress_shared_state.h | 437 + .../db_stress_tool/db_stress_stat.h | 219 + .../db_stress_table_properties_collector.h | 65 + .../db_stress_tool/db_stress_test_base.h | 327 + .../db_stress_tool/expected_state.h | 329 + .../db_stress_tool/expected_value.h | 206 + .../db_stress_tool/multi_ops_txns_stress.h | 446 + .../rocksdb-8.3.2/env/composite_env.cc | 534 + .../rocksdb-8.3.2/env/composite_env_wrapper.h | 378 + .../rocksdb-8.3.2/env/emulated_clock.h | 114 + .../rocksdb-8.3.2/env/env.cc | 1233 + .../rocksdb-8.3.2/env/env_chroot.cc | 148 + .../rocksdb-8.3.2/env/env_chroot.h | 55 + .../rocksdb-8.3.2/env/env_encryption.cc | 1346 + .../rocksdb-8.3.2/env/env_encryption_ctr.h | 114 + .../rocksdb-8.3.2/env/env_posix.cc | 524 + .../rocksdb-8.3.2/env/file_system.cc | 277 + .../rocksdb-8.3.2/env/file_system_tracer.cc | 564 + .../rocksdb-8.3.2/env/file_system_tracer.h | 461 + .../rocksdb-8.3.2/env/fs_posix.cc | 1284 + .../rocksdb-8.3.2/env/fs_readonly.h | 105 + .../rocksdb-8.3.2/env/fs_remap.cc | 341 + .../rocksdb-8.3.2/env/fs_remap.h | 137 + .../rocksdb-8.3.2/env/io_posix.cc | 1733 ++ .../rocksdb-8.3.2/env/io_posix.h | 523 + .../rocksdb-8.3.2/env/mock_env.cc | 1058 + .../rocksdb-8.3.2/env/mock_env.h | 144 + .../rocksdb-8.3.2/env/unique_id_gen.cc | 164 + .../rocksdb-8.3.2/env/unique_id_gen.h | 85 + .../rocksdb-8.3.2/file/delete_scheduler.cc | 409 + .../rocksdb-8.3.2/file/delete_scheduler.h | 147 + .../file/file_prefetch_buffer.cc | 955 + .../rocksdb-8.3.2/file/file_prefetch_buffer.h | 471 + .../rocksdb-8.3.2/file/file_util.cc | 277 + .../rocksdb-8.3.2/file/file_util.h | 91 + .../rocksdb-8.3.2/file/filename.cc | 523 + .../rocksdb-8.3.2/file/filename.h | 188 + .../rocksdb-8.3.2/file/line_file_reader.cc | 73 + .../rocksdb-8.3.2/file/line_file_reader.h | 60 + .../file/random_access_file_reader.cc | 599 + .../file/random_access_file_reader.h | 210 + .../rocksdb-8.3.2/file/read_write_util.cc | 33 + .../rocksdb-8.3.2/file/read_write_util.h | 31 + .../rocksdb-8.3.2/file/readahead_file_info.h | 33 + .../rocksdb-8.3.2/file/readahead_raf.cc | 169 + .../rocksdb-8.3.2/file/readahead_raf.h | 29 + .../file/sequence_file_reader.cc | 320 + .../rocksdb-8.3.2/file/sequence_file_reader.h | 119 + .../file/sst_file_manager_impl.cc | 507 + .../file/sst_file_manager_impl.h | 193 + .../file/writable_file_writer.cc | 989 + .../rocksdb-8.3.2/file/writable_file_writer.h | 320 + .../rocksdb-8.3.2/fuzz/util.h | 29 + .../include/rocksdb/advanced_cache.h | 623 + .../include/rocksdb/advanced_options.h | 1177 + .../rocksdb/block_cache_trace_writer.h | 149 + .../rocksdb-8.3.2/include/rocksdb/c.h | 2864 ++ .../rocksdb-8.3.2/include/rocksdb/cache.h | 442 + .../include/rocksdb/cache_bench_tool.h | 14 + .../rocksdb-8.3.2/include/rocksdb/cleanable.h | 128 + .../include/rocksdb/compaction_filter.h | 363 + .../include/rocksdb/compaction_job_stats.h | 109 + .../include/rocksdb/comparator.h | 164 + .../include/rocksdb/compression_type.h | 40 + .../include/rocksdb/concurrent_task_limiter.h | 51 + .../include/rocksdb/configurable.h | 390 + .../include/rocksdb/convenience.h | 466 + .../include/rocksdb/customizable.h | 229 + .../include/rocksdb/data_structure.h | 186 + .../rocksdb-8.3.2/include/rocksdb/db.h | 1980 ++ .../include/rocksdb/db_bench_tool.h | 11 + .../include/rocksdb/db_dump_tool.h | 43 + .../include/rocksdb/db_stress_tool.h | 11 + .../rocksdb-8.3.2/include/rocksdb/env.h | 1882 ++ .../include/rocksdb/env_encryption.h | 463 + .../include/rocksdb/experimental.h | 56 + .../include/rocksdb/file_checksum.h | 146 + .../include/rocksdb/file_system.h | 1849 ++ .../include/rocksdb/filter_policy.h | 206 + .../include/rocksdb/flush_block_policy.h | 75 + .../include/rocksdb/functor_wrapper.h | 56 + .../rocksdb-8.3.2/include/rocksdb/io_status.h | 244 + .../include/rocksdb/iostats_context.h | 98 + .../rocksdb-8.3.2/include/rocksdb/iterator.h | 144 + .../rocksdb-8.3.2/include/rocksdb/ldb_tool.h | 42 + .../rocksdb-8.3.2/include/rocksdb/listener.h | 840 + .../include/rocksdb/memory_allocator.h | 87 + .../include/rocksdb/memtablerep.h | 421 + .../include/rocksdb/merge_operator.h | 286 + .../rocksdb-8.3.2/include/rocksdb/metadata.h | 258 + .../rocksdb-8.3.2/include/rocksdb/options.h | 2083 ++ .../include/rocksdb/perf_context.h | 314 + .../include/rocksdb/perf_level.h | 36 + .../include/rocksdb/persistent_cache.h | 74 + .../rocksdb-8.3.2/include/rocksdb/port_defs.h | 22 + .../include/rocksdb/rate_limiter.h | 159 + .../include/rocksdb/rocksdb_namespace.h | 16 + .../include/rocksdb/secondary_cache.h | 139 + .../rocksdb-8.3.2/include/rocksdb/slice.h | 264 + .../include/rocksdb/slice_transform.h | 135 + .../rocksdb-8.3.2/include/rocksdb/snapshot.h | 53 + .../include/rocksdb/sst_dump_tool.h | 17 + .../include/rocksdb/sst_file_manager.h | 136 + .../include/rocksdb/sst_file_reader.h | 45 + .../include/rocksdb/sst_file_writer.h | 189 + .../include/rocksdb/sst_partitioner.h | 142 + .../include/rocksdb/statistics.h | 757 + .../include/rocksdb/stats_history.h | 70 + .../rocksdb-8.3.2/include/rocksdb/status.h | 574 + .../include/rocksdb/system_clock.h | 114 + .../rocksdb-8.3.2/include/rocksdb/table.h | 927 + .../include/rocksdb/table_properties.h | 332 + .../include/rocksdb/table_reader_caller.h | 41 + .../include/rocksdb/thread_status.h | 190 + .../include/rocksdb/threadpool.h | 67 + .../include/rocksdb/trace_reader_writer.h | 52 + .../include/rocksdb/trace_record.h | 248 + .../include/rocksdb/trace_record_result.h | 187 + .../include/rocksdb/transaction_log.h | 122 + .../rocksdb-8.3.2/include/rocksdb/types.h | 94 + .../rocksdb-8.3.2/include/rocksdb/unique_id.h | 55 + .../include/rocksdb/universal_compaction.h | 96 + .../include/rocksdb/utilities/agg_merge.h | 138 + .../include/rocksdb/utilities/backup_engine.h | 689 + .../rocksdb/utilities/cache_dump_load.h | 140 + .../include/rocksdb/utilities/checkpoint.h | 63 + .../include/rocksdb/utilities/convenience.h | 10 + .../rocksdb/utilities/customizable_util.h | 322 + .../include/rocksdb/utilities/db_ttl.h | 70 + .../include/rocksdb/utilities/debug.h | 46 + .../include/rocksdb/utilities/env_mirror.h | 179 + .../rocksdb/utilities/info_log_finder.h | 19 + .../include/rocksdb/utilities/ldb_cmd.h | 316 + .../utilities/ldb_cmd_execute_result.h | 75 + .../rocksdb/utilities/leveldb_options.h | 145 + .../utilities/lua/rocks_lua_custom_library.h | 43 + .../rocksdb/utilities/lua/rocks_lua_util.h | 55 + .../include/rocksdb/utilities/memory_util.h | 48 + .../rocksdb/utilities/object_registry.h | 583 + .../utilities/optimistic_transaction_db.h | 98 + .../utilities/option_change_migration.h | 24 + .../include/rocksdb/utilities/options_type.h | 1221 + .../include/rocksdb/utilities/options_util.h | 105 + .../include/rocksdb/utilities/replayer.h | 85 + .../include/rocksdb/utilities/sim_cache.h | 93 + .../include/rocksdb/utilities/stackable_db.h | 587 + .../utilities/table_properties_collectors.h | 88 + .../include/rocksdb/utilities/transaction.h | 683 + .../rocksdb/utilities/transaction_db.h | 506 + .../rocksdb/utilities/transaction_db_mutex.h | 89 + .../utilities/write_batch_with_index.h | 307 + .../rocksdb-8.3.2/include/rocksdb/version.h | 43 + .../include/rocksdb/wal_filter.h | 111 + .../include/rocksdb/wide_columns.h | 210 + .../include/rocksdb/write_batch.h | 494 + .../include/rocksdb/write_batch_base.h | 144 + .../include/rocksdb/write_buffer_manager.h | 183 + .../compaction_filter_factory_jnicallback.h | 37 + .../java/rocksjni/comparatorjnicallback.h | 137 + .../java/rocksjni/cplusplus_to_java_convert.h | 37 + .../rocksjni/event_listener_jnicallback.h | 122 + .../rocksdb-8.3.2/java/rocksjni/jnicallback.h | 32 + .../java/rocksjni/loggerjnicallback.h | 51 + .../rocksdb-8.3.2/java/rocksjni/portal.h | 8706 ++++++ .../java/rocksjni/statisticsjni.h | 34 + .../java/rocksjni/table_filter_jnicallback.h | 36 + .../java/rocksjni/trace_writer_jnicallback.h | 36 + .../transaction_notifier_jnicallback.h | 42 + .../java/rocksjni/wal_filter_jnicallback.h | 42 + .../rocksjni/writebatchhandlerjnicallback.h | 92 + .../rocksdb-8.3.2/logging/auto_roll_logger.cc | 368 + .../rocksdb-8.3.2/logging/auto_roll_logger.h | 166 + .../rocksdb-8.3.2/logging/env_logger.h | 195 + .../rocksdb-8.3.2/logging/event_logger.cc | 68 + .../rocksdb-8.3.2/logging/event_logger.h | 202 + .../rocksdb-8.3.2/logging/log_buffer.cc | 91 + .../rocksdb-8.3.2/logging/log_buffer.h | 57 + .../rocksdb-8.3.2/logging/logging.h | 62 + .../rocksdb-8.3.2/memory/allocator.h | 58 + .../rocksdb-8.3.2/memory/arena.cc | 170 + .../rocksdb-8.3.2/memory/arena.h | 135 + .../rocksdb-8.3.2/memory/concurrent_arena.cc | 45 + .../rocksdb-8.3.2/memory/concurrent_arena.h | 215 + .../memory/jemalloc_nodump_allocator.cc | 303 + .../memory/jemalloc_nodump_allocator.h | 99 + .../memory/memkind_kmem_allocator.cc | 44 + .../memory/memkind_kmem_allocator.h | 43 + .../rocksdb-8.3.2/memory/memory_allocator.cc | 80 + .../memory/memory_allocator_impl.h | 47 + .../rocksdb-8.3.2/memory/memory_usage.h | 38 + .../rocksdb-8.3.2/memtable/alloc_tracker.cc | 63 + .../memtable/hash_linklist_rep.cc | 924 + .../memtable/hash_skiplist_rep.cc | 391 + .../rocksdb-8.3.2/memtable/inlineskiplist.h | 1051 + .../rocksdb-8.3.2/memtable/skiplist.h | 498 + .../rocksdb-8.3.2/memtable/skiplistrep.cc | 368 + .../rocksdb-8.3.2/memtable/stl_wrappers.h | 33 + .../rocksdb-8.3.2/memtable/vectorrep.cc | 307 + .../memtable/write_buffer_manager.cc | 185 + .../monitoring/file_read_sample.h | 23 + .../rocksdb-8.3.2/monitoring/histogram.cc | 270 + .../rocksdb-8.3.2/monitoring/histogram.h | 143 + .../monitoring/histogram_windowing.cc | 194 + .../monitoring/histogram_windowing.h | 84 + .../monitoring/in_memory_stats_history.cc | 50 + .../monitoring/in_memory_stats_history.h | 74 + .../monitoring/instrumented_mutex.cc | 90 + .../monitoring/instrumented_mutex.h | 126 + .../monitoring/iostats_context.cc | 78 + .../monitoring/iostats_context_imp.h | 62 + .../rocksdb-8.3.2/monitoring/perf_context.cc | 313 + .../monitoring/perf_context_imp.h | 96 + .../rocksdb-8.3.2/monitoring/perf_level.cc | 23 + .../rocksdb-8.3.2/monitoring/perf_level_imp.h | 14 + .../monitoring/perf_step_timer.h | 77 + .../monitoring/persistent_stats_history.cc | 170 + .../monitoring/persistent_stats_history.h | 83 + .../rocksdb-8.3.2/monitoring/statistics.cc | 534 + .../monitoring/statistics_impl.h | 144 + .../monitoring/thread_status_impl.cc | 163 + .../monitoring/thread_status_updater.cc | 328 + .../monitoring/thread_status_updater.h | 226 + .../monitoring/thread_status_updater_debug.cc | 43 + .../monitoring/thread_status_util.cc | 208 + .../monitoring/thread_status_util.h | 139 + .../monitoring/thread_status_util_debug.cc | 46 + .../rocksdb-8.3.2/options/cf_options.cc | 1196 + .../rocksdb-8.3.2/options/cf_options.h | 347 + .../rocksdb-8.3.2/options/configurable.cc | 712 + .../options/configurable_helper.h | 185 + .../rocksdb-8.3.2/options/configurable_test.h | 116 + .../rocksdb-8.3.2/options/customizable.cc | 133 + .../rocksdb-8.3.2/options/db_options.cc | 1079 + .../rocksdb-8.3.2/options/db_options.h | 152 + .../rocksdb-8.3.2/options/options.cc | 692 + .../rocksdb-8.3.2/options/options_helper.cc | 1424 + .../rocksdb-8.3.2/options/options_helper.h | 116 + .../rocksdb-8.3.2/options/options_parser.cc | 736 + .../rocksdb-8.3.2/options/options_parser.h | 149 + .../rocksdb-8.3.2/port/jemalloc_helper.h | 107 + .../rocksdb-8.3.2/port/lang.h | 97 + .../rocksdb-8.3.2/port/likely.h | 18 + .../rocksdb-8.3.2/port/malloc.h | 17 + .../rocksdb-8.3.2/port/mmap.cc | 98 + .../rocksdb-8.3.2/port/mmap.h | 70 + .../rocksdb-8.3.2/port/port.h | 21 + .../rocksdb-8.3.2/port/port_dirent.h | 44 + .../rocksdb-8.3.2/port/port_example.h | 101 + .../rocksdb-8.3.2/port/port_posix.cc | 300 + .../rocksdb-8.3.2/port/port_posix.h | 243 + .../rocksdb-8.3.2/port/stack_trace.cc | 336 + .../rocksdb-8.3.2/port/stack_trace.h | 31 + .../rocksdb-8.3.2/port/sys_time.h | 63 + .../rocksdb-8.3.2/port/util_logger.h | 18 + .../rocksdb-8.3.2/port/win/env_default.cc | 45 + .../rocksdb-8.3.2/port/win/env_win.cc | 1437 + .../rocksdb-8.3.2/port/win/env_win.h | 305 + .../rocksdb-8.3.2/port/win/io_win.cc | 1101 + .../rocksdb-8.3.2/port/win/io_win.h | 508 + .../rocksdb-8.3.2/port/win/port_win.cc | 303 + .../rocksdb-8.3.2/port/win/port_win.h | 379 + .../rocksdb-8.3.2/port/win/win_logger.cc | 192 + .../rocksdb-8.3.2/port/win/win_logger.h | 64 + .../rocksdb-8.3.2/port/win/win_thread.cc | 170 + .../rocksdb-8.3.2/port/win/win_thread.h | 117 + .../rocksdb-8.3.2/port/win/xpress_win.h | 26 + .../rocksdb-8.3.2/port/xpress.h | 17 + .../table/adaptive/adaptive_table_factory.cc | 125 + .../table/adaptive/adaptive_table_factory.h | 56 + .../block_based/binary_search_index_reader.cc | 73 + .../block_based/binary_search_index_reader.h | 48 + .../rocksdb-8.3.2/table/block_based/block.cc | 1291 + .../rocksdb-8.3.2/table/block_based/block.h | 921 + .../block_based/block_based_table_builder.cc | 2027 ++ .../block_based/block_based_table_builder.h | 209 + .../block_based/block_based_table_factory.cc | 962 + .../block_based/block_based_table_factory.h | 102 + .../block_based/block_based_table_iterator.cc | 500 + .../block_based/block_based_table_iterator.h | 310 + .../block_based/block_based_table_reader.cc | 3043 +++ .../block_based/block_based_table_reader.h | 745 + .../block_based_table_reader_impl.h | 203 + .../block_based_table_reader_sync_and_async.h | 758 + .../table/block_based/block_builder.cc | 234 + .../table/block_based/block_builder.h | 104 + .../table/block_based/block_cache.cc | 106 + .../table/block_based/block_cache.h | 140 + .../table/block_based/block_prefetcher.cc | 120 + .../table/block_based/block_prefetcher.h | 72 + .../table/block_based/block_prefix_index.cc | 226 + .../table/block_based/block_prefix_index.h | 70 + .../table/block_based/block_type.h | 34 + .../table/block_based/cachable_entry.h | 244 + .../table/block_based/data_block_footer.cc | 59 + .../table/block_based/data_block_footer.h | 25 + .../block_based/data_block_hash_index.cc | 94 + .../table/block_based/data_block_hash_index.h | 137 + .../table/block_based/filter_block.h | 183 + .../block_based/filter_block_reader_common.cc | 163 + .../block_based/filter_block_reader_common.h | 76 + .../table/block_based/filter_policy.cc | 1966 ++ .../block_based/filter_policy_internal.h | 340 + .../table/block_based/flush_block_policy.cc | 132 + .../block_based/flush_block_policy_impl.h | 40 + .../table/block_based/full_filter_block.cc | 290 + .../table/block_based/full_filter_block.h | 147 + .../table/block_based/hash_index_reader.cc | 147 + .../table/block_based/hash_index_reader.h | 49 + .../table/block_based/index_builder.cc | 282 + .../table/block_based/index_builder.h | 455 + .../table/block_based/index_reader_common.cc | 57 + .../table/block_based/index_reader_common.h | 85 + .../block_based/mock_block_based_table.h | 62 + .../block_based/parsed_full_filter_block.cc | 23 + .../block_based/parsed_full_filter_block.h | 47 + .../block_based/partitioned_filter_block.cc | 554 + .../block_based/partitioned_filter_block.h | 182 + .../block_based/partitioned_index_iterator.cc | 163 + .../block_based/partitioned_index_iterator.h | 160 + .../block_based/partitioned_index_reader.cc | 224 + .../block_based/partitioned_index_reader.h | 56 + .../table/block_based/reader_common.cc | 52 + .../table/block_based/reader_common.h | 34 + .../block_based/uncompression_dict_reader.cc | 126 + .../block_based/uncompression_dict_reader.h | 61 + .../rocksdb-8.3.2/table/block_fetcher.cc | 405 + .../rocksdb-8.3.2/table/block_fetcher.h | 142 + .../table/compaction_merging_iterator.cc | 370 + .../table/compaction_merging_iterator.h | 44 + .../table/cuckoo/cuckoo_table_builder.cc | 551 + .../table/cuckoo/cuckoo_table_builder.h | 136 + .../table/cuckoo/cuckoo_table_factory.cc | 100 + .../table/cuckoo/cuckoo_table_factory.h | 80 + .../table/cuckoo/cuckoo_table_reader.cc | 412 + .../table/cuckoo/cuckoo_table_reader.h | 100 + .../rocksdb-8.3.2/table/format.cc | 573 + .../rocksdb-8.3.2/table/format.h | 378 + .../rocksdb-8.3.2/table/get_context.cc | 616 + .../rocksdb-8.3.2/table/get_context.h | 245 + .../rocksdb-8.3.2/table/internal_iterator.h | 224 + .../rocksdb-8.3.2/table/iter_heap.h | 44 + .../rocksdb-8.3.2/table/iterator.cc | 130 + .../rocksdb-8.3.2/table/iterator_wrapper.h | 190 + .../rocksdb-8.3.2/table/merging_iterator.cc | 1725 ++ .../rocksdb-8.3.2/table/merging_iterator.h | 97 + .../rocksdb-8.3.2/table/meta_blocks.cc | 564 + .../rocksdb-8.3.2/table/meta_blocks.h | 172 + .../rocksdb-8.3.2/table/mock_table.h | 94 + .../rocksdb-8.3.2/table/multiget_context.h | 405 + .../table/persistent_cache_helper.cc | 111 + .../table/persistent_cache_helper.h | 46 + .../table/persistent_cache_options.h | 34 + .../table/plain/plain_table_bloom.cc | 78 + .../table/plain/plain_table_bloom.h | 132 + .../table/plain/plain_table_builder.cc | 335 + .../table/plain/plain_table_builder.h | 152 + .../table/plain/plain_table_factory.cc | 296 + .../table/plain/plain_table_factory.h | 180 + .../table/plain/plain_table_index.cc | 211 + .../table/plain/plain_table_index.h | 246 + .../table/plain/plain_table_key_coding.cc | 507 + .../table/plain/plain_table_key_coding.h | 199 + .../table/plain/plain_table_reader.cc | 771 + .../table/plain/plain_table_reader.h | 243 + .../table/scoped_arena_iterator.h | 57 + .../rocksdb-8.3.2/table/sst_file_dumper.cc | 520 + .../rocksdb-8.3.2/table/sst_file_dumper.h | 99 + .../rocksdb-8.3.2/table/sst_file_reader.cc | 101 + .../rocksdb-8.3.2/table/sst_file_writer.cc | 436 + .../table/sst_file_writer_collectors.h | 95 + .../rocksdb-8.3.2/table/table_builder.h | 228 + .../rocksdb-8.3.2/table/table_factory.cc | 52 + .../rocksdb-8.3.2/table/table_properties.cc | 351 + .../table/table_properties_internal.h | 14 + .../rocksdb-8.3.2/table/table_reader.h | 187 + .../rocksdb-8.3.2/table/two_level_iterator.cc | 220 + .../rocksdb-8.3.2/table/two_level_iterator.h | 43 + .../rocksdb-8.3.2/table/unique_id.cc | 223 + .../rocksdb-8.3.2/table/unique_id_impl.h | 93 + .../rocksdb-8.3.2/test_util/mock_time_env.h | 78 + .../test_util/secondary_cache_test_util.h | 119 + .../rocksdb-8.3.2/test_util/sync_point.cc | 82 + .../rocksdb-8.3.2/test_util/sync_point.h | 180 + .../test_util/sync_point_impl.cc | 152 + .../rocksdb-8.3.2/test_util/sync_point_impl.h | 96 + .../rocksdb-8.3.2/test_util/testharness.h | 124 + .../rocksdb-8.3.2/test_util/testutil.h | 860 + .../test_util/transaction_test_util.cc | 400 + .../test_util/transaction_test_util.h | 147 + .../rocksdb-8.3.2/third-party/gcc/ppc-asm.h | 390 + .../gtest-1.8.1/fused-src/gtest/gtest.h | 22115 ++++++++++++++++ .../block_cache_trace_analyzer.h | 397 + .../rocksdb-8.3.2/tools/dump/db_dump_tool.cc | 258 + .../tools/io_tracer_parser_tool.h | 38 + .../rocksdb-8.3.2/tools/ldb_cmd_impl.h | 744 + .../tools/simulated_hybrid_file_system.h | 124 + .../rocksdb-8.3.2/tools/trace_analyzer_tool.h | 329 + .../trace_replay/block_cache_tracer.cc | 509 + .../trace_replay/block_cache_tracer.h | 239 + .../rocksdb-8.3.2/trace_replay/io_tracer.cc | 303 + .../rocksdb-8.3.2/trace_replay/io_tracer.h | 185 + .../trace_replay/trace_record.cc | 206 + .../trace_replay/trace_record_handler.cc | 190 + .../trace_replay/trace_record_handler.h | 46 + .../trace_replay/trace_record_result.cc | 146 + .../trace_replay/trace_replay.cc | 622 + .../rocksdb-8.3.2/trace_replay/trace_replay.h | 183 + .../rocksdb-8.3.2/util/aligned_buffer.h | 235 + .../rocksdb-8.3.2/util/async_file_reader.cc | 81 + .../rocksdb-8.3.2/util/async_file_reader.h | 144 + .../rocksdb-8.3.2/util/autovector.h | 393 + .../rocksdb-8.3.2/util/bloom_impl.h | 489 + .../rocksdb-8.3.2/util/build_version.cc | 79 + .../rocksdb-8.3.2/util/build_version.cc.in | 79 + .../rocksdb-8.3.2/util/cast_util.h | 42 + .../rocksdb-8.3.2/util/channel.h | 69 + .../rocksdb-8.3.2/util/cleanable.cc | 181 + .../rocksdb-8.3.2/util/coding.cc | 90 + .../rocksdb-8.3.2/util/coding.h | 389 + .../rocksdb-8.3.2/util/coding_lean.h | 101 + .../util/compaction_job_stats_impl.cc | 92 + .../rocksdb-8.3.2/util/comparator.cc | 383 + .../rocksdb-8.3.2/util/compression.cc | 122 + .../rocksdb-8.3.2/util/compression.h | 1795 ++ .../util/compression_context_cache.cc | 106 + .../util/compression_context_cache.h | 47 + .../util/concurrent_task_limiter_impl.cc | 64 + .../util/concurrent_task_limiter_impl.h | 67 + .../rocksdb-8.3.2/util/core_local.h | 84 + .../rocksdb-8.3.2/util/coro_utils.h | 112 + .../rocksdb-8.3.2/util/crc32c.cc | 1292 + .../rocksdb-8.3.2/util/crc32c.h | 56 + .../rocksdb-8.3.2/util/crc32c_arm64.cc | 213 + .../rocksdb-8.3.2/util/crc32c_arm64.h | 52 + .../rocksdb-8.3.2/util/crc32c_ppc.h | 22 + .../rocksdb-8.3.2/util/crc32c_ppc_constants.h | 900 + .../rocksdb-8.3.2/util/data_structure.cc | 18 + .../rocksdb-8.3.2/util/defer.h | 82 + .../rocksdb-8.3.2/util/distributed_mutex.h | 50 + .../rocksdb-8.3.2/util/duplicate_detector.h | 69 + .../rocksdb-8.3.2/util/dynamic_bloom.cc | 70 + .../rocksdb-8.3.2/util/dynamic_bloom.h | 214 + .../rocksdb-8.3.2/util/fastrange.h | 114 + .../util/file_checksum_helper.cc | 170 + .../rocksdb-8.3.2/util/file_checksum_helper.h | 101 + .../rocksdb-8.3.2/util/gflags_compat.h | 29 + .../rocksdb-8.3.2/util/hash.cc | 201 + .../rocksdb-8.3.2/util/hash.h | 137 + .../rocksdb-8.3.2/util/hash128.h | 26 + .../rocksdb-8.3.2/util/hash_containers.h | 51 + .../rocksdb-8.3.2/util/hash_map.h | 67 + .../rocksdb-8.3.2/util/heap.h | 174 + .../rocksdb-8.3.2/util/kv_map.h | 33 + .../rocksdb-8.3.2/util/math.h | 299 + .../rocksdb-8.3.2/util/math128.h | 316 + .../rocksdb-8.3.2/util/murmurhash.cc | 196 + .../rocksdb-8.3.2/util/murmurhash.h | 43 + .../rocksdb-8.3.2/util/mutexlock.h | 181 + .../rocksdb-8.3.2/util/ppc-opcode.h | 27 + .../rocksdb-8.3.2/util/random.cc | 63 + .../rocksdb-8.3.2/util/random.h | 190 + .../rocksdb-8.3.2/util/rate_limiter.cc | 376 + .../rocksdb-8.3.2/util/rate_limiter_impl.h | 146 + .../rocksdb-8.3.2/util/repeatable_thread.h | 149 + .../rocksdb-8.3.2/util/ribbon_alg.h | 1225 + .../rocksdb-8.3.2/util/ribbon_config.cc | 506 + .../rocksdb-8.3.2/util/ribbon_config.h | 182 + .../rocksdb-8.3.2/util/ribbon_impl.h | 1137 + .../rocksdb-8.3.2/util/set_comparator.h | 24 + .../util/single_thread_executor.h | 56 + .../rocksdb-8.3.2/util/slice.cc | 367 + .../rocksdb-8.3.2/util/status.cc | 162 + .../rocksdb-8.3.2/util/stderr_logger.cc | 30 + .../rocksdb-8.3.2/util/stderr_logger.h | 31 + .../rocksdb-8.3.2/util/stop_watch.h | 136 + .../rocksdb-8.3.2/util/string_util.cc | 502 + .../rocksdb-8.3.2/util/string_util.h | 175 + .../rocksdb-8.3.2/util/thread_guard.h | 41 + .../rocksdb-8.3.2/util/thread_local.cc | 521 + .../rocksdb-8.3.2/util/thread_local.h | 100 + .../rocksdb-8.3.2/util/thread_operation.h | 113 + .../rocksdb-8.3.2/util/threadpool_imp.cc | 551 + .../rocksdb-8.3.2/util/threadpool_imp.h | 120 + .../rocksdb-8.3.2/util/timer.h | 340 + .../rocksdb-8.3.2/util/timer_queue.h | 231 + .../rocksdb-8.3.2/util/udt_util.h | 77 + .../util/user_comparator_wrapper.h | 64 + .../rocksdb-8.3.2/util/vector_iterator.h | 118 + .../rocksdb-8.3.2/util/work_queue.h | 150 + .../rocksdb-8.3.2/util/xxhash.cc | 48 + .../rocksdb-8.3.2/util/xxhash.h | 6360 +++++ .../rocksdb-8.3.2/util/xxph3.h | 1764 ++ .../utilities/agg_merge/agg_merge.cc | 238 + .../utilities/agg_merge/agg_merge_impl.h | 49 + .../utilities/agg_merge/test_agg_merge.h | 47 + .../utilities/backup/backup_engine.cc | 3355 +++ .../utilities/backup/backup_engine_impl.h | 34 + .../blob_db/blob_compaction_filter.cc | 488 + .../blob_db/blob_compaction_filter.h | 202 + .../utilities/blob_db/blob_db.cc | 112 + .../rocksdb-8.3.2/utilities/blob_db/blob_db.h | 264 + .../utilities/blob_db/blob_db_gc_stats.h | 54 + .../utilities/blob_db/blob_db_impl.cc | 2185 ++ .../utilities/blob_db/blob_db_impl.h | 501 + .../blob_db/blob_db_impl_filesnapshot.cc | 111 + .../utilities/blob_db/blob_db_iterator.h | 148 + .../utilities/blob_db/blob_db_listener.h | 69 + .../utilities/blob_db/blob_dump_tool.h | 56 + .../utilities/blob_db/blob_file.cc | 316 + .../utilities/blob_db/blob_file.h | 244 + .../utilities/cache_dump_load.cc | 67 + .../utilities/cache_dump_load_impl.cc | 369 + .../utilities/cache_dump_load_impl.h | 356 + .../cassandra/cassandra_compaction_filter.cc | 106 + .../cassandra/cassandra_compaction_filter.h | 57 + .../utilities/cassandra/cassandra_options.h | 41 + .../utilities/cassandra/format.cc | 367 + .../utilities/cassandra/format.h | 183 + .../utilities/cassandra/merge_operator.cc | 80 + .../utilities/cassandra/merge_operator.h | 44 + .../utilities/cassandra/serialize.h | 81 + .../utilities/cassandra/test_utils.h | 42 + .../utilities/checkpoint/checkpoint_impl.cc | 470 + .../utilities/checkpoint/checkpoint_impl.h | 64 + .../utilities/compaction_filters.cc | 52 + .../layered_compaction_filter_base.h | 41 + .../remove_emptyvalue_compactionfilter.cc | 24 + .../remove_emptyvalue_compactionfilter.h | 26 + .../utilities/convenience/info_log_finder.cc | 26 + .../rocksdb-8.3.2/utilities/counted_fs.cc | 379 + .../rocksdb-8.3.2/utilities/counted_fs.h | 158 + .../rocksdb-8.3.2/utilities/debug.cc | 118 + .../rocksdb-8.3.2/utilities/env_mirror.cc | 273 + .../rocksdb-8.3.2/utilities/env_timed.cc | 181 + .../rocksdb-8.3.2/utilities/env_timed.h | 95 + .../utilities/fault_injection_env.cc | 555 + .../utilities/fault_injection_env.h | 258 + .../utilities/fault_injection_fs.cc | 1071 + .../utilities/fault_injection_fs.h | 593 + .../fault_injection_secondary_cache.cc | 136 + .../fault_injection_secondary_cache.h | 109 + .../leveldb_options/leveldb_options.cc | 57 + .../utilities/memory/memory_util.cc | 50 + .../utilities/memory_allocators.h | 103 + .../utilities/merge_operators.cc | 115 + .../rocksdb-8.3.2/utilities/merge_operators.h | 36 + .../utilities/merge_operators/bytesxor.cc | 57 + .../utilities/merge_operators/bytesxor.h | 40 + .../utilities/merge_operators/max.cc | 64 + .../utilities/merge_operators/max_operator.h | 35 + .../utilities/merge_operators/put.cc | 74 + .../utilities/merge_operators/put_operator.h | 56 + .../utilities/merge_operators/sortlist.cc | 95 + .../utilities/merge_operators/sortlist.h | 42 + .../string_append/stringappend.cc | 76 + .../string_append/stringappend.h | 32 + .../string_append/stringappend2.cc | 130 + .../string_append/stringappend2.h | 52 + .../utilities/merge_operators/uint64add.cc | 56 + .../utilities/merge_operators/uint64add.h | 35 + .../utilities/object_registry.cc | 381 + .../option_change_migration.cc | 176 + .../utilities/options/options_util.cc | 117 + .../persistent_cache/block_cache_tier.cc | 420 + .../persistent_cache/block_cache_tier.h | 154 + .../persistent_cache/block_cache_tier_file.cc | 608 + .../persistent_cache/block_cache_tier_file.h | 291 + .../block_cache_tier_file_buffer.h | 127 + .../block_cache_tier_metadata.cc | 84 + .../block_cache_tier_metadata.h | 122 + .../utilities/persistent_cache/hash_table.h | 237 + .../persistent_cache/hash_table_evictable.h | 166 + .../utilities/persistent_cache/lrulist.h | 172 + .../persistent_cache/persistent_cache_test.h | 284 + .../persistent_cache/persistent_cache_tier.cc | 165 + .../persistent_cache/persistent_cache_tier.h | 340 + .../persistent_cache/persistent_cache_util.h | 67 + .../persistent_cache/volatile_tier_impl.cc | 138 + .../persistent_cache/volatile_tier_impl.h | 139 + .../simulator_cache/cache_simulator.cc | 287 + .../simulator_cache/cache_simulator.h | 231 + .../utilities/simulator_cache/sim_cache.cc | 372 + .../compact_on_deletion_collector.cc | 221 + .../compact_on_deletion_collector.h | 68 + .../trace/file_trace_reader_writer.cc | 133 + .../trace/file_trace_reader_writer.h | 48 + .../utilities/trace/replayer_impl.cc | 314 + .../utilities/trace/replayer_impl.h | 84 + .../transactions/lock/lock_manager.cc | 27 + .../transactions/lock/lock_manager.h | 80 + .../transactions/lock/lock_tracker.h | 207 + .../lock/point/point_lock_manager.cc | 719 + .../lock/point/point_lock_manager.h | 222 + .../lock/point/point_lock_manager_test.h | 324 + .../lock/point/point_lock_tracker.cc | 255 + .../lock/point/point_lock_tracker.h | 97 + .../lock/range/range_lock_manager.h | 34 + .../lock/range/range_tree/lib/db.h | 76 + .../lock/range/range_tree/lib/ft/comparator.h | 138 + .../lock/range/range_tree/lib/ft/ft-status.h | 102 + .../range_tree/lib/locktree/concurrent_tree.h | 174 + .../range/range_tree/lib/locktree/keyrange.h | 141 + .../range_tree/lib/locktree/lock_request.h | 255 + .../range/range_tree/lib/locktree/locktree.h | 580 + .../range_tree/lib/locktree/range_buffer.h | 178 + .../range/range_tree/lib/locktree/treenode.h | 302 + .../range/range_tree/lib/locktree/txnid_set.h | 92 + .../lock/range/range_tree/lib/locktree/wfg.h | 124 + .../range/range_tree/lib/portability/memory.h | 215 + .../lib/portability/toku_assert_subst.h | 39 + .../range_tree/lib/portability/toku_atomic.h | 130 + .../lib/portability/toku_external_pthread.h | 83 + .../lib/portability/toku_instrumentation.h | 286 + .../lib/portability/toku_portability.h | 87 + .../range_tree/lib/portability/toku_pthread.h | 520 + .../lib/portability/toku_race_tools.h | 179 + .../range_tree/lib/portability/toku_time.h | 197 + .../range_tree/lib/portability/txn_subst.h | 27 + .../lock/range/range_tree/lib/util/dbt.h | 98 + .../range_tree/lib/util/growable_array.h | 144 + .../lock/range/range_tree/lib/util/memarena.h | 141 + .../lock/range/range_tree/lib/util/omt.h | 794 + .../lock/range/range_tree/lib/util/omt_impl.h | 1295 + .../range_tree/lib/util/partitioned_counter.h | 165 + .../lock/range/range_tree/lib/util/status.h | 76 + .../range_tree/range_tree_lock_manager.h | 135 + .../range_tree/range_tree_lock_tracker.h | 146 + .../transactions/optimistic_transaction.cc | 194 + .../transactions/optimistic_transaction.h | 99 + .../optimistic_transaction_db_impl.cc | 109 + .../optimistic_transaction_db_impl.h | 86 + .../transactions/pessimistic_transaction.cc | 1177 + .../transactions/pessimistic_transaction.h | 311 + .../pessimistic_transaction_db.cc | 780 + .../transactions/pessimistic_transaction_db.h | 316 + .../transactions/snapshot_checker.cc | 37 + .../transactions/transaction_base.cc | 757 + .../utilities/transactions/transaction_base.h | 382 + .../transactions/transaction_db_mutex_impl.cc | 133 + .../transactions/transaction_db_mutex_impl.h | 24 + .../utilities/transactions/transaction_test.h | 578 + .../transactions/transaction_util.cc | 204 + .../utilities/transactions/transaction_util.h | 83 + .../transactions/write_prepared_txn.cc | 515 + .../transactions/write_prepared_txn.h | 117 + .../transactions/write_prepared_txn_db.cc | 1038 + .../transactions/write_prepared_txn_db.h | 1123 + .../transactions/write_unprepared_txn.cc | 1056 + .../transactions/write_unprepared_txn.h | 339 + .../transactions/write_unprepared_txn_db.cc | 476 + .../transactions/write_unprepared_txn_db.h | 106 + .../utilities/ttl/db_ttl_impl.cc | 617 + .../rocksdb-8.3.2/utilities/ttl/db_ttl_impl.h | 243 + .../rocksdb-8.3.2/utilities/wal_filter.cc | 22 + .../write_batch_with_index.cc | 694 + .../write_batch_with_index_internal.cc | 742 + .../write_batch_with_index_internal.h | 342 + .../src/Database/RocksDB.hs | 17 + .../src/Database/RocksDB/Base.hs | 336 + .../src/Database/RocksDB/C.hsc | 415 + .../src/Database/RocksDB/Internal.hs | 217 + .../src/Database/RocksDB/Iterator.hs | 198 + .../src/Database/RocksDB/ReadOptions.hs | 93 + .../src/Database/RocksDB/Types.hs | 147 + .../rocksdb-haskell-kadena/unpack-rocksdb.sh | 354 + 861 files changed, 319552 insertions(+), 16 deletions(-) create mode 100644 vendored/rocksdb-haskell-kadena/CHANGELOG.md create mode 100644 vendored/rocksdb-haskell-kadena/LICENSE create mode 100644 vendored/rocksdb-haskell-kadena/README.md create mode 100644 vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.cpp create mode 100644 vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_constants.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_counting_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_completion_callback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_index.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_read_request.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/c.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/clipping_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iteration_stats.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_service_job.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/file_pri.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/sst_partitioner.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/convenience.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_filesnapshot.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_compaction_flush.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_debug.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_experimental.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_files.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_open.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_write.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_test_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_with_timestamp_test_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/experimental.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/history_trimming_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/job_context.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/kv_checksum.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_format.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/lookup_key.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_context.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_operator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pinned_iterators_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/post_memtable_callback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pre_release_callback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/read_callback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/repair.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_checker.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache_sync_and_async.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set_sync_and_async.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_columns.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_base.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_internal.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_callback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_common.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_compaction_filter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_driver.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_env_wrapper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_listener.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_shared_state.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_stat.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_table_properties_collector.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_test_base.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_state.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_value.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/multi_ops_txns_stress.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env_wrapper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/emulated_clock.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption_ctr.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_posix.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_posix.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_readonly.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_file_info.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/fuzz/util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/block_cache_trace_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/c.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache_bench_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cleanable.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_filter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_job_stats.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/comparator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compression_type.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/concurrent_task_limiter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/configurable.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/convenience.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/customizable.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/data_structure.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_bench_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_dump_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_stress_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env_encryption.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/experimental.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_checksum.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_system.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/filter_policy.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/flush_block_policy.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/functor_wrapper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/io_status.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iostats_context.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/ldb_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/listener.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memory_allocator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memtablerep.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/merge_operator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/metadata.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_context.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_level.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/persistent_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/port_defs.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rate_limiter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rocksdb_namespace.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/secondary_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice_transform.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/snapshot.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_dump_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_partitioner.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/statistics.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/stats_history.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/status.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/system_clock.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_properties.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_reader_caller.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/thread_status.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/threadpool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_reader_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record_result.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/transaction_log.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/types.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/unique_id.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/universal_compaction.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/agg_merge.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/backup_engine.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/cache_dump_load.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/checkpoint.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/convenience.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/customizable_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/db_ttl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/debug.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/env_mirror.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/info_log_finder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd_execute_result.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/leveldb_options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_custom_library.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/memory_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/object_registry.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/optimistic_transaction_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/option_change_migration.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_type.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/replayer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/sim_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/stackable_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/table_properties_collectors.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db_mutex.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/write_batch_with_index.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/version.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wal_filter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wide_columns.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch_base.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_buffer_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/compaction_filter_factory_jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/comparatorjnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/cplusplus_to_java_convert.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/event_listener_jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/loggerjnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/portal.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/statisticsjni.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/table_filter_jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/trace_writer_jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/transaction_notifier_jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/wal_filter_jnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/writebatchhandlerjnicallback.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/env_logger.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/logging.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/allocator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_usage.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/alloc_tracker.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_linklist_rep.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_skiplist_rep.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/inlineskiplist.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplist.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplistrep.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/stl_wrappers.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/vectorrep.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/write_buffer_manager.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/file_read_sample.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context_imp.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context_imp.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level_imp.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_step_timer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater_debug.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util_debug.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_helper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_test.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/customizable.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/jemalloc_helper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/lang.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/likely.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/malloc.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_dirent.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_example.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/sys_time.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/util_logger.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_default.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/xpress_win.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/xpress.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_sync_and_async.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_type.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/cachable_entry.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy_internal.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/mock_block_based_table.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/internal_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iter_heap.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator_wrapper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/mock_table.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/multiget_context.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/scoped_arena_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer_collectors.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_builder.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_factory.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties_internal.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/mock_time_env.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/secondary_cache_test_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testharness.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testutil.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gcc/ppc-asm.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gtest-1.8.1/fused-src/gtest/gtest.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/block_cache_analyzer/block_cache_trace_analyzer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/dump/db_dump_tool.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/io_tracer_parser_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/ldb_cmd_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/simulated_hybrid_file_system.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/trace_analyzer_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_result.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/aligned_buffer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/autovector.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/bloom_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc.in create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cast_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/channel.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cleanable.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding_lean.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compaction_job_stats_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/comparator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/core_local.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coro_utils.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc_constants.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/data_structure.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/defer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/distributed_mutex.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/duplicate_detector.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/fastrange.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/gflags_compat.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash128.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_containers.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_map.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/heap.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/kv_map.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math128.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/mutexlock.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ppc-opcode.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/repeatable_thread.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_alg.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/set_comparator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/single_thread_executor.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/slice.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/status.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stop_watch.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_guard.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_operation.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer_queue.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/udt_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/user_comparator_wrapper.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/vector_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/work_queue.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxph3.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/test_agg_merge.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_gc_stats.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl_filesnapshot.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_iterator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_listener.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_dump_tool.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_options.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/serialize.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/test_utils.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/layered_compaction_filter_base.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/convenience/info_log_finder.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/debug.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_mirror.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/leveldb_options/leveldb_options.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory/memory_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory_allocators.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max_operator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put_operator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/object_registry.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/option_change_migration/option_change_migration.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/options/options_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file_buffer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table_evictable.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/lrulist.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_test.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/sim_cache.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_tracker.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager_test.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_lock_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/memory.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/dbt.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/memarena.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/status.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/snapshot_checker.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_test.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.h create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/wal_filter.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc create mode 100644 vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.h create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB.hs create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Base.hs create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB/C.hsc create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Internal.hs create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Iterator.hs create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB/ReadOptions.hs create mode 100644 vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Types.hs create mode 100755 vendored/rocksdb-haskell-kadena/unpack-rocksdb.sh diff --git a/cabal.project b/cabal.project index a6d0b218a0..7666a1b523 100644 --- a/cabal.project +++ b/cabal.project @@ -52,9 +52,6 @@ package pact -- avoid conflict with cryptonite during linking flags: +cryptonite-ed25519 -build-tool -package rocksdb-haskell-kadena - ghc-options: -Wwarn -optc-w -optcxx-w - package crypton flags: +support_pclmuldq @@ -84,12 +81,6 @@ source-repository-package tag: 1d260bfaa48312b54851057885de4c43c420e35f --sha256: 0fzq4mzaszj5clvixx9mn1x6r4dcrnwvbl2znd0p5mmy5h2jr0hh -source-repository-package - type: git - location: https://github.com/kadena-io/rocksdb-haskell.git - tag: cede9de2932a4ead1bd82fd7709b19ab7b19b33d - --sha256: 1dngd44va6h66vwpdpwmnj0zcky87m4vzykjwv49p2km12lwq9mf - source-repository-package type: git location: https://github.com/kadena-io/rosetta.git diff --git a/cabal.project.freeze b/cabal.project.freeze index 1be9de4a7f..e23231bf7b 100644 --- a/cabal.project.freeze +++ b/cabal.project.freeze @@ -62,7 +62,7 @@ constraints: any.Boolean ==0.2.4, any.cassava ==0.5.3.1, any.cborg ==0.2.10.0, any.cereal ==0.5.8.3, - chainweb -debug -ed25519 -ghc-flags, + chainweb -debug -ed25519 -ghc-flags -rocksdb-with-tbb, any.character-ps ==0.1, any.charset ==0.3.10, any.chronos ==1.1.5.1, @@ -228,8 +228,6 @@ constraints: any.Boolean ==0.2.4, any.resource-pool ==0.4.0.0, any.resourcet ==1.3.0, any.retry ==0.9.3.1, - any.rocksdb-haskell-kadena ==1.1.0, - rocksdb-haskell-kadena -with-tbb, any.rosetta ==1.0.1, any.rts ==1.0.2, any.run-st ==0.1.3.3, diff --git a/chainweb.cabal b/chainweb.cabal index d9b9105c4d..7913456e3c 100644 --- a/chainweb.cabal +++ b/chainweb.cabal @@ -85,6 +85,11 @@ flag ghc-flags default: False manual: True +flag rocksdb-with-tbb + description: Build RocksDB with tbb support + default: False + manual: True + common debugging-flags if flag(debug) ghc-options: @@ -112,8 +117,458 @@ common warning-flags -- Chainweb Library -- -------------------------------------------------------------------------- -- +library rocksdb-haskell-kadena + import: warning-flags, debugging-flags + hs-source-dirs: vendored/rocksdb-haskell-kadena/src + default-language: Haskell2010 + + cxx-sources: + vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.cpp + include-dirs: + vendored/rocksdb-haskell-kadena/cpp + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2 + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include + + install-includes: + vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.h + exposed-modules: + Database.RocksDB + Database.RocksDB.Base + Database.RocksDB.C + Database.RocksDB.Internal + Database.RocksDB.Iterator + Database.RocksDB.ReadOptions + Database.RocksDB.Types + + other-extensions: + CPP, + ForeignFunctionInterface, + EmptyDataDecls, + RecordWildCards + + build-depends: + , base >= 4 && < 5 + , binary + , bytestring + , data-default + , directory + , filepath + , transformers + + ghc-options: + -optc-w + -optcxx-w + -- -optl-static-libstdc++ + -- ld-options: -static-libstdc++ + + -- All of the following libraries are optional. If one of the libraries + -- is not available on the build host, it can be commented out in the + -- list below. + -- + extra-libraries: + snappy + gflags + z + bz2 + lz4 + zstd + stdc++ + + if flag(rocksdb-with-tbb) + extra-libraries: + tbb + + -- These options are extracted from CXXFLAGS the RocksDB Makefile + cxx-options: + -fno-rtti + -g + -W + -Wextra + -Wall + -Wno-invalid-offsetof + -Wsign-compare + -Wshadow + -Wunused-parameter + -std=c++2a + -faligned-new + -DNO_THREEWAY_CRC32C + -DHAVE_ALIGNED_NEW + -DROCKSDB_PLATFORM_POSIX + -DROCKSDB_LIB_IO_POSIX + -DSNAPPY + -DGFLAGS=1 + -DZLIB + -DBZIP2 + -DLZ4 + -DZSTD + -DROCKSDB_BACKTRACE + -mtune=generic + -DROCKSDB_SUPPORT_THREAD_LOCAL + -fno-omit-frame-pointer + -momit-leaf-frame-pointer + -DNDEBUG + -Woverloaded-virtual + -Wnon-virtual-dtor + -Wno-missing-field-initializers + -O2 + + if arch(x86_64) + cxx-options: + -march=x86-64 + -msse3 + + if flag(rocksdb-with-tbb) + cxx-options: + -DTBB + + if os(linux) + cxx-options: + -DOS_LINUX + -fno-builtin-memcmp + -DROCKSDB_FALLOCATE_PRESENT + -DROCKSDB_MALLOC_USABLE_SIZE + -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX + -DROCKSDB_RANGESYNC_PRESENT + -DROCKSDB_SCHED_GETCPU_PRESENT + -DROCKSDB_AUXV_GETAUXVAL_PRESENT + + if os(darwin) + cxx-options: + -DOS_MACOSX + -Wshorten-64-to-32 + -DHAVE_FULLFSYNC + + -- These sources are extracted from LIB_OBJECTS the RocksDB Makefile + cxx-sources: + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/cache_entry_roles.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/cache_key.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/cache_helpers.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/cache_reservation_manager.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/charged_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/clock_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/lru_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/compressed_secondary_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/secondary_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/secondary_cache_adapter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/cache/sharded_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/c.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_service_job.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/sst_partitioner.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/convenience.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_filesnapshot.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_compaction_flush.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_debug.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_experimental.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_files.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_open.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_write.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/experimental.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_operator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/repair.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_columns.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_base.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_posix.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_posix.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/alloc_tracker.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_linklist_rep.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_skiplist_rep.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplistrep.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/vectorrep.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/write_buffer_manager.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater_debug.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util_debug.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/customizable.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_default.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_factory.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/dump/db_dump_tool.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_result.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cleanable.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compaction_job_stats_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/comparator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/data_structure.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/slice.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/status.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl_filesnapshot.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/convenience/info_log_finder.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/debug.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_mirror.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/leveldb_options/leveldb_options.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory/memory_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/object_registry.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/option_change_migration/option_change_migration.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/options/options_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/sim_cache.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/snapshot_checker.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/wal_filter.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc + + if os(windows) + cxx-sources: + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_default.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc + vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc + library chainweb-storage - -- import: warning-flags, debugging-flags + import: warning-flags, debugging-flags hs-source-dirs: vendored/chainweb-storage/src default-language: Haskell2010 exposed-modules: @@ -136,7 +591,7 @@ library chainweb-storage , memory >=0.14 , mtl >= 2.2 , nothunks >= 0.1.0.0 - , rocksdb-haskell-kadena >=1.1.0 + , chainweb:rocksdb-haskell-kadena , stm >=2.4 , streaming >=0.2 , temporary >=1.3 @@ -624,7 +1079,7 @@ test-suite chainweb-tests , resource-pool >= 0.4 , resourcet >= 1.3 , retry >= 0.7 - , rocksdb-haskell-kadena >= 1.1.0 + , chainweb:rocksdb-haskell-kadena , rosetta >= 1.0 , scheduler >= 1.4 , servant >= 0.20.1 @@ -810,7 +1265,7 @@ executable cwtool , retry >= 0.9 , wreq >= 0.5 , regex-tdfa >= 1.3 - , rocksdb-haskell-kadena >= 1.1.0 + , chainweb:rocksdb-haskell-kadena , safe-exceptions >= 0.1 , servant-client >= 0.18.2 , servant-client-core >= 0.18.2 diff --git a/vendored/rocksdb-haskell-kadena/CHANGELOG.md b/vendored/rocksdb-haskell-kadena/CHANGELOG.md new file mode 100644 index 0000000000..2fee918d08 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/CHANGELOG.md @@ -0,0 +1,35 @@ +## 1.0.1 + * Add support for UTF-8 characters in a database's path + +## 1.0.0 + + * First version by Serokell + +## 0.3x.0 + + * ResourceT is no longer compulsory + + +## 0.2.0 + + * requires LevelDB v1.7 + * support for filter policy (LevelDB v1.5), either custom or using the built-in + bloom filter implementation + * write batch values no longer require a `memcpy` to be early-finalizer-safe + (introduced in 0.1.1) + + +## 0.1.0 + + * memory (foreign pointers) is managed through + [ResourceT](http://hackage.haskell.org/package/resourcet). Note that this + requires to lift monadic actions inside the `MonadResource` monad, see the + examples. + * links against shared library (LevelDB v1.3 or higher) + * LevelDB 1.3 API fully supported (including custom comparators, excluding + custom environments) + + +## 0.0.x + + * experimental releases diff --git a/vendored/rocksdb-haskell-kadena/LICENSE b/vendored/rocksdb-haskell-kadena/LICENSE new file mode 100644 index 0000000000..1a671f848b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/LICENSE @@ -0,0 +1,31 @@ +Copyright (c) 2011, Kim Altintop +Copyright (c) 2014, Alexander Thiemann + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Kim Altintop nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendored/rocksdb-haskell-kadena/README.md b/vendored/rocksdb-haskell-kadena/README.md new file mode 100644 index 0000000000..eeaa639a44 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/README.md @@ -0,0 +1,47 @@ +This library provides Haskell bindings to +[RocksDB](http://rocksdb.org) + +[![Build Status](https://travis-ci.org/serokell/rocksdb-haskell.svg?branch=master)](https://travis-ci.org/serokell/rocksdb-haskell) +[![Windows build status](https://ci.appveyor.com/api/projects/status/x4dmt91wuk8dglw0/branch/master?svg=true)](https://ci.appveyor.com/project/jagajaga/rocksdb-haskell) + +## History + +Version 0.1.0: + +* initial release of this fork. + +## Installation + +Prerequisites: + +* [GHC 8.*](http://www.haskell.org/ghc) +* [Cabal](http://www.haskell.org/cabal), version 1.3 or higher +* [RocksDB](http://rocksdb.org) +* Optional: [Snappy](http://code.google.com/p/snappy), + if compression support is desired + +To install the latest version from hackage: + +```shell +$ cabal install rocksdb-haskell +``` + +To install from checked-out source: + +```shell +$ cabal install +``` + +## Notes + +This library is in very early stage and has seen very limited testing. Comments +and contributions are welcome. + +## Bugs and Contributing + +Please report issues via http://github.com/serokell/rocksdb-haskell/issues.
+Patches are best submitted as pull requests. + +## License + +BSD 3, see LICENSE file. diff --git a/vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.cpp b/vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.cpp new file mode 100644 index 0000000000..2a22d5dd8c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// The code not taken from the LevelDB and RocksDB authors is +// Copyright (c) 2022 Kadena. + +// copied from rocksdb/c.cc +#include +#include "rocksdb/db.h" +#include "rocksdb/slice_transform.h" + +#include +#include +#include + +using rocksdb::DB; +using rocksdb::DBOptions; +using rocksdb::DbPath; +using rocksdb::Env; +using rocksdb::EnvOptions; +using rocksdb::Options; +using rocksdb::ReadOptions; +using rocksdb::Slice; +using rocksdb::SliceParts; +using rocksdb::SliceTransform; +using rocksdb::Snapshot; +using rocksdb::SstFileWriter; +using rocksdb::Status; +using rocksdb::WritableFile; +using rocksdb::WriteOptions; + +using std::shared_ptr; +using std::vector; +using std::unordered_set; +using std::map; + +class DelimitedPrefixTransform : public SliceTransform { + protected: + std::vector *delims; + public: + + explicit DelimitedPrefixTransform(std::vector *_delims) : delims(_delims) { } + ~DelimitedPrefixTransform() { + } + + static const char* kClassName() { return "rocksdb-haskell.DelimitedPrefix"; } + static const char* kNickName() { return "table"; } + const char* Name() const override { return kClassName(); } + Slice Transform(const Slice& src) const override { + size_t prefix_end; + for (char c : *delims) { + if ((prefix_end = std::string(src.data()).find(c)) != std::string::npos) { + return Slice(src.data(), prefix_end); + } + } + // we must return the entire string if there's no symbol because of the law: + // prefix(prefix(str)) == prefix(str) + // this prevents us from implementing a real InDomain, which will go badly + // (redundant prefix seeking) + // *if* we mix prefixed and nonprefixed keys in the same database + return src; + } + + bool InDomain(const Slice&) const override { + return true; + } + + bool InRange(const Slice&) const override { + return true; + } + + bool FullLengthEnabled(size_t*) const override { + return false; + } + + bool SameResultWhenAppended(const Slice&) const override { + return false; + } +}; + +std::vector std_delimiters { '$', '%' }; + +DelimitedPrefixTransform dollar_percent_delimited_transform(&std_delimiters); + +extern "C" { + +struct rocksdb_t { DB* rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_readoptions_t { ReadOptions rep; }; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != nullptr); + if (s.ok()) { + return false; + } else if (*errptr == nullptr) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + // This is a bug if *errptr is not created by malloc() + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} +// end of code copied from rocksdb/c.cc + +void rocksdb_delete_range(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len, + char** errptr) { + SaveError(errptr, db->rep->DeleteRange(options->rep, nullptr, + Slice(start_key, start_key_len), + Slice(end_key, end_key_len))); +} + +void rocksdb_readoptions_set_auto_prefix_mode(rocksdb_readoptions_t* options, bool auto_prefix_mode) { + options->rep.auto_prefix_mode = auto_prefix_mode; +} + +const void* rocksdb_options_table_prefix_extractor() { + return &dollar_percent_delimited_transform; +} + +} diff --git a/vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.h b/vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.h new file mode 100644 index 0000000000..8ebb8829e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/cpp/chainweb-rocksdb.h @@ -0,0 +1,15 @@ +#pragma once + +#include "rocksdb/c.h" + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_range(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_auto_prefix_mode( + rocksdb_readoptions_t* options, int auto_prefix_mode); + +extern ROCKSDB_LIBRARY_API void* rocksdb_options_table_prefix_extractor( + char *delims, int delimsLen); \ No newline at end of file diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.cc new file mode 100644 index 0000000000..2dbfff79a9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/arena_wrapped_db_iter.h" + +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +Status ArenaWrappedDBIter::GetProperty(std::string prop_name, + std::string* prop) { + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + if (!db_iter_->GetProperty(prop_name, prop).ok()) { + *prop = std::to_string(sv_number_); + } + return Status::OK(); + } + return db_iter_->GetProperty(prop_name, prop); +} + +void ArenaWrappedDBIter::Init( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { + auto mem = arena_.AllocateAligned(sizeof(DBIter)); + db_iter_ = + new (mem) DBIter(env, read_options, ioptions, mutable_cf_options, + ioptions.user_comparator, /* iter */ nullptr, version, + sequence, true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, expose_blob_index); + sv_number_ = version_number; + read_options_ = read_options; + allow_refresh_ = allow_refresh; + memtable_range_tombstone_iter_ = nullptr; + if (!env->GetFileSystem()->use_async_io()) { + read_options_.async_io = false; + } +} + +Status ArenaWrappedDBIter::Refresh() { + if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { + return Status::NotSupported("Creating renew iterator is not allowed."); + } + assert(db_iter_ != nullptr); + // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the + // correct behavior. Will be corrected automatically when we take a snapshot + // here for the case of WritePreparedTxnDB. + uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); + TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1"); + TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2"); + auto reinit_internal_iter = [&]() { + Env* env = db_iter_->env(); + db_iter_->~DBIter(); + arena_.~Arena(); + new (&arena_) Arena(); + + SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + if (read_callback_) { + read_callback_->Refresh(latest_seq); + } + Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, + sv->current, latest_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_, + allow_refresh_); + + InternalIterator* internal_iter = db_impl_->NewInternalIterator( + read_options_, cfd_, sv, &arena_, latest_seq, + /* allow_unprepared_value */ true, /* db_iter */ this); + SetIterUnderDBIter(internal_iter); + }; + while (true) { + if (sv_number_ != cur_sv_number) { + reinit_internal_iter(); + break; + } else { + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + // Refresh range-tombstones in MemTable + if (!read_options_.ignore_range_deletions) { + SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_); + TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr); + auto t = sv->mem->NewRangeTombstoneIterator( + read_options_, latest_seq, false /* immutable_memtable */); + if (!t || t->empty()) { + // If memtable_range_tombstone_iter_ points to a non-empty tombstone + // iterator, then it means sv->mem is not the memtable that + // memtable_range_tombstone_iter_ points to, so SV must have changed + // after the sv_number_ != cur_sv_number check above. We will fall + // back to re-init the InternalIterator, and the tombstone iterator + // will be freed during db_iter destruction there. + if (memtable_range_tombstone_iter_) { + assert(!*memtable_range_tombstone_iter_ || + sv_number_ != cfd_->GetSuperVersionNumber()); + } + delete t; + } else { // current mutable memtable has range tombstones + if (!memtable_range_tombstone_iter_) { + delete t; + db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); + // The memtable under DBIter did not have range tombstone before + // refresh. + reinit_internal_iter(); + break; + } else { + delete *memtable_range_tombstone_iter_; + *memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator( + std::unique_ptr(t), + &cfd_->internal_comparator(), nullptr, nullptr); + } + } + db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); + } + // Refresh latest sequence number + db_iter_->set_sequence(latest_seq); + db_iter_->set_valid(false); + // Check again if the latest super version number is changed + uint64_t latest_sv_number = cfd_->GetSuperVersionNumber(); + if (latest_sv_number != cur_sv_number) { + // If the super version number is changed after refreshing, + // fallback to Re-Init the InternalIterator + cur_sv_number = latest_sv_number; + continue; + } + break; + } + } + return Status::OK(); +} + +ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { + ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); + iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence, + max_sequential_skip_in_iterations, version_number, read_callback, + db_impl, cfd, expose_blob_index, allow_refresh); + if (db_impl != nullptr && cfd != nullptr && allow_refresh) { + iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index); + } + + return iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.h new file mode 100644 index 0000000000..f15be306d2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/arena_wrapped_db_iter.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class Version; + +// A wrapper iterator which wraps DB Iterator and the arena, with which the DB +// iterator is supposed to be allocated. This class is used as an entry point of +// a iterator hierarchy whose memory can be allocated inline. In that way, +// accessing the iterator tree can be more cache friendly. It is also faster +// to allocate. +// When using the class's Iterator interface, the behavior is exactly +// the same as the inner DBIter. +class ArenaWrappedDBIter : public Iterator { + public: + ~ArenaWrappedDBIter() override { + if (db_iter_ != nullptr) { + db_iter_->~DBIter(); + } else { + assert(false); + } + } + + // Get the arena to be used to allocate memory for DBIter to be wrapped, + // as well as child iterators in it. + virtual Arena* GetArena() { return &arena_; } + + const ReadOptions& GetReadOptions() { return read_options_; } + + // Set the internal iterator wrapped inside the DB Iterator. Usually it is + // a merging iterator. + virtual void SetIterUnderDBIter(InternalIterator* iter) { + db_iter_->SetIter(iter); + } + + void SetMemtableRangetombstoneIter(TruncatedRangeDelIterator** iter) { + memtable_range_tombstone_iter_ = iter; + } + + bool Valid() const override { return db_iter_->Valid(); } + void SeekToFirst() override { db_iter_->SeekToFirst(); } + void SeekToLast() override { db_iter_->SeekToLast(); } + // 'target' does not contain timestamp, even if user timestamp feature is + // enabled. + void Seek(const Slice& target) override { db_iter_->Seek(target); } + void SeekForPrev(const Slice& target) override { + db_iter_->SeekForPrev(target); + } + void Next() override { db_iter_->Next(); } + void Prev() override { db_iter_->Prev(); } + Slice key() const override { return db_iter_->key(); } + Slice value() const override { return db_iter_->value(); } + const WideColumns& columns() const override { return db_iter_->columns(); } + Status status() const override { return db_iter_->status(); } + Slice timestamp() const override { return db_iter_->timestamp(); } + bool IsBlob() const { return db_iter_->IsBlob(); } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + Status Refresh() override; + + void Init(Env* env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool expose_blob_index, bool allow_refresh); + + // Store some parameters so we can refresh the iterator at a later point + // with these same params + void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd, + ReadCallback* read_callback, bool expose_blob_index) { + db_impl_ = db_impl; + cfd_ = cfd; + read_callback_ = read_callback; + expose_blob_index_ = expose_blob_index; + } + + private: + DBIter* db_iter_ = nullptr; + Arena arena_; + uint64_t sv_number_; + ColumnFamilyData* cfd_ = nullptr; + DBImpl* db_impl_ = nullptr; + ReadOptions read_options_; + ReadCallback* read_callback_; + bool expose_blob_index_ = false; + bool allow_refresh_ = true; + // If this is nullptr, it means the mutable memtable does not contain range + // tombstone when added under this DBIter. + TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr; +}; + +// Generate the arena wrapped iterator class. +// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not +// be supported. +extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false, bool allow_refresh = true); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_constants.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_constants.h new file mode 100644 index 0000000000..a5d09ac767 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_constants.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint64_t kInvalidBlobFileNumber = 0; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.cc new file mode 100644 index 0000000000..8b793c5d26 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.cc @@ -0,0 +1,42 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_contents.h" + +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_helpers.h" +#include "port/malloc.h" + +namespace ROCKSDB_NAMESPACE { + +size_t BlobContents::ApproximateMemoryUsage() const { + size_t usage = 0; + + if (allocation_) { + MemoryAllocator* const allocator = allocation_.get_deleter().allocator; + + if (allocator) { + usage += allocator->UsableSize(allocation_.get(), data_.size()); + } else { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(allocation_.get()); +#else + usage += data_.size(); +#endif + } + } + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif + + return usage; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.h new file mode 100644 index 0000000000..15a672a0ae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_contents.h @@ -0,0 +1,59 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "memory/memory_allocator_impl.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A class representing a single uncompressed value read from a blob file. +class BlobContents { + public: + BlobContents(CacheAllocationPtr&& allocation, size_t size) + : allocation_(std::move(allocation)), data_(allocation_.get(), size) {} + + BlobContents(const BlobContents&) = delete; + BlobContents& operator=(const BlobContents&) = delete; + + BlobContents(BlobContents&&) = default; + BlobContents& operator=(BlobContents&&) = default; + + ~BlobContents() = default; + + const Slice& data() const { return data_; } + size_t size() const { return data_.size(); } + + size_t ApproximateMemoryUsage() const; + + // For TypedCacheInterface + const Slice& ContentSlice() const { return data_; } + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kBlobValue; + + private: + CacheAllocationPtr allocation_; + Slice data_; +}; + +class BlobContentsCreator : public Cache::CreateContext { + public: + static void Create(std::unique_ptr* out, size_t* out_charge, + const Slice& contents, MemoryAllocator* alloc) { + auto raw = new BlobContents(AllocateAndCopyBlock(contents, alloc), + contents.size()); + out->reset(raw); + if (out_charge) { + *out_charge = raw->ApproximateMemoryUsage(); + } + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_counting_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_counting_iterator.h new file mode 100644 index 0000000000..b21651f66f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_counting_iterator.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/blob/blob_garbage_meter.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that passes each key-value encountered to +// BlobGarbageMeter as inflow in order to measure the total number and size of +// blobs in the compaction input on a per-blob file basis. +class BlobCountingIterator : public InternalIterator { + public: + BlobCountingIterator(InternalIterator* iter, + BlobGarbageMeter* blob_garbage_meter) + : iter_(iter), blob_garbage_meter_(blob_garbage_meter) { + assert(iter_); + assert(blob_garbage_meter_); + + UpdateAndCountBlobIfNeeded(); + } + + bool Valid() const override { return iter_->Valid() && status_.ok(); } + + void SeekToFirst() override { + iter_->SeekToFirst(); + UpdateAndCountBlobIfNeeded(); + } + + void SeekToLast() override { + iter_->SeekToLast(); + UpdateAndCountBlobIfNeeded(); + } + + void Seek(const Slice& target) override { + iter_->Seek(target); + UpdateAndCountBlobIfNeeded(); + } + + void SeekForPrev(const Slice& target) override { + iter_->SeekForPrev(target); + UpdateAndCountBlobIfNeeded(); + } + + void Next() override { + assert(Valid()); + + iter_->Next(); + UpdateAndCountBlobIfNeeded(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + + const bool res = iter_->NextAndGetResult(result); + UpdateAndCountBlobIfNeeded(); + return res; + } + + void Prev() override { + assert(Valid()); + + iter_->Prev(); + UpdateAndCountBlobIfNeeded(); + } + + Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + Slice user_key() const override { + assert(Valid()); + return iter_->user_key(); + } + + Slice value() const override { + assert(Valid()); + return iter_->value(); + } + + Status status() const override { return status_; } + + bool PrepareValue() override { + assert(Valid()); + return iter_->PrepareValue(); + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return iter_->UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(Valid()); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(Valid()); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + bool IsDeleteRangeSentinelKey() const override { + return iter_->IsDeleteRangeSentinelKey(); + } + + private: + void UpdateAndCountBlobIfNeeded() { + assert(!iter_->Valid() || iter_->status().ok()); + + if (!iter_->Valid()) { + status_ = iter_->status(); + return; + } + + TEST_SYNC_POINT( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow"); + + status_ = blob_garbage_meter_->ProcessInFlow(key(), value()); + } + + InternalIterator* iter_; + BlobGarbageMeter* blob_garbage_meter_; + Status status_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.cc new file mode 100644 index 0000000000..124429f93a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_fetcher.h" + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index_slice, + prefetch_buffer, blob_value, bytes_read); +} + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer, + blob_value, bytes_read); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.h new file mode 100644 index 0000000000..8aeaf965d2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_fetcher.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class Slice; +class FilePrefetchBuffer; +class PinnableSlice; +class BlobIndex; + +// A thin wrapper around the blob retrieval functionality of Version. +class BlobFetcher { + public: + BlobFetcher(const Version* version, const ReadOptions& read_options) + : version_(version), read_options_(read_options) {} + + Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + private: + const Version* version_; + ReadOptions read_options_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.cc new file mode 100644 index 0000000000..71b1bb7fca --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include +#include + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileAddition::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileAddition::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, total_blob_count_); + PutVarint64(output, total_blob_bytes_); + PutLengthPrefixedSlice(output, checksum_method_); + PutLengthPrefixedSlice(output, checksum_value_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileAddition::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileAddition"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &total_blob_count_)) { + return Status::Corruption(class_name, "Error decoding total blob count"); + } + + if (!GetVarint64(input, &total_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding total blob bytes"); + } + + Slice checksum_method; + if (!GetLengthPrefixedSlice(input, &checksum_method)) { + return Status::Corruption(class_name, "Error decoding checksum method"); + } + checksum_method_ = checksum_method.ToString(); + + Slice checksum_value; + if (!GetLengthPrefixedSlice(input, &checksum_value)) { + return Status::Corruption(class_name, "Error decoding checksum value"); + } + checksum_value_ = checksum_value.ToString(); + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileAddition::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileAddition::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() && + lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() && + lhs.GetChecksumMethod() == rhs.GetChecksumMethod() && + lhs.GetChecksumValue() == rhs.GetChecksumValue(); +} + +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition) { + os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber() + << " total_blob_count: " << blob_file_addition.GetTotalBlobCount() + << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes() + << " checksum_method: " << blob_file_addition.GetChecksumMethod() + << " checksum_value: " + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition) { + jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber() + << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount() + << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes() + << "ChecksumMethod" << blob_file_addition.GetChecksumMethod() + << "ChecksumValue" + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.h new file mode 100644 index 0000000000..43b1a0bcbe --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_addition.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileAddition { + public: + BlobFileAddition() = default; + + BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t total_blob_count_ = 0; + uint64_t total_blob_bytes_ = 0; + std::string checksum_method_; + std::string checksum_value_; +}; + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs); +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.cc new file mode 100644 index 0000000000..21c1e5d417 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.cc @@ -0,0 +1,426 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_completion_callback.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/blob/blob_source.h" +#include "db/event_helpers.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileBuilder::BlobFileBuilder( + VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + std::string db_id, std::string db_session_id, int job_id, + uint32_t column_family_id, const std::string& column_family_name, + Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions) + : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, + immutable_options, mutable_cf_options, file_options, + db_id, db_session_id, job_id, column_family_id, + column_family_name, io_priority, write_hint, io_tracer, + blob_callback, creation_reason, blob_file_paths, + blob_file_additions) {} + +BlobFileBuilder::BlobFileBuilder( + std::function file_number_generator, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + std::string db_id, std::string db_session_id, int job_id, + uint32_t column_family_id, const std::string& column_family_name, + Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions) + : file_number_generator_(std::move(file_number_generator)), + fs_(fs), + immutable_options_(immutable_options), + min_blob_size_(mutable_cf_options->min_blob_size), + blob_file_size_(mutable_cf_options->blob_file_size), + blob_compression_type_(mutable_cf_options->blob_compression_type), + prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache), + file_options_(file_options), + db_id_(std::move(db_id)), + db_session_id_(std::move(db_session_id)), + job_id_(job_id), + column_family_id_(column_family_id), + column_family_name_(column_family_name), + io_priority_(io_priority), + write_hint_(write_hint), + io_tracer_(io_tracer), + blob_callback_(blob_callback), + creation_reason_(creation_reason), + blob_file_paths_(blob_file_paths), + blob_file_additions_(blob_file_additions), + blob_count_(0), + blob_bytes_(0) { + assert(file_number_generator_); + assert(fs_); + assert(immutable_options_); + assert(file_options_); + assert(blob_file_paths_); + assert(blob_file_paths_->empty()); + assert(blob_file_additions_); + assert(blob_file_additions_->empty()); +} + +BlobFileBuilder::~BlobFileBuilder() = default; + +Status BlobFileBuilder::Add(const Slice& key, const Slice& value, + std::string* blob_index) { + assert(blob_index); + assert(blob_index->empty()); + + if (value.size() < min_blob_size_) { + return Status::OK(); + } + + { + const Status s = OpenBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + Slice blob = value; + std::string compressed_blob; + + { + const Status s = CompressBlobIfNeeded(&blob, &compressed_blob); + if (!s.ok()) { + return s; + } + } + + uint64_t blob_file_number = 0; + uint64_t blob_offset = 0; + + { + const Status s = + WriteBlobToFile(key, blob, &blob_file_number, &blob_offset); + if (!s.ok()) { + return s; + } + } + + { + const Status s = CloseBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + { + const Status s = + PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_options_->info_log, + "Failed to pre-populate the blob into blob cache: %s", + s.ToString().c_str()); + } + } + + BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(), + blob_compression_type_); + + return Status::OK(); +} + +Status BlobFileBuilder::Finish() { + if (!IsBlobFileOpen()) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; } + +Status BlobFileBuilder::OpenBlobFileIfNeeded() { + if (IsBlobFileOpen()) { + return Status::OK(); + } + + assert(!blob_count_); + assert(!blob_bytes_); + + assert(file_number_generator_); + const uint64_t blob_file_number = file_number_generator_(); + + assert(immutable_options_); + assert(!immutable_options_->cf_paths.empty()); + std::string blob_file_path = + BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number); + + if (blob_callback_) { + blob_callback_->OnBlobFileCreationStarted( + blob_file_path, column_family_name_, job_id_, creation_reason_); + } + + std::unique_ptr file; + + { + assert(file_options_); + Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s); + + if (!s.ok()) { + return s; + } + } + + // Note: files get added to blob_file_paths_ right after the open, so they + // can be cleaned up upon failure. Contrast this with blob_file_additions_, + // which only contains successfully written files. + assert(blob_file_paths_); + blob_file_paths_->emplace_back(std::move(blob_file_path)); + + assert(file); + file->SetIOPriority(io_priority_); + file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; + Statistics* const statistics = immutable_options_->stats; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_paths_->back(), *file_options_, + immutable_options_->clock, io_tracer_, statistics, + immutable_options_->listeners, + immutable_options_->file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kBlobFile), false)); + + constexpr bool do_flush = false; + + std::unique_ptr blob_log_writer(new BlobLogWriter( + std::move(file_writer), immutable_options_->clock, statistics, + blob_file_number, immutable_options_->use_fsync, do_flush)); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl, + expiration_range); + + { + Status s = blob_log_writer->WriteHeader(header); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); + + if (!s.ok()) { + return s; + } + } + + writer_ = std::move(blob_log_writer); + + assert(IsBlobFileOpen()); + + return Status::OK(); +} + +Status BlobFileBuilder::CompressBlobIfNeeded( + Slice* blob, std::string* compressed_blob) const { + assert(blob); + assert(compressed_blob); + assert(compressed_blob->empty()); + assert(immutable_options_); + + if (blob_compression_type_ == kNoCompression) { + return Status::OK(); + } + + // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb + CompressionOptions opts; + CompressionContext context(blob_compression_type_); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + blob_compression_type_, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + bool success = false; + + { + StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats, + BLOB_DB_COMPRESSION_MICROS); + success = + CompressData(*blob, info, compression_format_version, compressed_blob); + } + + if (!success) { + return Status::Corruption("Error compressing blob"); + } + + *blob = Slice(*compressed_blob); + + return Status::OK(); +} + +Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, + uint64_t* blob_offset) { + assert(IsBlobFileOpen()); + assert(blob_file_number); + assert(blob_offset); + + uint64_t key_offset = 0; + + Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); + + if (!s.ok()) { + return s; + } + + *blob_file_number = writer_->get_log_number(); + + ++blob_count_; + blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size(); + + return Status::OK(); +} + +Status BlobFileBuilder::CloseBlobFile() { + assert(IsBlobFileOpen()); + + BlobLogFooter footer; + footer.blob_count = blob_count_; + + std::string checksum_method; + std::string checksum_value; + + Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); + + if (!s.ok()) { + return s; + } + + const uint64_t blob_file_number = writer_->get_log_number(); + + if (blob_callback_) { + s = blob_callback_->OnBlobFileCompleted( + blob_file_paths_->back(), column_family_name_, job_id_, + blob_file_number, creation_reason_, s, checksum_value, checksum_method, + blob_count_, blob_bytes_); + } + + assert(blob_file_additions_); + blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_, + std::move(checksum_method), + std::move(checksum_value)); + + assert(immutable_options_); + ROCKS_LOG_INFO(immutable_options_->logger, + "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64 + " total blobs, %" PRIu64 " total bytes", + column_family_name_.c_str(), job_id_, blob_file_number, + blob_count_, blob_bytes_); + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; + + return s; +} + +Status BlobFileBuilder::CloseBlobFileIfNeeded() { + assert(IsBlobFileOpen()); + + const WritableFileWriter* const file_writer = writer_->file(); + assert(file_writer); + + if (file_writer->GetFileSize() < blob_file_size_) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +void BlobFileBuilder::Abandon(const Status& s) { + if (!IsBlobFileOpen()) { + return; + } + if (blob_callback_) { + // BlobFileBuilder::Abandon() is called because of error while writing to + // Blob files. So we can ignore the below error. + blob_callback_ + ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_, + job_id_, writer_->get_log_number(), + creation_reason_, s, "", "", blob_count_, + blob_bytes_) + .PermitUncheckedError(); + } + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; +} + +Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob, + uint64_t blob_file_number, + uint64_t blob_offset) const { + Status s = Status::OK(); + + BlobSource::SharedCacheInterface blob_cache{immutable_options_->blob_cache}; + auto statistics = immutable_options_->statistics.get(); + bool warm_cache = + prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly && + creation_reason_ == BlobFileCreationReason::kFlush; + + if (blob_cache && warm_cache) { + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, + blob_file_number); + const CacheKey cache_key = base_cache_key.WithOffset(blob_offset); + const Slice key = cache_key.AsSlice(); + + const Cache::Priority priority = Cache::Priority::BOTTOM; + + s = blob_cache.InsertSaved(key, blob, nullptr /*context*/, priority, + immutable_options_->lowest_used_cache_tier); + + if (s.ok()) { + RecordTick(statistics, BLOB_DB_CACHE_ADD); + RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, blob.size()); + } else { + RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES); + } + } + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.h new file mode 100644 index 0000000000..8e7aab502d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_builder.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionSet; +class FileSystem; +class SystemClock; +struct ImmutableOptions; +struct MutableCFOptions; +struct FileOptions; +class BlobFileAddition; +class Status; +class Slice; +class BlobLogWriter; +class IOTracer; +class BlobFileCompletionCallback; + +class BlobFileBuilder { + public: + BlobFileBuilder(VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, std::string db_id, + std::string db_session_id, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions); + + BlobFileBuilder(std::function file_number_generator, + FileSystem* fs, const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, std::string db_id, + std::string db_session_id, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions); + + BlobFileBuilder(const BlobFileBuilder&) = delete; + BlobFileBuilder& operator=(const BlobFileBuilder&) = delete; + + ~BlobFileBuilder(); + + Status Add(const Slice& key, const Slice& value, std::string* blob_index); + Status Finish(); + void Abandon(const Status& s); + + private: + bool IsBlobFileOpen() const; + Status OpenBlobFileIfNeeded(); + Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const; + Status WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, uint64_t* blob_offset); + Status CloseBlobFile(); + Status CloseBlobFileIfNeeded(); + + Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number, + uint64_t blob_offset) const; + + std::function file_number_generator_; + FileSystem* fs_; + const ImmutableOptions* immutable_options_; + uint64_t min_blob_size_; + uint64_t blob_file_size_; + CompressionType blob_compression_type_; + PrepopulateBlobCache prepopulate_blob_cache_; + const FileOptions* file_options_; + const std::string db_id_; + const std::string db_session_id_; + int job_id_; + uint32_t column_family_id_; + std::string column_family_name_; + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + std::shared_ptr io_tracer_; + BlobFileCompletionCallback* blob_callback_; + BlobFileCreationReason creation_reason_; + std::vector* blob_file_paths_; + std::vector* blob_file_additions_; + std::unique_ptr writer_; + uint64_t blob_count_; + uint64_t blob_bytes_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.cc new file mode 100644 index 0000000000..deebf8d344 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileCache::BlobFileCache(Cache* cache, + const ImmutableOptions* immutable_options, + const FileOptions* file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer) + : cache_(cache), + mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr), + immutable_options_(immutable_options), + file_options_(file_options), + column_family_id_(column_family_id), + blob_file_read_hist_(blob_file_read_hist), + io_tracer_(io_tracer) { + assert(cache_); + assert(immutable_options_); + assert(file_options_); +} + +Status BlobFileCache::GetBlobFileReader( + const ReadOptions& read_options, uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader) { + assert(blob_file_reader); + assert(blob_file_reader->IsEmpty()); + + const Slice key = GetSliceForKey(&blob_file_number); + + assert(cache_); + + TypedHandle* handle = cache_.Lookup(key); + if (handle) { + *blob_file_reader = cache_.Guard(handle); + return Status::OK(); + } + + TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck"); + + // Check again while holding mutex + MutexLock lock(mutex_.get(key)); + + handle = cache_.Lookup(key); + if (handle) { + *blob_file_reader = cache_.Guard(handle); + return Status::OK(); + } + + assert(immutable_options_); + Statistics* const statistics = immutable_options_->stats; + + RecordTick(statistics, NO_FILE_OPENS); + + std::unique_ptr reader; + + { + assert(file_options_); + const Status s = BlobFileReader::Create( + *immutable_options_, read_options, *file_options_, column_family_id_, + blob_file_read_hist_, blob_file_number, io_tracer_, &reader); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + { + constexpr size_t charge = 1; + + const Status s = cache_.Insert(key, reader.get(), charge, &handle); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + reader.release(); + + *blob_file_reader = cache_.Guard(handle); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.h new file mode 100644 index 0000000000..a80be7c55e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_cache.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "cache/typed_cache.h" +#include "db/blob/blob_file_reader.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +class Status; +class Slice; +class IOTracer; + +class BlobFileCache { + public: + BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options, + const FileOptions* file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer); + + BlobFileCache(const BlobFileCache&) = delete; + BlobFileCache& operator=(const BlobFileCache&) = delete; + + Status GetBlobFileReader(const ReadOptions& read_options, + uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader); + + private: + using CacheInterface = + BasicTypedCacheInterface; + using TypedHandle = CacheInterface::TypedHandle; + CacheInterface cache_; + // Note: mutex_ below is used to guard against multiple threads racing to open + // the same file. + Striped mutex_; + const ImmutableOptions* immutable_options_; + const FileOptions* file_options_; + uint32_t column_family_id_; + HistogramImpl* blob_file_read_hist_; + std::shared_ptr io_tracer_; + + static constexpr size_t kNumberOfMutexStripes = 1 << 7; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_completion_callback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_completion_callback.h new file mode 100644 index 0000000000..9159677315 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_completion_callback.h @@ -0,0 +1,84 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileCompletionCallback { + public: + BlobFileCompletionCallback( + SstFileManager* sst_file_manager, InstrumentedMutex* mutex, + ErrorHandler* error_handler, EventLogger* event_logger, + const std::vector>& listeners, + const std::string& dbname) + : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) { + sst_file_manager_ = sst_file_manager; + mutex_ = mutex; + error_handler_ = error_handler; + } + + void OnBlobFileCreationStarted(const std::string& file_name, + const std::string& column_family_name, + int job_id, + BlobFileCreationReason creation_reason) { + // Notify the listeners. + EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_, + column_family_name, file_name, + job_id, creation_reason); + } + + Status OnBlobFileCompleted(const std::string& file_name, + const std::string& column_family_name, int job_id, + uint64_t file_number, + BlobFileCreationReason creation_reason, + const Status& report_status, + const std::string& checksum_value, + const std::string& checksum_method, + uint64_t blob_count, uint64_t blob_bytes) { + Status s; + + auto sfm = static_cast(sst_file_manager_); + if (sfm) { + // Report new blob files to SstFileManagerImpl + s = sfm->OnAddFile(file_name); + if (sfm->IsMaxAllowedSpaceReached()) { + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached"); + InstrumentedMutexLock l(mutex_); + error_handler_->SetBGError(s, BackgroundErrorReason::kFlush); + } + } + + // Notify the listeners. + EventHelpers::LogAndNotifyBlobFileCreationFinished( + event_logger_, listeners_, dbname_, column_family_name, file_name, + job_id, file_number, creation_reason, + (!report_status.ok() ? report_status : s), + (checksum_value.empty() ? kUnknownFileChecksum : checksum_value), + (checksum_method.empty() ? kUnknownFileChecksumFuncName + : checksum_method), + blob_count, blob_bytes); + return s; + } + + private: + SstFileManager* sst_file_manager_; + InstrumentedMutex* mutex_; + ErrorHandler* error_handler_; + EventLogger* event_logger_; + std::vector> listeners_; + std::string dbname_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.cc new file mode 100644 index 0000000000..52c336f499 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include +#include + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileGarbage::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileGarbage::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, garbage_blob_count_); + PutVarint64(output, garbage_blob_bytes_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileGarbage::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileGarbage"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &garbage_blob_count_)) { + return Status::Corruption(class_name, "Error decoding garbage blob count"); + } + + if (!GetVarint64(input, &garbage_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding garbage blob bytes"); + } + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileGarbage::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileGarbage::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() && + lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes(); +} + +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage) { + os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber() + << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount() + << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes(); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage) { + jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber() + << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount() + << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes(); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.h new file mode 100644 index 0000000000..6dc14ddcae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_garbage.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileGarbage { + public: + BlobFileGarbage() = default; + + BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : blob_file_number_(blob_file_number), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; +}; + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.cc new file mode 100644 index 0000000000..4913137e59 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_meta.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t SharedBlobFileMetaData::GetBlobFileSize() const { + return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize; +} + +std::string SharedBlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta) { + os << "blob_file_number: " << shared_meta.GetBlobFileNumber() + << " total_blob_count: " << shared_meta.GetTotalBlobCount() + << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes() + << " checksum_method: " << shared_meta.GetChecksumMethod() + << " checksum_value: " + << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +std::string BlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) { + const auto& shared_meta = meta.GetSharedMeta(); + assert(shared_meta); + os << (*shared_meta); + + os << " linked_ssts: {"; + for (uint64_t file_number : meta.GetLinkedSsts()) { + os << ' ' << file_number; + } + os << " }"; + + os << " garbage_blob_count: " << meta.GetGarbageBlobCount() + << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes(); + + return os; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.h new file mode 100644 index 0000000000..d7c8a12433 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_meta.h @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// SharedBlobFileMetaData represents the immutable part of blob files' metadata, +// like the blob file number, total number and size of blobs, or checksum +// method and value. There is supposed to be one object of this class per blob +// file (shared across all versions that include the blob file in question); +// hence, the type is neither copyable nor movable. A blob file can be marked +// obsolete when the corresponding SharedBlobFileMetaData object is destroyed. + +class SharedBlobFileMetaData { + public: + static std::shared_ptr Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + return std::shared_ptr(new SharedBlobFileMetaData( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value))); + } + + template + static std::shared_ptr Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value, Deleter deleter) { + return std::shared_ptr( + new SharedBlobFileMetaData(blob_file_number, total_blob_count, + total_blob_bytes, std::move(checksum_method), + std::move(checksum_value)), + deleter); + } + + SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete; + SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete; + + SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete; + SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete; + + uint64_t GetBlobFileSize() const; + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + std::string DebugString() const; + + private: + SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t blob_file_number_; + uint64_t total_blob_count_; + uint64_t total_blob_bytes_; + std::string checksum_method_; + std::string checksum_value_; +}; + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta); + +// BlobFileMetaData contains the part of the metadata for blob files that can +// vary across versions, like the amount of garbage in the blob file. In +// addition, BlobFileMetaData objects point to and share the ownership of the +// SharedBlobFileMetaData object for the corresponding blob file. Similarly to +// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They +// are meant to be jointly owned by the versions in which the blob file has the +// same (immutable *and* mutable) state. + +class BlobFileMetaData { + public: + using LinkedSsts = std::unordered_set; + + static std::shared_ptr Create( + std::shared_ptr shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + return std::shared_ptr( + new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes)); + } + + BlobFileMetaData(const BlobFileMetaData&) = delete; + BlobFileMetaData& operator=(const BlobFileMetaData&) = delete; + + BlobFileMetaData(BlobFileMetaData&&) = delete; + BlobFileMetaData& operator=(BlobFileMetaData&&) = delete; + + const std::shared_ptr& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileSize() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileSize(); + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + uint64_t GetTotalBlobCount() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobCount(); + } + uint64_t GetTotalBlobBytes() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobBytes(); + } + const std::string& GetChecksumMethod() const { + assert(shared_meta_); + return shared_meta_->GetChecksumMethod(); + } + const std::string& GetChecksumValue() const { + assert(shared_meta_); + return shared_meta_->GetChecksumValue(); + } + + const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + std::string DebugString() const; + + private: + BlobFileMetaData(std::shared_ptr shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : shared_meta_(std::move(shared_meta)), + linked_ssts_(std::move(linked_ssts)), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) { + assert(shared_meta_); + assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount()); + assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes()); + } + + std::shared_ptr shared_meta_; + LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_; + uint64_t garbage_blob_bytes_; +}; + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.cc new file mode 100644 index 0000000000..7abd1ce6b5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.cc @@ -0,0 +1,624 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include +#include + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_log_format.h" +#include "file/file_prefetch_buffer.h" +#include "file/filename.h" +#include "monitoring/statistics_impl.h" +#include "options/cf_options.h" +#include "rocksdb/file_system.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/multiget_context.h" +#include "test_util/sync_point.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFileReader::Create( + const ImmutableOptions& immutable_options, const ReadOptions& read_options, + const FileOptions& file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + std::unique_ptr* blob_file_reader) { + assert(blob_file_reader); + assert(!*blob_file_reader); + + uint64_t file_size = 0; + std::unique_ptr file_reader; + + { + const Status s = + OpenFile(immutable_options, file_options, blob_file_read_hist, + blob_file_number, io_tracer, &file_size, &file_reader); + if (!s.ok()) { + return s; + } + } + + assert(file_reader); + + Statistics* const statistics = immutable_options.stats; + + CompressionType compression_type = kNoCompression; + + { + const Status s = + ReadHeader(file_reader.get(), read_options, column_family_id, + statistics, &compression_type); + if (!s.ok()) { + return s; + } + } + + { + const Status s = + ReadFooter(file_reader.get(), read_options, file_size, statistics); + if (!s.ok()) { + return s; + } + } + + blob_file_reader->reset( + new BlobFileReader(std::move(file_reader), file_size, compression_type, + immutable_options.clock, statistics)); + + return Status::OK(); +} + +Status BlobFileReader::OpenFile( + const ImmutableOptions& immutable_options, const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, uint64_t* file_size, + std::unique_ptr* file_reader) { + assert(file_size); + assert(file_reader); + + const auto& cf_paths = immutable_options.cf_paths; + assert(!cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(cf_paths.front().path, blob_file_number); + + FileSystem* const fs = immutable_options.fs.get(); + assert(fs); + + constexpr IODebugContext* dbg = nullptr; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize"); + + const Status s = + fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg); + if (!s.ok()) { + return s; + } + } + + if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) { + return Status::Corruption("Malformed blob file"); + } + + std::unique_ptr file; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile"); + + const Status s = + fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg); + if (!s.ok()) { + return s; + } + } + + assert(file); + + if (immutable_options.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + + file_reader->reset(new RandomAccessFileReader( + std::move(file), blob_file_path, immutable_options.clock, io_tracer, + immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS, + blob_file_read_hist, immutable_options.rate_limiter.get(), + immutable_options.listeners)); + + return Status::OK(); +} + +Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, + uint32_t column_family_id, + Statistics* statistics, + CompressionType* compression_type) { + assert(file_reader); + assert(compression_type); + + Slice header_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile"); + + constexpr uint64_t read_offset = 0; + constexpr size_t read_size = BlobLogHeader::kSize; + + // TODO: rate limit reading headers from blob files. + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &header_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult", + &header_slice); + } + + BlobLogHeader header; + + { + const Status s = header.DecodeFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (header.has_ttl || header.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + if (header.column_family_id != column_family_id) { + return Status::Corruption("Column family ID mismatch"); + } + + *compression_type = header.compression; + + return Status::OK(); +} + +Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, + uint64_t file_size, Statistics* statistics) { + assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); + assert(file_reader); + + Slice footer_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile"); + + const uint64_t read_offset = file_size - BlobLogFooter::kSize; + constexpr size_t read_size = BlobLogFooter::kSize; + + // TODO: rate limit reading footers from blob files. + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &footer_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult", + &footer_slice); + } + + BlobLogFooter footer; + + { + const Status s = footer.DecodeFrom(footer_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (footer.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + return Status::OK(); +} + +Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, + Buffer* buf, AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority) { + assert(slice); + assert(buf); + assert(aligned_buf); + + assert(file_reader); + + RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size); + + Status s; + + IOOptions io_options; + s = file_reader->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } + + if (file_reader->use_direct_io()) { + constexpr char* scratch = nullptr; + + s = file_reader->Read(io_options, read_offset, read_size, slice, scratch, + aligned_buf, rate_limiter_priority); + } else { + buf->reset(new char[read_size]); + constexpr AlignedBuf* aligned_scratch = nullptr; + + s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(), + aligned_scratch, rate_limiter_priority); + } + + if (!s.ok()) { + return s; + } + + if (slice->size() != read_size) { + return Status::Corruption("Failed to read data from blob file"); + } + + return Status::OK(); +} + +BlobFileReader::BlobFileReader( + std::unique_ptr&& file_reader, uint64_t file_size, + CompressionType compression_type, SystemClock* clock, + Statistics* statistics) + : file_reader_(std::move(file_reader)), + file_size_(file_size), + compression_type_(compression_type), + clock_(clock), + statistics_(statistics) { + assert(file_reader_); +} + +BlobFileReader::~BlobFileReader() = default; + +Status BlobFileReader::GetBlob( + const ReadOptions& read_options, const Slice& user_key, uint64_t offset, + uint64_t value_size, CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator, + std::unique_ptr* result, uint64_t* bytes_read) const { + assert(result); + + const uint64_t key_size = user_key.size(); + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + return Status::Corruption("Invalid blob offset"); + } + + if (compression_type != compression_type_) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + // Note: if verify_checksum is set, we read the entire blob record to be able + // to perform the verification; otherwise, we just read the blob itself. Since + // the offset in BlobIndex actually points to the blob value, we need to make + // an adjustment in the former case. + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(offset >= adjustment); + + const uint64_t record_offset = offset - adjustment; + const uint64_t record_size = value_size + adjustment; + + Slice record_slice; + Buffer buf; + AlignedBuf aligned_buf; + + bool prefetched = false; + + if (prefetch_buffer) { + Status s; + constexpr bool for_compaction = true; + + IOOptions io_options; + s = file_reader_->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } + prefetched = prefetch_buffer->TryReadFromCache( + io_options, file_reader_.get(), record_offset, + static_cast(record_size), &record_slice, &s, + read_options.rate_limiter_priority, for_compaction); + if (!s.ok()) { + return s; + } + } + + if (!prefetched) { + TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile"); + PERF_COUNTER_ADD(blob_read_count, 1); + PERF_COUNTER_ADD(blob_read_byte, record_size); + PERF_TIMER_GUARD(blob_read_time); + const Status s = ReadFromFile( + file_reader_.get(), read_options, record_offset, + static_cast(record_size), statistics_, &record_slice, &buf, + &aligned_buf, read_options.rate_limiter_priority); + if (!s.ok()) { + return s; + } + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult", + &record_slice); + + if (read_options.verify_checksums) { + const Status s = VerifyBlob(record_slice, user_key, value_size); + if (!s.ok()) { + return s; + } + } + + const Slice value_slice(record_slice.data() + adjustment, value_size); + + { + const Status s = UncompressBlobIfNeeded( + value_slice, compression_type, allocator, clock_, statistics_, result); + if (!s.ok()) { + return s; + } + } + + if (bytes_read) { + *bytes_read = record_size; + } + + return Status::OK(); +} + +void BlobFileReader::MultiGetBlob( + const ReadOptions& read_options, MemoryAllocator* allocator, + autovector>>& + blob_reqs, + uint64_t* bytes_read) const { + const size_t num_blobs = blob_reqs.size(); + assert(num_blobs > 0); + assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE); + +#ifndef NDEBUG + for (size_t i = 0; i < num_blobs - 1; ++i) { + assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset); + } +#endif // !NDEBUG + + std::vector read_reqs; + autovector adjustments; + uint64_t total_len = 0; + read_reqs.reserve(num_blobs); + for (size_t i = 0; i < num_blobs; ++i) { + BlobReadRequest* const req = blob_reqs[i].first; + assert(req); + assert(req->user_key); + assert(req->status); + + const size_t key_size = req->user_key->size(); + const uint64_t offset = req->offset; + const uint64_t value_size = req->len; + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + *req->status = Status::Corruption("Invalid blob offset"); + continue; + } + if (req->compression != compression_type_) { + *req->status = + Status::Corruption("Compression type mismatch when reading a blob"); + continue; + } + + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(req->offset >= adjustment); + adjustments.push_back(adjustment); + + FSReadRequest read_req = {}; + read_req.offset = req->offset - adjustment; + read_req.len = req->len + adjustment; + read_reqs.emplace_back(read_req); + total_len += read_req.len; + } + + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len); + + Buffer buf; + AlignedBuf aligned_buf; + + Status s; + bool direct_io = file_reader_->use_direct_io(); + if (direct_io) { + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = nullptr; + } + } else { + buf.reset(new char[total_len]); + std::ptrdiff_t pos = 0; + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = buf.get() + pos; + pos += read_reqs[i].len; + } + } + TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile"); + PERF_COUNTER_ADD(blob_read_count, num_blobs); + PERF_COUNTER_ADD(blob_read_byte, total_len); + s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(), + direct_io ? &aligned_buf : nullptr, + read_options.rate_limiter_priority); + if (!s.ok()) { + for (auto& req : read_reqs) { + req.status.PermitUncheckedError(); + } + for (auto& blob_req : blob_reqs) { + BlobReadRequest* const req = blob_req.first; + assert(req); + assert(req->status); + + if (!req->status->IsCorruption()) { + // Avoid overwriting corruption status. + *req->status = s; + } + } + return; + } + + assert(s.ok()); + + uint64_t total_bytes = 0; + for (size_t i = 0, j = 0; i < num_blobs; ++i) { + BlobReadRequest* const req = blob_reqs[i].first; + assert(req); + assert(req->user_key); + assert(req->status); + + if (!req->status->ok()) { + continue; + } + + assert(j < read_reqs.size()); + auto& read_req = read_reqs[j++]; + const auto& record_slice = read_req.result; + if (read_req.status.ok() && record_slice.size() != read_req.len) { + read_req.status = + IOStatus::Corruption("Failed to read data from blob file"); + } + + *req->status = read_req.status; + if (!req->status->ok()) { + continue; + } + + // Verify checksums if enabled + if (read_options.verify_checksums) { + *req->status = VerifyBlob(record_slice, *req->user_key, req->len); + if (!req->status->ok()) { + continue; + } + } + + // Uncompress blob if needed + Slice value_slice(record_slice.data() + adjustments[i], req->len); + *req->status = + UncompressBlobIfNeeded(value_slice, compression_type_, allocator, + clock_, statistics_, &blob_reqs[i].second); + if (req->status->ok()) { + total_bytes += record_slice.size(); + } + } + + if (bytes_read) { + *bytes_read = total_bytes; + } +} + +Status BlobFileReader::VerifyBlob(const Slice& record_slice, + const Slice& user_key, uint64_t value_size) { + PERF_TIMER_GUARD(blob_checksum_time); + + BlobLogRecord record; + + const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize); + + { + const Status s = record.DecodeHeaderFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + if (record.key_size != user_key.size()) { + return Status::Corruption("Key size mismatch when reading blob"); + } + + if (record.value_size != value_size) { + return Status::Corruption("Value size mismatch when reading blob"); + } + + record.key = + Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size); + if (record.key != user_key) { + return Status::Corruption("Key mismatch when reading blob"); + } + + record.value = Slice(record.key.data() + record.key_size, value_size); + + { + TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC", + &record); + + const Status s = record.CheckBlobCRC(); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status BlobFileReader::UncompressBlobIfNeeded( + const Slice& value_slice, CompressionType compression_type, + MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics, + std::unique_ptr* result) { + assert(result); + + if (compression_type == kNoCompression) { + BlobContentsCreator::Create(result, nullptr, value_slice, allocator); + return Status::OK(); + } + + UncompressionContext context(compression_type); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression_type); + + size_t uncompressed_size = 0; + constexpr uint32_t compression_format_version = 2; + + CacheAllocationPtr output; + + { + PERF_TIMER_GUARD(blob_decompress_time); + StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS); + output = UncompressData(info, value_slice.data(), value_slice.size(), + &uncompressed_size, compression_format_version, + allocator); + } + + TEST_SYNC_POINT_CALLBACK( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output); + + if (!output) { + return Status::Corruption("Unable to uncompress blob"); + } + + result->reset(new BlobContents(std::move(output), uncompressed_size)); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.h new file mode 100644 index 0000000000..990e325408 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_file_reader.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "db/blob/blob_read_request.h" +#include "file/random_access_file_reader.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Status; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +struct ReadOptions; +class Slice; +class FilePrefetchBuffer; +class BlobContents; +class Statistics; + +class BlobFileReader { + public: + static Status Create(const ImmutableOptions& immutable_options, + const ReadOptions& read_options, + const FileOptions& file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + std::unique_ptr* reader); + + BlobFileReader(const BlobFileReader&) = delete; + BlobFileReader& operator=(const BlobFileReader&) = delete; + + ~BlobFileReader(); + + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t offset, uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + MemoryAllocator* allocator, + std::unique_ptr* result, + uint64_t* bytes_read) const; + + // offsets must be sorted in ascending order by caller. + void MultiGetBlob( + const ReadOptions& read_options, MemoryAllocator* allocator, + autovector>>& + blob_reqs, + uint64_t* bytes_read) const; + + CompressionType GetCompressionType() const { return compression_type_; } + + uint64_t GetFileSize() const { return file_size_; } + + private: + BlobFileReader(std::unique_ptr&& file_reader, + uint64_t file_size, CompressionType compression_type, + SystemClock* clock, Statistics* statistics); + + static Status OpenFile(const ImmutableOptions& immutable_options, + const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + uint64_t* file_size, + std::unique_ptr* file_reader); + + static Status ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, + uint32_t column_family_id, Statistics* statistics, + CompressionType* compression_type); + + static Status ReadFooter(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t file_size, + Statistics* statistics); + + using Buffer = std::unique_ptr; + + static Status ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, Buffer* buf, + AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority); + + static Status VerifyBlob(const Slice& record_slice, const Slice& user_key, + uint64_t value_size); + + static Status UncompressBlobIfNeeded(const Slice& value_slice, + CompressionType compression_type, + MemoryAllocator* allocator, + SystemClock* clock, + Statistics* statistics, + std::unique_ptr* result); + + std::unique_ptr file_reader_; + uint64_t file_size_; + CompressionType compression_type_; + SystemClock* clock_; + Statistics* statistics_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.cc new file mode 100644 index 0000000000..d328d7ff4c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + flows_[blob_file_number].AddInFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + // Note: in order to measure the amount of additional garbage, we only need to + // track the outflow for preexisting files, i.e. those that also had inflow. + // (Newly written files would only have outflow.) + auto it = flows_.find(blob_file_number); + if (it == flows_.end()) { + return Status::OK(); + } + + it->second.AddOutFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes) { + assert(blob_file_number); + assert(*blob_file_number == kInvalidBlobFileNumber); + assert(bytes); + assert(*bytes == 0); + + ParsedInternalKey ikey; + + { + constexpr bool log_err_key = false; + const Status s = ParseInternalKey(key, &ikey, log_err_key); + if (!s.ok()) { + return s; + } + } + + if (ikey.type != kTypeBlobIndex) { + return Status::OK(); + } + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return s; + } + } + + if (blob_index.IsInlined() || blob_index.HasTTL()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + *blob_file_number = blob_index.file_number(); + *bytes = + blob_index.size() + + BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size()); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.h new file mode 100644 index 0000000000..a6c04b0b24 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_garbage_meter.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// A class that can be used to compute the amount of additional garbage +// generated by a compaction. It parses the keys and blob references in the +// input and output of a compaction, and aggregates the "inflow" and "outflow" +// on a per-blob file basis. The amount of additional garbage for any given blob +// file can then be computed by subtracting the outflow from the inflow. +class BlobGarbageMeter { + public: + // A class to store the number and total size of blobs on a per-blob file + // basis. + class BlobStats { + public: + void Add(uint64_t bytes) { + ++count_; + bytes_ += bytes; + } + void Add(uint64_t count, uint64_t bytes) { + count_ += count; + bytes_ += bytes; + } + + uint64_t GetCount() const { return count_; } + uint64_t GetBytes() const { return bytes_; } + + private: + uint64_t count_ = 0; + uint64_t bytes_ = 0; + }; + + // A class to keep track of the "inflow" and the "outflow" and to compute the + // amount of additional garbage for a given blob file. + class BlobInOutFlow { + public: + void AddInFlow(uint64_t bytes) { + in_flow_.Add(bytes); + assert(IsValid()); + } + void AddOutFlow(uint64_t bytes) { + out_flow_.Add(bytes); + assert(IsValid()); + } + + const BlobStats& GetInFlow() const { return in_flow_; } + const BlobStats& GetOutFlow() const { return out_flow_; } + + bool IsValid() const { + return in_flow_.GetCount() >= out_flow_.GetCount() && + in_flow_.GetBytes() >= out_flow_.GetBytes(); + } + bool HasGarbage() const { + assert(IsValid()); + return in_flow_.GetCount() > out_flow_.GetCount(); + } + uint64_t GetGarbageCount() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetCount() - out_flow_.GetCount(); + } + uint64_t GetGarbageBytes() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetBytes() - out_flow_.GetBytes(); + } + + private: + BlobStats in_flow_; + BlobStats out_flow_; + }; + + Status ProcessInFlow(const Slice& key, const Slice& value); + Status ProcessOutFlow(const Slice& key, const Slice& value); + + const std::unordered_map& flows() const { + return flows_; + } + + private: + static Status Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes); + + std::unordered_map flows_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_index.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_index.h new file mode 100644 index 0000000000..e9944d7844 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_index.h @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/compression_type.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// BlobIndex is a pointer to the blob and metadata of the blob. The index is +// stored in base DB as ValueType::kTypeBlobIndex. +// There are three types of blob index: +// +// kInlinedTTL: +// +------+------------+---------------+ +// | type | expiration | value | +// +------+------------+---------------+ +// | char | varint64 | variable size | +// +------+------------+---------------+ +// +// kBlob: +// +------+-------------+----------+----------+-------------+ +// | type | file number | offset | size | compression | +// +------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | char | +// +------+-------------+----------+----------+-------------+ +// +// kBlobTTL: +// +------+------------+-------------+----------+----------+-------------+ +// | type | expiration | file number | offset | size | compression | +// +------+------------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | varint64 | char | +// +------+------------+-------------+----------+----------+-------------+ +// +// There isn't a kInlined (without TTL) type since we can store it as a plain +// value (i.e. ValueType::kTypeValue). +class BlobIndex { + public: + enum class Type : unsigned char { + kInlinedTTL = 0, + kBlob = 1, + kBlobTTL = 2, + kUnknown = 3, + }; + + BlobIndex() : type_(Type::kUnknown) {} + + BlobIndex(const BlobIndex&) = default; + BlobIndex& operator=(const BlobIndex&) = default; + + bool IsInlined() const { return type_ == Type::kInlinedTTL; } + + bool HasTTL() const { + return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL; + } + + uint64_t expiration() const { + assert(HasTTL()); + return expiration_; + } + + const Slice& value() const { + assert(IsInlined()); + return value_; + } + + uint64_t file_number() const { + assert(!IsInlined()); + return file_number_; + } + + uint64_t offset() const { + assert(!IsInlined()); + return offset_; + } + + uint64_t size() const { + assert(!IsInlined()); + return size_; + } + + CompressionType compression() const { + assert(!IsInlined()); + return compression_; + } + + Status DecodeFrom(Slice slice) { + const char* kErrorMessage = "Error while decoding blob index"; + assert(slice.size() > 0); + type_ = static_cast(*slice.data()); + if (type_ >= Type::kUnknown) { + return Status::Corruption(kErrorMessage, + "Unknown blob index type: " + + std::to_string(static_cast(type_))); + } + slice = Slice(slice.data() + 1, slice.size() - 1); + if (HasTTL()) { + if (!GetVarint64(&slice, &expiration_)) { + return Status::Corruption(kErrorMessage, "Corrupted expiration"); + } + } + if (IsInlined()) { + value_ = slice; + } else { + if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) && + GetVarint64(&slice, &size_) && slice.size() == 1) { + compression_ = static_cast(*slice.data()); + } else { + return Status::Corruption(kErrorMessage, "Corrupted blob offset"); + } + } + return Status::OK(); + } + + std::string DebugString(bool output_hex) const { + std::ostringstream oss; + + if (IsInlined()) { + oss << "[inlined blob] value:" << value_.ToString(output_hex); + } else { + oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ + << " size:" << size_ + << " compression: " << CompressionTypeToString(compression_); + } + + if (HasTTL()) { + oss << " exp:" << expiration_; + } + + return oss.str(); + } + + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, + const Slice& value) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(1 + kMaxVarint64Length + value.size()); + dst->push_back(static_cast(Type::kInlinedTTL)); + PutVarint64(dst, expiration); + dst->append(value.data(), value.size()); + } + + static void EncodeBlob(std::string* dst, uint64_t file_number, + uint64_t offset, uint64_t size, + CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 3 + 2); + dst->push_back(static_cast(Type::kBlob)); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + static void EncodeBlobTTL(std::string* dst, uint64_t expiration, + uint64_t file_number, uint64_t offset, + uint64_t size, CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 4 + 2); + dst->push_back(static_cast(Type::kBlobTTL)); + PutVarint64(dst, expiration); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + private: + Type type_ = Type::kUnknown; + uint64_t expiration_ = 0; + Slice value_; + uint64_t file_number_ = 0; + uint64_t offset_ = 0; + uint64_t size_ = 0; + CompressionType compression_ = kNoCompression; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.cc new file mode 100644 index 0000000000..8e26281e37 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_format.h" + +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { + +void BlobLogHeader::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogHeader::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed32(dst, version); + PutFixed32(dst, column_family_id); + unsigned char flags = (has_ttl ? 1 : 0); + dst->push_back(flags); + dst->push_back(compression); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); +} + +Status BlobLogHeader::DecodeFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob log header"; + if (src.size() != BlobLogHeader::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file header size"); + } + uint32_t magic_number; + unsigned char flags; + if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) || + !GetFixed32(&src, &column_family_id)) { + return Status::Corruption( + kErrorMessage, + "Error decoding magic number, version and column family id"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (version != kVersion1) { + return Status::Corruption(kErrorMessage, "Unknown header version"); + } + flags = src.data()[0]; + compression = static_cast(src.data()[1]); + has_ttl = (flags & 1) == 1; + src.remove_prefix(2); + if (!GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second)) { + return Status::Corruption(kErrorMessage, "Error decoding expiration range"); + } + return Status::OK(); +} + +void BlobLogFooter::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogFooter::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed64(dst, blob_count); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); + crc = crc32c::Value(dst->c_str(), dst->size()); + crc = crc32c::Mask(crc); + PutFixed32(dst, crc); +} + +Status BlobLogFooter::DecodeFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob log footer"; + if (src.size() != BlobLogFooter::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file footer size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t)); + src_crc = crc32c::Mask(src_crc); + uint32_t magic_number = 0; + if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) || + !GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (src_crc != crc) { + return Status::Corruption(kErrorMessage, "CRC mismatch"); + } + return Status::OK(); +} + +void BlobLogRecord::EncodeHeaderTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size()); + PutFixed64(dst, key.size()); + PutFixed64(dst, value.size()); + PutFixed64(dst, expiration); + header_crc = crc32c::Value(dst->c_str(), dst->size()); + header_crc = crc32c::Mask(header_crc); + PutFixed32(dst, header_crc); + blob_crc = crc32c::Value(key.data(), key.size()); + blob_crc = crc32c::Extend(blob_crc, value.data(), value.size()); + blob_crc = crc32c::Mask(blob_crc); + PutFixed32(dst, blob_crc); +} + +Status BlobLogRecord::DecodeHeaderFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob record"; + if (src.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob record header size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8); + src_crc = crc32c::Mask(src_crc); + if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) || + !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) || + !GetFixed32(&src, &blob_crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (src_crc != header_crc) { + return Status::Corruption(kErrorMessage, "Header CRC mismatch"); + } + return Status::OK(); +} + +Status BlobLogRecord::CheckBlobCRC() const { + uint32_t expected_crc = 0; + expected_crc = crc32c::Value(key.data(), key.size()); + expected_crc = crc32c::Extend(expected_crc, value.data(), value.size()); + expected_crc = crc32c::Mask(expected_crc); + if (expected_crc != blob_crc) { + return Status::Corruption("Blob CRC mismatch"); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.h new file mode 100644 index 0000000000..607db23678 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_format.h @@ -0,0 +1,164 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Log format information shared by reader and writer. + +#pragma once + +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37 +constexpr uint32_t kVersion1 = 1; + +using ExpirationRange = std::pair; + +// clang-format off + +// Format of blob log file header (30 bytes): +// +// +--------------+---------+---------+-------+-------------+-------------------+ +// | magic number | version | cf id | flags | compression | expiration range | +// +--------------+---------+---------+-------+-------------+-------------------+ +// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 | +// +--------------+---------+---------+-------+-------------+-------------------+ +// +// List of flags: +// has_ttl: Whether the file contain TTL data. +// +// Expiration range in the header is a rough range based on +// blob_db_options.ttl_range_secs. + +// clang-format on + +struct BlobLogHeader { + static constexpr size_t kSize = 30; + + BlobLogHeader() = default; + BlobLogHeader(uint32_t _column_family_id, CompressionType _compression, + bool _has_ttl, const ExpirationRange& _expiration_range) + : column_family_id(_column_family_id), + compression(_compression), + has_ttl(_has_ttl), + expiration_range(_expiration_range) {} + + uint32_t version = kVersion1; + uint32_t column_family_id = 0; + CompressionType compression = kNoCompression; + bool has_ttl = false; + ExpirationRange expiration_range; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// clang-format off + +// Format of blob log file footer (32 bytes): +// +// +--------------+------------+-------------------+------------+ +// | magic number | blob count | expiration range | footer CRC | +// +--------------+------------+-------------------+------------+ +// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed32 | +// +--------------+------------+-------------------+------------+ +// +// The footer will be presented only when the blob file is properly closed. +// +// Unlike the same field in file header, expiration range in the footer is the +// range of smallest and largest expiration of the data in this file. + +// clang-format on + +struct BlobLogFooter { + static constexpr size_t kSize = 32; + + uint64_t blob_count = 0; + ExpirationRange expiration_range = std::make_pair(0, 0); + uint32_t crc = 0; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// clang-format off + +// Blob record format (32 bytes header + key + value): +// +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | key length | value length | expiration | header CRC | blob CRC | key | value | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// +// If file has has_ttl = false, expiration field is always 0, and the blob +// doesn't has expiration. +// +// Also note that if compression is used, value is compressed value and value +// length is compressed value length. +// +// Header CRC is the checksum of (key_len + val_len + expiration), while +// blob CRC is the checksum of (key + value). +// +// We could use variable length encoding (Varint64) to save more space, but it +// make reader more complicated. + +// clang-format on + +struct BlobLogRecord { + // header include fields up to blob CRC + static constexpr size_t kHeaderSize = 32; + + // Note that the offset field of BlobIndex actually points to the blob value + // as opposed to the start of the blob record. The following method can + // be used to calculate the adjustment needed to read the blob record header. + static constexpr uint64_t CalculateAdjustmentForRecordHeader( + uint64_t key_size) { + return key_size + kHeaderSize; + } + + uint64_t key_size = 0; + uint64_t value_size = 0; + uint64_t expiration = 0; + uint32_t header_crc = 0; + uint32_t blob_crc = 0; + Slice key; + Slice value; + std::unique_ptr key_buf; + std::unique_ptr value_buf; + + uint64_t record_size() const { return kHeaderSize + key_size + value_size; } + + void EncodeHeaderTo(std::string* dst); + + Status DecodeHeaderFrom(Slice src); + + Status CheckBlobCRC() const; +}; + +// Checks whether a blob offset is potentially valid or not. +inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size, + uint64_t value_size, uint64_t file_size) { + if (value_offset < + BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) { + return false; + } + + if (value_offset + value_size + BlobLogFooter::kSize > file_size) { + return false; + } + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.cc new file mode 100644 index 0000000000..2ed4306819 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_sequential_reader.h" + +#include "file/random_access_file_reader.h" +#include "monitoring/statistics_impl.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogSequentialReader::BlobLogSequentialReader( + std::unique_ptr&& file_reader, SystemClock* clock, + Statistics* statistics) + : file_(std::move(file_reader)), + clock_(clock), + statistics_(statistics), + next_byte_(0) {} + +BlobLogSequentialReader::~BlobLogSequentialReader() = default; + +Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice, + char* buf) { + assert(slice); + assert(file_); + + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?) + Status s = + file_->Read(IOOptions(), next_byte_, static_cast(size), slice, + buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + next_byte_ += size; + if (!s.ok()) { + return s; + } + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size()); + if (slice->size() != size) { + return Status::Corruption("EOF reached while reading record"); + } + return s; +} + +Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) { + assert(header); + assert(next_byte_ == 0); + + static_assert(BlobLogHeader::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogHeader::kSize"); + + Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogHeader::kSize) { + return Status::Corruption("EOF reached before file header"); + } + + return header->DecodeFrom(buffer_); +} + +Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record, + ReadLevel level, + uint64_t* blob_offset) { + assert(record); + static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogRecord::kHeaderSize"); + + Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + if (buffer_.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption("EOF reached before record header"); + } + + s = record->DecodeHeaderFrom(buffer_); + if (!s.ok()) { + return s; + } + + uint64_t kb_size = record->key_size + record->value_size; + if (blob_offset != nullptr) { + *blob_offset = next_byte_ + record->key_size; + } + + switch (level) { + case kReadHeader: + next_byte_ += kb_size; + break; + + case kReadHeaderKey: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + next_byte_ += record->value_size; + break; + + case kReadHeaderKeyBlob: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + if (s.ok()) { + record->value_buf.reset(new char[record->value_size]); + s = ReadSlice(record->value_size, &record->value, + record->value_buf.get()); + } + if (s.ok()) { + s = record->CheckBlobCRC(); + } + break; + } + return s; +} + +Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) { + assert(footer); + static_assert(BlobLogFooter::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogFooter::kSize"); + + Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogFooter::kSize) { + return Status::Corruption("EOF reached before file footer"); + } + + return footer->DecodeFrom(buffer_); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.h new file mode 100644 index 0000000000..98afa8518b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_sequential_reader.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c)) + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReader; +class Env; +class Statistics; +class Status; +class SystemClock; + +/** + * BlobLogSequentialReader is a general purpose log stream reader + * implementation. The actual job of reading from the device is implemented by + * the RandomAccessFileReader interface. + * + * Please see BlobLogWriter for details on the file and record layout. + */ + +class BlobLogSequentialReader { + public: + enum ReadLevel { + kReadHeader, + kReadHeaderKey, + kReadHeaderKeyBlob, + }; + + // Create a reader that will return log records from "*file_reader". + BlobLogSequentialReader(std::unique_ptr&& file_reader, + SystemClock* clock, Statistics* statistics); + + // No copying allowed + BlobLogSequentialReader(const BlobLogSequentialReader&) = delete; + BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete; + + ~BlobLogSequentialReader(); + + Status ReadHeader(BlobLogHeader* header); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. The contents filled in + // *record will only be valid until the next mutating operation on this + // reader. + // If blob_offset is non-null, return offset of the blob through it. + Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader, + uint64_t* blob_offset = nullptr); + + Status ReadFooter(BlobLogFooter* footer); + + void ResetNextByte() { next_byte_ = 0; } + + uint64_t GetNextByte() const { return next_byte_; } + + private: + Status ReadSlice(uint64_t size, Slice* slice, char* buf); + + const std::unique_ptr file_; + SystemClock* clock_; + + Statistics* statistics_; + + Slice buffer_; + char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize, + BlobLogRecord::kHeaderSize)]; + + // which byte to read next + uint64_t next_byte_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#undef MAX_HEADER_SIZE \ No newline at end of file diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.cc new file mode 100644 index 0000000000..bf5ef27c1d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_log_writer.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "file/writable_file_writer.h" +#include "monitoring/statistics_impl.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogWriter::BlobLogWriter(std::unique_ptr&& dest, + SystemClock* clock, Statistics* statistics, + uint64_t log_number, bool use_fs, bool do_flush, + uint64_t boffset) + : dest_(std::move(dest)), + clock_(clock), + statistics_(statistics), + log_number_(log_number), + block_offset_(boffset), + use_fsync_(use_fs), + do_flush_(do_flush), + last_elem_type_(kEtNone) {} + +BlobLogWriter::~BlobLogWriter() = default; + +Status BlobLogWriter::Sync() { + TEST_SYNC_POINT("BlobLogWriter::Sync"); + + StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); + Status s = dest_->Sync(use_fsync_); + RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + return s; +} + +Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { + assert(block_offset_ == 0); + assert(last_elem_type_ == kEtNone); + std::string str; + header.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + if (do_flush_) { + s = dest_->Flush(); + } + } + last_elem_type_ = kEtFileHdr; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogHeader::kSize); + return s; +} + +Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, + std::string* checksum_method, + std::string* checksum_value) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string str; + footer.EncodeTo(&str); + + Status s; + if (dest_->seen_error()) { + s.PermitUncheckedError(); + return Status::IOError("Seen Error. Skip closing."); + } else { + s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + + s = Sync(); + + if (s.ok()) { + s = dest_->Close(); + + if (s.ok()) { + assert(!!checksum_method == !!checksum_value); + + if (checksum_method) { + assert(checksum_method->empty()); + + std::string method = dest_->GetFileChecksumFuncName(); + if (method != kUnknownFileChecksumFuncName) { + *checksum_method = std::move(method); + } + } + if (checksum_value) { + assert(checksum_value->empty()); + + std::string value = dest_->GetFileChecksum(); + if (value != kUnknownFileChecksum) { + *checksum_value = std::move(value); + } + } + } + } + } + + dest_.reset(); + } + + last_elem_type_ = kEtFileFooter; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogFooter::kSize); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t expiration, uint64_t* key_offset, + uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, expiration); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, 0); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration) { + BlobLogRecord record; + record.key = key; + record.value = val; + record.expiration = expiration; + record.EncodeHeaderTo(buf); +} + +Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, + const Slice& key, const Slice& val, + uint64_t* key_offset, + uint64_t* blob_offset) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); + Status s = dest_->Append(Slice(headerbuf)); + if (s.ok()) { + s = dest_->Append(key); + } + if (s.ok()) { + s = dest_->Append(val); + } + if (do_flush_ && s.ok()) { + s = dest_->Flush(); + } + + *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; + *blob_offset = *key_offset + key.size(); + block_offset_ = *blob_offset + val.size(); + last_elem_type_ = kEtRecord; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogRecord::kHeaderSize + key.size() + val.size()); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.h new file mode 100644 index 0000000000..c1f9f31ad0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_log_writer.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; +class SystemClock; +/** + * BlobLogWriter is the blob log stream writer. It provides an append-only + * abstraction for writing blob data. + * + * + * Look at blob_db_format.h to see the details of the record formats. + */ + +class BlobLogWriter { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this BlobLogWriter is in use. + BlobLogWriter(std::unique_ptr&& dest, SystemClock* clock, + Statistics* statistics, uint64_t log_number, bool use_fsync, + bool do_flush, uint64_t boffset = 0); + // No copying allowed + BlobLogWriter(const BlobLogWriter&) = delete; + BlobLogWriter& operator=(const BlobLogWriter&) = delete; + + ~BlobLogWriter(); + + static void ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration, + uint64_t* key_offset, uint64_t* blob_offset); + + Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method, + std::string* checksum_value); + + Status WriteHeader(BlobLogHeader& header); + + WritableFileWriter* file() { return dest_.get(); } + + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + Status Sync(); + + private: + std::unique_ptr dest_; + SystemClock* clock_; + Statistics* statistics_; + uint64_t log_number_; + uint64_t block_offset_; // Current offset in block + bool use_fsync_; + bool do_flush_; + + public: + enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter }; + ElemType last_elem_type_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_read_request.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_read_request.h new file mode 100644 index 0000000000..f9668ca2ef --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_read_request.h @@ -0,0 +1,58 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// A read Blob request structure for use in BlobSource::MultiGetBlob and +// BlobFileReader::MultiGetBlob. +struct BlobReadRequest { + // User key to lookup the paired blob + const Slice* user_key = nullptr; + + // File offset in bytes + uint64_t offset = 0; + + // Length to read in bytes + size_t len = 0; + + // Blob compression type + CompressionType compression = kNoCompression; + + // Output parameter set by MultiGetBlob() to point to the data buffer, and + // the number of valid bytes + PinnableSlice* result = nullptr; + + // Status of read + Status* status = nullptr; + + BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len, + CompressionType _compression, PinnableSlice* _result, + Status* _status) + : user_key(&_user_key), + offset(_offset), + len(_len), + compression(_compression), + result(_result), + status(_status) {} + + BlobReadRequest() = default; + BlobReadRequest(const BlobReadRequest& other) = default; + BlobReadRequest& operator=(const BlobReadRequest& other) = default; +}; + +using BlobFileReadRequests = + std::tuple>; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.cc new file mode 100644 index 0000000000..b524982e53 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.cc @@ -0,0 +1,459 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_source.h" + +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "cache/charged_cache.h" +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_log_format.h" +#include "monitoring/statistics_impl.h" +#include "options/cf_options.h" +#include "table/get_context.h" +#include "table/multiget_context.h" + +namespace ROCKSDB_NAMESPACE { + +BlobSource::BlobSource(const ImmutableOptions* immutable_options, + const std::string& db_id, + const std::string& db_session_id, + BlobFileCache* blob_file_cache) + : db_id_(db_id), + db_session_id_(db_session_id), + statistics_(immutable_options->statistics.get()), + blob_file_cache_(blob_file_cache), + blob_cache_(immutable_options->blob_cache), + lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) { + auto bbto = + immutable_options->table_factory->GetOptions(); + if (bbto && + bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache) + .charged == CacheEntryRoleOptions::Decision::kEnabled) { + blob_cache_ = SharedCacheInterface{std::make_shared( + immutable_options->blob_cache, bbto->block_cache)}; + } +} + +BlobSource::~BlobSource() = default; + +Status BlobSource::GetBlobFromCache( + const Slice& cache_key, CacheHandleGuard* cached_blob) const { + assert(blob_cache_); + assert(!cache_key.empty()); + assert(cached_blob); + assert(cached_blob->IsEmpty()); + + Cache::Handle* cache_handle = nullptr; + cache_handle = GetEntryFromCache(cache_key); + if (cache_handle != nullptr) { + *cached_blob = + CacheHandleGuard(blob_cache_.get(), cache_handle); + + assert(cached_blob->GetValue()); + + PERF_COUNTER_ADD(blob_cache_hit_count, 1); + RecordTick(statistics_, BLOB_DB_CACHE_HIT); + RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ, + cached_blob->GetValue()->size()); + + return Status::OK(); + } + + RecordTick(statistics_, BLOB_DB_CACHE_MISS); + + return Status::NotFound("Blob not found in cache"); +} + +Status BlobSource::PutBlobIntoCache( + const Slice& cache_key, std::unique_ptr* blob, + CacheHandleGuard* cached_blob) const { + assert(blob_cache_); + assert(!cache_key.empty()); + assert(blob); + assert(*blob); + assert(cached_blob); + assert(cached_blob->IsEmpty()); + + TypedHandle* cache_handle = nullptr; + const Status s = InsertEntryIntoCache(cache_key, blob->get(), + &cache_handle, Cache::Priority::BOTTOM); + if (s.ok()) { + blob->release(); + + assert(cache_handle != nullptr); + *cached_blob = + CacheHandleGuard(blob_cache_.get(), cache_handle); + + assert(cached_blob->GetValue()); + + RecordTick(statistics_, BLOB_DB_CACHE_ADD); + RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE, + cached_blob->GetValue()->size()); + + } else { + RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES); + } + + return s; +} + +BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const { + return blob_cache_.LookupFull(key, nullptr /* context */, + Cache::Priority::BOTTOM, statistics_, + lowest_used_cache_tier_); +} + +void BlobSource::PinCachedBlob(CacheHandleGuard* cached_blob, + PinnableSlice* value) { + assert(cached_blob); + assert(cached_blob->GetValue()); + assert(value); + + // To avoid copying the cached blob into the buffer provided by the + // application, we can simply transfer ownership of the cache handle to + // the target PinnableSlice. This has the potential to save a lot of + // CPU, especially with large blob values. + + value->Reset(); + + constexpr Cleanable* cleanable = nullptr; + value->PinSlice(cached_blob->GetValue()->data(), cleanable); + + cached_blob->TransferTo(value); +} + +void BlobSource::PinOwnedBlob(std::unique_ptr* owned_blob, + PinnableSlice* value) { + assert(owned_blob); + assert(*owned_blob); + assert(value); + + BlobContents* const blob = owned_blob->release(); + assert(blob); + + value->Reset(); + value->PinSlice( + blob->data(), + [](void* arg1, void* /* arg2 */) { + delete static_cast(arg1); + }, + blob, nullptr); +} + +Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value, + TypedHandle** cache_handle, + Cache::Priority priority) const { + return blob_cache_.InsertFull(key, value, value->ApproximateMemoryUsage(), + cache_handle, priority, + lowest_used_cache_tier_); +} + +Status BlobSource::GetBlob(const ReadOptions& read_options, + const Slice& user_key, uint64_t file_number, + uint64_t offset, uint64_t file_size, + uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) { + assert(value); + + Status s; + + const CacheKey cache_key = GetCacheKey(file_number, file_size, offset); + + CacheHandleGuard blob_handle; + + // First, try to get the blob from the cache + // + // If blob cache is enabled, we'll try to read from it. + if (blob_cache_) { + Slice key = cache_key.AsSlice(); + s = GetBlobFromCache(key, &blob_handle); + if (s.ok()) { + PinCachedBlob(&blob_handle, value); + + // For consistency, the size of on-disk (possibly compressed) blob record + // is assigned to bytes_read. + uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader( + user_key.size()) + : 0; + assert(offset >= adjustment); + + uint64_t record_size = value_size + adjustment; + if (bytes_read) { + *bytes_read = record_size; + } + return s; + } + } + + assert(blob_handle.IsEmpty()); + + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (no_io) { + s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + return s; + } + + // Can't find the blob from the cache. Since I/O is allowed, read from the + // file. + std::unique_ptr blob_contents; + + { + CacheHandleGuard blob_file_reader; + s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); + if (!s.ok()) { + return s; + } + + assert(blob_file_reader.GetValue()); + + if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + MemoryAllocator* const allocator = + (blob_cache_ && read_options.fill_cache) + ? blob_cache_.get()->memory_allocator() + : nullptr; + + uint64_t read_size = 0; + s = blob_file_reader.GetValue()->GetBlob( + read_options, user_key, offset, value_size, compression_type, + prefetch_buffer, allocator, &blob_contents, &read_size); + if (!s.ok()) { + return s; + } + if (bytes_read) { + *bytes_read = read_size; + } + } + + if (blob_cache_ && read_options.fill_cache) { + // If filling cache is allowed and a cache is configured, try to put the + // blob to the cache. + Slice key = cache_key.AsSlice(); + s = PutBlobIntoCache(key, &blob_contents, &blob_handle); + if (!s.ok()) { + return s; + } + + PinCachedBlob(&blob_handle, value); + } else { + PinOwnedBlob(&blob_contents, value); + } + + assert(s.ok()); + return s; +} + +void BlobSource::MultiGetBlob(const ReadOptions& read_options, + autovector& blob_reqs, + uint64_t* bytes_read) { + assert(blob_reqs.size() > 0); + + uint64_t total_bytes_read = 0; + uint64_t bytes_read_in_file = 0; + + for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) { + // sort blob_reqs_in_file by file offset. + std::sort( + blob_reqs_in_file.begin(), blob_reqs_in_file.end(), + [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool { + return lhs.offset < rhs.offset; + }); + + MultiGetBlobFromOneFile(read_options, file_number, file_size, + blob_reqs_in_file, &bytes_read_in_file); + + total_bytes_read += bytes_read_in_file; + } + + if (bytes_read) { + *bytes_read = total_bytes_read; + } +} + +void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, + uint64_t file_number, + uint64_t /*file_size*/, + autovector& blob_reqs, + uint64_t* bytes_read) { + const size_t num_blobs = blob_reqs.size(); + assert(num_blobs > 0); + assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE); + +#ifndef NDEBUG + for (size_t i = 0; i < num_blobs - 1; ++i) { + assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset); + } +#endif // !NDEBUG + + using Mask = uint64_t; + Mask cache_hit_mask = 0; + + uint64_t total_bytes = 0; + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + + if (blob_cache_) { + size_t cached_blob_count = 0; + for (size_t i = 0; i < num_blobs; ++i) { + auto& req = blob_reqs[i]; + + CacheHandleGuard blob_handle; + const CacheKey cache_key = base_cache_key.WithOffset(req.offset); + const Slice key = cache_key.AsSlice(); + + const Status s = GetBlobFromCache(key, &blob_handle); + + if (s.ok()) { + assert(req.status); + *req.status = s; + + PinCachedBlob(&blob_handle, req.result); + + // Update the counter for the number of valid blobs read from the cache. + ++cached_blob_count; + + // For consistency, the size of each on-disk (possibly compressed) blob + // record is accumulated to total_bytes. + uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader( + req.user_key->size()) + : 0; + assert(req.offset >= adjustment); + total_bytes += req.len + adjustment; + cache_hit_mask |= (Mask{1} << i); // cache hit + } + } + + // All blobs were read from the cache. + if (cached_blob_count == num_blobs) { + if (bytes_read) { + *bytes_read = total_bytes; + } + return; + } + } + + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (no_io) { + for (size_t i = 0; i < num_blobs; ++i) { + if (!(cache_hit_mask & (Mask{1} << i))) { + BlobReadRequest& req = blob_reqs[i]; + assert(req.status); + + *req.status = + Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + } + } + return; + } + + { + // Find the rest of blobs from the file since I/O is allowed. + autovector>> + _blob_reqs; + uint64_t _bytes_read = 0; + + for (size_t i = 0; i < num_blobs; ++i) { + if (!(cache_hit_mask & (Mask{1} << i))) { + _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr()); + } + } + + CacheHandleGuard blob_file_reader; + Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); + if (!s.ok()) { + for (size_t i = 0; i < _blob_reqs.size(); ++i) { + BlobReadRequest* const req = _blob_reqs[i].first; + assert(req); + assert(req->status); + + *req->status = s; + } + return; + } + + assert(blob_file_reader.GetValue()); + + MemoryAllocator* const allocator = + (blob_cache_ && read_options.fill_cache) + ? blob_cache_.get()->memory_allocator() + : nullptr; + + blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator, + _blob_reqs, &_bytes_read); + + if (blob_cache_ && read_options.fill_cache) { + // If filling cache is allowed and a cache is configured, try to put + // the blob(s) to the cache. + for (auto& [req, blob_contents] : _blob_reqs) { + assert(req); + + if (req->status->ok()) { + CacheHandleGuard blob_handle; + const CacheKey cache_key = base_cache_key.WithOffset(req->offset); + const Slice key = cache_key.AsSlice(); + s = PutBlobIntoCache(key, &blob_contents, &blob_handle); + if (!s.ok()) { + *req->status = s; + } else { + PinCachedBlob(&blob_handle, req->result); + } + } + } + } else { + for (auto& [req, blob_contents] : _blob_reqs) { + assert(req); + + if (req->status->ok()) { + PinOwnedBlob(&blob_contents, req->result); + } + } + } + + total_bytes += _bytes_read; + if (bytes_read) { + *bytes_read = total_bytes; + } + } +} + +bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size, + uint64_t offset, size_t* charge) const { + const CacheKey cache_key = GetCacheKey(file_number, file_size, offset); + const Slice key = cache_key.AsSlice(); + + CacheHandleGuard blob_handle; + const Status s = GetBlobFromCache(key, &blob_handle); + + if (s.ok() && blob_handle.GetValue() != nullptr) { + if (charge) { + const Cache* const cache = blob_handle.GetCache(); + assert(cache); + + Cache::Handle* const handle = blob_handle.GetCacheHandle(); + assert(handle); + + *charge = cache->GetUsage(handle); + } + + return true; + } + + return false; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.h new file mode 100644 index 0000000000..d5e009b54d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/blob_source.h @@ -0,0 +1,161 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "cache/cache_key.h" +#include "cache/typed_cache.h" +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_read_request.h" +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" +#include "table/block_based/cachable_entry.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +struct ImmutableOptions; +class Status; +class FilePrefetchBuffer; +class Slice; + +// BlobSource is a class that provides universal access to blobs, regardless of +// whether they are in the blob cache, secondary cache, or (remote) storage. +// Depending on user settings, it always fetch blobs from multi-tier cache and +// storage with minimal cost. +class BlobSource { + public: + BlobSource(const ImmutableOptions* immutable_options, + const std::string& db_id, const std::string& db_session_id, + BlobFileCache* blob_file_cache); + + BlobSource(const BlobSource&) = delete; + BlobSource& operator=(const BlobSource&) = delete; + + ~BlobSource(); + + // Read a blob from the underlying cache or one blob file. + // + // If successful, returns ok and sets "*value" to the newly retrieved + // uncompressed blob. If there was an error while fetching the blob, sets + // "*value" to empty and returns a non-ok status. + // + // Note: For consistency, whether the blob is found in the cache or on disk, + // sets "*bytes_read" to the size of on-disk (possibly compressed) blob + // record. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t file_number, uint64_t offset, uint64_t file_size, + uint64_t value_size, CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read); + + // Read multiple blobs from the underlying cache or blob file(s). + // + // If successful, returns ok and sets "result" in the elements of "blob_reqs" + // to the newly retrieved uncompressed blobs. If there was an error while + // fetching one of blobs, sets its "result" to empty and sets its + // corresponding "status" to a non-ok status. + // + // Note: + // - The main difference between this function and MultiGetBlobFromOneFile is + // that this function can read multiple blobs from multiple blob files. + // + // - For consistency, whether the blob is found in the cache or on disk, sets + // "*bytes_read" to the total size of on-disk (possibly compressed) blob + // records. + void MultiGetBlob(const ReadOptions& read_options, + autovector& blob_reqs, + uint64_t* bytes_read); + + // Read multiple blobs from the underlying cache or one blob file. + // + // If successful, returns ok and sets "result" in the elements of "blob_reqs" + // to the newly retrieved uncompressed blobs. If there was an error while + // fetching one of blobs, sets its "result" to empty and sets its + // corresponding "status" to a non-ok status. + // + // Note: + // - The main difference between this function and MultiGetBlob is that this + // function is only used for the case where the demanded blobs are stored in + // one blob file. MultiGetBlob will call this function multiple times if the + // demanded blobs are stored in multiple blob files. + // + // - For consistency, whether the blob is found in the cache or on disk, sets + // "*bytes_read" to the total size of on-disk (possibly compressed) blob + // records. + void MultiGetBlobFromOneFile(const ReadOptions& read_options, + uint64_t file_number, uint64_t file_size, + autovector& blob_reqs, + uint64_t* bytes_read); + + inline Status GetBlobFileReader( + const ReadOptions& read_options, uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader) { + return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number, + blob_file_reader); + } + + inline Cache* GetBlobCache() const { return blob_cache_.get(); } + + bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size, + uint64_t offset, size_t* charge = nullptr) const; + + // For TypedSharedCacheInterface + void Create(BlobContents** out, const char* buf, size_t size, + MemoryAllocator* alloc); + + using SharedCacheInterface = + FullTypedSharedCacheInterface; + using TypedHandle = SharedCacheInterface::TypedHandle; + + private: + Status GetBlobFromCache(const Slice& cache_key, + CacheHandleGuard* cached_blob) const; + + Status PutBlobIntoCache(const Slice& cache_key, + std::unique_ptr* blob, + CacheHandleGuard* cached_blob) const; + + static void PinCachedBlob(CacheHandleGuard* cached_blob, + PinnableSlice* value); + + static void PinOwnedBlob(std::unique_ptr* owned_blob, + PinnableSlice* value); + + TypedHandle* GetEntryFromCache(const Slice& key) const; + + Status InsertEntryIntoCache(const Slice& key, BlobContents* value, + TypedHandle** cache_handle, + Cache::Priority priority) const; + + inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/, + uint64_t offset) const { + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + return base_cache_key.WithOffset(offset); + } + + const std::string& db_id_; + const std::string& db_session_id_; + + Statistics* statistics_; + + // A cache to store blob file reader. + BlobFileCache* blob_file_cache_; + + // A cache to store uncompressed blobs. + mutable SharedCacheInterface blob_cache_; + + // The control option of how the cache tiers will be used. Currently rocksdb + // support block/blob cache (volatile tier) and secondary cache (this tier + // isn't strictly speaking a non-volatile tier since the compressed cache in + // this tier is in volatile memory). + const CacheTier lowest_used_cache_tier_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.cc new file mode 100644 index 0000000000..079576f518 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/prefetch_buffer_collection.h" + +namespace ROCKSDB_NAMESPACE { + +FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer( + uint64_t file_number) { + auto& prefetch_buffer = prefetch_buffers_[file_number]; + if (!prefetch_buffer) { + prefetch_buffer.reset( + new FilePrefetchBuffer(readahead_size_, readahead_size_)); + } + + return prefetch_buffer.get(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.h new file mode 100644 index 0000000000..b973eddc04 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/blob/prefetch_buffer_collection.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "file/file_prefetch_buffer.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A class that owns a collection of FilePrefetchBuffers using the file number +// as key. Used for implementing compaction readahead for blob files. Designed +// to be accessed by a single thread only: every (sub)compaction needs its own +// buffers since they are guaranteed to read different blobs from different +// positions even when reading the same file. +class PrefetchBufferCollection { + public: + explicit PrefetchBufferCollection(uint64_t readahead_size) + : readahead_size_(readahead_size) { + assert(readahead_size_ > 0); + } + + FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number); + + private: + uint64_t readahead_size_; + std::unordered_map> + prefetch_buffers_; // maps file number to prefetch buffer +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.cc new file mode 100644 index 0000000000..a99bb57e8c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.cc @@ -0,0 +1,447 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include +#include +#include + +#include "db/blob/blob_file_builder.h" +#include "db/compaction/compaction_iterator.h" +#include "db/event_helpers.h" +#include "db/internal_stats.h" +#include "db/merge_helper.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class TableFactory; + +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file) { + assert((tboptions.column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + tboptions.column_family_name.empty()); + return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); +} + +Status BuildTable( + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, + std::vector> + range_del_iters, + FileMetaData* meta, std::vector* blob_file_additions, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker, + bool paranoid_file_checks, InternalStats* internal_stats, + IOStatus* io_status, const std::shared_ptr& io_tracer, + BlobFileCreationReason blob_creation_reason, + const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger, + int job_id, const Env::IOPriority io_priority, + TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, + const std::string* full_history_ts_low, + BlobFileCompletionCallback* blob_callback, Version* version, + uint64_t* num_input_entries, uint64_t* memtable_payload_bytes, + uint64_t* memtable_garbage_bytes) { + assert((tboptions.column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + tboptions.column_family_name.empty()); + auto& mutable_cf_options = tboptions.moptions; + auto& ioptions = tboptions.ioptions; + // Reports the IOStats for flush for every following bytes. + const size_t kReportFlushIOStatsEvery = 1048576; + OutputValidator output_validator( + tboptions.internal_comparator, + /*enable_order_check=*/ + mutable_cf_options.check_flush_compaction_key_order, + /*enable_hash=*/paranoid_file_checks); + Status s; + meta->fd.file_size = 0; + iter->SeekToFirst(); + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&tboptions.internal_comparator, + snapshots, full_history_ts_low)); + uint64_t num_unfragmented_tombstones = 0; + uint64_t total_tombstone_payload_bytes = 0; + for (auto& range_del_iter : range_del_iters) { + num_unfragmented_tombstones += + range_del_iter->num_unfragmented_tombstones(); + total_tombstone_payload_bytes += + range_del_iter->total_tombstone_payload_bytes(); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + + std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); + std::vector blob_file_paths; + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname, + tboptions.column_family_name, + fname, job_id, tboptions.reason); + Env* env = db_options.env; + assert(env); + FileSystem* fs = db_options.fs.get(); + assert(fs); + + TableProperties tp; + bool table_file_created = false; + if (iter->Valid() || !range_del_agg->IsEmpty()) { + std::unique_ptr compaction_filter; + if (ioptions.compaction_filter_factory != nullptr && + ioptions.compaction_filter_factory->ShouldFilterTableFileCreation( + tboptions.reason)) { + CompactionFilter::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = false; + context.column_family_id = tboptions.column_family_id; + context.reason = tboptions.reason; + compaction_filter = + ioptions.compaction_filter_factory->CreateCompactionFilter(context); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s.PermitUncheckedError(); + return Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + } + } + + TableBuilder* builder; + std::unique_ptr file_writer; + { + std::unique_ptr file; +#ifndef NDEBUG + bool use_direct_writes = file_options.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); +#endif // !NDEBUG + IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); + assert(s.ok()); + s = io_s; + if (io_status->ok()) { + *io_status = io_s; + } + if (!s.ok()) { + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, + tboptions.column_family_name, fname, job_id, meta->fd, + kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum, + file_checksum_func_name); + return s; + } + + table_file_created = true; + FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; + file->SetIOPriority(io_priority); + file->SetWriteLifeTimeHint(write_hint); + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, ioptions.clock, io_tracer, + ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + builder = NewTableBuilder(tboptions, file_writer.get()); + } + + auto ucmp = tboptions.internal_comparator.user_comparator(); + MergeHelper merge( + env, ucmp, ioptions.merge_operator.get(), compaction_filter.get(), + ioptions.logger, true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); + + std::unique_ptr blob_file_builder( + (mutable_cf_options.enable_blob_files && + tboptions.level_at_creation >= + mutable_cf_options.blob_file_starting_level && + blob_file_additions) + ? new BlobFileBuilder( + versions, fs, &ioptions, &mutable_cf_options, &file_options, + tboptions.db_id, tboptions.db_session_id, job_id, + tboptions.column_family_id, tboptions.column_family_name, + io_priority, write_hint, io_tracer, blob_callback, + blob_creation_reason, &blob_file_paths, blob_file_additions) + : nullptr); + + const std::atomic kManualCompactionCanceledFalse{false}; + CompactionIterator c_iter( + iter, ucmp, &merge, kMaxSequenceNumber, &snapshots, + earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, + ShouldReportDetailedTime(env, ioptions.stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + blob_file_builder.get(), ioptions.allow_data_in_errors, + ioptions.enforce_single_del_contracts, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); + + c_iter.SeekToFirst(); + for (; c_iter.Valid(); c_iter.Next()) { + const Slice& key = c_iter.key(); + const Slice& value = c_iter.value(); + const ParsedInternalKey& ikey = c_iter.ikey(); + // Generate a rolling 64-bit hash of the key and values + // Note : + // Here "key" integrates 'sequence_number'+'kType'+'user key'. + s = output_validator.Add(key, value); + if (!s.ok()) { + break; + } + builder->Add(key, value); + + s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); + if (!s.ok()) { + break; + } + + // TODO(noetzli): Update stats after flush, too. + if (io_priority == Env::IO_HIGH && + IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) { + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + } + } + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } + + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + Slice last_tombstone_start_user_key{}; + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + auto kv = tombstone.Serialize(); + builder->Add(kv.first.Encode(), kv.second); + InternalKey tombstone_end = tombstone.SerializeEndKey(); + meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, + tboptions.internal_comparator); + if (version) { + if (last_tombstone_start_user_key.empty() || + ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, + range_del_it->start_key()) < 0) { + SizeApproximationOptions approx_opts; + approx_opts.files_size_error_margin = 0.1; + meta->compensated_range_deletion_size += versions->ApproximateSize( + approx_opts, read_options, version, kv.first.Encode(), + tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */, + TableReaderCaller::kFlush); + } + last_tombstone_start_user_key = range_del_it->start_key(); + } + } + } + + TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); + const bool empty = builder->IsEmpty(); + if (num_input_entries != nullptr) { + *num_input_entries = + c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + } + if (!s.ok() || empty) { + builder->Abandon(); + } else { + std::string seqno_time_mapping_str; + seqno_to_time_mapping.Encode( + seqno_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder->SetSeqnoTimeTableProperties( + seqno_time_mapping_str, + ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO + ? meta->file_creation_time + : meta->oldest_ancester_time); + s = builder->Finish(); + } + if (io_status->ok()) { + *io_status = builder->io_status(); + } + + if (s.ok() && !empty) { + uint64_t file_size = builder->FileSize(); + meta->fd.file_size = file_size; + meta->tail_size = builder->GetTailSize(); + meta->marked_for_compaction = builder->NeedCompact(); + assert(meta->fd.GetFileSize() > 0); + tp = builder + ->GetTableProperties(); // refresh now that builder is finished + if (memtable_payload_bytes != nullptr && + memtable_garbage_bytes != nullptr) { + const CompactionIterationStats& ci_stats = c_iter.iter_stats(); + uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes + + ci_stats.total_input_raw_value_bytes + + total_tombstone_payload_bytes; + uint64_t total_payload_bytes_written = + (tp.raw_key_size + tp.raw_value_size); + // Prevent underflow, which may still happen at this point + // since we only support inserts, deletes, and deleteRanges. + if (total_payload_bytes_written <= total_payload_bytes) { + *memtable_payload_bytes = total_payload_bytes; + *memtable_garbage_bytes = + total_payload_bytes - total_payload_bytes_written; + } else { + *memtable_payload_bytes = 0; + *memtable_garbage_bytes = 0; + } + } + if (table_properties) { + *table_properties = tp; + } + } + delete builder; + + // Finish and check for file errors + TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); + if (s.ok() && !empty) { + StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); + *io_status = file_writer->Sync(ioptions.use_fsync); + } + TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); + if (s.ok() && io_status->ok() && !empty) { + *io_status = file_writer->Close(); + } + if (s.ok() && io_status->ok() && !empty) { + // Add the checksum information to file metadata. + meta->file_checksum = file_writer->GetFileChecksum(); + meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName(); + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + // Set unique_id only if db_id and db_session_id exist + if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) { + if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id, + meta->fd.GetNumber(), &(meta->unique_id)) + .ok()) { + // if failed to get unique id, just set it Null + meta->unique_id = kNullUniqueId64x2; + } + } + } + + if (s.ok()) { + s = *io_status; + } + + if (blob_file_builder) { + if (s.ok()) { + s = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(s); + } + blob_file_builder.reset(); + } + + // TODO Also check the IO status when create the Iterator. + + TEST_SYNC_POINT("BuildTable:BeforeOutputValidation"); + if (s.ok() && !empty) { + // Verify that the table is usable + // We set for_compaction to false and don't OptimizeForCompactionTableRead + // here because this is a special case after we finish the table building. + // No matter whether use_direct_io_for_flush_and_compaction is true, + // the goal is to cache it here for further user reads. + std::unique_ptr it(table_cache->NewIterator( + read_options, file_options, tboptions.internal_comparator, *meta, + nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, + nullptr, + (internal_stats == nullptr) ? nullptr + : internal_stats->GetFileReadHist(0), + TableReaderCaller::kFlush, /*arena=*/nullptr, + /*skip_filter=*/false, tboptions.level_at_creation, + MaxFileSizeForL0MetaPin(mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr, + /*allow_unprepared_value*/ false, + mutable_cf_options.block_protection_bytes_per_key)); + s = it->status(); + if (s.ok() && paranoid_file_checks) { + OutputValidator file_validator(tboptions.internal_comparator, + /*enable_order_check=*/true, + /*enable_hash=*/true); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + // Generate a rolling 64-bit hash of the key and values + file_validator.Add(it->key(), it->value()).PermitUncheckedError(); + } + s = it->status(); + if (s.ok() && !output_validator.CompareValidator(file_validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } + } + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (!s.ok() || meta->fd.GetFileSize() == 0) { + TEST_SYNC_POINT("BuildTable:BeforeDeleteFile"); + + constexpr IODebugContext* dbg = nullptr; + + if (table_file_created) { + Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); + ignored.PermitUncheckedError(); + } + + assert(blob_file_additions || blob_file_paths.empty()); + + if (blob_file_additions) { + for (const std::string& blob_file_path : blob_file_paths) { + Status ignored = DeleteDBFile(&db_options, blob_file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + ignored.PermitUncheckedError(); + TEST_SYNC_POINT("BuildTable::AfterDeleteFile"); + } + } + } + + Status status_for_listener = s; + if (meta->fd.GetFileSize() == 0) { + fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } + } + // Output to event logger and fire events. + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, tboptions.column_family_name, + fname, job_id, meta->fd, meta->oldest_blob_file_number, tp, + tboptions.reason, status_for_listener, file_checksum, + file_checksum_func_name); + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.h new file mode 100644 index 0000000000..6a6a1866a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/builder.h @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include +#include +#include + +#include "db/range_tombstone_fragmenter.h" +#include "db/seqno_to_time_mapping.h" +#include "db/table_properties_collector.h" +#include "db/version_set.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" +#include "table/scoped_arena_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +struct FileMetaData; + +class VersionSet; +class BlobFileAddition; +class SnapshotChecker; +class TableCache; +class TableBuilder; +class WritableFileWriter; +class InternalStats; +class BlobFileCompletionCallback; + +// Convenience function for NewTableBuilder on the embedded table_factory. +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file); + +// Build a Table file from the contents of *iter. The generated file +// will be named according to number specified in meta. On success, the rest of +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. +// +// @param column_family_name Name of the column family that is also identified +// by column_family_id, or empty string if unknown. +extern Status BuildTable( + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, + std::vector> + range_del_iters, + FileMetaData* meta, std::vector* blob_file_additions, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker, + bool paranoid_file_checks, InternalStats* internal_stats, + IOStatus* io_status, const std::shared_ptr& io_tracer, + BlobFileCreationReason blob_creation_reason, + const SeqnoToTimeMapping& seqno_to_time_mapping, + EventLogger* event_logger = nullptr, int job_id = 0, + const Env::IOPriority io_priority = Env::IO_HIGH, + TableProperties* table_properties = nullptr, + Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, + const std::string* full_history_ts_low = nullptr, + BlobFileCompletionCallback* blob_callback = nullptr, + Version* version = nullptr, uint64_t* num_input_entries = nullptr, + uint64_t* memtable_payload_bytes = nullptr, + uint64_t* memtable_garbage_bytes = nullptr); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/c.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/c.cc new file mode 100644 index 0000000000..dde8f69fbe --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/c.cc @@ -0,0 +1,6559 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/c.h" + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/utilities/backup_engine.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/memory_util.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/options_util.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/write_batch.h" +#include "utilities/merge_operators.h" + +using ROCKSDB_NAMESPACE::BackupEngine; +using ROCKSDB_NAMESPACE::BackupEngineOptions; +using ROCKSDB_NAMESPACE::BackupID; +using ROCKSDB_NAMESPACE::BackupInfo; +using ROCKSDB_NAMESPACE::BatchResult; +using ROCKSDB_NAMESPACE::BlockBasedTableOptions; +using ROCKSDB_NAMESPACE::BottommostLevelCompaction; +using ROCKSDB_NAMESPACE::BytewiseComparator; +using ROCKSDB_NAMESPACE::Cache; +using ROCKSDB_NAMESPACE::Checkpoint; +using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; +using ROCKSDB_NAMESPACE::ColumnFamilyHandle; +using ROCKSDB_NAMESPACE::ColumnFamilyMetaData; +using ROCKSDB_NAMESPACE::ColumnFamilyOptions; +using ROCKSDB_NAMESPACE::CompactionFilter; +using ROCKSDB_NAMESPACE::CompactionFilterFactory; +using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; +using ROCKSDB_NAMESPACE::CompactRangeOptions; +using ROCKSDB_NAMESPACE::Comparator; +using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::ConfigOptions; +using ROCKSDB_NAMESPACE::CuckooTableOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DBOptions; +using ROCKSDB_NAMESPACE::DbPath; +using ROCKSDB_NAMESPACE::Env; +using ROCKSDB_NAMESPACE::EnvOptions; +using ROCKSDB_NAMESPACE::FileLock; +using ROCKSDB_NAMESPACE::FilterPolicy; +using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::HyperClockCacheOptions; +using ROCKSDB_NAMESPACE::InfoLogLevel; +using ROCKSDB_NAMESPACE::IngestExternalFileOptions; +using ROCKSDB_NAMESPACE::Iterator; +using ROCKSDB_NAMESPACE::LevelMetaData; +using ROCKSDB_NAMESPACE::LiveFileMetaData; +using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::LRUCacheOptions; +using ROCKSDB_NAMESPACE::MemoryAllocator; +using ROCKSDB_NAMESPACE::MemoryUtil; +using ROCKSDB_NAMESPACE::MergeOperator; +using ROCKSDB_NAMESPACE::NewBloomFilterPolicy; +using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory; +using ROCKSDB_NAMESPACE::NewGenericRateLimiter; +using ROCKSDB_NAMESPACE::NewLRUCache; +using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy; +using ROCKSDB_NAMESPACE::OptimisticTransactionDB; +using ROCKSDB_NAMESPACE::OptimisticTransactionOptions; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::PerfContext; +using ROCKSDB_NAMESPACE::PerfLevel; +using ROCKSDB_NAMESPACE::PinnableSlice; +using ROCKSDB_NAMESPACE::PrepopulateBlobCache; +using ROCKSDB_NAMESPACE::RandomAccessFile; +using ROCKSDB_NAMESPACE::Range; +using ROCKSDB_NAMESPACE::RateLimiter; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::RestoreOptions; +using ROCKSDB_NAMESPACE::SequentialFile; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::SliceParts; +using ROCKSDB_NAMESPACE::SliceTransform; +using ROCKSDB_NAMESPACE::Snapshot; +using ROCKSDB_NAMESPACE::SstFileMetaData; +using ROCKSDB_NAMESPACE::SstFileWriter; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory; +using ROCKSDB_NAMESPACE::Transaction; +using ROCKSDB_NAMESPACE::TransactionDB; +using ROCKSDB_NAMESPACE::TransactionDBOptions; +using ROCKSDB_NAMESPACE::TransactionLogIterator; +using ROCKSDB_NAMESPACE::TransactionOptions; +using ROCKSDB_NAMESPACE::WALRecoveryMode; +using ROCKSDB_NAMESPACE::WritableFile; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteBatchWithIndex; +using ROCKSDB_NAMESPACE::WriteOptions; + +using std::unordered_set; +using std::vector; + +extern "C" { + +struct rocksdb_t { + DB* rep; +}; +struct rocksdb_backup_engine_t { + BackupEngine* rep; +}; +struct rocksdb_backup_engine_info_t { + std::vector rep; +}; +struct rocksdb_restore_options_t { + RestoreOptions rep; +}; +struct rocksdb_iterator_t { + Iterator* rep; +}; +struct rocksdb_writebatch_t { + WriteBatch rep; +}; +struct rocksdb_writebatch_wi_t { + WriteBatchWithIndex* rep; +}; +struct rocksdb_snapshot_t { + const Snapshot* rep; +}; +struct rocksdb_flushoptions_t { + FlushOptions rep; +}; +struct rocksdb_fifo_compaction_options_t { + CompactionOptionsFIFO rep; +}; +struct rocksdb_readoptions_t { + ReadOptions rep; + // stack variables to set pointers to in ReadOptions + Slice upper_bound; + Slice lower_bound; + Slice timestamp; + Slice iter_start_ts; +}; +struct rocksdb_writeoptions_t { + WriteOptions rep; +}; +struct rocksdb_options_t { + Options rep; +}; +struct rocksdb_compactoptions_t { + CompactRangeOptions rep; + Slice full_history_ts_low; +}; +struct rocksdb_block_based_table_options_t { + BlockBasedTableOptions rep; +}; +struct rocksdb_cuckoo_table_options_t { + CuckooTableOptions rep; +}; +struct rocksdb_seqfile_t { + SequentialFile* rep; +}; +struct rocksdb_randomfile_t { + RandomAccessFile* rep; +}; +struct rocksdb_writablefile_t { + WritableFile* rep; +}; +struct rocksdb_wal_iterator_t { + TransactionLogIterator* rep; +}; +struct rocksdb_wal_readoptions_t { + TransactionLogIterator::ReadOptions rep; +}; +struct rocksdb_filelock_t { + FileLock* rep; +}; +struct rocksdb_logger_t { + std::shared_ptr rep; +}; +struct rocksdb_lru_cache_options_t { + LRUCacheOptions rep; +}; +struct rocksdb_hyper_clock_cache_options_t { + HyperClockCacheOptions rep; +}; +struct rocksdb_memory_allocator_t { + std::shared_ptr rep; +}; +struct rocksdb_cache_t { + std::shared_ptr rep; +}; +struct rocksdb_livefiles_t { + std::vector rep; +}; +struct rocksdb_column_family_handle_t { + ColumnFamilyHandle* rep; +}; +struct rocksdb_column_family_metadata_t { + ColumnFamilyMetaData rep; +}; +struct rocksdb_level_metadata_t { + const LevelMetaData* rep; +}; +struct rocksdb_sst_file_metadata_t { + const SstFileMetaData* rep; +}; +struct rocksdb_envoptions_t { + EnvOptions rep; +}; +struct rocksdb_ingestexternalfileoptions_t { + IngestExternalFileOptions rep; +}; +struct rocksdb_sstfilewriter_t { + SstFileWriter* rep; +}; +struct rocksdb_ratelimiter_t { + std::shared_ptr rep; +}; +struct rocksdb_perfcontext_t { + PerfContext* rep; +}; +struct rocksdb_pinnableslice_t { + PinnableSlice rep; +}; +struct rocksdb_transactiondb_options_t { + TransactionDBOptions rep; +}; +struct rocksdb_transactiondb_t { + TransactionDB* rep; +}; +struct rocksdb_transaction_options_t { + TransactionOptions rep; +}; +struct rocksdb_transaction_t { + Transaction* rep; +}; +struct rocksdb_backup_engine_options_t { + BackupEngineOptions rep; +}; +struct rocksdb_checkpoint_t { + Checkpoint* rep; +}; +struct rocksdb_optimistictransactiondb_t { + OptimisticTransactionDB* rep; +}; +struct rocksdb_optimistictransaction_options_t { + OptimisticTransactionOptions rep; +}; + +struct rocksdb_compactionfiltercontext_t { + CompactionFilter::Context rep; +}; + +struct rocksdb_compactionfilter_t : public CompactionFilter { + void* state_; + void (*destructor_)(void*); + unsigned char (*filter_)(void*, int level, const char* key, size_t key_length, + const char* existing_value, size_t value_length, + char** new_value, size_t* new_value_length, + unsigned char* value_changed); + const char* (*name_)(void*); + unsigned char ignore_snapshots_; + + ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); } + + bool Filter(int level, const Slice& key, const Slice& existing_value, + std::string* new_value, bool* value_changed) const override { + char* c_new_value = nullptr; + size_t new_value_length = 0; + unsigned char c_value_changed = 0; + unsigned char result = + (*filter_)(state_, level, key.data(), key.size(), existing_value.data(), + existing_value.size(), &c_new_value, &new_value_length, + &c_value_changed); + if (c_value_changed) { + new_value->assign(c_new_value, new_value_length); + *value_changed = true; + } + return result; + } + + const char* Name() const override { return (*name_)(state_); } + + bool IgnoreSnapshots() const override { return ignore_snapshots_; } +}; + +struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory { + void* state_; + void (*destructor_)(void*); + rocksdb_compactionfilter_t* (*create_compaction_filter_)( + void*, rocksdb_compactionfiltercontext_t* context); + const char* (*name_)(void*); + + ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + rocksdb_compactionfiltercontext_t ccontext; + ccontext.rep = context; + CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext); + return std::unique_ptr(cf); + } + + const char* Name() const override { return (*name_)(state_); } +}; + +struct rocksdb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)(void*, const char* a, size_t alen, const char* b, + size_t blen); + const char* (*name_)(void*); + int (*compare_ts_)(void*, const char* a_ts, size_t a_tslen, const char* b_ts, + size_t b_tslen); + int (*compare_without_ts_)(void*, const char* a, size_t alen, + unsigned char a_has_ts, const char* b, size_t blen, + unsigned char b_has_ts); + + rocksdb_comparator_t() : Comparator() {} + + rocksdb_comparator_t(size_t ts_size) : Comparator(ts_size) {} + + ~rocksdb_comparator_t() override { (*destructor_)(state_); } + + int Compare(const Slice& a, const Slice& b) const override { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + int CompareTimestamp(const Slice& a_ts, const Slice& b_ts) const override { + if (compare_ts_ == nullptr) { + return 0; + } + return (*compare_ts_)(state_, a_ts.data(), a_ts.size(), b_ts.data(), + b_ts.size()); + } + + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + if (compare_without_ts_ == nullptr) { + return Compare(a, b); + } + return (*compare_without_ts_)(state_, a.data(), a.size(), a_has_ts, + b.data(), b.size(), b_has_ts); + } + + const char* Name() const override { return (*name_)(state_); } + + // No-ops since the C binding does not support key shortening methods. + void FindShortestSeparator(std::string*, const Slice&) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +struct rocksdb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + + ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } +}; + +struct rocksdb_mergeoperator_t : public MergeOperator { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*full_merge_)(void*, const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + char* (*partial_merge_)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + void (*delete_value_)(void*, const char* value, size_t value_length); + + ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + size_t n = merge_in.operand_list.size(); + std::vector operand_pointers(n); + std::vector operand_sizes(n); + for (size_t i = 0; i < n; i++) { + Slice operand(merge_in.operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + const char* existing_value_data = nullptr; + size_t existing_value_len = 0; + if (merge_in.existing_value != nullptr) { + existing_value_data = merge_in.existing_value->data(); + existing_value_len = merge_in.existing_value->size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*full_merge_)( + state_, merge_in.key.data(), merge_in.key.size(), existing_value_data, + existing_value_len, &operand_pointers[0], &operand_sizes[0], + static_cast(n), &success, &new_value_len); + merge_out->new_value.assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + size_t operand_count = operand_list.size(); + std::vector operand_pointers(operand_count); + std::vector operand_sizes(operand_count); + for (size_t i = 0; i < operand_count; ++i) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*partial_merge_)( + state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], + static_cast(operand_count), &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } +}; + +struct rocksdb_dbpath_t { + DbPath rep; +}; + +struct rocksdb_env_t { + Env* rep; + bool is_default; +}; + +struct rocksdb_slicetransform_t : public SliceTransform { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*transform_)(void*, const char* key, size_t length, + size_t* dst_length); + unsigned char (*in_domain_)(void*, const char* key, size_t length); + unsigned char (*in_range_)(void*, const char* key, size_t length); + + ~rocksdb_slicetransform_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + Slice Transform(const Slice& src) const override { + size_t len; + char* dst = (*transform_)(state_, src.data(), src.size(), &len); + return Slice(dst, len); + } + + bool InDomain(const Slice& src) const override { + return (*in_domain_)(state_, src.data(), src.size()); + } + + bool InRange(const Slice& src) const override { + return (*in_range_)(state_, src.data(), src.size()); + } +}; + +struct rocksdb_universal_compaction_options_t { + ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep; +}; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != nullptr); + if (s.ok()) { + return false; + } else if (*errptr == nullptr) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + // This is a bug if *errptr is not created by malloc() + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_with_ttl(const rocksdb_options_t* options, + const char* name, int ttl, char** errptr) { + ROCKSDB_NAMESPACE::DBWithTTL* db; + if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( + options->rep, std::string(name), &db, ttl))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options, + const char* name, + unsigned char error_if_wal_file_exists, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), + &db, error_if_wal_file_exists))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, + const char* name, + const char* secondary_path, + char** errptr) { + DB* db; + if (SaveError(errptr, + DB::OpenAsSecondary(options->rep, std::string(name), + std::string(secondary_path), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open( + options->rep.env, + BackupEngineOptions(path, nullptr, true, + options->rep.info_log.get()), + &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + +rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts( + const rocksdb_backup_engine_options_t* options, rocksdb_env_t* env, + char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + +void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, + rocksdb_t* db, char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep)); +} + +void rocksdb_backup_engine_create_new_backup_flush( + rocksdb_backup_engine_t* be, rocksdb_t* db, + unsigned char flush_before_backup, char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup)); +} + +void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be, + uint32_t num_backups_to_keep, + char** errptr) { + SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep)); +} + +rocksdb_restore_options_t* rocksdb_restore_options_create() { + return new rocksdb_restore_options_t; +} + +void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) { + delete opt; +} + +void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt, + int v) { + opt->rep.keep_log_files = v; +} + +void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be, + uint32_t backup_id, char** errptr) { + SaveError(errptr, be->rep->VerifyBackup(static_cast(backup_id))); +} + +void rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + +void rocksdb_backup_engine_restore_db_from_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, const uint32_t backup_id, + char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + +const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info( + rocksdb_backup_engine_t* be) { + rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t; + be->rep->GetBackupInfo(&result->rep); + return result; +} + +int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) { + return static_cast(info->rep.size()); +} + +int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].timestamp; +} + +uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].backup_id; +} + +uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].size; +} + +uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].number_files; +} + +void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info) { + delete info; +} + +void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) { + delete be->rep; + delete be; +} + +rocksdb_backup_engine_options_t* rocksdb_backup_engine_options_create( + const char* backup_dir) { + return new rocksdb_backup_engine_options_t{ + BackupEngineOptions(std::string(backup_dir))}; +} + +void rocksdb_backup_engine_options_set_backup_dir( + rocksdb_backup_engine_options_t* options, const char* backup_dir) { + options->rep.backup_dir = std::string(backup_dir); +} + +void rocksdb_backup_engine_options_set_env( + rocksdb_backup_engine_options_t* options, rocksdb_env_t* env) { + options->rep.backup_env = (env ? env->rep : nullptr); +} + +void rocksdb_backup_engine_options_set_share_table_files( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.share_table_files = val; +} + +unsigned char rocksdb_backup_engine_options_get_share_table_files( + rocksdb_backup_engine_options_t* options) { + return options->rep.share_table_files; +} + +void rocksdb_backup_engine_options_set_sync( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.sync = val; +} + +unsigned char rocksdb_backup_engine_options_get_sync( + rocksdb_backup_engine_options_t* options) { + return options->rep.sync; +} + +void rocksdb_backup_engine_options_set_destroy_old_data( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.destroy_old_data = val; +} + +unsigned char rocksdb_backup_engine_options_get_destroy_old_data( + rocksdb_backup_engine_options_t* options) { + return options->rep.destroy_old_data; +} + +void rocksdb_backup_engine_options_set_backup_log_files( + rocksdb_backup_engine_options_t* options, unsigned char val) { + options->rep.backup_log_files = val; +} + +unsigned char rocksdb_backup_engine_options_get_backup_log_files( + rocksdb_backup_engine_options_t* options) { + return options->rep.backup_log_files; +} + +void rocksdb_backup_engine_options_set_backup_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit) { + options->rep.backup_rate_limit = limit; +} + +uint64_t rocksdb_backup_engine_options_get_backup_rate_limit( + rocksdb_backup_engine_options_t* options) { + return options->rep.backup_rate_limit; +} + +void rocksdb_backup_engine_options_set_restore_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit) { + options->rep.restore_rate_limit = limit; +} + +uint64_t rocksdb_backup_engine_options_get_restore_rate_limit( + rocksdb_backup_engine_options_t* options) { + return options->rep.restore_rate_limit; +} + +void rocksdb_backup_engine_options_set_max_background_operations( + rocksdb_backup_engine_options_t* options, int val) { + options->rep.max_background_operations = val; +} + +int rocksdb_backup_engine_options_get_max_background_operations( + rocksdb_backup_engine_options_t* options) { + return options->rep.max_background_operations; +} + +void rocksdb_backup_engine_options_set_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options, uint64_t size) { + options->rep.callback_trigger_interval_size = size; +} + +uint64_t rocksdb_backup_engine_options_get_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options) { + return options->rep.callback_trigger_interval_size; +} + +void rocksdb_backup_engine_options_set_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options, int val) { + options->rep.max_valid_backups_to_open = val; +} + +int rocksdb_backup_engine_options_get_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options) { + return options->rep.max_valid_backups_to_open; +} + +void rocksdb_backup_engine_options_set_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options, int val) { + options->rep.share_files_with_checksum_naming = + static_cast(val); +} + +int rocksdb_backup_engine_options_get_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options) { + return static_cast(options->rep.share_files_with_checksum_naming); +} + +void rocksdb_backup_engine_options_destroy( + rocksdb_backup_engine_options_t* options) { + delete options; +} + +rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, + char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint, + const char* checkpoint_dir, + uint64_t log_size_for_flush, char** errptr) { + SaveError(errptr, checkpoint->rep->CreateCheckpoint( + std::string(checkpoint_dir), log_size_for_flush)); +} + +void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) { + delete checkpoint->rep; + delete checkpoint; +} + +void rocksdb_close(rocksdb_t* db) { + delete db->rep; + delete db; +} + +void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) { + opt->rep.merge_operator = + ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator(); +} + +rocksdb_t* rocksdb_open_and_trim_history( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char* trim_ts, + size_t trim_tslen, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + std::string trim_ts_(trim_ts, trim_tslen); + + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAndTrimHistory( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db, trim_ts_))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + DB* db; + std::vector handles; + if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_column_families_with_ttl( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, const int* ttls, + char** errptr) { + std::vector ttls_vec; + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + ttls_vec.push_back(ttls[i]); + + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + ROCKSDB_NAMESPACE::DBWithTTL* db; + std::vector handles; + if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db, ttls_vec))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, + unsigned char error_if_wal_file_exists, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + DB* db; + std::vector handles; + if (SaveError(errptr, + DB::OpenForReadOnly(DBOptions(db_options->rep), + std::string(name), column_families, + &handles, &db, error_if_wal_file_exists))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* db_options, const char* name, + const char* secondary_path, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i != num_column_families; ++i) { + column_families.emplace_back( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep)); + } + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep), + std::string(name), + std::string(secondary_path), + column_families, &handles, &db))) { + return nullptr; + } + for (size_t i = 0; i != handles.size(); ++i) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +char** rocksdb_list_column_families(const rocksdb_options_t* options, + const char* name, size_t* lencfs, + char** errptr) { + std::vector fams; + SaveError(errptr, DB::ListColumnFamilies(DBOptions(options->rep), + std::string(name), &fams)); + + *lencfs = fams.size(); + char** column_families = + static_cast(malloc(sizeof(char*) * fams.size())); + for (size_t i = 0; i < fams.size(); i++) { + column_families[i] = strdup(fams[i].c_str()); + } + return column_families; +} + +void rocksdb_list_column_families_destroy(char** list, size_t len) { + for (size_t i = 0; i < len; ++i) { + free(list[i]); + } + free(list); +} + +rocksdb_column_family_handle_t* rocksdb_create_column_family( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, db->rep->CreateColumnFamily( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); + return handle; +} + +rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, int ttl, char** errptr) { + ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl = + static_cast(db->rep); + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep), ttl)); + return handle; +} + +void rocksdb_drop_column_family(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, + char** errptr) { + SaveError(errptr, db->rep->DropColumnFamily(handle->rep)); +} + +uint32_t rocksdb_column_family_handle_get_id( + rocksdb_column_family_handle_t* handle) { + return handle->rep->GetID(); +} + +char* rocksdb_column_family_handle_get_name( + rocksdb_column_family_handle_t* handle, size_t* name_len) { + auto name = handle->rep->GetName(); + *name_len = name.size(); + return CopyString(name); +} + +void rocksdb_column_family_handle_destroy( + rocksdb_column_family_handle_t* handle) { + delete handle->rep; + delete handle; +} + +void rocksdb_put(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_put_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* val, + size_t vallen, char** errptr) { + SaveError(errptr, db->rep->Put(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_put_with_ts(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* ts, + size_t tslen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen), + Slice(ts, tslen), Slice(val, vallen))); +} + +void rocksdb_put_cf_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* ts, + size_t tslen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, column_family->rep, Slice(key, keylen), + Slice(ts, tslen), Slice(val, vallen))); +} + +void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + +void rocksdb_delete_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +void rocksdb_delete_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* ts, + size_t tslen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen), + Slice(ts, tslen))); +} + +void rocksdb_delete_cf_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* ts, + size_t tslen, char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen), Slice(ts, tslen))); +} + +void rocksdb_singledelete(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen))); +} + +void rocksdb_singledelete_cf(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, char** errptr) { + SaveError(errptr, db->rep->SingleDelete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +void rocksdb_singledelete_with_ts(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* ts, size_t tslen, char** errptr) { + SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen), + Slice(ts, tslen))); +} + +void rocksdb_singledelete_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr) { + SaveError(errptr, + db->rep->SingleDelete(options->rep, column_family->rep, + Slice(key, keylen), Slice(ts, tslen))); +} + +void rocksdb_increase_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* ts_low, size_t ts_lowlen, char** errptr) { + std::string ts(ts_low, ts_lowlen); + SaveError(errptr, db->rep->IncreaseFullHistoryTsLow(column_family->rep, ts)); +} + +char* rocksdb_get_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + size_t* ts_len, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->GetFullHistoryTsLow(column_family->rep, &tmp); + if (s.ok()) { + *ts_len = tmp.size(); + result = CopyString(tmp); + } else { + *ts_len = 0; + SaveError(errptr, s); + } + return result; +} + +void rocksdb_delete_range_cf(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len, + char** errptr) { + SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep, + Slice(start_key, start_key_len), + Slice(end_key, end_key_len))); +} + +void rocksdb_merge(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* val, + size_t vallen, char** errptr) { + SaveError(errptr, db->rep->Merge(options->rep, Slice(key, keylen), + Slice(val, vallen))); +} + +void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, const char* val, + size_t vallen, char** errptr) { + SaveError(errptr, db->rep->Merge(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_write(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, size_t keylen, size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_with_ts(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, size_t keylen, size_t* vallen, + char** ts, size_t* tslen, char** errptr) { + char* result = nullptr; + std::string tmp_val; + std::string tmp_ts; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp_val, &tmp_ts); + if (s.ok()) { + *vallen = tmp_val.size(); + result = CopyString(tmp_val); + *tslen = tmp_ts.size(); + *ts = CopyString(tmp_ts); + } else { + *vallen = 0; + *tslen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_cf_with_ts(rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, size_t* vallen, + char** ts, size_t* tslen, char** errptr) { + char* result = nullptr; + std::string tmp; + std::string tmp_ts; + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &tmp, &tmp_ts); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + *tslen = tmp_ts.size(); + *ts = CopyString(tmp_ts); + } else { + *vallen = 0; + *tslen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = db->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_with_ts(rocksdb_t* db, + const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, + char** timestamp_list, + size_t* timestamp_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector timestamps(num_keys); + std::vector statuses = + db->rep->MultiGet(options->rep, keys, &values, ×tamps); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + timestamp_list[i] = CopyString(timestamps[i]); + timestamp_list_sizes[i] = timestamps[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + timestamp_list[i] = nullptr; + timestamp_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + db->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_cf_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** timestamps_list, + size_t* timestamps_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector timestamps(num_keys); + std::vector statuses = + db->rep->MultiGet(options->rep, cfs, keys, &values, ×tamps); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + timestamps_list[i] = CopyString(timestamps[i]); + timestamps_list_sizes[i] = timestamps[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + timestamps_list[i] = nullptr; + timestamps_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_batched_multi_get_cf(rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + rocksdb_pinnableslice_t** values, char** errs, + const bool sorted_input) { + Slice* key_slices = new Slice[num_keys]; + PinnableSlice* value_slices = new PinnableSlice[num_keys]; + Status* statuses = new Status[num_keys]; + for (size_t i = 0; i < num_keys; ++i) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + + db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices, + value_slices, statuses, sorted_input); + + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + values[i] = new (rocksdb_pinnableslice_t); + values[i]->rep = std::move(value_slices[i]); + errs[i] = nullptr; + } else { + values[i] = nullptr; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } + + delete[] key_slices; + delete[] value_slices; + delete[] statuses; +} + +unsigned char rocksdb_key_may_exist(rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t key_len, + char** value, size_t* val_len, + const char* timestamp, size_t timestamp_len, + unsigned char* value_found) { + std::string tmp; + std::string time; + if (timestamp) { + time.assign(timestamp, timestamp_len); + } + bool found = false; + const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len), + &tmp, timestamp ? &time : nullptr, + value_found ? &found : nullptr); + if (value_found) { + *value_found = found; + if (found) { + *val_len = tmp.size(); + *value = CopyString(tmp); + } + } + return result; +} + +unsigned char rocksdb_key_may_exist_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found) { + std::string tmp; + std::string time; + if (timestamp) { + time.assign(timestamp, timestamp_len); + } + bool found = false; + const bool result = db->rep->KeyMayExist( + options->rep, column_family->rep, Slice(key, key_len), &tmp, + timestamp ? &time : nullptr, value_found ? &found : nullptr); + if (value_found) { + *value_found = found; + if (found) { + *val_len = tmp.size(); + *value = CopyString(tmp); + } + } + return result; +} + +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +rocksdb_wal_iterator_t* rocksdb_get_updates_since( + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, char** errptr) { + std::unique_ptr iter; + TransactionLogIterator::ReadOptions ro; + if (options != nullptr) { + ro = options->rep; + } + if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) { + return nullptr; + } + rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t; + result->rep = iter.release(); + return result; +} + +void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { iter->rep->Next(); } + +unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_wal_iter_status(const rocksdb_wal_iterator_t* iter, + char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +void rocksdb_wal_iter_destroy(const rocksdb_wal_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +rocksdb_writebatch_t* rocksdb_wal_iter_get_batch( + const rocksdb_wal_iterator_t* iter, uint64_t* seq) { + rocksdb_writebatch_t* result = rocksdb_writebatch_create(); + BatchResult wal_batch = iter->rep->GetBatch(); + result->rep = std::move(*wal_batch.writeBatchPtr); + if (seq != nullptr) { + *seq = wal_batch.sequence; + } + return result; +} + +uint64_t rocksdb_get_latest_sequence_number(rocksdb_t* db) { + return db->rep->GetLatestSequenceNumber(); +} + +rocksdb_iterator_t* rocksdb_create_iterator_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep, column_family->rep); + return result; +} + +void rocksdb_create_iterators(rocksdb_t* db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, + char** errptr) { + std::vector column_families_vec; + for (size_t i = 0; i < size; i++) { + column_families_vec.push_back(column_families[i]->rep); + } + + std::vector res; + Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res); + assert(res.size() == size); + if (SaveError(errptr, status)) { + return; + } + + for (size_t i = 0; i < size; i++) { + iterators[i] = new rocksdb_iterator_t; + iterators[i]->rep = res[i]; + } +} + +const rocksdb_snapshot_t* rocksdb_create_snapshot(rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void rocksdb_release_snapshot(rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_property_value(rocksdb_t* db, const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_property_int(rocksdb_t* db, const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +int rocksdb_property_int_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val) { + if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +char* rocksdb_property_value_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges, + const char* const* range_start_key, + const size_t* range_start_key_len, + const char* const* range_limit_key, + const size_t* range_limit_key_len, + uint64_t* sizes, char** errptr) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } + delete[] ranges; +} + +void rocksdb_approximate_sizes_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + Status s = db->rep->GetApproximateSizes(column_family->rep, ranges, + num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } + delete[] ranges; +} + +void rocksdb_delete_file(rocksdb_t* db, const char* name) { + db->rep->DeleteFile(name); +} + +const rocksdb_livefiles_t* rocksdb_livefiles(rocksdb_t* db) { + rocksdb_livefiles_t* result = new rocksdb_livefiles_t; + db->rep->GetLiveFilesMetaData(&result->rep); + return result; +} + +void rocksdb_compact_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + CompactRangeOptions(), + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + CompactRangeOptions(), column_family->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_suggest_compact_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + Status s = ROCKSDB_NAMESPACE::experimental::SuggestCompactRange( + db->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); + SaveError(errptr, s); +} + +void rocksdb_suggest_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + Status s = db->rep->SuggestCompactRange( + column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); + SaveError(errptr, s); +} + +void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + opt->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_cf_opt(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + rocksdb_compactoptions_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + opt->rep, column_family->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_flush(rocksdb_t* db, const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep)); +} + +void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); +} + +void rocksdb_flush_cfs(rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_families, + int num_column_families, char** errptr) { + std::vector column_family_handles; + for (int i = 0; i < num_column_families; i++) { + column_family_handles.push_back(column_families[i]->rep); + } + + SaveError(errptr, db->rep->Flush(options->rep, column_family_handles)); +} + +void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) { + SaveError(errptr, db->rep->FlushWAL(sync)); +} + +void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->DisableFileDeletions()); +} + +void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force, + char** errptr) { + SaveError(errptr, db->rep->EnableFileDeletions(force)); +} + +void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void rocksdb_repair_db(const rocksdb_options_t* options, const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k, + size_t klen) { + iter->rep->SeekForPrev(Slice(k, klen)); +} + +void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); } + +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); } + +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_timestamp(const rocksdb_iterator_t* iter, + size_t* tslen) { + Slice s = iter->rep->timestamp(); + *tslen = s.size(); + return s.data(); +} + +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; +} + +rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep, + size_t size) { + rocksdb_writebatch_t* b = new rocksdb_writebatch_t; + b->rep = WriteBatch(std::string(rep, size)); + return b; +} + +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; } + +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); } + +int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { return b->rep.Count(); } + +void rocksdb_writebatch_put(rocksdb_writebatch_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_put_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { + b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_put_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen, const char* val, + size_t vlen) { + b->rep.Put(column_family->rep, Slice(key, klen), Slice(ts, tslen), + Slice(val, vlen)); +} + +void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Put(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep.Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_merge_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { + b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Merge(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key, + size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key, + size_t klen) { + b->rep.SingleDelete(Slice(key, klen)); +} + +void rocksdb_writebatch_delete_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep.Delete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_delete_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen) { + b->rep.Delete(column_family->rep, Slice(key, klen), Slice(ts, tslen)); +} + +void rocksdb_writebatch_singledelete_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep.SingleDelete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_singledelete_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen) { + b->rep.SingleDelete(column_family->rep, Slice(key, klen), Slice(ts, tslen)); +} + +void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep.Delete(SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_deletev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b, + const char* start_key, + size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep.DeleteRange(Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_delete_range_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_delete_rangev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep.DeleteRange(column_family->rep, + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob, + size_t len) { + b->rep.PutLogData(Slice(blob, len)); +} + +class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + void Put(const Slice& key, const Slice& value) override { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + void Delete(const Slice& key) override { + (*deleted_)(state_, key.data(), key.size()); + } +}; + +void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, + const char* v, size_t vlen), + void (*deleted)(void*, const char* k, + size_t klen)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) { + *size = b->rep.GetDataSize(); + return b->rep.Data().c_str(); +} + +void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) { + b->rep.SetSavePoint(); +} + +void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b, + char** errptr) { + SaveError(errptr, b->rep.RollbackToSavePoint()); +} + +void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) { + SaveError(errptr, b->rep.PopSavePoint()); +} + +rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create( + size_t reserved_bytes, unsigned char overwrite_key) { + rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t; + b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, + overwrite_key); + return b; +} + +void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) { + if (b->rep) { + delete b->rep; + } + delete b; +} + +void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) { + b->rep->Clear(); +} + +int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) { + return b->rep->GetWriteBatch()->Count(); +} + +void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep->Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_put_cf(rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { + b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Put(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_putv_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { + b->rep->Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_merge_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen) { + b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Merge(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_mergev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen) { + b->rep->Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b, + const char* key, size_t klen) { + b->rep->SingleDelete(Slice(key, klen)); +} + +void rocksdb_writebatch_wi_delete_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep->Delete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_wi_singledelete_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep->SingleDelete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep->Delete(SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_deletev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b, + const char* start_key, + size_t start_key_len, + const char* end_key, + size_t end_key_len) { + b->rep->DeleteRange(Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_wi_delete_range_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, + int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_delete_rangev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep->DeleteRange(column_family->rep, + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b, + const char* blob, size_t len) { + b->rep->PutLogData(Slice(blob, len)); +} + +void rocksdb_writebatch_wi_iterate( + rocksdb_writebatch_wi_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep->GetWriteBatch()->Iterate(&handler); +} + +const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, + size_t* size) { + WriteBatch* wb = b->rep->GetWriteBatch(); + *size = wb->GetDataSize(); + return wb->Data().c_str(); +} + +void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) { + b->rep->SetSavePoint(); +} + +void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b, + char** errptr) { + SaveError(errptr, b->rep->RollbackToSavePoint()); +} + +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep); + delete base_iterator; + return result; +} + +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = + wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep); + delete base_iterator; + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + const char* key, size_t keylen, + size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_cf( + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatchAndDB( + db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +void rocksdb_write_writebatch_wi(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, char** errptr) { + WriteBatch* wb = wbwi->rep->GetWriteBatch(); + SaveError(errptr, db->rep->Write(options->rep, wb)); +} + +void rocksdb_load_latest_options( + const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options, + rocksdb_cache_t* cache, rocksdb_options_t** db_options, + size_t* num_column_families, char*** list_column_family_names, + rocksdb_options_t*** list_column_family_options, char** errptr) { + DBOptions db_opt; + std::vector cf_descs; + ConfigOptions config_opts; + config_opts.ignore_unknown_options = ignore_unknown_options; + config_opts.input_strings_escaped = true; + config_opts.env = env->rep; + Status s = LoadLatestOptions(config_opts, std::string(db_path), &db_opt, + &cf_descs, &cache->rep); + if (s.ok()) { + char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*)); + rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc( + cf_descs.size() * sizeof(rocksdb_options_t*)); + for (size_t i = 0; i < cf_descs.size(); ++i) { + cf_names[i] = strdup(cf_descs[i].name.c_str()); + cf_options[i] = new rocksdb_options_t{ + Options(DBOptions(), std::move(cf_descs[i].options))}; + } + *num_column_families = cf_descs.size(); + *db_options = new rocksdb_options_t{ + Options(std::move(db_opt), ColumnFamilyOptions())}; + *list_column_family_names = cf_names; + *list_column_family_options = cf_options; + } else { + *num_column_families = 0; + *db_options = nullptr; + *list_column_family_names = nullptr; + *list_column_family_options = nullptr; + SaveError(errptr, s); + } +} + +void rocksdb_load_latest_options_destroy( + rocksdb_options_t* db_options, char** list_column_family_names, + rocksdb_options_t** list_column_family_options, size_t len) { + rocksdb_options_destroy(db_options); + if (list_column_family_names) { + for (size_t i = 0; i < len; ++i) { + free(list_column_family_names[i]); + } + free(list_column_family_names); + } + if (list_column_family_options) { + for (size_t i = 0; i < len; ++i) { + rocksdb_options_destroy(list_column_family_options[i]); + } + free(list_column_family_options); + } +} + +rocksdb_block_based_table_options_t* rocksdb_block_based_options_create() { + return new rocksdb_block_based_table_options_t; +} + +void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options) { + delete options; +} + +void rocksdb_block_based_options_set_checksum( + rocksdb_block_based_table_options_t* opt, char v) { + opt->rep.checksum = static_cast(v); +} + +void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size) { + options->rep.block_size = block_size; +} + +void rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation) { + options->rep.block_size_deviation = block_size_deviation; +} + +void rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval) { + options->rep.block_restart_interval = block_restart_interval; +} + +void rocksdb_block_based_options_set_index_block_restart_interval( + rocksdb_block_based_table_options_t* options, + int index_block_restart_interval) { + options->rep.index_block_restart_interval = index_block_restart_interval; +} + +void rocksdb_block_based_options_set_metadata_block_size( + rocksdb_block_based_table_options_t* options, + uint64_t metadata_block_size) { + options->rep.metadata_block_size = metadata_block_size; +} + +void rocksdb_block_based_options_set_partition_filters( + rocksdb_block_based_table_options_t* options, + unsigned char partition_filters) { + options->rep.partition_filters = partition_filters; +} + +void rocksdb_block_based_options_set_optimize_filters_for_memory( + rocksdb_block_based_table_options_t* options, + unsigned char optimize_filters_for_memory) { + options->rep.optimize_filters_for_memory = optimize_filters_for_memory; +} + +void rocksdb_block_based_options_set_use_delta_encoding( + rocksdb_block_based_table_options_t* options, + unsigned char use_delta_encoding) { + options->rep.use_delta_encoding = use_delta_encoding; +} + +void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy) { + options->rep.filter_policy.reset(filter_policy); +} + +void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, + unsigned char no_block_cache) { + options->rep.no_block_cache = no_block_cache; +} + +void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache) { + if (block_cache) { + options->rep.block_cache = block_cache->rep; + } +} + +void rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.whole_key_filtering = v; +} + +void rocksdb_block_based_options_set_format_version( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.format_version = v; +} + +void rocksdb_block_based_options_set_index_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.index_type = static_cast(v); +} + +void rocksdb_block_based_options_set_data_block_index_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.data_block_index_type = + static_cast(v); +} + +void rocksdb_block_based_options_set_data_block_hash_ratio( + rocksdb_block_based_table_options_t* options, double v) { + options->rep.data_block_hash_table_util_ratio = v; +} + +void rocksdb_block_based_options_set_cache_index_and_filter_blocks( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.cache_index_and_filter_blocks = v; +} + +void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.cache_index_and_filter_blocks_with_high_priority = v; +} + +void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_l0_filter_and_index_blocks_in_cache = v; +} + +void rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_top_level_index_and_filter = v; +} + +void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t* opt, + rocksdb_block_based_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep)); + } +} + +rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() { + return new rocksdb_cuckoo_table_options_t; +} + +void rocksdb_cuckoo_options_destroy(rocksdb_cuckoo_table_options_t* options) { + delete options; +} + +void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v) { + options->rep.hash_table_ratio = v; +} + +void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.max_search_depth = v; +} + +void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.cuckoo_block_size = v; +} + +void rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.identity_as_first_hash = v; +} + +void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.use_module_hash = v; +} + +void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep)); + } +} + +void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[], + const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + SaveError(errptr, db->rep->SetOptions(options_map)); +} + +void rocksdb_set_options_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, int count, + const char* const keys[], + const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + SaveError(errptr, db->rep->SetOptions(handle->rep, options_map)); +} + +rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; } + +void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; } + +rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) { + return new rocksdb_options_t(*options); +} + +void rocksdb_options_increase_parallelism(rocksdb_options_t* opt, + int total_threads) { + opt->rep.IncreaseParallelism(total_threads); +} + +void rocksdb_options_optimize_for_point_lookup(rocksdb_options_t* opt, + uint64_t block_cache_size_mb) { + opt->rep.OptimizeForPointLookup(block_cache_size_mb); +} + +void rocksdb_options_optimize_level_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget) { + opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget); +} + +void rocksdb_options_optimize_universal_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget) { + opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget); +} + +void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_ingest_behind = v; +} + +unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) { + return opt->rep.allow_ingest_behind; +} + +void rocksdb_options_set_compaction_filter(rocksdb_options_t* opt, + rocksdb_compactionfilter_t* filter) { + opt->rep.compaction_filter = filter; +} + +void rocksdb_options_set_compaction_filter_factory( + rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) { + opt->rep.compaction_filter_factory = + std::shared_ptr(factory); +} + +void rocksdb_options_compaction_readahead_size(rocksdb_options_t* opt, + size_t s) { + opt->rep.compaction_readahead_size = s; +} + +size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) { + return opt->rep.compaction_readahead_size; +} + +void rocksdb_options_set_comparator(rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void rocksdb_options_set_merge_operator( + rocksdb_options_t* opt, rocksdb_mergeoperator_t* merge_operator) { + opt->rep.merge_operator = std::shared_ptr(merge_operator); +} + +void rocksdb_options_set_create_if_missing(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.create_if_missing = v; +} + +unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) { + return opt->rep.create_if_missing; +} + +void rocksdb_options_set_create_missing_column_families(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.create_missing_column_families = v; +} + +unsigned char rocksdb_options_get_create_missing_column_families( + rocksdb_options_t* opt) { + return opt->rep.create_missing_column_families; +} + +void rocksdb_options_set_error_if_exists(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.error_if_exists = v; +} + +unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) { + return opt->rep.error_if_exists; +} + +void rocksdb_options_set_paranoid_checks(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.paranoid_checks = v; +} + +unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) { + return opt->rep.paranoid_checks; +} + +void rocksdb_options_set_db_paths(rocksdb_options_t* opt, + const rocksdb_dbpath_t** dbpath_values, + size_t num_paths) { + std::vector db_paths(num_paths); + for (size_t i = 0; i < num_paths; ++i) { + db_paths[i] = dbpath_values[i]->rep; + } + opt->rep.db_paths = db_paths; +} + +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { + opt->rep.env = (env ? env->rep : nullptr); +} + +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { + if (l) { + opt->rep.info_log = l->rep; + } +} + +void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) { + opt->rep.info_log_level = static_cast(v); +} + +int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) { + return static_cast(opt->rep.info_log_level); +} + +void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt, + size_t s) { + opt->rep.db_write_buffer_size = s; +} + +size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) { + return opt->rep.db_write_buffer_size; +} + +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) { + return opt->rep.write_buffer_size; +} + +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) { + return opt->rep.max_open_files; +} + +void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, + int n) { + opt->rep.max_file_opening_threads = n; +} + +int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) { + return opt->rep.max_file_opening_threads; +} + +void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_total_wal_size = n; +} + +uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) { + return opt->rep.max_total_wal_size; +} + +void rocksdb_options_set_target_file_size_base(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.target_file_size_base = n; +} + +uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) { + return opt->rep.target_file_size_base; +} + +void rocksdb_options_set_target_file_size_multiplier(rocksdb_options_t* opt, + int n) { + opt->rep.target_file_size_multiplier = n; +} + +int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) { + return opt->rep.target_file_size_multiplier; +} + +void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_bytes_for_level_base = n; +} + +uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) { + return opt->rep.max_bytes_for_level_base; +} + +void rocksdb_options_set_level_compaction_dynamic_level_bytes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.level_compaction_dynamic_level_bytes = v; +} + +unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes( + rocksdb_options_t* opt) { + return opt->rep.level_compaction_dynamic_level_bytes; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt, + double n) { + opt->rep.max_bytes_for_level_multiplier = n; +} + +double rocksdb_options_get_max_bytes_for_level_multiplier( + rocksdb_options_t* opt) { + return opt->rep.max_bytes_for_level_multiplier; +} + +void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_compaction_bytes = n; +} + +uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) { + return opt->rep.max_compaction_bytes; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t* opt, int* level_values, size_t num_levels) { + opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i]; + } +} + +void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { + opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +} + +void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.skip_stats_update_on_db_open = val; +} + +unsigned char rocksdb_options_get_skip_stats_update_on_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_stats_update_on_db_open; +} + +void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val) { + opt->rep.skip_checking_sst_file_sizes_on_db_open = val; +} + +unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_checking_sst_file_sizes_on_db_open; +} + +/* Blob Options Settings */ +void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_files = val; +} +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt) { + return opt->rep.enable_blob_files; +} + +void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.min_blob_size = val; +} + +uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) { + return opt->rep.min_blob_size; +} + +void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.blob_file_size = val; +} + +uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) { + return opt->rep.blob_file_size; +} + +void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt, + int val) { + opt->rep.blob_compression_type = static_cast(val); +} + +int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) { + return opt->rep.blob_compression_type; +} + +void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_garbage_collection = val; +} + +unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) { + return opt->rep.enable_blob_garbage_collection; +} + +void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_age_cutoff = val; +} + +double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_age_cutoff; +} + +void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_force_threshold = val; +} + +double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_force_threshold; +} + +void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt, + uint64_t val) { + opt->rep.blob_compaction_readahead_size = val; +} + +uint64_t rocksdb_options_get_blob_compaction_readahead_size( + rocksdb_options_t* opt) { + return opt->rep.blob_compaction_readahead_size; +} + +void rocksdb_options_set_blob_file_starting_level(rocksdb_options_t* opt, + int val) { + opt->rep.blob_file_starting_level = val; +} + +int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) { + return opt->rep.blob_file_starting_level; +} + +void rocksdb_options_set_blob_cache(rocksdb_options_t* opt, + rocksdb_cache_t* blob_cache) { + opt->rep.blob_cache = blob_cache->rep; +} + +void rocksdb_options_set_prepopulate_blob_cache(rocksdb_options_t* opt, int t) { + opt->rep.prepopulate_blob_cache = static_cast(t); +} + +int rocksdb_options_get_prepopulate_blob_cache(rocksdb_options_t* opt) { + return static_cast(opt->rep.prepopulate_blob_cache); +} + +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { + opt->rep.num_levels = n; +} + +int rocksdb_options_get_num_levels(rocksdb_options_t* opt) { + return opt->rep.num_levels; +} + +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_file_num_compaction_trigger = n; +} + +int rocksdb_options_get_level0_file_num_compaction_trigger( + rocksdb_options_t* opt) { + return opt->rep.level0_file_num_compaction_trigger; +} + +void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t* opt, + int n) { + opt->rep.level0_slowdown_writes_trigger = n; +} + +int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) { + return opt->rep.level0_slowdown_writes_trigger; +} + +void rocksdb_options_set_level0_stop_writes_trigger(rocksdb_options_t* opt, + int n) { + opt->rep.level0_stop_writes_trigger = n; +} + +int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) { + return opt->rep.level0_stop_writes_trigger; +} + +void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt, int mode) { + opt->rep.wal_recovery_mode = static_cast(mode); +} + +int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) { + return static_cast(opt->rep.wal_recovery_mode); +} + +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +int rocksdb_options_get_compression(rocksdb_options_t* opt) { + return opt->rep.compression; +} + +void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) { + opt->rep.bottommost_compression = static_cast(t); +} + +int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) { + return opt->rep.bottommost_compression; +} + +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, + const int* level_values, + size_t num_levels) { + opt->rep.compression_per_level.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.compression_per_level[i] = + static_cast(level_values[i]); + } +} + +void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt, + int w_bits, int level, + int strategy, + int max_dict_bytes, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.window_bits = w_bits; + opt->rep.bottommost_compression_opts.level = level; + opt->rep.bottommost_compression_opts.strategy = strategy; + opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) { + opt->rep.bottommost_compression_opts.zstd_max_train_bytes = + zstd_max_train_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.use_zstd_dict_trainer = + use_zstd_dict_trainer; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +unsigned char +rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt) { + return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer; +} + +void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.max_dict_buffer_bytes = + max_dict_buffer_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits, + int level, int strategy, + int max_dict_bytes) { + opt->rep.compression_opts.window_bits = w_bits; + opt->rep.compression_opts.level = level; + opt->rep.compression_opts.strategy = strategy; + opt->rep.compression_opts.max_dict_bytes = max_dict_bytes; +} + +void rocksdb_options_set_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt, int zstd_max_train_bytes) { + opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes; +} + +int rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.zstd_max_train_bytes; +} + +void rocksdb_options_set_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) { + opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer; +} + +unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.use_zstd_dict_trainer; +} + +void rocksdb_options_set_compression_options_parallel_threads( + rocksdb_options_t* opt, int value) { + opt->rep.compression_opts.parallel_threads = value; +} + +int rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.parallel_threads; +} + +void rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) { + opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes; +} + +uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.max_dict_buffer_bytes; +} + +void rocksdb_options_set_prefix_extractor( + rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { + opt->rep.prefix_extractor.reset(prefix_extractor); +} + +void rocksdb_options_set_use_fsync(rocksdb_options_t* opt, int use_fsync) { + opt->rep.use_fsync = use_fsync; +} + +int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) { + return opt->rep.use_fsync; +} + +void rocksdb_options_set_db_log_dir(rocksdb_options_t* opt, + const char* db_log_dir) { + opt->rep.db_log_dir = db_log_dir; +} + +void rocksdb_options_set_wal_dir(rocksdb_options_t* opt, const char* v) { + opt->rep.wal_dir = v; +} + +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { + opt->rep.WAL_ttl_seconds = ttl; +} + +uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) { + return opt->rep.WAL_ttl_seconds; +} + +void rocksdb_options_set_WAL_size_limit_MB(rocksdb_options_t* opt, + uint64_t limit) { + opt->rep.WAL_size_limit_MB = limit; +} + +uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) { + return opt->rep.WAL_size_limit_MB; +} + +void rocksdb_options_set_manifest_preallocation_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.manifest_preallocation_size = v; +} + +size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) { + return opt->rep.manifest_preallocation_size; +} + +void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.use_direct_reads = v; +} + +unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) { + return opt->rep.use_direct_reads; +} + +void rocksdb_options_set_use_direct_io_for_flush_and_compaction( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.use_direct_io_for_flush_and_compaction = v; +} + +unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction( + rocksdb_options_t* opt) { + return opt->rep.use_direct_io_for_flush_and_compaction; +} + +void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_mmap_reads = v; +} + +unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) { + return opt->rep.allow_mmap_reads; +} + +void rocksdb_options_set_allow_mmap_writes(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_mmap_writes = v; +} + +unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) { + return opt->rep.allow_mmap_writes; +} + +void rocksdb_options_set_is_fd_close_on_exec(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.is_fd_close_on_exec = v; +} + +unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) { + return opt->rep.is_fd_close_on_exec; +} + +void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, + unsigned int v) { + opt->rep.stats_dump_period_sec = v; +} + +unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) { + return opt->rep.stats_dump_period_sec; +} + +void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt, + unsigned int v) { + opt->rep.stats_persist_period_sec = v; +} + +unsigned int rocksdb_options_get_stats_persist_period_sec( + rocksdb_options_t* opt) { + return opt->rep.stats_persist_period_sec; +} + +void rocksdb_options_set_advise_random_on_open(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.advise_random_on_open = v; +} + +unsigned char rocksdb_options_get_advise_random_on_open( + rocksdb_options_t* opt) { + return opt->rep.advise_random_on_open; +} + +void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt, + int v) { + switch (v) { + case 0: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::NONE; + break; + case 1: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::NORMAL; + break; + case 2: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::SEQUENTIAL; + break; + case 3: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::WILLNEED; + break; + default: + assert(0); + } +} + +int rocksdb_options_get_access_hint_on_compaction_start( + rocksdb_options_t* opt) { + return opt->rep.access_hint_on_compaction_start; +} + +void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.use_adaptive_mutex = v; +} + +unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) { + return opt->rep.use_adaptive_mutex; +} + +void rocksdb_options_set_wal_bytes_per_sync(rocksdb_options_t* opt, + uint64_t v) { + opt->rep.wal_bytes_per_sync = v; +} + +uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) { + return opt->rep.wal_bytes_per_sync; +} + +void rocksdb_options_set_bytes_per_sync(rocksdb_options_t* opt, uint64_t v) { + opt->rep.bytes_per_sync = v; +} + +uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) { + return opt->rep.bytes_per_sync; +} + +void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt, + uint64_t v) { + opt->rep.writable_file_max_buffer_size = static_cast(v); +} + +uint64_t rocksdb_options_get_writable_file_max_buffer_size( + rocksdb_options_t* opt) { + return opt->rep.writable_file_max_buffer_size; +} + +void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_concurrent_memtable_write = v; +} + +unsigned char rocksdb_options_get_allow_concurrent_memtable_write( + rocksdb_options_t* opt) { + return opt->rep.allow_concurrent_memtable_write; +} + +void rocksdb_options_set_enable_write_thread_adaptive_yield( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.enable_write_thread_adaptive_yield = v; +} + +unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield( + rocksdb_options_t* opt) { + return opt->rep.enable_write_thread_adaptive_yield; +} + +void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.max_sequential_skip_in_iterations = v; +} + +uint64_t rocksdb_options_get_max_sequential_skip_in_iterations( + rocksdb_options_t* opt) { + return opt->rep.max_sequential_skip_in_iterations; +} + +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, + int n) { + opt->rep.max_write_buffer_number = n; +} + +int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_number; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge( + rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +int rocksdb_options_get_min_write_buffer_number_to_merge( + rocksdb_options_t* opt) { + return opt->rep.min_write_buffer_number_to_merge; +} + +void rocksdb_options_set_max_write_buffer_number_to_maintain( + rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number_to_maintain = n; +} + +int rocksdb_options_get_max_write_buffer_number_to_maintain( + rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_number_to_maintain; +} + +void rocksdb_options_set_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt, int64_t n) { + opt->rep.max_write_buffer_size_to_maintain = n; +} + +int64_t rocksdb_options_get_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_size_to_maintain; +} + +void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.enable_pipelined_write = v; +} + +unsigned char rocksdb_options_get_enable_pipelined_write( + rocksdb_options_t* opt) { + return opt->rep.enable_pipelined_write; +} + +void rocksdb_options_set_unordered_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.unordered_write = v; +} + +unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) { + return opt->rep.unordered_write; +} + +void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, + uint32_t n) { + opt->rep.max_subcompactions = n; +} + +uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) { + return opt->rep.max_subcompactions; +} + +void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) { + opt->rep.max_background_jobs = n; +} + +int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) { + return opt->rep.max_background_jobs; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, + int n) { + opt->rep.max_background_compactions = n; +} + +int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) { + return opt->rep.max_background_compactions; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) { + return opt->rep.max_background_flushes; +} + +void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt, + double v) { + opt->rep.experimental_mempurge_threshold = v; +} + +double rocksdb_options_get_experimental_mempurge_threshold( + rocksdb_options_t* opt) { + return opt->rep.experimental_mempurge_threshold; +} + +void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) { + opt->rep.max_log_file_size = v; +} + +size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) { + return opt->rep.max_log_file_size; +} + +void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, + size_t v) { + opt->rep.log_file_time_to_roll = v; +} + +size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) { + return opt->rep.log_file_time_to_roll; +} + +void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { + opt->rep.keep_log_file_num = v; +} + +size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) { + return opt->rep.keep_log_file_num; +} + +void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt, + size_t v) { + opt->rep.recycle_log_file_num = v; +} + +size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) { + return opt->rep.recycle_log_file_num; +} + +void rocksdb_options_set_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v) { + opt->rep.soft_pending_compaction_bytes_limit = v; +} + +size_t rocksdb_options_get_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt) { + return opt->rep.soft_pending_compaction_bytes_limit; +} + +void rocksdb_options_set_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v) { + opt->rep.hard_pending_compaction_bytes_limit = v; +} + +size_t rocksdb_options_get_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt) { + return opt->rep.hard_pending_compaction_bytes_limit; +} + +void rocksdb_options_set_max_manifest_file_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.max_manifest_file_size = v; +} + +size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) { + return opt->rep.max_manifest_file_size; +} + +void rocksdb_options_set_table_cache_numshardbits(rocksdb_options_t* opt, + int v) { + opt->rep.table_cache_numshardbits = v; +} + +int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) { + return opt->rep.table_cache_numshardbits; +} + +void rocksdb_options_set_arena_block_size(rocksdb_options_t* opt, size_t v) { + opt->rep.arena_block_size = v; +} + +size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) { + return opt->rep.arena_block_size; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, + int disable) { + opt->rep.disable_auto_compactions = disable; +} + +unsigned char rocksdb_options_get_disable_auto_compactions( + rocksdb_options_t* opt) { + return opt->rep.disable_auto_compactions; +} + +void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, + int v) { + opt->rep.optimize_filters_for_hits = v; +} + +unsigned char rocksdb_options_get_optimize_filters_for_hits( + rocksdb_options_t* opt) { + return opt->rep.optimize_filters_for_hits; +} + +void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.delete_obsolete_files_period_micros = v; +} + +uint64_t rocksdb_options_get_delete_obsolete_files_period_micros( + rocksdb_options_t* opt) { + return opt->rep.delete_obsolete_files_period_micros; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t* opt) { + opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); +} + +void rocksdb_options_set_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt, double v) { + opt->rep.memtable_prefix_bloom_size_ratio = v; +} + +double rocksdb_options_get_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt) { + return opt->rep.memtable_prefix_bloom_size_ratio; +} + +void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.memtable_huge_page_size = v; +} + +size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) { + return opt->rep.memtable_huge_page_size; +} + +void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t* opt, + size_t bucket_count, + int32_t skiplist_height, + int32_t skiplist_branching_factor) { + ROCKSDB_NAMESPACE::MemTableRepFactory* factory = + ROCKSDB_NAMESPACE::NewHashSkipListRepFactory( + bucket_count, skiplist_height, skiplist_branching_factor); + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt, + size_t bucket_count) { + opt->rep.memtable_factory.reset( + ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count)); +} + +void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt, + uint32_t user_key_len, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness) { + ROCKSDB_NAMESPACE::PlainTableOptions options; + options.user_key_len = user_key_len; + options.bloom_bits_per_key = bloom_bits_per_key; + options.hash_table_ratio = hash_table_ratio; + options.index_sparseness = index_sparseness; + + ROCKSDB_NAMESPACE::TableFactory* factory = + ROCKSDB_NAMESPACE::NewPlainTableFactory(options); + opt->rep.table_factory.reset(factory); +} + +void rocksdb_options_set_max_successive_merges(rocksdb_options_t* opt, + size_t v) { + opt->rep.max_successive_merges = v; +} + +size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) { + return opt->rep.max_successive_merges; +} + +void rocksdb_options_set_bloom_locality(rocksdb_options_t* opt, uint32_t v) { + opt->rep.bloom_locality = v; +} + +uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) { + return opt->rep.bloom_locality; +} + +void rocksdb_options_set_inplace_update_support(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.inplace_update_support = v; +} + +unsigned char rocksdb_options_get_inplace_update_support( + rocksdb_options_t* opt) { + return opt->rep.inplace_update_support; +} + +void rocksdb_options_set_inplace_update_num_locks(rocksdb_options_t* opt, + size_t v) { + opt->rep.inplace_update_num_locks = v; +} + +size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) { + return opt->rep.inplace_update_num_locks; +} + +void rocksdb_options_set_report_bg_io_stats(rocksdb_options_t* opt, int v) { + opt->rep.report_bg_io_stats = v; +} + +unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) { + return opt->rep.report_bg_io_stats; +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t* opt, int style) { + opt->rep.compaction_style = + static_cast(style); +} + +int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) { + return opt->rep.compaction_style; +} + +void rocksdb_options_set_universal_compaction_options( + rocksdb_options_t* opt, rocksdb_universal_compaction_options_t* uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +void rocksdb_options_set_fifo_compaction_options( + rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo) { + opt->rep.compaction_options_fifo = fifo->rep; +} + +char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) { + ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get(); + if (statistics) { + return strdup(statistics->ToString().c_str()); + } + return nullptr; +} + +void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt, + rocksdb_ratelimiter_t* limiter) { + if (limiter) { + opt->rep.rate_limiter = limiter->rep; + } +} + +void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt, + unsigned char atomic_flush) { + opt->rep.atomic_flush = atomic_flush; +} + +unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) { + return opt->rep.atomic_flush; +} + +void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt, + unsigned char manual_wal_flush) { + opt->rep.manual_wal_flush = manual_wal_flush; +} + +unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) { + return opt->rep.manual_wal_flush; +} + +void rocksdb_options_set_wal_compression(rocksdb_options_t* opt, int val) { + opt->rep.wal_compression = static_cast(val); +} + +int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) { + return opt->rep.wal_compression; +} + +rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec, + int64_t refill_period_us, + int32_t fairness) { + rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t; + rate_limiter->rep.reset( + NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness)); + return rate_limiter; +} + +void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) { + delete limiter; +} + +void rocksdb_options_set_row_cache(rocksdb_options_t* opt, + rocksdb_cache_t* cache) { + if (cache) { + opt->rep.row_cache = cache->rep; + } +} + +void rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) { + std::shared_ptr + compact_on_del = + NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + +void rocksdb_set_perf_level(int v) { + PerfLevel level = static_cast(v); + SetPerfLevel(level); +} + +rocksdb_perfcontext_t* rocksdb_perfcontext_create() { + rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t; + context->rep = ROCKSDB_NAMESPACE::get_perf_context(); + return context; +} + +void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) { + context->rep->Reset(); +} + +char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context, + unsigned char exclude_zero_counters) { + return strdup(context->rep->ToString(exclude_zero_counters).c_str()); +} + +uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, + int metric) { + PerfContext* rep = context->rep; + switch (metric) { + case rocksdb_user_key_comparison_count: + return rep->user_key_comparison_count; + case rocksdb_block_cache_hit_count: + return rep->block_cache_hit_count; + case rocksdb_block_read_count: + return rep->block_read_count; + case rocksdb_block_read_byte: + return rep->block_read_byte; + case rocksdb_block_read_time: + return rep->block_read_time; + case rocksdb_block_checksum_time: + return rep->block_checksum_time; + case rocksdb_block_decompress_time: + return rep->block_decompress_time; + case rocksdb_get_read_bytes: + return rep->get_read_bytes; + case rocksdb_multiget_read_bytes: + return rep->multiget_read_bytes; + case rocksdb_iter_read_bytes: + return rep->iter_read_bytes; + case rocksdb_internal_key_skipped_count: + return rep->internal_key_skipped_count; + case rocksdb_internal_delete_skipped_count: + return rep->internal_delete_skipped_count; + case rocksdb_internal_recent_skipped_count: + return rep->internal_recent_skipped_count; + case rocksdb_internal_merge_count: + return rep->internal_merge_count; + case rocksdb_get_snapshot_time: + return rep->get_snapshot_time; + case rocksdb_get_from_memtable_time: + return rep->get_from_memtable_time; + case rocksdb_get_from_memtable_count: + return rep->get_from_memtable_count; + case rocksdb_get_post_process_time: + return rep->get_post_process_time; + case rocksdb_get_from_output_files_time: + return rep->get_from_output_files_time; + case rocksdb_seek_on_memtable_time: + return rep->seek_on_memtable_time; + case rocksdb_seek_on_memtable_count: + return rep->seek_on_memtable_count; + case rocksdb_next_on_memtable_count: + return rep->next_on_memtable_count; + case rocksdb_prev_on_memtable_count: + return rep->prev_on_memtable_count; + case rocksdb_seek_child_seek_time: + return rep->seek_child_seek_time; + case rocksdb_seek_child_seek_count: + return rep->seek_child_seek_count; + case rocksdb_seek_min_heap_time: + return rep->seek_min_heap_time; + case rocksdb_seek_max_heap_time: + return rep->seek_max_heap_time; + case rocksdb_seek_internal_seek_time: + return rep->seek_internal_seek_time; + case rocksdb_find_next_user_entry_time: + return rep->find_next_user_entry_time; + case rocksdb_write_wal_time: + return rep->write_wal_time; + case rocksdb_write_memtable_time: + return rep->write_memtable_time; + case rocksdb_write_delay_time: + return rep->write_delay_time; + case rocksdb_write_pre_and_post_process_time: + return rep->write_pre_and_post_process_time; + case rocksdb_db_mutex_lock_nanos: + return rep->db_mutex_lock_nanos; + case rocksdb_db_condition_wait_nanos: + return rep->db_condition_wait_nanos; + case rocksdb_merge_operator_time_nanos: + return rep->merge_operator_time_nanos; + case rocksdb_read_index_block_nanos: + return rep->read_index_block_nanos; + case rocksdb_read_filter_block_nanos: + return rep->read_filter_block_nanos; + case rocksdb_new_table_block_iter_nanos: + return rep->new_table_block_iter_nanos; + case rocksdb_new_table_iterator_nanos: + return rep->new_table_iterator_nanos; + case rocksdb_block_seek_nanos: + return rep->block_seek_nanos; + case rocksdb_find_table_nanos: + return rep->find_table_nanos; + case rocksdb_bloom_memtable_hit_count: + return rep->bloom_memtable_hit_count; + case rocksdb_bloom_memtable_miss_count: + return rep->bloom_memtable_miss_count; + case rocksdb_bloom_sst_hit_count: + return rep->bloom_sst_hit_count; + case rocksdb_bloom_sst_miss_count: + return rep->bloom_sst_miss_count; + case rocksdb_key_lock_wait_time: + return rep->key_lock_wait_time; + case rocksdb_key_lock_wait_count: + return rep->key_lock_wait_count; + case rocksdb_env_new_sequential_file_nanos: + return rep->env_new_sequential_file_nanos; + case rocksdb_env_new_random_access_file_nanos: + return rep->env_new_random_access_file_nanos; + case rocksdb_env_new_writable_file_nanos: + return rep->env_new_writable_file_nanos; + case rocksdb_env_reuse_writable_file_nanos: + return rep->env_reuse_writable_file_nanos; + case rocksdb_env_new_random_rw_file_nanos: + return rep->env_new_random_rw_file_nanos; + case rocksdb_env_new_directory_nanos: + return rep->env_new_directory_nanos; + case rocksdb_env_file_exists_nanos: + return rep->env_file_exists_nanos; + case rocksdb_env_get_children_nanos: + return rep->env_get_children_nanos; + case rocksdb_env_get_children_file_attributes_nanos: + return rep->env_get_children_file_attributes_nanos; + case rocksdb_env_delete_file_nanos: + return rep->env_delete_file_nanos; + case rocksdb_env_create_dir_nanos: + return rep->env_create_dir_nanos; + case rocksdb_env_create_dir_if_missing_nanos: + return rep->env_create_dir_if_missing_nanos; + case rocksdb_env_delete_dir_nanos: + return rep->env_delete_dir_nanos; + case rocksdb_env_get_file_size_nanos: + return rep->env_get_file_size_nanos; + case rocksdb_env_get_file_modification_time_nanos: + return rep->env_get_file_modification_time_nanos; + case rocksdb_env_rename_file_nanos: + return rep->env_rename_file_nanos; + case rocksdb_env_link_file_nanos: + return rep->env_link_file_nanos; + case rocksdb_env_lock_file_nanos: + return rep->env_lock_file_nanos; + case rocksdb_env_unlock_file_nanos: + return rep->env_unlock_file_nanos; + case rocksdb_env_new_logger_nanos: + return rep->env_new_logger_nanos; + case rocksdb_number_async_seek: + return rep->number_async_seek; + case rocksdb_blob_cache_hit_count: + return rep->blob_cache_hit_count; + case rocksdb_blob_read_count: + return rep->blob_read_count; + case rocksdb_blob_read_byte: + return rep->blob_read_byte; + case rocksdb_blob_read_time: + return rep->blob_read_time; + case rocksdb_blob_checksum_time: + return rep->blob_checksum_time; + case rocksdb_blob_decompress_time: + return rep->blob_decompress_time; + case rocksdb_internal_range_del_reseek_count: + return rep->internal_range_del_reseek_count; + case rocksdb_block_read_cpu_time: + return rep->block_read_cpu_time; + default: + break; + } + return 0; +} + +void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) { + delete context; +} + +/* +TODO: +DB::OpenForReadOnly +DB::KeyMayExist +DB::GetOptions +DB::GetSortedWalFiles +DB::GetLatestSequenceNumber +DB::GetUpdatesSince +DB::GetDbIdentity +DB::RunManualCompaction +custom cache +table_properties_collectors +*/ + +rocksdb_compactionfilter_t* rocksdb_compactionfilter_create( + void* state, void (*destructor)(void*), + unsigned char (*filter)(void*, int level, const char* key, + size_t key_length, const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed), + const char* (*name)(void*)) { + rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t; + result->state_ = state; + result->destructor_ = destructor; + result->filter_ = filter; + result->ignore_snapshots_ = true; + result->name_ = name; + return result; +} + +void rocksdb_compactionfilter_set_ignore_snapshots( + rocksdb_compactionfilter_t* filter, unsigned char whether_ignore) { + filter->ignore_snapshots_ = whether_ignore; +} + +void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) { + delete filter; +} + +unsigned char rocksdb_compactionfiltercontext_is_full_compaction( + rocksdb_compactionfiltercontext_t* context) { + return context->rep.is_full_compaction; +} + +unsigned char rocksdb_compactionfiltercontext_is_manual_compaction( + rocksdb_compactionfiltercontext_t* context) { + return context->rep.is_manual_compaction; +} + +rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create( + void* state, void (*destructor)(void*), + rocksdb_compactionfilter_t* (*create_compaction_filter)( + void*, rocksdb_compactionfiltercontext_t* context), + const char* (*name)(void*)) { + rocksdb_compactionfilterfactory_t* result = + new rocksdb_compactionfilterfactory_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_compaction_filter_ = create_compaction_filter; + result->name_ = name; + return result; +} + +void rocksdb_compactionfilterfactory_destroy( + rocksdb_compactionfilterfactory_t* factory) { + delete factory; +} + +rocksdb_comparator_t* rocksdb_comparator_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + const char* (*name)(void*)) { + rocksdb_comparator_t* result = new rocksdb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + result->compare_ts_ = nullptr; + result->compare_without_ts_ = nullptr; + return result; +} + +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; } + +rocksdb_comparator_t* rocksdb_comparator_with_ts_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts, + size_t b_tslen), + int (*compare_without_ts)(void*, const char* a, size_t alen, + unsigned char a_has_ts, const char* b, + size_t blen, unsigned char b_has_ts), + const char* (*name)(void*), size_t timestamp_size) { + rocksdb_comparator_t* result = new rocksdb_comparator_t(timestamp_size); + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->compare_ts_ = compare_ts; + result->compare_without_ts_ = compare_without_ts; + result->name_ = name; + return result; +} + +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { + delete filter; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format( + double bits_per_key, bool original_format) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + const char* CompatibilityName() const override { + return rep_->CompatibilityName(); + } + // No need to override GetFilterBitsBuilder if this one is overridden + ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( + const ROCKSDB_NAMESPACE::FilterBuildingContext& context) + const override { + return rep_->GetBuilderWithContext(context); + } + ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader( + const Slice& contents) const override { + return rep_->GetFilterBitsReader(contents); + } + static void DoNothing(void*) {} + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full( + double bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format( + double bloom_equivalent_bits_per_key, int bloom_before_level) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewRibbonFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + const char* CompatibilityName() const override { + return rep_->CompatibilityName(); + } + ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( + const ROCKSDB_NAMESPACE::FilterBuildingContext& context) + const override { + return rep_->GetBuilderWithContext(context); + } + ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader( + const Slice& contents) const override { + return rep_->GetFilterBitsReader(contents); + } + static void DoNothing(void*) {} + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = + NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon( + double bloom_equivalent_bits_per_key) { + return rocksdb_filterpolicy_create_ribbon_format( + bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid( + double bloom_equivalent_bits_per_key, int bloom_before_level) { + return rocksdb_filterpolicy_create_ribbon_format( + bloom_equivalent_bits_per_key, bloom_before_level); +} + +rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)) { + rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t; + result->state_ = state; + result->destructor_ = destructor; + result->full_merge_ = full_merge; + result->partial_merge_ = partial_merge; + result->delete_value_ = delete_value; + result->name_ = name; + return result; +} + +void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) { + delete merge_operator; +} + +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; +} + +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; } + +void rocksdb_readoptions_set_verify_checksums(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +unsigned char rocksdb_readoptions_get_verify_checksums( + rocksdb_readoptions_t* opt) { + return opt->rep.verify_checksums; +} + +void rocksdb_readoptions_set_fill_cache(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.fill_cache = v; +} + +unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) { + return opt->rep.fill_cache; +} + +void rocksdb_readoptions_set_snapshot(rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : nullptr); +} + +void rocksdb_readoptions_set_iterate_upper_bound(rocksdb_readoptions_t* opt, + const char* key, + size_t keylen) { + if (key == nullptr) { + opt->upper_bound = Slice(); + opt->rep.iterate_upper_bound = nullptr; + + } else { + opt->upper_bound = Slice(key, keylen); + opt->rep.iterate_upper_bound = &opt->upper_bound; + } +} + +void rocksdb_readoptions_set_iterate_lower_bound(rocksdb_readoptions_t* opt, + const char* key, + size_t keylen) { + if (key == nullptr) { + opt->lower_bound = Slice(); + opt->rep.iterate_lower_bound = nullptr; + } else { + opt->lower_bound = Slice(key, keylen); + opt->rep.iterate_lower_bound = &opt->lower_bound; + } +} + +void rocksdb_readoptions_set_read_tier(rocksdb_readoptions_t* opt, int v) { + opt->rep.read_tier = static_cast(v); +} + +int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) { + return static_cast(opt->rep.read_tier); +} + +void rocksdb_readoptions_set_tailing(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.tailing = v; +} + +unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) { + return opt->rep.tailing; +} + +void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.managed = v; +} + +void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt, + size_t v) { + opt->rep.readahead_size = v; +} + +size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) { + return opt->rep.readahead_size; +} + +void rocksdb_readoptions_set_prefix_same_as_start(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.prefix_same_as_start = v; +} + +unsigned char rocksdb_readoptions_get_prefix_same_as_start( + rocksdb_readoptions_t* opt) { + return opt->rep.prefix_same_as_start; +} + +void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.pin_data = v; +} + +unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) { + return opt->rep.pin_data; +} + +void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.total_order_seek = v; +} + +unsigned char rocksdb_readoptions_get_total_order_seek( + rocksdb_readoptions_t* opt) { + return opt->rep.total_order_seek; +} + +void rocksdb_readoptions_set_max_skippable_internal_keys( + rocksdb_readoptions_t* opt, uint64_t v) { + opt->rep.max_skippable_internal_keys = v; +} + +uint64_t rocksdb_readoptions_get_max_skippable_internal_keys( + rocksdb_readoptions_t* opt) { + return opt->rep.max_skippable_internal_keys; +} + +void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.background_purge_on_iterator_cleanup = v; +} + +unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t* opt) { + return opt->rep.background_purge_on_iterator_cleanup; +} + +void rocksdb_readoptions_set_ignore_range_deletions(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.ignore_range_deletions = v; +} + +unsigned char rocksdb_readoptions_get_ignore_range_deletions( + rocksdb_readoptions_t* opt) { + return opt->rep.ignore_range_deletions; +} + +void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.deadline = std::chrono::microseconds(microseconds); +} + +uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) { + return opt->rep.deadline.count(); +} + +void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.io_timeout = std::chrono::microseconds(microseconds); +} + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) { + return opt->rep.io_timeout.count(); +} + +void rocksdb_readoptions_set_async_io(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.async_io = v; +} + +unsigned char rocksdb_readoptions_get_async_io(rocksdb_readoptions_t* opt) { + return opt->rep.async_io; +} + +void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt, + const char* ts, size_t tslen) { + if (ts == nullptr) { + opt->timestamp = Slice(); + opt->rep.timestamp = nullptr; + } else { + opt->timestamp = Slice(ts, tslen); + opt->rep.timestamp = &opt->timestamp; + } +} + +void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt, + const char* ts, size_t tslen) { + if (ts == nullptr) { + opt->iter_start_ts = Slice(); + opt->rep.iter_start_ts = nullptr; + } else { + opt->iter_start_ts = Slice(ts, tslen); + opt->rep.iter_start_ts = &opt->iter_start_ts; + } +} + +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; +} + +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; } + +void rocksdb_writeoptions_set_sync(rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.sync = v; +} + +unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) { + return opt->rep.sync; +} + +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, + int disable) { + opt->rep.disableWAL = disable; +} + +unsigned char rocksdb_writeoptions_get_disable_WAL( + rocksdb_writeoptions_t* opt) { + return opt->rep.disableWAL; +} + +void rocksdb_writeoptions_set_ignore_missing_column_families( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.ignore_missing_column_families = v; +} + +unsigned char rocksdb_writeoptions_get_ignore_missing_column_families( + rocksdb_writeoptions_t* opt) { + return opt->rep.ignore_missing_column_families; +} + +void rocksdb_writeoptions_set_no_slowdown(rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.no_slowdown = v; +} + +unsigned char rocksdb_writeoptions_get_no_slowdown( + rocksdb_writeoptions_t* opt) { + return opt->rep.no_slowdown; +} + +void rocksdb_writeoptions_set_low_pri(rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.low_pri = v; +} + +unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) { + return opt->rep.low_pri; +} + +void rocksdb_writeoptions_set_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.memtable_insert_hint_per_batch = v; +} + +unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt) { + return opt->rep.memtable_insert_hint_per_batch; +} + +rocksdb_compactoptions_t* rocksdb_compactoptions_create() { + return new rocksdb_compactoptions_t; +} + +void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) { + delete opt; +} + +void rocksdb_compactoptions_set_bottommost_level_compaction( + rocksdb_compactoptions_t* opt, unsigned char v) { + opt->rep.bottommost_level_compaction = + static_cast(v); +} + +unsigned char rocksdb_compactoptions_get_bottommost_level_compaction( + rocksdb_compactoptions_t* opt) { + return static_cast(opt->rep.bottommost_level_compaction); +} + +void rocksdb_compactoptions_set_exclusive_manual_compaction( + rocksdb_compactoptions_t* opt, unsigned char v) { + opt->rep.exclusive_manual_compaction = v; +} + +unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction( + rocksdb_compactoptions_t* opt) { + return opt->rep.exclusive_manual_compaction; +} + +void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt, + unsigned char v) { + opt->rep.change_level = v; +} + +unsigned char rocksdb_compactoptions_get_change_level( + rocksdb_compactoptions_t* opt) { + return opt->rep.change_level; +} + +void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt, + int n) { + opt->rep.target_level = n; +} + +int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) { + return opt->rep.target_level; +} + +void rocksdb_compactoptions_set_full_history_ts_low( + rocksdb_compactoptions_t* opt, char* ts, size_t tslen) { + if (ts == nullptr) { + opt->full_history_ts_low = Slice(); + opt->rep.full_history_ts_low = nullptr; + } else { + opt->full_history_ts_low = Slice(ts, tslen); + opt->rep.full_history_ts_low = &opt->full_history_ts_low; + } +} + +rocksdb_flushoptions_t* rocksdb_flushoptions_create() { + return new rocksdb_flushoptions_t; +} + +void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { delete opt; } + +void rocksdb_flushoptions_set_wait(rocksdb_flushoptions_t* opt, + unsigned char v) { + opt->rep.wait = v; +} + +unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) { + return opt->rep.wait; +} + +rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create( + char** errptr) { + rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t; + ROCKSDB_NAMESPACE::JemallocAllocatorOptions options; + SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator( + options, &allocator->rep)); + return allocator; +} + +void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) { + delete allocator; +} + +rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() { + return new rocksdb_lru_cache_options_t; +} + +void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) { + delete opt; +} + +void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt, + size_t capacity) { + opt->rep.capacity = capacity; +} + +void rocksdb_lru_cache_options_set_num_shard_bits( + rocksdb_lru_cache_options_t* opt, int num_shard_bits) { + opt->rep.num_shard_bits = num_shard_bits; +} + +void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) { + opt->rep.memory_allocator = allocator->rep; +} + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit( + size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + c->rep->SetStrictCapacityLimit(true); + return c; +} + +rocksdb_cache_t* rocksdb_cache_create_lru_opts( + const rocksdb_lru_cache_options_t* opt) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(opt->rep); + return c; +} + +rocksdb_hyper_clock_cache_options_t* rocksdb_hyper_clock_cache_options_create( + size_t capacity, size_t estimated_entry_charge) { + return new rocksdb_hyper_clock_cache_options_t{ + HyperClockCacheOptions(capacity, estimated_entry_charge)}; +} + +void rocksdb_hyper_clock_cache_options_destroy( + rocksdb_hyper_clock_cache_options_t* opt) { + delete opt; +} + +void rocksdb_hyper_clock_cache_options_set_capacity( + rocksdb_hyper_clock_cache_options_t* opts, size_t capacity) { + opts->rep.capacity = capacity; +} + +void rocksdb_hyper_clock_cache_options_set_estimated_entry_charge( + rocksdb_hyper_clock_cache_options_t* opts, size_t estimated_entry_charge) { + opts->rep.estimated_entry_charge = estimated_entry_charge; +} + +void rocksdb_hyper_clock_cache_options_set_num_shard_bits( + rocksdb_hyper_clock_cache_options_t* opts, int num_shard_bits) { + opts->rep.num_shard_bits = num_shard_bits; +} + +void rocksdb_hyper_clock_cache_options_set_memory_allocator( + rocksdb_hyper_clock_cache_options_t* opts, + rocksdb_memory_allocator_t* memory_allocator) { + opts->rep.memory_allocator = memory_allocator->rep; +} + +rocksdb_cache_t* rocksdb_cache_create_hyper_clock( + size_t capacity, size_t estimated_entry_charge) { + HyperClockCacheOptions opts(capacity, estimated_entry_charge); + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = opts.MakeSharedCache(); + return c; +} + +rocksdb_cache_t* rocksdb_cache_create_hyper_clock_opts( + const rocksdb_hyper_clock_cache_options_t* opts) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = opts->rep.MakeSharedCache(); + return c; +} + +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } + +void rocksdb_cache_disown_data(rocksdb_cache_t* cache) { + cache->rep->DisownData(); +} + +void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { + cache->rep->SetCapacity(capacity); +} + +size_t rocksdb_cache_get_capacity(const rocksdb_cache_t* cache) { + return cache->rep->GetCapacity(); +} + +size_t rocksdb_cache_get_usage(const rocksdb_cache_t* cache) { + return cache->rep->GetUsage(); +} + +size_t rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache) { + return cache->rep->GetPinnedUsage(); +} + +size_t rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache) { + return cache->rep->GetTableAddressCount(); +} + +size_t rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache) { + return cache->rep->GetOccupancyCount(); +} + +rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, + uint64_t target_size) { + rocksdb_dbpath_t* result = new rocksdb_dbpath_t; + result->rep.path = std::string(path); + result->rep.target_size = target_size; + return result; +} + +void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { delete dbpath; } + +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +rocksdb_env_t* rocksdb_create_mem_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default()); + result->is_default = false; + return result; +} + +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +int rocksdb_env_get_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(); +} + +void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::BOTTOM); +} + +int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::BOTTOM); +} + +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + +int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::HIGH); +} + +void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::LOW); +} + +int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::LOW); +} + +void rocksdb_env_join_all_threads(rocksdb_env_t* env) { + env->rep->WaitForJoin(); +} + +void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_io_priority( + rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(Env::HIGH); +} + +void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_cpu_priority( + rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(Env::HIGH); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +rocksdb_envoptions_t* rocksdb_envoptions_create() { + rocksdb_envoptions_t* opt = new rocksdb_envoptions_t; + return opt; +} + +void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; } + +rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) { + rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; + writer->rep = new SstFileWriter(env->rep, io_options->rep); + return writer; +} + +void rocksdb_create_dir_if_missing(rocksdb_env_t* env, const char* path, + char** errptr) { + SaveError(errptr, env->rep->CreateDirIfMissing(std::string(path))); +} + +rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, + const rocksdb_comparator_t* /*comparator*/) { + rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; + writer->rep = new SstFileWriter(env->rep, io_options->rep); + return writer; +} + +void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer, + const char* name, char** errptr) { + SaveError(errptr, writer->rep->Open(std::string(name))); +} + +void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key, + size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key, + size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_put_with_ts(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* ts, size_t tslen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(ts, tslen), + Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, writer->rep->Delete(Slice(key, keylen))); +} + +void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* ts, size_t tslen, + char** errptr) { + SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen))); +} + +void rocksdb_sstfilewriter_delete_range(rocksdb_sstfilewriter_t* writer, + const char* begin_key, + size_t begin_keylen, + const char* end_key, size_t end_keylen, + char** errptr) { + SaveError(errptr, writer->rep->DeleteRange(Slice(begin_key, begin_keylen), + Slice(end_key, end_keylen))); +} + +void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer, + char** errptr) { + SaveError(errptr, writer->rep->Finish(nullptr)); +} + +void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer, + uint64_t* file_size) { + *file_size = writer->rep->FileSize(); +} + +void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) { + delete writer->rep; + delete writer; +} + +rocksdb_ingestexternalfileoptions_t* +rocksdb_ingestexternalfileoptions_create() { + rocksdb_ingestexternalfileoptions_t* opt = + new rocksdb_ingestexternalfileoptions_t; + return opt; +} + +void rocksdb_ingestexternalfileoptions_set_move_files( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) { + opt->rep.move_files = move_files; +} + +void rocksdb_ingestexternalfileoptions_set_snapshot_consistency( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char snapshot_consistency) { + opt->rep.snapshot_consistency = snapshot_consistency; +} + +void rocksdb_ingestexternalfileoptions_set_allow_global_seqno( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_global_seqno) { + opt->rep.allow_global_seqno = allow_global_seqno; +} + +void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_blocking_flush) { + opt->rep.allow_blocking_flush = allow_blocking_flush; +} + +void rocksdb_ingestexternalfileoptions_set_ingest_behind( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind) { + opt->rep.ingest_behind = ingest_behind; +} + +void rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char fail_if_not_bottommost_level) { + opt->rep.fail_if_not_bottommost_level = fail_if_not_bottommost_level; +} + +void rocksdb_ingestexternalfileoptions_destroy( + rocksdb_ingestexternalfileoptions_t* opt) { + delete opt; +} + +void rocksdb_ingest_external_file( + rocksdb_t* db, const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) { + std::vector files(list_len); + for (size_t i = 0; i < list_len; ++i) { + files[i] = std::string(file_list[i]); + } + SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep)); +} + +void rocksdb_ingest_external_file_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, + const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) { + std::vector files(list_len); + for (size_t i = 0; i < list_len; ++i) { + files[i] = std::string(file_list[i]); + } + SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep)); +} + +void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->TryCatchUpWithPrimary()); +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, void (*destructor)(void*), + char* (*transform)(void*, const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)(void*, const char* key, size_t length), + unsigned char (*in_range)(void*, const char* key, size_t length), + const char* (*name)(void*)) { + rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; + result->state_ = state; + result->destructor_ = destructor; + result->transform_ = transform; + result->in_domain_ = in_domain; + result->in_range_ = in_range; + result->name_ = name; + return result; +} + +void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { delete st; } + +struct SliceTransformWrapper : public rocksdb_slicetransform_t { + const SliceTransform* rep_; + ~SliceTransformWrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + std::string GetId() const override { return rep_->GetId(); } + Slice Transform(const Slice& src) const override { + return rep_->Transform(src); + } + bool InDomain(const Slice& src) const override { return rep_->InDomain(src); } + bool InRange(const Slice& src) const override { return rep_->InRange(src); } + static void DoNothing(void*) {} +}; + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix( + size_t prefixLen) { + SliceTransformWrapper* wrapper = new SliceTransformWrapper; + wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen); + wrapper->state_ = nullptr; + wrapper->destructor_ = &SliceTransformWrapper::DoNothing; + return wrapper; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() { + SliceTransformWrapper* wrapper = new SliceTransformWrapper; + wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform(); + wrapper->state_ = nullptr; + wrapper->destructor_ = &SliceTransformWrapper::DoNothing; + return wrapper; +} + +rocksdb_universal_compaction_options_t* +rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = + new rocksdb_universal_compaction_options_t; + result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +int rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->size_ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +int rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->min_merge_width; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +int rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_merge_width; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +int rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_size_amplification_percent; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +int rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->compression_size_percent; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = + static_cast(style); +} + +int rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t* uco) { + return static_cast(uco->rep->stop_style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + +rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() { + rocksdb_fifo_compaction_options_t* result = + new rocksdb_fifo_compaction_options_t; + result->rep = CompactionOptionsFIFO(); + return result; +} + +void rocksdb_fifo_compaction_options_set_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts, unsigned char allow_compaction) { + fifo_opts->rep.allow_compaction = allow_compaction; +} + +unsigned char rocksdb_fifo_compaction_options_get_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.allow_compaction; +} + +void rocksdb_fifo_compaction_options_set_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) { + fifo_opts->rep.max_table_files_size = size; +} + +uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.max_table_files_size; +} + +void rocksdb_fifo_compaction_options_destroy( + rocksdb_fifo_compaction_options_t* fifo_opts) { + delete fifo_opts; +} + +void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, + int level) { + if (level >= 0) { + assert(level <= opt->rep.num_levels); + opt->rep.compression_per_level.resize(opt->rep.num_levels); + for (int i = 0; i < level; i++) { + opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression; + } + for (int i = level; i < opt->rep.num_levels; i++) { + opt->rep.compression_per_level[i] = opt->rep.compression; + } + } +} + +int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) { + return static_cast(lf->rep.size()); +} + +const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].column_family_name.c_str(); +} + +const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].name.c_str(); +} + +int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].level; +} + +size_t rocksdb_livefiles_size(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].size; +} + +const char* rocksdb_livefiles_smallestkey(const rocksdb_livefiles_t* lf, + int index, size_t* size) { + *size = lf->rep[index].smallestkey.size(); + return lf->rep[index].smallestkey.data(); +} + +const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf, + int index, size_t* size) { + *size = lf->rep[index].largestkey.size(); + return lf->rep[index].largestkey.data(); +} + +uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].num_entries; +} + +uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) { + return lf->rep[index].num_deletions; +} + +extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { + delete lf; +} + +void rocksdb_get_options_from_string(const rocksdb_options_t* base_options, + const char* opts_str, + rocksdb_options_t* new_options, + char** errptr) { + SaveError(errptr, + GetOptionsFromString(base_options->rep, std::string(opts_str), + &new_options->rep)); +} + +void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, db->rep->DefaultColumnFamily(), + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +/* MetaData */ + +rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata( + rocksdb_t* db) { + rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t; + db->rep->GetColumnFamilyMetaData(&meta->rep); + return meta; +} + +rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family) { + rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t; + db->rep->GetColumnFamilyMetaData(column_family->rep, &meta->rep); + return meta; +} + +void rocksdb_column_family_metadata_destroy( + rocksdb_column_family_metadata_t* cf_meta) { + delete cf_meta; +} + +uint64_t rocksdb_column_family_metadata_get_size( + rocksdb_column_family_metadata_t* cf_meta) { + return cf_meta->rep.size; +} + +size_t rocksdb_column_family_metadata_get_file_count( + rocksdb_column_family_metadata_t* cf_meta) { + return cf_meta->rep.file_count; +} + +char* rocksdb_column_family_metadata_get_name( + rocksdb_column_family_metadata_t* cf_meta) { + return strdup(cf_meta->rep.name.c_str()); +} + +size_t rocksdb_column_family_metadata_get_level_count( + rocksdb_column_family_metadata_t* cf_meta) { + return cf_meta->rep.levels.size(); +} + +rocksdb_level_metadata_t* rocksdb_column_family_metadata_get_level_metadata( + rocksdb_column_family_metadata_t* cf_meta, size_t i) { + if (i >= cf_meta->rep.levels.size()) { + return NULL; + } + rocksdb_level_metadata_t* level_meta = + (rocksdb_level_metadata_t*)malloc(sizeof(rocksdb_level_metadata_t)); + level_meta->rep = &cf_meta->rep.levels[i]; + + return level_meta; +} + +void rocksdb_level_metadata_destroy(rocksdb_level_metadata_t* level_meta) { + // Only free the base pointer as its parent rocksdb_column_family_metadata_t + // has the ownership of its rep. + free(level_meta); +} + +int rocksdb_level_metadata_get_level(rocksdb_level_metadata_t* level_meta) { + return level_meta->rep->level; +} + +uint64_t rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta) { + return level_meta->rep->size; +} + +size_t rocksdb_level_metadata_get_file_count( + rocksdb_level_metadata_t* level_meta) { + return level_meta->rep->files.size(); +} + +rocksdb_sst_file_metadata_t* rocksdb_level_metadata_get_sst_file_metadata( + rocksdb_level_metadata_t* level_meta, size_t i) { + if (i >= level_meta->rep->files.size()) { + return nullptr; + } + rocksdb_sst_file_metadata_t* file_meta = + (rocksdb_sst_file_metadata_t*)malloc(sizeof(rocksdb_sst_file_metadata_t)); + file_meta->rep = &level_meta->rep->files[i]; + return file_meta; +} + +void rocksdb_sst_file_metadata_destroy(rocksdb_sst_file_metadata_t* file_meta) { + // Only free the base pointer as its parent rocksdb_level_metadata_t + // has the ownership of its rep. + free(file_meta); +} + +char* rocksdb_sst_file_metadata_get_relative_filename( + rocksdb_sst_file_metadata_t* file_meta) { + return strdup(file_meta->rep->relative_filename.c_str()); +} + +uint64_t rocksdb_sst_file_metadata_get_size( + rocksdb_sst_file_metadata_t* file_meta) { + return file_meta->rep->size; +} + +char* rocksdb_sst_file_metadata_get_smallestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) { + *key_len = file_meta->rep->smallestkey.size(); + return CopyString(file_meta->rep->smallestkey); +} + +char* rocksdb_sst_file_metadata_get_largestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) { + *key_len = file_meta->rep->largestkey.size(); + return CopyString(file_meta->rep->largestkey); +} + +/* Transactions */ + +rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() { + return new rocksdb_transactiondb_options_t; +} + +void rocksdb_transactiondb_options_destroy( + rocksdb_transactiondb_options_t* opt) { + delete opt; +} + +void rocksdb_transactiondb_options_set_max_num_locks( + rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) { + opt->rep.max_num_locks = max_num_locks; +} + +void rocksdb_transactiondb_options_set_num_stripes( + rocksdb_transactiondb_options_t* opt, size_t num_stripes) { + opt->rep.num_stripes = num_stripes; +} + +void rocksdb_transactiondb_options_set_transaction_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) { + opt->rep.transaction_lock_timeout = txn_lock_timeout; +} + +void rocksdb_transactiondb_options_set_default_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) { + opt->rep.default_lock_timeout = default_lock_timeout; +} + +rocksdb_transaction_options_t* rocksdb_transaction_options_create() { + return new rocksdb_transaction_options_t; +} + +void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) { + delete opt; +} + +void rocksdb_transaction_options_set_set_snapshot( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.set_snapshot = v; +} + +void rocksdb_transaction_options_set_deadlock_detect( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.deadlock_detect = v; +} + +void rocksdb_transaction_options_set_lock_timeout( + rocksdb_transaction_options_t* opt, int64_t lock_timeout) { + opt->rep.lock_timeout = lock_timeout; +} + +void rocksdb_transaction_options_set_expiration( + rocksdb_transaction_options_t* opt, int64_t expiration) { + opt->rep.expiration = expiration; +} + +void rocksdb_transaction_options_set_deadlock_detect_depth( + rocksdb_transaction_options_t* opt, int64_t depth) { + opt->rep.deadlock_detect_depth = depth; +} + +void rocksdb_transaction_options_set_max_write_batch_size( + rocksdb_transaction_options_t* opt, size_t size) { + opt->rep.max_write_batch_size = size; +} + +void rocksdb_transaction_options_set_skip_prepare( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.skip_prepare = v; +} + +rocksdb_optimistictransaction_options_t* +rocksdb_optimistictransaction_options_create() { + return new rocksdb_optimistictransaction_options_t; +} + +void rocksdb_optimistictransaction_options_destroy( + rocksdb_optimistictransaction_options_t* opt) { + delete opt; +} + +void rocksdb_optimistictransaction_options_set_set_snapshot( + rocksdb_optimistictransaction_options_t* opt, unsigned char v) { + opt->rep.set_snapshot = v; +} + +char* rocksdb_optimistictransactiondb_property_value( + rocksdb_optimistictransactiondb_t* db, const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_optimistictransactiondb_property_int( + rocksdb_optimistictransactiondb_t* db, const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family( + rocksdb_transactiondb_t* txn_db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, txn_db->rep->CreateColumnFamily( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); + return handle; +} + +rocksdb_transactiondb_t* rocksdb_transactiondb_open( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + char** errptr) { + TransactionDB* txn_db; + if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep, + std::string(name), &txn_db))) { + return nullptr; + } + rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t; + result->rep = txn_db; + return result; +} + +rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + TransactionDB* txn_db; + std::vector handles; + if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep, + std::string(name), column_families, + &handles, &txn_db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t; + result->rep = txn_db; + return result; +} + +const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot( + rocksdb_transactiondb_t* txn_db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = txn_db->rep->GetSnapshot(); + return result; +} + +void rocksdb_transactiondb_release_snapshot( + rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) { + txn_db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db, + const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +rocksdb_transaction_t* rocksdb_transaction_begin( + rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_transaction_options_t* txn_options, + rocksdb_transaction_t* old_txn) { + if (old_txn == nullptr) { + rocksdb_transaction_t* result = new rocksdb_transaction_t; + result->rep = txn_db->rep->BeginTransaction(write_options->rep, + txn_options->rep, nullptr); + return result; + } + old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep, + txn_options->rep, old_txn->rep); + return old_txn; +} + +rocksdb_transaction_t** rocksdb_transactiondb_get_prepared_transactions( + rocksdb_transactiondb_t* txn_db, size_t* cnt) { + std::vector txns; + txn_db->rep->GetAllPreparedTransactions(&txns); + *cnt = txns.size(); + if (txns.empty()) { + return nullptr; + } else { + rocksdb_transaction_t** buf = (rocksdb_transaction_t**)malloc( + txns.size() * sizeof(rocksdb_transaction_t*)); + for (size_t i = 0; i < txns.size(); i++) { + buf[i] = new rocksdb_transaction_t; + buf[i]->rep = txns[i]; + } + return buf; + } +} + +void rocksdb_transaction_set_name(rocksdb_transaction_t* txn, const char* name, + size_t name_len, char** errptr) { + std::string str = std::string(name, name_len); + SaveError(errptr, txn->rep->SetName(str)); +} + +char* rocksdb_transaction_get_name(rocksdb_transaction_t* txn, + size_t* name_len) { + auto name = txn->rep->GetName(); + *name_len = name.size(); + return CopyString(name); +} + +void rocksdb_transaction_prepare(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Prepare()); +} + +rocksdb_writebatch_wi_t* rocksdb_transaction_get_writebatch_wi( + rocksdb_transaction_t* txn) { + rocksdb_writebatch_wi_t* wi = + (rocksdb_writebatch_wi_t*)malloc(sizeof(rocksdb_writebatch_wi_t)); + wi->rep = txn->rep->GetWriteBatch(); + + return wi; +} + +void rocksdb_transaction_rebuild_from_writebatch( + rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch, + char** errptr) { + SaveError(errptr, txn->rep->RebuildFromWriteBatch(&writebatch->rep)); +} + +void rocksdb_transaction_rebuild_from_writebatch_wi(rocksdb_transaction_t* txn, + rocksdb_writebatch_wi_t* wi, + char** errptr) { + SaveError(errptr, txn->rep->RebuildFromWriteBatch(wi->rep->GetWriteBatch())); +} + +void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Commit()); +} + +void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Rollback()); +} + +void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) { + txn->rep->SetSavePoint(); +} + +void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, + char** errptr) { + SaveError(errptr, txn->rep->RollbackToSavePoint()); +} + +void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) { + delete txn->rep; + delete txn; +} + +const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot( + rocksdb_transaction_t* txn) { + // This will be freed later on using free, so use malloc here to avoid a + // mismatch + rocksdb_snapshot_t* result = + (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t)); + result->rep = txn->rep->GetSnapshot(); + return result; +} + +// Read a key inside a transaction +char* rocksdb_transaction_get(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->Get(options->rep, Slice(key, klen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), + &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +// Read a key inside a transaction +char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, unsigned char exclusive, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen), + v->rep.GetSelf(), exclusive); + v->rep.PinSelf(); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +char* rocksdb_transaction_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn->rep->GetForUpdate(options->rep, column_family->rep, + Slice(key, klen), &tmp, exclusive); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + unsigned char exclusive, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn->rep->GetForUpdate(options->rep, column_family->rep, + Slice(key, klen), &v->rep, exclusive); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + size_t num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_transaction_multi_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGetForUpdate(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_transaction_multi_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_transaction_multi_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +// Read a key outside a transaction +char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(), + Slice(key, klen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +char* rocksdb_transactiondb_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn_db->rep->Get(options->rep, column_family->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = txn_db->rep->Get(options->rep, column_family->rep, + Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + size_t num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = + txn_db->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_transactiondb_multi_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + txn_db->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +// Put a key inside a transaction +void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key, + size_t klen, const char* val, size_t vlen, + char** errptr) { + SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen), + Slice(val, vlen))); +} + +void rocksdb_transaction_set_commit_timestamp(rocksdb_transaction_t* txn, + uint64_t commit_timestamp) { + txn->rep->SetCommitTimestamp(commit_timestamp); +} + +void rocksdb_transaction_set_read_timestamp_for_validation( + rocksdb_transaction_t* txn, uint64_t read_timestamp) { + txn->rep->SetReadTimestampForValidation(read_timestamp); +} + +// Put a key outside a transaction +void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, + txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +// Write batch into transaction db +void rocksdb_transactiondb_write(rocksdb_transactiondb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +// Merge a key inside a transaction +void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key, + size_t klen, const char* val, size_t vlen, + char** errptr) { + SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen), + Slice(val, vlen))); +} + +// Merge a key outside a transaction +void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen), + Slice(val, vlen))); +} + +void rocksdb_transactiondb_merge_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + const char* val, size_t vlen, char** errptr) { + SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep, + Slice(key, klen), Slice(val, vlen))); +} + +// Delete a key inside a transaction +void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key, + size_t klen, char** errptr) { + SaveError(errptr, txn->rep->Delete(Slice(key, klen))); +} + +void rocksdb_transaction_delete_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr) { + SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen))); +} + +// Delete a key outside a transaction +void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, char** errptr) { + SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen))); +} + +void rocksdb_transactiondb_delete_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +// Create an iterator inside a transaction +rocksdb_iterator_t* rocksdb_transaction_create_iterator( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn->rep->GetIterator(options->rep); + return result; +} + +// Create an iterator inside a transaction with column family +rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn->rep->GetIterator(options->rep, column_family->rep); + return result; +} + +// Create an iterator outside a transaction +rocksdb_iterator_t* rocksdb_transactiondb_create_iterator( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn_db->rep->NewIterator(options->rep); + return result; +} + +rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep); + return result; +} + +void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) { + delete txn_db->rep; + delete txn_db; +} + +void rocksdb_transactiondb_flush_wal(rocksdb_transactiondb_t* txn_db, + unsigned char sync, char** errptr) { + SaveError(errptr, txn_db->rep->FlushWAL(sync)); +} + +void rocksdb_transactiondb_flush(rocksdb_transactiondb_t* txn_db, + const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, txn_db->rep->Flush(options->rep)); +} + +void rocksdb_transactiondb_flush_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr) { + SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep)); +} + +void rocksdb_transactiondb_flush_cfs( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_families, int num_column_families, + char** errptr) { + std::vector column_family_handles; + for (int i = 0; i < num_column_families; i++) { + column_family_handles.push_back(column_families[i]->rep); + } + + SaveError(errptr, txn_db->rep->Flush(options->rep, column_family_handles)); +} + +rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create( + rocksdb_transactiondb_t* txn_db, char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open( + const rocksdb_options_t* options, const char* name, char** errptr) { + OptimisticTransactionDB* otxn_db; + if (SaveError(errptr, OptimisticTransactionDB::Open( + options->rep, std::string(name), &otxn_db))) { + return nullptr; + } + rocksdb_optimistictransactiondb_t* result = + new rocksdb_optimistictransactiondb_t; + result->rep = otxn_db; + return result; +} + +rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + OptimisticTransactionDB* otxn_db; + std::vector handles; + if (SaveError(errptr, OptimisticTransactionDB::Open( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &otxn_db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_optimistictransactiondb_t* result = + new rocksdb_optimistictransactiondb_t; + result->rep = otxn_db; + return result; +} + +rocksdb_t* rocksdb_optimistictransactiondb_get_base_db( + rocksdb_optimistictransactiondb_t* otxn_db) { + DB* base_db = otxn_db->rep->GetBaseDB(); + + if (base_db != nullptr) { + rocksdb_t* result = new rocksdb_t; + result->rep = base_db; + return result; + } + + return nullptr; +} + +void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) { + delete base_db; +} + +rocksdb_transaction_t* rocksdb_optimistictransaction_begin( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_optimistictransaction_options_t* otxn_options, + rocksdb_transaction_t* old_txn) { + if (old_txn == nullptr) { + rocksdb_transaction_t* result = new rocksdb_transaction_t; + result->rep = otxn_db->rep->BeginTransaction(write_options->rep, + otxn_options->rep, nullptr); + return result; + } + old_txn->rep = otxn_db->rep->BeginTransaction( + write_options->rep, otxn_options->rep, old_txn->rep); + return old_txn; +} + +// Write batch into OptimisticTransactionDB +void rocksdb_optimistictransactiondb_write( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep)); +} + +void rocksdb_optimistictransactiondb_close( + rocksdb_optimistictransactiondb_t* otxn_db) { + delete otxn_db->rep; + delete otxn_db; +} + +rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create( + rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +void rocksdb_free(void* ptr) { free(ptr); } + +rocksdb_pinnableslice_t* rocksdb_get_pinned( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(), + Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &v->rep); + if (!s.ok()) { + delete v; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; } + +const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v, + size_t* vlen) { + if (!v) { + *vlen = 0; + return nullptr; + } + + *vlen = v->rep.size(); + return v->rep.data(); +} + +// container to keep databases and caches in order to use +// ROCKSDB_NAMESPACE::MemoryUtil +struct rocksdb_memory_consumers_t { + std::vector dbs; + std::unordered_set caches; +}; + +// initializes new container of memory consumers +rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() { + return new rocksdb_memory_consumers_t; +} + +// adds datatabase to the container of memory consumers +void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers, + rocksdb_t* db) { + consumers->dbs.push_back(db); +} + +// adds cache to the container of memory consumers +void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers, + rocksdb_cache_t* cache) { + consumers->caches.insert(cache); +} + +// deletes container with memory consumers +void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) { + delete consumers; +} + +// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil +struct rocksdb_memory_usage_t { + uint64_t mem_table_total; + uint64_t mem_table_unflushed; + uint64_t mem_table_readers_total; + uint64_t cache_total; +}; + +// estimates amount of memory occupied by consumers (dbs and caches) +rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( + rocksdb_memory_consumers_t* consumers, char** errptr) { + vector dbs; + for (auto db : consumers->dbs) { + dbs.push_back(db->rep); + } + + unordered_set cache_set; + for (auto cache : consumers->caches) { + cache_set.insert(const_cast(cache->rep.get())); + } + + std::map usage_by_type; + + auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, + &usage_by_type); + if (SaveError(errptr, status)) { + return nullptr; + } + + auto result = new rocksdb_memory_usage_t; + result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal]; + result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed]; + result->mem_table_readers_total = + usage_by_type[MemoryUtil::kTableReadersTotal]; + result->cache_total = usage_by_type[MemoryUtil::kCacheTotal]; + return result; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_unflushed; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_readers_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->cache_total; +} + +void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.dump_malloc_stats = val; +} + +void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.memtable_whole_key_filtering = val; +} + +void rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.avoid_unnecessary_blocking_io = val; +} + +unsigned char rocksdb_options_get_avoid_unnecessary_blocking_io( + rocksdb_options_t* opt) { + return opt->rep.avoid_unnecessary_blocking_io; +} + +// deletes container with memory usage estimates +void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) { + delete usage; +} + +void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) { + CancelAllBackgroundWork(db->rep, wait); +} + +void rocksdb_disable_manual_compaction(rocksdb_t* db) { + db->rep->DisableManualCompaction(); +} + +void rocksdb_enable_manual_compaction(rocksdb_t* db) { + db->rep->EnableManualCompaction(); +} + +} // end extern "C" diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.cc new file mode 100644 index 0000000000..de94426d71 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.cc @@ -0,0 +1,1708 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/column_family.h" + +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_source.h" +#include "db/compaction/compaction_picker.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/db_impl/db_impl.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/range_del_aggregator.h" +#include "db/table_properties_collector.h" +#include "db/version_set.h" +#include "db/write_controller.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/table.h" +#include "table/merging_iterator.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +ColumnFamilyHandleImpl::ColumnFamilyHandleImpl( + ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex) + : cfd_(column_family_data), db_(db), mutex_(mutex) { + if (cfd_ != nullptr) { + cfd_->Ref(); + } +} + +ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { + if (cfd_ != nullptr) { + for (auto& listener : cfd_->ioptions()->listeners) { + listener->OnColumnFamilyHandleDeletionStarted(this); + } + // Job id == 0 means that this is not our background process, but rather + // user thread + // Need to hold some shared pointers owned by the initial_cf_options + // before final cleaning up finishes. + ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options(); + JobContext job_context(0); + mutex_->Lock(); + bool dropped = cfd_->IsDropped(); + if (cfd_->UnrefAndTryDelete()) { + if (dropped) { + db_->FindObsoleteFiles(&job_context, false, true); + } + } + mutex_->Unlock(); + if (job_context.HaveSomethingToDelete()) { + bool defer_purge = + db_->immutable_db_options().avoid_unnecessary_blocking_io; + db_->PurgeObsoleteFiles(job_context, defer_purge); + } + job_context.Clean(); + } +} + +uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } + +const std::string& ColumnFamilyHandleImpl::GetName() const { + return cfd()->GetName(); +} + +Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) { + // accessing mutable cf-options requires db mutex. + InstrumentedMutexLock l(mutex_); + *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions()); + return Status::OK(); +} + +const Comparator* ColumnFamilyHandleImpl::GetComparator() const { + return cfd()->user_comparator(); +} + +void GetIntTblPropCollectorFactory( + const ImmutableCFOptions& ioptions, + IntTblPropCollectorFactories* int_tbl_prop_collector_factories) { + assert(int_tbl_prop_collector_factories); + + auto& collector_factories = ioptions.table_properties_collector_factories; + for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); + ++i) { + assert(collector_factories[i]); + int_tbl_prop_collector_factories->emplace_back( + new UserKeyTablePropertiesCollectorFactory(collector_factories[i])); + } +} + +Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) { + if (!cf_options.compression_per_level.empty()) { + for (size_t level = 0; level < cf_options.compression_per_level.size(); + ++level) { + if (!CompressionTypeSupported(cf_options.compression_per_level[level])) { + return Status::InvalidArgument( + "Compression type " + + CompressionTypeToString(cf_options.compression_per_level[level]) + + " is not linked with the binary."); + } + } + } else { + if (!CompressionTypeSupported(cf_options.compression)) { + return Status::InvalidArgument( + "Compression type " + + CompressionTypeToString(cf_options.compression) + + " is not linked with the binary."); + } + } + if (cf_options.compression_opts.zstd_max_train_bytes > 0) { + if (cf_options.compression_opts.use_zstd_dict_trainer) { + if (!ZSTD_TrainDictionarySupported()) { + return Status::InvalidArgument( + "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ " + "is not linked with the binary."); + } + } else if (!ZSTD_FinalizeDictionarySupported()) { + return Status::InvalidArgument( + "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ " + "is not linked with the binary."); + } + if (cf_options.compression_opts.max_dict_bytes == 0) { + return Status::InvalidArgument( + "The dictionary size limit (`CompressionOptions::max_dict_bytes`) " + "should be nonzero if we're using zstd's dictionary generator."); + } + } + + if (!CompressionTypeSupported(cf_options.blob_compression_type)) { + std::ostringstream oss; + oss << "The specified blob compression type " + << CompressionTypeToString(cf_options.blob_compression_type) + << " is not available."; + + return Status::InvalidArgument(oss.str()); + } + + return Status::OK(); +} + +Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { + if (cf_options.inplace_update_support) { + return Status::InvalidArgument( + "In-place memtable updates (inplace_update_support) is not compatible " + "with concurrent writes (allow_concurrent_memtable_write)"); + } + if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { + return Status::InvalidArgument( + "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)"); + } + return Status::OK(); +} + +Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) { + // More than one cf_paths are supported only in universal + // and level compaction styles. This function also checks the case + // in which cf_paths is not specified, which results in db_paths + // being used. + if ((cf_options.compaction_style != kCompactionStyleUniversal) && + (cf_options.compaction_style != kCompactionStyleLevel)) { + if (cf_options.cf_paths.size() > 1) { + return Status::NotSupported( + "More than one CF paths are only supported in " + "universal and level compaction styles. "); + } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) { + return Status::NotSupported( + "More than one DB paths are only supported in " + "universal and level compaction styles. "); + } + } + return Status::OK(); +} + +namespace { +const uint64_t kDefaultTtl = 0xfffffffffffffffe; +const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; +} // anonymous namespace + +ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src) { + ColumnFamilyOptions result = src; + size_t clamp_max = std::conditional< + sizeof(size_t) == 4, std::integral_constant, + std::integral_constant>::type::value; + ClipToRange(&result.write_buffer_size, (static_cast(64)) << 10, + clamp_max); + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = + std::min(size_t{1024 * 1024}, result.write_buffer_size / 8); + + // Align up to 4k + const size_t align = 4 * 1024; + result.arena_block_size = + ((result.arena_block_size + align - 1) / align) * align; + } + result.min_write_buffer_number_to_merge = + std::min(result.min_write_buffer_number_to_merge, + result.max_write_buffer_number - 1); + if (result.min_write_buffer_number_to_merge < 1) { + result.min_write_buffer_number_to_merge = 1; + } + + if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) { + ROCKS_LOG_WARN( + db_options.logger, + "Currently, if atomic_flush is true, then triggering flush for any " + "column family internally (non-manual flush) will trigger flushing " + "all column families even if the number of memtables is smaller " + "min_write_buffer_number_to_merge. Therefore, configuring " + "min_write_buffer_number_to_merge > 1 is not compatible and should " + "be satinized to 1. Not doing so will lead to data loss and " + "inconsistent state across multiple column families when WAL is " + "disabled, which is a common setting for atomic flush"); + + result.min_write_buffer_number_to_merge = 1; + } + + if (result.num_levels < 1) { + result.num_levels = 1; + } + if (result.compaction_style == kCompactionStyleLevel && + result.num_levels < 2) { + result.num_levels = 2; + } + + if (result.compaction_style == kCompactionStyleUniversal && + db_options.allow_ingest_behind && result.num_levels < 3) { + result.num_levels = 3; + } + + if (result.max_write_buffer_number < 2) { + result.max_write_buffer_number = 2; + } + // fall back max_write_buffer_number_to_maintain if + // max_write_buffer_size_to_maintain is not set + if (result.max_write_buffer_size_to_maintain < 0) { + result.max_write_buffer_size_to_maintain = + result.max_write_buffer_number * + static_cast(result.write_buffer_size); + } else if (result.max_write_buffer_size_to_maintain == 0 && + result.max_write_buffer_number_to_maintain < 0) { + result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; + } + // bloom filter size shouldn't exceed 1/4 of memtable size. + if (result.memtable_prefix_bloom_size_ratio > 0.25) { + result.memtable_prefix_bloom_size_ratio = 0.25; + } else if (result.memtable_prefix_bloom_size_ratio < 0) { + result.memtable_prefix_bloom_size_ratio = 0; + } + + if (!result.prefix_extractor) { + assert(result.memtable_factory); + Slice name = result.memtable_factory->Name(); + if (name.compare("HashSkipListRepFactory") == 0 || + name.compare("HashLinkListRepFactory") == 0) { + result.memtable_factory = std::make_shared(); + } + } + + if (result.compaction_style == kCompactionStyleFIFO) { + // since we delete level0 files in FIFO compaction when there are too many + // of them, these options don't really mean anything + result.level0_slowdown_writes_trigger = std::numeric_limits::max(); + result.level0_stop_writes_trigger = std::numeric_limits::max(); + } + + if (result.max_bytes_for_level_multiplier <= 0) { + result.max_bytes_for_level_multiplier = 1; + } + + if (result.level0_file_num_compaction_trigger == 0) { + ROCKS_LOG_WARN(db_options.logger, + "level0_file_num_compaction_trigger cannot be 0"); + result.level0_file_num_compaction_trigger = 1; + } + + if (result.level0_stop_writes_trigger < + result.level0_slowdown_writes_trigger || + result.level0_slowdown_writes_trigger < + result.level0_file_num_compaction_trigger) { + ROCKS_LOG_WARN(db_options.logger, + "This condition must be satisfied: " + "level0_stop_writes_trigger(%d) >= " + "level0_slowdown_writes_trigger(%d) >= " + "level0_file_num_compaction_trigger(%d)", + result.level0_stop_writes_trigger, + result.level0_slowdown_writes_trigger, + result.level0_file_num_compaction_trigger); + if (result.level0_slowdown_writes_trigger < + result.level0_file_num_compaction_trigger) { + result.level0_slowdown_writes_trigger = + result.level0_file_num_compaction_trigger; + } + if (result.level0_stop_writes_trigger < + result.level0_slowdown_writes_trigger) { + result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger; + } + ROCKS_LOG_WARN(db_options.logger, + "Adjust the value to " + "level0_stop_writes_trigger(%d)" + "level0_slowdown_writes_trigger(%d)" + "level0_file_num_compaction_trigger(%d)", + result.level0_stop_writes_trigger, + result.level0_slowdown_writes_trigger, + result.level0_file_num_compaction_trigger); + } + + if (result.soft_pending_compaction_bytes_limit == 0) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } else if (result.hard_pending_compaction_bytes_limit > 0 && + result.soft_pending_compaction_bytes_limit > + result.hard_pending_compaction_bytes_limit) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } + + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = + static_cast(db_options.sst_file_manager.get()); + for (size_t i = 0; i < result.cf_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(db_options.env, sfm, + result.cf_paths[i].path) + .PermitUncheckedError(); + } + + if (result.cf_paths.empty()) { + result.cf_paths = db_options.db_paths; + } + + if (result.level_compaction_dynamic_level_bytes) { + if (result.compaction_style != kCompactionStyleLevel) { + ROCKS_LOG_WARN(db_options.info_log.get(), + "level_compaction_dynamic_level_bytes only makes sense" + "for level-based compaction"); + result.level_compaction_dynamic_level_bytes = false; + } else if (result.cf_paths.size() > 1U) { + // we don't yet know how to make both of this feature and multiple + // DB path work. + ROCKS_LOG_WARN(db_options.info_log.get(), + "multiple cf_paths/db_paths and" + "level_compaction_dynamic_level_bytes" + "can't be used together"); + result.level_compaction_dynamic_level_bytes = false; + } + } + + if (result.max_compaction_bytes == 0) { + result.max_compaction_bytes = result.target_file_size_base * 25; + } + + bool is_block_based_table = (result.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())); + + const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; + if (result.ttl == kDefaultTtl) { + if (is_block_based_table && + result.compaction_style != kCompactionStyleFIFO) { + result.ttl = kAdjustedTtl; + } else { + result.ttl = 0; + } + } + + const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60; + + // Turn on periodic compactions and set them to occur once every 30 days if + // compaction filters are used and periodic_compaction_seconds is set to the + // default value. + if (result.compaction_style != kCompactionStyleFIFO) { + if ((result.compaction_filter != nullptr || + result.compaction_filter_factory != nullptr) && + result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && + is_block_based_table) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + } else { + // result.compaction_style == kCompactionStyleFIFO + if (result.ttl == 0) { + if (is_block_based_table) { + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + result.ttl = result.periodic_compaction_seconds; + } + } else if (result.periodic_compaction_seconds != 0) { + result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); + } + } + + // TTL compactions would work similar to Periodic Compactions in Universal in + // most of the cases. So, if ttl is set, execute the periodic compaction + // codepath. + if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) { + if (result.periodic_compaction_seconds != 0) { + result.periodic_compaction_seconds = + std::min(result.ttl, result.periodic_compaction_seconds); + } else { + result.periodic_compaction_seconds = result.ttl; + } + } + + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = 0; + } + + return result; +} + +int SuperVersion::dummy = 0; +void* const SuperVersion::kSVInUse = &SuperVersion::dummy; +void* const SuperVersion::kSVObsolete = nullptr; + +SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +SuperVersion* SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool SuperVersion::Unref() { + // fetch_sub returns the previous value of ref + uint32_t previous_refs = refs.fetch_sub(1); + assert(previous_refs > 0); + return previous_refs == 1; +} + +void SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + // Since this SuperVersion object is being deleted, + // decrement reference to the immutable MemtableList + // this SV object was pointing to. + imm->Unref(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + auto* memory_usage = current->cfd()->imm()->current_memory_usage(); + assert(*memory_usage >= m->ApproximateMemoryUsage()); + *memory_usage -= m->ApproximateMemoryUsage(); + to_delete.push_back(m); + } + current->Unref(); + cfd->UnrefAndTryDelete(); +} + +void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current) { + cfd = new_cfd; + mem = new_mem; + imm = new_imm; + current = new_current; + cfd->Ref(); + mem->Ref(); + imm->Ref(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +namespace { +void SuperVersionUnrefHandle(void* ptr) { + // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets + // destroyed. When the former happens, the thread shouldn't see kSVInUse. + // When the latter happens, only super_version_ holds a reference + // to ColumnFamilyData, so no further queries are possible. + SuperVersion* sv = static_cast(ptr); + bool was_last_ref __attribute__((__unused__)); + was_last_ref = sv->Unref(); + // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_. + // This is important because we can't do SuperVersion cleanup here. + // That would require locking DB mutex, which would deadlock because + // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex. + assert(!was_last_ref); +} +} // anonymous namespace + +std::vector ColumnFamilyData::GetDbPaths() const { + std::vector paths; + paths.reserve(ioptions_.cf_paths.size()); + for (const DbPath& db_path : ioptions_.cf_paths) { + paths.emplace_back(db_path.path); + } + return paths; +} + +const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId = + std::numeric_limits::max(); + +ColumnFamilyData::ColumnFamilyData( + uint32_t id, const std::string& name, Version* _dummy_versions, + Cache* _table_cache, WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, + const FileOptions* file_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, const std::string& db_id, + const std::string& db_session_id) + : id_(id), + name_(name), + dummy_versions_(_dummy_versions), + current_(nullptr), + refs_(0), + initialized_(false), + dropped_(false), + internal_comparator_(cf_options.comparator), + initial_cf_options_(SanitizeOptions(db_options, cf_options)), + ioptions_(db_options, initial_cf_options_), + mutable_cf_options_(initial_cf_options_), + is_delete_range_supported_( + cf_options.table_factory->IsDeleteRangeSupported()), + write_buffer_manager_(write_buffer_manager), + mem_(nullptr), + imm_(ioptions_.min_write_buffer_number_to_merge, + ioptions_.max_write_buffer_number_to_maintain, + ioptions_.max_write_buffer_size_to_maintain), + super_version_(nullptr), + super_version_number_(0), + local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + next_(nullptr), + prev_(nullptr), + log_number_(0), + column_family_set_(column_family_set), + queued_for_flush_(false), + queued_for_compaction_(false), + prev_compaction_needed_bytes_(0), + allow_2pc_(db_options.allow_2pc), + last_memtable_id_(0), + db_paths_registered_(false), + mempurge_used_(false), + next_epoch_number_(1) { + if (id_ != kDummyColumnFamilyDataId) { + // TODO(cc): RegisterDbPaths can be expensive, considering moving it + // outside of this constructor which might be called with db mutex held. + // TODO(cc): considering using ioptions_.fs, currently some tests rely on + // EnvWrapper, that's the main reason why we use env here. + Status s = ioptions_.env->RegisterDbPaths(GetDbPaths()); + if (s.ok()) { + db_paths_registered_ = true; + } else { + ROCKS_LOG_ERROR( + ioptions_.logger, + "Failed to register data paths of column family (id: %d, name: %s)", + id_, name_.c_str()); + } + } + Ref(); + + // Convert user defined table properties collector factories to internal ones. + GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_); + + // if _dummy_versions is nullptr, then this is a dummy column family. + if (_dummy_versions != nullptr) { + internal_stats_.reset( + new InternalStats(ioptions_.num_levels, ioptions_.clock, this)); + table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache, + block_cache_tracer, io_tracer, + db_session_id)); + blob_file_cache_.reset( + new BlobFileCache(_table_cache, ioptions(), soptions(), id_, + internal_stats_->GetBlobFileReadHist(), io_tracer)); + blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id, + blob_file_cache_.get())); + + if (ioptions_.compaction_style == kCompactionStyleLevel) { + compaction_picker_.reset( + new LevelCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset( + new UniversalCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleFIFO) { + compaction_picker_.reset( + new FIFOCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleNone) { + compaction_picker_.reset( + new NullCompactionPicker(ioptions_, &internal_comparator_)); + ROCKS_LOG_WARN(ioptions_.logger, + "Column family %s does not use any background compaction. " + "Compactions can only be done via CompactFiles\n", + GetName().c_str()); + } else { + ROCKS_LOG_ERROR(ioptions_.logger, + "Unable to recognize the specified compaction style %d. " + "Column family %s will use kCompactionStyleLevel.\n", + ioptions_.compaction_style, GetName().c_str()); + compaction_picker_.reset( + new LevelCompactionPicker(ioptions_, &internal_comparator_)); + } + + if (column_family_set_->NumberOfColumnFamilies() < 10) { + ROCKS_LOG_INFO(ioptions_.logger, + "--------------- Options for column family [%s]:\n", + name.c_str()); + initial_cf_options_.Dump(ioptions_.logger); + } else { + ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n"); + } + } + + RecalculateWriteStallConditions(mutable_cf_options_); + + if (cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName()) && + cf_options.table_factory->GetOptions()) { + const BlockBasedTableOptions* bbto = + cf_options.table_factory->GetOptions(); + const auto& options_overrides = bbto->cache_usage_options.options_overrides; + const auto file_metadata_charged = + options_overrides.at(CacheEntryRole::kFileMetadata).charged; + if (bbto->block_cache && + file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) { + // TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope + // responsible for reservation of `ObsoleteFileInfo` so that we can keep + // this `file_metadata_cache_res_mgr_` nonconcurrent + file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager( + std::make_shared< + CacheReservationManagerImpl>( + bbto->block_cache))); + } + } +} + +// DB mutex held +ColumnFamilyData::~ColumnFamilyData() { + assert(refs_.load(std::memory_order_relaxed) == 0); + // remove from linked list + auto prev = prev_; + auto next = next_; + prev->next_ = next; + next->prev_ = prev; + + if (!dropped_ && column_family_set_ != nullptr) { + // If it's dropped, it's already removed from column family set + // If column_family_set_ == nullptr, this is dummy CFD and not in + // ColumnFamilySet + column_family_set_->RemoveColumnFamily(this); + } + + if (current_ != nullptr) { + current_->Unref(); + } + + // It would be wrong if this ColumnFamilyData is in flush_queue_ or + // compaction_queue_ and we destroyed it + assert(!queued_for_flush_); + assert(!queued_for_compaction_); + assert(super_version_ == nullptr); + + if (dummy_versions_ != nullptr) { + // List must be empty + assert(dummy_versions_->Next() == dummy_versions_); + bool deleted __attribute__((__unused__)); + deleted = dummy_versions_->Unref(); + assert(deleted); + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + autovector to_delete; + imm_.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } + + if (db_paths_registered_) { + // TODO(cc): considering using ioptions_.fs, currently some tests rely on + // EnvWrapper, that's the main reason why we use env here. + Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths()); + if (!s.ok()) { + ROCKS_LOG_ERROR( + ioptions_.logger, + "Failed to unregister data paths of column family (id: %d, name: %s)", + id_, name_.c_str()); + } + } +} + +bool ColumnFamilyData::UnrefAndTryDelete() { + int old_refs = refs_.fetch_sub(1); + assert(old_refs > 0); + + if (old_refs == 1) { + assert(super_version_ == nullptr); + delete this; + return true; + } + + if (old_refs == 2 && super_version_ != nullptr) { + // Only the super_version_ holds me + SuperVersion* sv = super_version_; + super_version_ = nullptr; + + // Release SuperVersion references kept in ThreadLocalPtr. + local_sv_.reset(); + + if (sv->Unref()) { + // Note: sv will delete this ColumnFamilyData during Cleanup() + assert(sv->cfd == this); + sv->Cleanup(); + delete sv; + return true; + } + } + return false; +} + +void ColumnFamilyData::SetDropped() { + // can't drop default CF + assert(id_ != 0); + dropped_ = true; + write_controller_token_.reset(); + + // remove from column_family_set + column_family_set_->RemoveColumnFamily(this); +} + +ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const { + return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); +} + +uint64_t ColumnFamilyData::OldestLogToKeep() { + auto current_log = GetLogNumber(); + + if (allow_2pc_) { + auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection(); + auto mem_prep_log = mem()->GetMinLogContainingPrepSection(); + + if (imm_prep_log > 0 && imm_prep_log < current_log) { + current_log = imm_prep_log; + } + + if (mem_prep_log > 0 && mem_prep_log < current_log) { + current_log = mem_prep_log; + } + } + + return current_log; +} + +const double kIncSlowdownRatio = 0.8; +const double kDecSlowdownRatio = 1 / kIncSlowdownRatio; +const double kNearStopSlowdownRatio = 0.6; +const double kDelayRecoverSlowdownRatio = 1.4; + +namespace { +// If penalize_stop is true, we further reduce slowdown rate. +std::unique_ptr SetupDelay( + WriteController* write_controller, uint64_t compaction_needed_bytes, + uint64_t prev_compaction_need_bytes, bool penalize_stop, + bool auto_compactions_disabled) { + const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. + + uint64_t max_write_rate = write_controller->max_delayed_write_rate(); + uint64_t write_rate = write_controller->delayed_write_rate(); + + if (auto_compactions_disabled) { + // When auto compaction is disabled, always use the value user gave. + write_rate = max_write_rate; + } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) { + // If user gives rate less than kMinWriteRate, don't adjust it. + // + // If already delayed, need to adjust based on previous compaction debt. + // When there are two or more column families require delay, we always + // increase or reduce write rate based on information for one single + // column family. It is likely to be OK but we can improve if there is a + // problem. + // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes + // is only available in level-based compaction + // + // If the compaction debt stays the same as previously, we also further slow + // down. It usually means a mem table is full. It's mainly for the case + // where both of flush and compaction are much slower than the speed we + // insert to mem tables, so we need to actively slow down before we get + // feedback signal from compaction and flushes to avoid the full stop + // because of hitting the max write buffer number. + // + // If DB just falled into the stop condition, we need to further reduce + // the write rate to avoid the stop condition. + if (penalize_stop) { + // Penalize the near stop or stop condition by more aggressive slowdown. + // This is to provide the long term slowdown increase signal. + // The penalty is more than the reward of recovering to the normal + // condition. + write_rate = static_cast(static_cast(write_rate) * + kNearStopSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > 0 && + prev_compaction_need_bytes <= compaction_needed_bytes) { + write_rate = static_cast(static_cast(write_rate) * + kIncSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > compaction_needed_bytes) { + // We are speeding up by ratio of kSlowdownRatio when we have paid + // compaction debt. But we'll never speed up to faster than the write rate + // given by users. + write_rate = static_cast(static_cast(write_rate) * + kDecSlowdownRatio); + if (write_rate > max_write_rate) { + write_rate = max_write_rate; + } + } + } + return write_controller->GetDelayToken(write_rate); +} + +int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, + int level0_slowdown_writes_trigger) { + // SanitizeOptions() ensures it. + assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); + + if (level0_file_num_compaction_trigger < 0) { + return std::numeric_limits::max(); + } + + const int64_t twice_level0_trigger = + static_cast(level0_file_num_compaction_trigger) * 2; + + const int64_t one_fourth_trigger_slowdown = + static_cast(level0_file_num_compaction_trigger) + + ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) / + 4); + + assert(twice_level0_trigger >= 0); + assert(one_fourth_trigger_slowdown >= 0); + + // 1/4 of the way between L0 compaction trigger threshold and slowdown + // condition. + // Or twice as compaction trigger, if it is smaller. + int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown); + if (res >= std::numeric_limits::max()) { + return std::numeric_limits::max(); + } else { + // res fits in int + return static_cast(res); + } +} +} // anonymous namespace + +std::pair +ColumnFamilyData::GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options) { + if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) { + return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) { + return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + num_compaction_needed_bytes >= + mutable_cf_options.hard_pending_compaction_bytes_limit) { + return {WriteStallCondition::kStopped, + WriteStallCause::kPendingCompactionBytes}; + } else if (mutable_cf_options.max_write_buffer_number > 3 && + num_unflushed_memtables >= + mutable_cf_options.max_write_buffer_number - 1 && + num_unflushed_memtables - 1 >= + immutable_cf_options.min_write_buffer_number_to_merge) { + return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.level0_slowdown_writes_trigger >= 0 && + num_l0_files >= + mutable_cf_options.level0_slowdown_writes_trigger) { + return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.soft_pending_compaction_bytes_limit > 0 && + num_compaction_needed_bytes >= + mutable_cf_options.soft_pending_compaction_bytes_limit) { + return {WriteStallCondition::kDelayed, + WriteStallCause::kPendingCompactionBytes}; + } + return {WriteStallCondition::kNormal, WriteStallCause::kNone}; +} + +WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options) { + auto write_stall_condition = WriteStallCondition::kNormal; + if (current_ != nullptr) { + auto* vstorage = current_->storage_info(); + auto write_controller = column_family_set_->write_controller_; + uint64_t compaction_needed_bytes = + vstorage->estimated_compaction_needed_bytes(); + + auto write_stall_condition_and_cause = GetWriteStallConditionAndCause( + imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(), + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, + *ioptions()); + write_stall_condition = write_stall_condition_and_cause.first; + auto write_stall_cause = write_stall_condition_and_cause.second; + + bool was_stopped = write_controller->IsStopped(); + bool needed_delay = write_controller->NeedsDelay(); + + if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kMemtableLimit) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stopping writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d", + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number); + } else if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kL0FileCountLimit) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); + if (compaction_picker_->IsLevel0CompactionInProgress()) { + internal_stats_->AddCFStats( + InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, + 1); + } + ROCKS_LOG_WARN(ioptions_.logger, + "[%s] Stopping writes because we have %d level-0 files", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats( + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stopping writes because of estimated pending compaction " + "bytes %" PRIu64, + name_.c_str(), compaction_needed_bytes); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kMemtableLimit) { + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stalling writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d " + "rate %" PRIu64, + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number, + write_controller->delayed_write_rate()); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kL0FileCountLimit) { + // L0 is the last two files from stopping. + bool near_stop = vstorage->l0_delay_trigger_count() >= + mutable_cf_options.level0_stop_writes_trigger - 2; + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped || near_stop, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1); + if (compaction_picker_->IsLevel0CompactionInProgress()) { + internal_stats_->AddCFStats( + InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, + 1); + } + ROCKS_LOG_WARN(ioptions_.logger, + "[%s] Stalling writes because we have %d level-0 files " + "rate %" PRIu64, + name_.c_str(), vstorage->l0_delay_trigger_count(), + write_controller->delayed_write_rate()); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + // If the distance to hard limit is less than 1/4 of the gap between soft + // and + // hard bytes limit, we think it is near stop and speed up the slowdown. + bool near_stop = + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + (compaction_needed_bytes - + mutable_cf_options.soft_pending_compaction_bytes_limit) > + 3 * + (mutable_cf_options.hard_pending_compaction_bytes_limit - + mutable_cf_options.soft_pending_compaction_bytes_limit) / + 4; + + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped || near_stop, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats( + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1); + ROCKS_LOG_WARN( + ioptions_.logger, + "[%s] Stalling writes because of estimated pending compaction " + "bytes %" PRIu64 " rate %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes(), + write_controller->delayed_write_rate()); + } else { + assert(write_stall_condition == WriteStallCondition::kNormal); + if (vstorage->l0_delay_trigger_count() >= + GetL0ThresholdSpeedupCompaction( + mutable_cf_options.level0_file_num_compaction_trigger, + mutable_cf_options.level0_slowdown_writes_trigger)) { + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + ROCKS_LOG_INFO( + ioptions_.logger, + "[%s] Increasing compaction threads because we have %d level-0 " + "files ", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { + // Increase compaction threads if bytes needed for compaction exceeds + // 1/4 of threshold for slowing down. + // If soft pending compaction byte limit is not set, always speed up + // compaction. + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + ROCKS_LOG_INFO( + ioptions_.logger, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } + } else { + write_controller_token_.reset(); + } + // If the DB recovers from delay conditions, we reward with reducing + // double the slowdown ratio. This is to balance the long term slowdown + // increase signal. + if (needed_delay) { + uint64_t write_rate = write_controller->delayed_write_rate(); + write_controller->set_delayed_write_rate(static_cast( + static_cast(write_rate) * kDelayRecoverSlowdownRatio)); + // Set the low pri limit to be 1/4 the delayed write rate. + // Note we don't reset this value even after delay condition is relased. + // Low-pri rate will continue to apply if there is a compaction + // pressure. + write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate / + 4); + } + } + prev_compaction_needed_bytes_ = compaction_needed_bytes; + } + return write_stall_condition; +} + +const FileOptions* ColumnFamilyData::soptions() const { + return &(column_family_set_->file_options_); +} + +void ColumnFamilyData::SetCurrent(Version* current_version) { + current_ = current_version; +} + +uint64_t ColumnFamilyData::GetNumLiveVersions() const { + return VersionSet::GetNumLiveVersions(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetTotalSstFilesSize() const { + return VersionSet::GetTotalSstFilesSize(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetTotalBlobFileSize() const { + return VersionSet::GetTotalBlobFileSize(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { + return current_->GetSstFilesSize(); +} + +MemTable* ColumnFamilyData::ConstructNewMemtable( + const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { + return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, + write_buffer_manager_, earliest_seq, id_); +} + +void ColumnFamilyData::CreateNewMemtable( + const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { + if (mem_ != nullptr) { + delete mem_->Unref(); + } + SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq)); + mem_->Ref(); +} + +bool ColumnFamilyData::NeedsCompaction() const { + return !mutable_cf_options_.disable_auto_compactions && + compaction_picker_->NeedsCompaction(current_->storage_info()); +} + +Compaction* ColumnFamilyData::PickCompaction( + const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { + auto* result = compaction_picker_->PickCompaction( + GetName(), mutable_options, mutable_db_options, current_->storage_info(), + log_buffer); + if (result != nullptr) { + result->SetInputVersion(current_); + } + return result; +} + +bool ColumnFamilyData::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + return compaction_picker_->RangeOverlapWithCompaction( + smallest_user_key, largest_user_key, level); +} + +Status ColumnFamilyData::RangesOverlapWithMemtables( + const autovector& ranges, SuperVersion* super_version, + bool allow_data_in_errors, bool* overlap) { + assert(overlap != nullptr); + *overlap = false; + // Create an InternalIterator over all unflushed memtables + Arena arena; + // TODO: plumb Env::IOActivity + ReadOptions read_opts; + read_opts.total_order_seek = true; + MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_opts, &arena)); + super_version->imm->AddIterators(read_opts, &merge_iter_builder, + false /* add_range_tombstone_iter */); + ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); + + auto read_seq = super_version->current->version_set()->LastSequence(); + ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq); + auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator( + read_opts, read_seq, false /* immutable_memtable */); + range_del_agg.AddTombstones( + std::unique_ptr(active_range_del_iter)); + Status status; + status = super_version->imm->AddRangeTombstoneIterators( + read_opts, nullptr /* arena */, &range_del_agg); + // AddRangeTombstoneIterators always return Status::OK. + assert(status.ok()); + + for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) { + auto* vstorage = super_version->current->storage_info(); + auto* ucmp = vstorage->InternalComparator()->user_comparator(); + InternalKey range_start(ranges[i].start, kMaxSequenceNumber, + kValueTypeForSeek); + memtable_iter->Seek(range_start.Encode()); + status = memtable_iter->status(); + ParsedInternalKey seek_result; + + if (status.ok() && memtable_iter->Valid()) { + status = ParseInternalKey(memtable_iter->key(), &seek_result, + allow_data_in_errors); + } + + if (status.ok()) { + if (memtable_iter->Valid() && + ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { + *overlap = true; + } else if (range_del_agg.IsRangeOverlapped(ranges[i].start, + ranges[i].limit)) { + *overlap = true; + } + } + } + return status; +} + +const int ColumnFamilyData::kCompactAllLevels = -1; +const int ColumnFamilyData::kCompactToBaseLevel = -2; + +Compaction* ColumnFamilyData::CompactRange( + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, int input_level, + int output_level, const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + auto* result = compaction_picker_->CompactRange( + GetName(), mutable_cf_options, mutable_db_options, + current_->storage_info(), input_level, output_level, + compact_range_options, begin, end, compaction_end, conflict, + max_file_num_to_ignore, trim_ts); + if (result != nullptr) { + result->SetInputVersion(current_); + } + TEST_SYNC_POINT("ColumnFamilyData::CompactRange:Return"); + return result; +} + +SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) { + SuperVersion* sv = GetThreadLocalSuperVersion(db); + sv->Ref(); + if (!ReturnThreadLocalSuperVersion(sv)) { + // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion() + // when the thread-local pointer was populated. So, the Ref() earlier in + // this function still prevents the returned SuperVersion* from being + // deleted out from under the caller. + sv->Unref(); + } + return sv; +} + +SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { + // The SuperVersion is cached in thread local storage to avoid acquiring + // mutex when SuperVersion does not change since the last use. When a new + // SuperVersion is installed, the compaction or flush thread cleans up + // cached SuperVersion in all existing thread local storage. To avoid + // acquiring mutex for this operation, we use atomic Swap() on the thread + // local pointer to guarantee exclusive access. If the thread local pointer + // is being used while a new SuperVersion is installed, the cached + // SuperVersion can become stale. In that case, the background thread would + // have swapped in kSVObsolete. We re-check the value at when returning + // SuperVersion back to thread local, with an atomic compare and swap. + // The superversion will need to be released if detected to be stale. + void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); + // Invariant: + // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage + // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage + // should only keep kSVInUse before ReturnThreadLocalSuperVersion call + // (if no Scrape happens). + assert(ptr != SuperVersion::kSVInUse); + SuperVersion* sv = static_cast(ptr); + if (sv == SuperVersion::kSVObsolete || + sv->version_number != super_version_number_.load()) { + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); + SuperVersion* sv_to_delete = nullptr; + + if (sv && sv->Unref()) { + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); + db->mutex()->Lock(); + // NOTE: underlying resources held by superversion (sst files) might + // not be released until the next background job. + sv->Cleanup(); + if (db->immutable_db_options().avoid_unnecessary_blocking_io) { + db->AddSuperVersionsToFreeQueue(sv); + db->SchedulePurge(); + } else { + sv_to_delete = sv; + } + } else { + db->mutex()->Lock(); + } + sv = super_version_->Ref(); + db->mutex()->Unlock(); + + delete sv_to_delete; + } + assert(sv != nullptr); + return sv; +} + +bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { + assert(sv != nullptr); + // Put the SuperVersion back + void* expected = SuperVersion::kSVInUse; + if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { + // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal + // storage has not been altered and no Scrape has happened. The + // SuperVersion is still current. + return true; + } else { + // ThreadLocal scrape happened in the process of this GetImpl call (after + // thread local Swap() at the beginning and before CompareAndSwap()). + // This means the SuperVersion it holds is obsolete. + assert(expected == SuperVersion::kSVObsolete); + } + return false; +} + +void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex) { + db_mutex->AssertHeld(); + return InstallSuperVersion(sv_context, mutable_cf_options_); +} + +void ColumnFamilyData::InstallSuperVersion( + SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options) { + SuperVersion* new_superversion = sv_context->new_superversion.release(); + new_superversion->mutable_cf_options = mutable_cf_options; + new_superversion->Init(this, mem_, imm_.current(), current_); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + ++super_version_number_; + super_version_->version_number = super_version_number_; + if (old_superversion == nullptr || old_superversion->current != current() || + old_superversion->mem != mem_ || + old_superversion->imm != imm_.current()) { + // Should not recalculate slow down condition if nothing has changed, since + // currently RecalculateWriteStallConditions() treats it as further slowing + // down is needed. + super_version_->write_stall_condition = + RecalculateWriteStallConditions(mutable_cf_options); + } else { + super_version_->write_stall_condition = + old_superversion->write_stall_condition; + } + if (old_superversion != nullptr) { + // Reset SuperVersions cached in thread local storage. + // This should be done before old_superversion->Unref(). That's to ensure + // that local_sv_ never holds the last reference to SuperVersion, since + // it has no means to safely do SuperVersion cleanup. + ResetThreadLocalSuperVersions(); + + if (old_superversion->mutable_cf_options.write_buffer_size != + mutable_cf_options.write_buffer_size) { + mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size); + } + if (old_superversion->write_stall_condition != + new_superversion->write_stall_condition) { + sv_context->PushWriteStallNotification( + old_superversion->write_stall_condition, + new_superversion->write_stall_condition, GetName(), ioptions()); + } + if (old_superversion->Unref()) { + old_superversion->Cleanup(); + sv_context->superversions_to_free.push_back(old_superversion); + } + } +} + +void ColumnFamilyData::ResetThreadLocalSuperVersions() { + autovector sv_ptrs; + local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); + for (auto ptr : sv_ptrs) { + assert(ptr); + if (ptr == SuperVersion::kSVInUse) { + continue; + } + auto sv = static_cast(ptr); + bool was_last_ref __attribute__((__unused__)); + was_last_ref = sv->Unref(); + // sv couldn't have been the last reference because + // ResetThreadLocalSuperVersions() is called before + // unref'ing super_version_. + assert(!was_last_ref); + } +} + +Status ColumnFamilyData::ValidateOptions( + const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { + Status s; + s = CheckCompressionSupported(cf_options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } + if (s.ok() && db_options.unordered_write && + cf_options.max_successive_merges != 0) { + s = Status::InvalidArgument( + "max_successive_merges > 0 is incompatible with unordered_write"); + } + if (s.ok()) { + s = CheckCFPathsSupported(db_options, cf_options); + } + if (!s.ok()) { + return s; + } + + if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) { + if (!cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + return Status::NotSupported( + "TTL is only supported in Block-Based Table format. "); + } + } + + if (cf_options.periodic_compaction_seconds > 0 && + cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { + if (!cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + return Status::NotSupported( + "Periodic Compaction is only supported in " + "Block-Based Table format. "); + } + } + + if (cf_options.enable_blob_garbage_collection) { + if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || + cf_options.blob_garbage_collection_age_cutoff > 1.0) { + return Status::InvalidArgument( + "The age cutoff for blob garbage collection should be in the range " + "[0.0, 1.0]."); + } + if (cf_options.blob_garbage_collection_force_threshold < 0.0 || + cf_options.blob_garbage_collection_force_threshold > 1.0) { + return Status::InvalidArgument( + "The garbage ratio threshold for forcing blob garbage collection " + "should be in the range [0.0, 1.0]."); + } + } + + if (cf_options.compaction_style == kCompactionStyleFIFO && + db_options.max_open_files != -1 && cf_options.ttl > 0) { + return Status::NotSupported( + "FIFO compaction only supported with max_open_files = -1."); + } + + std::vector supported{0, 1, 2, 4, 8}; + if (std::find(supported.begin(), supported.end(), + cf_options.memtable_protection_bytes_per_key) == + supported.end()) { + return Status::NotSupported( + "Memtable per key-value checksum protection only supports 0, 1, 2, 4 " + "or 8 bytes per key."); + } + if (std::find(supported.begin(), supported.end(), + cf_options.block_protection_bytes_per_key) == supported.end()) { + return Status::NotSupported( + "Block per key-value checksum protection only supports 0, 1, 2, 4 " + "or 8 bytes per key."); + } + + if (!cf_options.compaction_options_fifo.file_temperature_age_thresholds + .empty()) { + if (cf_options.compaction_style != kCompactionStyleFIFO) { + return Status::NotSupported( + "Option file_temperature_age_thresholds only supports FIFO " + "compaction."); + } else if (cf_options.num_levels > 1) { + return Status::NotSupported( + "Option file_temperature_age_thresholds is only supported when " + "num_levels = 1."); + } else { + const auto& ages = + cf_options.compaction_options_fifo.file_temperature_age_thresholds; + assert(ages.size() >= 1); + // check that age is sorted + for (size_t i = 0; i < ages.size() - 1; ++i) { + if (ages[i].age >= ages[i + 1].age) { + return Status::NotSupported( + "Option file_temperature_age_thresholds requires elements to be " + "sorted in increasing order with respect to `age` field."); + } + } + } + } + return s; +} + +Status ColumnFamilyData::SetOptions( + const DBOptions& db_opts, + const std::unordered_map& options_map) { + ColumnFamilyOptions cf_opts = + BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); + ConfigOptions config_opts; + config_opts.mutable_options_only = true; + Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map, + &cf_opts); + if (s.ok()) { + s = ValidateOptions(db_opts, cf_opts); + } + if (s.ok()) { + mutable_cf_options_ = MutableCFOptions(cf_opts); + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + } + return s; +} + +// REQUIRES: DB mutex held +Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { + if (initial_cf_options_.compaction_style != kCompactionStyleLevel) { + return Env::WLTH_NOT_SET; + } + if (level == 0) { + return Env::WLTH_MEDIUM; + } + int base_level = current_->storage_info()->base_level(); + + // L1: medium, L2: long, ... + if (level - base_level >= 2) { + return Env::WLTH_EXTREME; + } else if (level < base_level) { + // There is no restriction which prevents level passed in to be smaller + // than base_level. + return Env::WLTH_MEDIUM; + } + return static_cast( + level - base_level + static_cast(Env::WLTH_MEDIUM)); +} + +Status ColumnFamilyData::AddDirectories( + std::map>* created_dirs) { + Status s; + assert(created_dirs != nullptr); + assert(data_dirs_.empty()); + for (auto& p : ioptions_.cf_paths) { + auto existing_dir = created_dirs->find(p.path); + + if (existing_dir == created_dirs->end()) { + std::unique_ptr path_directory; + s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path, + &path_directory); + if (!s.ok()) { + return s; + } + assert(path_directory != nullptr); + data_dirs_.emplace_back(path_directory.release()); + (*created_dirs)[p.path] = data_dirs_.back(); + } else { + data_dirs_.emplace_back(existing_dir->second); + } + } + assert(data_dirs_.size() == ioptions_.cf_paths.size()); + return s; +} + +FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { + if (data_dirs_.empty()) { + return nullptr; + } + + assert(path_id < data_dirs_.size()); + return data_dirs_[path_id].get(); +} + +void ColumnFamilyData::RecoverEpochNumbers() { + assert(current_); + auto* vstorage = current_->storage_info(); + assert(vstorage); + vstorage->RecoverEpochNumbers(this); +} + +ColumnFamilySet::ColumnFamilySet(const std::string& dbname, + const ImmutableDBOptions* db_options, + const FileOptions& file_options, + Cache* table_cache, + WriteBufferManager* _write_buffer_manager, + WriteController* _write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, + const std::string& db_session_id) + : max_column_family_(0), + file_options_(file_options), + dummy_cfd_(new ColumnFamilyData( + ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr, + nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr, + block_cache_tracer, io_tracer, db_id, db_session_id)), + default_cfd_cache_(nullptr), + db_name_(dbname), + db_options_(db_options), + table_cache_(table_cache), + write_buffer_manager_(_write_buffer_manager), + write_controller_(_write_controller), + block_cache_tracer_(block_cache_tracer), + io_tracer_(io_tracer), + db_id_(db_id), + db_session_id_(db_session_id) { + // initialize linked list + dummy_cfd_->prev_ = dummy_cfd_; + dummy_cfd_->next_ = dummy_cfd_; +} + +ColumnFamilySet::~ColumnFamilySet() { + while (column_family_data_.size() > 0) { + // cfd destructor will delete itself from column_family_data_ + auto cfd = column_family_data_.begin()->second; + bool last_ref __attribute__((__unused__)); + last_ref = cfd->UnrefAndTryDelete(); + assert(last_ref); + } + bool dummy_last_ref __attribute__((__unused__)); + dummy_last_ref = dummy_cfd_->UnrefAndTryDelete(); + assert(dummy_last_ref); +} + +ColumnFamilyData* ColumnFamilySet::GetDefault() const { + assert(default_cfd_cache_ != nullptr); + return default_cfd_cache_; +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { + auto cfd_iter = column_family_data_.find(id); + if (cfd_iter != column_family_data_.end()) { + return cfd_iter->second; + } else { + return nullptr; + } +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily( + const std::string& name) const { + auto cfd_iter = column_families_.find(name); + if (cfd_iter != column_families_.end()) { + auto cfd = GetColumnFamily(cfd_iter->second); + assert(cfd != nullptr); + return cfd; + } else { + return nullptr; + } +} + +uint32_t ColumnFamilySet::GetNextColumnFamilyID() { + return ++max_column_family_; +} + +uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; } + +void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) { + max_column_family_ = std::max(new_max_column_family, max_column_family_); +} + +size_t ColumnFamilySet::NumberOfColumnFamilies() const { + return column_families_.size(); +} + +// under a DB mutex AND write thread +ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( + const std::string& name, uint32_t id, Version* dummy_versions, + const ColumnFamilyOptions& options) { + assert(column_families_.find(name) == column_families_.end()); + ColumnFamilyData* new_cfd = new ColumnFamilyData( + id, name, dummy_versions, table_cache_, write_buffer_manager_, options, + *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_, + db_id_, db_session_id_); + column_families_.insert({name, id}); + column_family_data_.insert({id, new_cfd}); + max_column_family_ = std::max(max_column_family_, id); + // add to linked list + new_cfd->next_ = dummy_cfd_; + auto prev = dummy_cfd_->prev_; + new_cfd->prev_ = prev; + prev->next_ = new_cfd; + dummy_cfd_->prev_ = new_cfd; + if (id == 0) { + default_cfd_cache_ = new_cfd; + } + return new_cfd; +} + +// under a DB mutex AND from a write thread +void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { + auto cfd_iter = column_family_data_.find(cfd->GetID()); + assert(cfd_iter != column_family_data_.end()); + column_family_data_.erase(cfd_iter); + column_families_.erase(cfd->GetName()); +} + +// under a DB mutex OR from a write thread +bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { + if (column_family_id == 0) { + // optimization for common case + current_ = column_family_set_->GetDefault(); + } else { + current_ = column_family_set_->GetColumnFamily(column_family_id); + } + handle_.SetCFD(current_); + return current_ != nullptr; +} + +uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const { + assert(current_ != nullptr); + return current_->GetLogNumber(); +} + +MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { + assert(current_ != nullptr); + return current_->mem(); +} + +ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { + assert(current_ != nullptr); + return &handle_; +} + +uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = static_cast_with_check(column_family); + column_family_id = cfh->GetID(); + } + return column_family_id; +} + +const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family) { + if (column_family != nullptr) { + return column_family->GetComparator(); + } + return nullptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.h new file mode 100644 index 0000000000..9ec093010d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/column_family.h @@ -0,0 +1,851 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "db/memtable_list.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/write_batch_internal.h" +#include "db/write_controller.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/hash_containers.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class VersionSet; +class VersionStorageInfo; +class MemTable; +class MemTableListVersion; +class CompactionPicker; +class Compaction; +class InternalKey; +class InternalStats; +class ColumnFamilyData; +class DBImpl; +class LogBuffer; +class InstrumentedMutex; +class InstrumentedMutexLock; +struct SuperVersionContext; +class BlobFileCache; +class BlobSource; + +extern const double kIncSlowdownRatio; +// This file contains a list of data structures for managing column family +// level metadata. +// +// The basic relationships among classes declared here are illustrated as +// following: +// +// +----------------------+ +----------------------+ +--------+ +// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | +// | +----------------------+ | +----------------------+ +----+---+ +// | +--------------------------+ | +// | | +-----------------------------+ +// | | | +// | | +-----------------------------v-------------------------------+ +// | | | | +// | | | ColumnFamilySet | +// | | | | +// | | +-------------+--------------------------+----------------+---+ +// | | | | | +// | +-------------------------------------+ | | +// | | | | v +// | +-------------v-------------+ +-----v----v---------+ +// | | | | | +// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... +// | | | | | +// +---> | | | +// | +---------+ | | +// | | MemTable| | | +// | | List | | | +// +--------+---+--+-+----+----+ +--------------------++ +// | | | | +// | | | | +// | | | +-----------------------+ +// | | +-----------+ | +// v +--------+ | | +// +--------+--------+ | | | +// | | | | +----------v----------+ +// +---> |SuperVersion 1.a +-----------------> | +// | +------+ | | MemTableListVersion | +// +---+-------------+ | | | | | +// | | | | +----+------------+---+ +// | current | | | | | +// | +-------------+ | |mem | | +// | | | | | | +// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ +// | | | | | | | | +// | Version 1.a | | memtable | | memtable | | memtable | +// | | | 1.a | | 1.b | | 1.c | +// +-------------+ | | | | | | +// +----------+ +----------+ +----------+ +// +// DBImpl keeps a ColumnFamilySet, which references to all column families by +// pointing to respective ColumnFamilyData object of each column family. +// This is how DBImpl can list and operate on all the column families. +// ColumnFamilyHandle also points to ColumnFamilyData directly, so that +// when a user executes a query, it can directly find memtables and Version +// as well as SuperVersion to the column family, without going through +// ColumnFamilySet. +// +// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables +// and SST files) indirectly, while ongoing operations may hold references +// to a current or an out-of-date SuperVersion, which in turn points to a +// point-in-time view of the LSM-tree. This guarantees the memtables and SST +// files being operated on will not go away, until the SuperVersion is +// unreferenced to 0 and destoryed. +// +// The following graph illustrates a possible referencing relationships: +// +// Column +--------------+ current +-----------+ +// Family +---->+ +------------------->+ | +// Data | SuperVersion +----------+ | Version A | +// | 3 | imm | | | +// Iter2 +----->+ | +-------v------+ +-----------+ +// +-----+--------+ | MemtableList +----------------> Empty +// | | Version r | +-----------+ +// | +--------------+ | | +// +------------------+ current| Version B | +// +--------------+ | +----->+ | +// | | | | +-----+-----+ +// Compaction +>+ SuperVersion +-------------+ ^ +// Job | 2 +------+ | |current +// | +----+ | | mem | +------------+ +// +--------------+ | | +---------------------> | +// | +------------------------> MemTable a | +// | mem | | | +// +--------------+ | | +------------+ +// | +--------------------------+ +// Iter1 +-----> SuperVersion | | +------------+ +// | 1 +------------------------------>+ | +// | +-+ | mem | MemTable b | +// +--------------+ | | | | +// | | +--------------+ +-----^------+ +// | |imm | MemtableList | | +// | +--->+ Version s +------------+ +// | +--------------+ +// | +--------------+ +// | | MemtableList | +// +------>+ Version t +--------> Empty +// imm +--------------+ +// +// In this example, even if the current LSM-tree consists of Version A and +// memtable a, which is also referenced by SuperVersion, two older SuperVersion +// SuperVersion2 and Superversion1 still exist, and are referenced by a +// compaction job and an old iterator Iter1, respectively. SuperVersion2 +// contains Version B, memtable a and memtable b; SuperVersion1 contains +// Version B and memtable b (mutable). As a result, Version B and memtable b +// are prevented from being destroyed or deleted. + +// ColumnFamilyHandleImpl is the class that clients use to access different +// column families. It has non-trivial destructor, which gets called when client +// is done using the column family +class ColumnFamilyHandleImpl : public ColumnFamilyHandle { + public: + // create while holding the mutex + ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, + InstrumentedMutex* mutex); + // destroy without mutex + virtual ~ColumnFamilyHandleImpl(); + virtual ColumnFamilyData* cfd() const { return cfd_; } + + virtual uint32_t GetID() const override; + virtual const std::string& GetName() const override; + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; + virtual const Comparator* GetComparator() const override; + + private: + ColumnFamilyData* cfd_; + DBImpl* db_; + InstrumentedMutex* mutex_; +}; + +// Does not ref-count ColumnFamilyData +// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter +// calls DBImpl methods. When this happens, MemTableInserter need access to +// ColumnFamilyHandle (same as the client would need). In that case, we feed +// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl +// methods +class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { + public: + ColumnFamilyHandleInternal() + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + internal_cfd_(nullptr) {} + + void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } + virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } + + private: + ColumnFamilyData* internal_cfd_; +}; + +// holds references to memtable, all immutable memtables and version +struct SuperVersion { + // Accessing members of this class is not thread-safe and requires external + // synchronization (ie db mutex held or on write thread). + ColumnFamilyData* cfd; + MemTable* mem; + MemTableListVersion* imm; + Version* current; + MutableCFOptions mutable_cf_options; + // Version number of the current SuperVersion + uint64_t version_number; + WriteStallCondition write_stall_condition; + + // should be called outside the mutex + SuperVersion() = default; + ~SuperVersion(); + SuperVersion* Ref(); + // If Unref() returns true, Cleanup() should be called with mutex held + // before deleting this SuperVersion. + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current); + + // The value of dummy is not actually used. kSVInUse takes its address as a + // mark in the thread local storage to indicate the SuperVersion is in use + // by thread. This way, the value of kSVInUse is guaranteed to have no + // conflict with SuperVersion object address and portable on different + // platform. + static int dummy; + static void* const kSVInUse; + static void* const kSVObsolete; + + private: + std::atomic refs; + // We need to_delete because during Cleanup(), imm->Unref() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + autovector to_delete; +}; + +extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); + +extern Status CheckConcurrentWritesSupported( + const ColumnFamilyOptions& cf_options); + +extern Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + +extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src); +// Wrap user defined table properties collector factories `from cf_options` +// into internal ones in int_tbl_prop_collector_factories. Add a system internal +// one too. +extern void GetIntTblPropCollectorFactory( + const ImmutableCFOptions& ioptions, + IntTblPropCollectorFactories* int_tbl_prop_collector_factories); + +class ColumnFamilySet; + +// This class keeps all the data that a column family needs. +// Most methods require DB mutex held, unless otherwise noted +class ColumnFamilyData { + public: + ~ColumnFamilyData(); + + // thread-safe + uint32_t GetID() const { return id_; } + // thread-safe + const std::string& GetName() const { return name_; } + + // Ref() can only be called from a context where the caller can guarantee + // that ColumnFamilyData is alive (while holding a non-zero ref already, + // holding a DB mutex, or as the leader in a write batch group). + void Ref() { refs_.fetch_add(1); } + + // UnrefAndTryDelete() decreases the reference count and do free if needed, + // return true if this is freed else false, UnrefAndTryDelete() can only + // be called while holding a DB mutex, or during single-threaded recovery. + bool UnrefAndTryDelete(); + + // SetDropped() can only be called under following conditions: + // 1) Holding a DB mutex, + // 2) from single-threaded write thread, AND + // 3) from single-threaded VersionSet::LogAndApply() + // After dropping column family no other operation on that column family + // will be executed. All the files and memory will be, however, kept around + // until client drops the column family handle. That way, client can still + // access data from dropped column family. + // Column family can be dropped and still alive. In that state: + // *) Compaction and flush is not executed on the dropped column family. + // *) Client can continue reading from column family. Writes will fail unless + // WriteOptions::ignore_missing_column_families is true + // When the dropped column family is unreferenced, then we: + // *) Remove column family from the linked list maintained by ColumnFamilySet + // *) delete all memory associated with that column family + // *) delete all the files associated with that column family + void SetDropped(); + bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } + + // thread-safe + int NumberLevels() const { return ioptions_.num_levels; } + + void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } + uint64_t GetLogNumber() const { return log_number_; } + + // thread-safe + const FileOptions* soptions() const; + const ImmutableOptions* ioptions() const { return &ioptions_; } + // REQUIRES: DB mutex held + // This returns the MutableCFOptions used by current SuperVersion + // You should use this API to reference MutableCFOptions most of the time. + const MutableCFOptions* GetCurrentMutableCFOptions() const { + return &(super_version_->mutable_cf_options); + } + // REQUIRES: DB mutex held + // This returns the latest MutableCFOptions, which may be not in effect yet. + const MutableCFOptions* GetLatestMutableCFOptions() const { + return &mutable_cf_options_; + } + + // REQUIRES: DB mutex held + // Build ColumnFamiliesOptions with immutable options and latest mutable + // options. + ColumnFamilyOptions GetLatestCFOptions() const; + + bool is_delete_range_supported() { return is_delete_range_supported_; } + + // Validate CF options against DB options + static Status ValidateOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + // REQUIRES: DB mutex held + Status SetOptions( + const DBOptions& db_options, + const std::unordered_map& options_map); + + InternalStats* internal_stats() { return internal_stats_.get(); } + + MemTableList* imm() { return &imm_; } + MemTable* mem() { return mem_; } + + bool IsEmpty() { + return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; + } + + Version* current() { return current_; } + Version* dummy_versions() { return dummy_versions_; } + void SetCurrent(Version* _current); + uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held + uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held + void SetMemtable(MemTable* new_mem) { + uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; + new_mem->SetID(memtable_id); + mem_ = new_mem; + } + + // calculate the oldest log needed for the durability of this column family + uint64_t OldestLogToKeep(); + + // See Memtable constructor for explanation of earliest_seq param. + MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, + SequenceNumber earliest_seq); + void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, + SequenceNumber earliest_seq); + + TableCache* table_cache() const { return table_cache_.get(); } + BlobSource* blob_source() const { return blob_source_.get(); } + + // See documentation in compaction_picker.h + // REQUIRES: DB mutex held + bool NeedsCompaction() const; + // REQUIRES: DB mutex held + Compaction* PickCompaction(const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, + LogBuffer* log_buffer); + + // Check if the passed range overlap with any running compactions. + // REQUIRES: DB mutex held + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Check if the passed ranges overlap with any unflushed memtables + // (immutable or mutable). + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status RangesOverlapWithMemtables(const autovector& ranges, + SuperVersion* super_version, + bool allow_data_in_errors, bool* overlap); + + // A flag to tell a manual compaction is to compact all levels together + // instead of a specific level. + static const int kCompactAllLevels; + // A flag to tell a manual compaction's output is base level. + static const int kCompactToBaseLevel; + // REQUIRES: DB mutex held + Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, + const std::string& trim_ts); + + CompactionPicker* compaction_picker() { return compaction_picker_.get(); } + // thread-safe + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + // thread-safe + const InternalKeyComparator& internal_comparator() const { + return internal_comparator_; + } + + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const { + return &int_tbl_prop_collector_factories_; + } + + SuperVersion* GetSuperVersion() { return super_version_; } + // thread-safe + // Return a already referenced SuperVersion to be used safely. + SuperVersion* GetReferencedSuperVersion(DBImpl* db); + // thread-safe + // Get SuperVersion stored in thread local storage. If it does not exist, + // get a reference from a current SuperVersion. + SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); + // Try to return SuperVersion back to thread local storage. Return true on + // success and false on failure. It fails when the thread local storage + // contains anything other than SuperVersion::kSVInUse flag. + bool ReturnThreadLocalSuperVersion(SuperVersion* sv); + // thread-safe + uint64_t GetSuperVersionNumber() const { + return super_version_number_.load(); + } + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion to enable + // the clients to allocate SuperVersion outside of mutex. + // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() + void InstallSuperVersion(SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + void InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex); + + void ResetThreadLocalSuperVersions(); + + // Protected by DB mutex + void set_queued_for_flush(bool value) { queued_for_flush_ = value; } + void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } + bool queued_for_flush() { return queued_for_flush_; } + bool queued_for_compaction() { return queued_for_compaction_; } + + static std::pair + GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options); + + // Recalculate some stall conditions, which are changed only during + // compaction, adding new memtable and/or recalculation of compaction score. + WriteStallCondition RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options); + + void set_initialized() { initialized_.store(true); } + + bool initialized() const { return initialized_.load(); } + + const ColumnFamilyOptions& initial_cf_options() { + return initial_cf_options_; + } + + Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); + + // created_dirs remembers directory created, so that we don't need to call + // the same data creation operation again. + Status AddDirectories( + std::map>* created_dirs); + + FSDirectory* GetDataDir(size_t path_id) const; + + // full_history_ts_low_ can only increase. + void SetFullHistoryTsLow(std::string ts_low) { + assert(!ts_low.empty()); + const Comparator* ucmp = user_comparator(); + assert(ucmp); + if (full_history_ts_low_.empty() || + ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { + full_history_ts_low_ = std::move(ts_low); + } + } + + const std::string& GetFullHistoryTsLow() const { + return full_history_ts_low_; + } + + ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } + WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } + std::shared_ptr + GetFileMetadataCacheReservationManager() { + return file_metadata_cache_res_mgr_; + } + + SequenceNumber GetFirstMemtableSequenceNumber() const; + + static const uint32_t kDummyColumnFamilyDataId; + + // Keep track of whether the mempurge feature was ever used. + void SetMempurgeUsed() { mempurge_used_ = true; } + bool GetMempurgeUsed() { return mempurge_used_; } + + // Allocate and return a new epoch number + uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); } + + // Get the next epoch number to be assigned + uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); } + + // Set the next epoch number to be assigned + void SetNextEpochNumber(uint64_t next_epoch_number) { + next_epoch_number_.store(next_epoch_number); + } + + // Reset the next epoch number to be assigned + void ResetNextEpochNumber() { next_epoch_number_.store(1); } + + // Recover the next epoch number of this CF and epoch number + // of its files (if missing) + void RecoverEpochNumbers(); + + private: + friend class ColumnFamilySet; + ColumnFamilyData(uint32_t id, const std::string& name, + Version* dummy_versions, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& options, + const ImmutableDBOptions& db_options, + const FileOptions* file_options, + ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id); + + std::vector GetDbPaths() const; + + uint32_t id_; + const std::string name_; + Version* dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions->prev_ + + std::atomic refs_; // outstanding references to ColumnFamilyData + std::atomic initialized_; + std::atomic dropped_; // true if client dropped it + + const InternalKeyComparator internal_comparator_; + IntTblPropCollectorFactories int_tbl_prop_collector_factories_; + + const ColumnFamilyOptions initial_cf_options_; + const ImmutableOptions ioptions_; + MutableCFOptions mutable_cf_options_; + + const bool is_delete_range_supported_; + + std::unique_ptr table_cache_; + std::unique_ptr blob_file_cache_; + std::unique_ptr blob_source_; + + std::unique_ptr internal_stats_; + + WriteBufferManager* write_buffer_manager_; + + MemTable* mem_; + MemTableList imm_; + SuperVersion* super_version_; + + // An ordinal representing the current SuperVersion. Updated by + // InstallSuperVersion(), i.e. incremented every time super_version_ + // changes. + std::atomic super_version_number_; + + // Thread's local copy of SuperVersion pointer + // This needs to be destructed before mutex_ + std::unique_ptr local_sv_; + + // pointers for a circular linked list. we use it to support iterations over + // all column families that are alive (note: dropped column families can also + // be alive as long as client holds a reference) + ColumnFamilyData* next_; + ColumnFamilyData* prev_; + + // This is the earliest log file number that contains data from this + // Column Family. All earlier log files must be ignored and not + // recovered from + uint64_t log_number_; + + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; + + ColumnFamilySet* column_family_set_; + + std::unique_ptr write_controller_token_; + + // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ + bool queued_for_flush_; + + // If true --> this ColumnFamily is currently present in + // DBImpl::compaction_queue_ + bool queued_for_compaction_; + + uint64_t prev_compaction_needed_bytes_; + + // if the database was opened with 2pc enabled + bool allow_2pc_; + + // Memtable id to track flush. + std::atomic last_memtable_id_; + + // Directories corresponding to cf_paths. + std::vector> data_dirs_; + + bool db_paths_registered_; + + std::string full_history_ts_low_; + + // For charging memory usage of file metadata created for newly added files to + // a Version associated with this CFD + std::shared_ptr file_metadata_cache_res_mgr_; + bool mempurge_used_; + + std::atomic next_epoch_number_; +}; + +// ColumnFamilySet has interesting thread-safety requirements +// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB +// mutex AND executed in the write thread. +// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND +// single-threaded write thread. It is also called during Recovery and in +// DumpManifest(). +// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be +// held and it needs to be executed from the write thread. SetDropped() also +// guarantees that it will be called only from single-threaded LogAndApply(), +// but this condition is not that important. +// * Iteration -- hold DB mutex. If you want to release the DB mutex in the +// body of the iteration, wrap in a RefedColumnFamilySet. +// * GetDefault() -- thread safe +// * GetColumnFamily() -- either inside of DB mutex or from a write thread +// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), +// NumberOfColumnFamilies -- inside of DB mutex +class ColumnFamilySet { + public: + // ColumnFamilySet supports iteration + class iterator { + public: + explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} + // NOTE: minimum operators for for-loop iteration + iterator& operator++() { + current_ = current_->next_; + return *this; + } + bool operator!=(const iterator& other) const { + return this->current_ != other.current_; + } + ColumnFamilyData* operator*() { return current_; } + + private: + ColumnFamilyData* current_; + }; + + ColumnFamilySet(const std::string& dbname, + const ImmutableDBOptions* db_options, + const FileOptions& file_options, Cache* table_cache, + WriteBufferManager* _write_buffer_manager, + WriteController* _write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id); + ~ColumnFamilySet(); + + ColumnFamilyData* GetDefault() const; + // GetColumnFamily() calls return nullptr if column family is not found + ColumnFamilyData* GetColumnFamily(uint32_t id) const; + ColumnFamilyData* GetColumnFamily(const std::string& name) const; + // this call will return the next available column family ID. it guarantees + // that there is no column family with id greater than or equal to the + // returned value in the current running instance or anytime in RocksDB + // instance history. + uint32_t GetNextColumnFamilyID(); + uint32_t GetMaxColumnFamily(); + void UpdateMaxColumnFamily(uint32_t new_max_column_family); + size_t NumberOfColumnFamilies() const; + + ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, + Version* dummy_version, + const ColumnFamilyOptions& options); + + iterator begin() { return iterator(dummy_cfd_->next_); } + iterator end() { return iterator(dummy_cfd_); } + + Cache* get_table_cache() { return table_cache_; } + + WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } + + WriteController* write_controller() { return write_controller_; } + + private: + friend class ColumnFamilyData; + // helper function that gets called from cfd destructor + // REQUIRES: DB mutex held + void RemoveColumnFamily(ColumnFamilyData* cfd); + + // column_families_ and column_family_data_ need to be protected: + // * when mutating both conditions have to be satisfied: + // 1. DB mutex locked + // 2. thread currently in single-threaded write thread + // * when reading, at least one condition needs to be satisfied: + // 1. DB mutex locked + // 2. accessed from a single-threaded write thread + UnorderedMap column_families_; + UnorderedMap column_family_data_; + + uint32_t max_column_family_; + const FileOptions file_options_; + + ColumnFamilyData* dummy_cfd_; + // We don't hold the refcount here, since default column family always exists + // We are also not responsible for cleaning up default_cfd_cache_. This is + // just a cache that makes common case (accessing default column family) + // faster + ColumnFamilyData* default_cfd_cache_; + + const std::string db_name_; + const ImmutableDBOptions* const db_options_; + Cache* table_cache_; + WriteBufferManager* write_buffer_manager_; + WriteController* write_controller_; + BlockCacheTracer* const block_cache_tracer_; + std::shared_ptr io_tracer_; + const std::string& db_id_; + std::string db_session_id_; +}; + +// A wrapper for ColumnFamilySet that supports releasing DB mutex during each +// iteration over the iterator, because the cfd is Refed and Unrefed during +// each iteration to prevent concurrent CF drop from destroying it (until +// Unref). +class RefedColumnFamilySet { + public: + explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {} + + class iterator { + public: + explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) { + MaybeRef(*wrapped_); + } + ~iterator() { MaybeUnref(*wrapped_); } + inline void MaybeRef(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->Ref(); + } + } + inline void MaybeUnref(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->UnrefAndTryDelete(); + } + } + // NOTE: minimum operators for for-loop iteration + inline iterator& operator++() { + ColumnFamilyData* old = *wrapped_; + ++wrapped_; + // Can only unref & potentially free cfd after accessing its next_ + MaybeUnref(old); + MaybeRef(*wrapped_); + return *this; + } + inline bool operator!=(const iterator& other) const { + return this->wrapped_ != other.wrapped_; + } + inline ColumnFamilyData* operator*() { return *wrapped_; } + + private: + ColumnFamilySet::iterator wrapped_; + }; + + iterator begin() { return iterator(wrapped_->begin()); } + iterator end() { return iterator(wrapped_->end()); } + + private: + ColumnFamilySet* wrapped_; +}; + +// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access +// memtables of different column families (specified by ID in the write batch) +class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} + + // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed + // with the arguments used to construct *orig. + explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) + : column_family_set_(orig->column_family_set_), current_(nullptr) {} + + // sets current_ to ColumnFamilyData with column_family_id + // returns false if column family doesn't exist + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + bool Seek(uint32_t column_family_id) override; + + // Returns log number of the selected column family + // REQUIRES: under a DB mutex OR from a write thread + uint64_t GetLogNumber() const override; + + // REQUIRES: Seek() called first + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual MemTable* GetMemTable() const override; + + // Returns column family handle for the selected column family + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + + // Cannot be called while another thread is calling Seek(). + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyData* current() override { return current_; } + + private: + ColumnFamilySet* column_family_set_; + ColumnFamilyData* current_; + ColumnFamilyHandleInternal handle_; +}; + +extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); + +extern const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/clipping_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/clipping_iterator.h new file mode 100644 index 0000000000..3f50cdd9dd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/clipping_iterator.h @@ -0,0 +1,281 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/comparator.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that wraps another one and ensures that any keys +// returned are strictly within a range [start, end). If the underlying +// iterator has already performed the bounds checking, it relies on that result; +// otherwise, it performs the necessary key comparisons itself. Both bounds +// are optional. +class ClippingIterator : public InternalIterator { + public: + ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end, + const CompareInterface* cmp) + : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) { + assert(iter_); + assert(cmp_); + assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0); + + UpdateAndEnforceBounds(); + } + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + if (start_) { + iter_->Seek(*start_); + } else { + iter_->SeekToFirst(); + } + + UpdateAndEnforceUpperBound(); + } + + void SeekToLast() override { + if (end_) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + } else { + iter_->SeekToLast(); + } + + UpdateAndEnforceLowerBound(); + } + + void Seek(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + iter_->Seek(*start_); + UpdateAndEnforceUpperBound(); + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + valid_ = false; + return; + } + + iter_->Seek(target); + UpdateAndEnforceUpperBound(); + } + + void SeekForPrev(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + valid_ = false; + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + + UpdateAndEnforceLowerBound(); + return; + } + + iter_->SeekForPrev(target); + UpdateAndEnforceLowerBound(); + } + + void Next() override { + assert(valid_); + iter_->Next(); + UpdateAndEnforceUpperBound(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(valid_); + assert(result); + + IterateResult res; + valid_ = iter_->NextAndGetResult(&res); + + if (!valid_) { + return false; + } + + if (end_) { + EnforceUpperBoundImpl(res.bound_check_result); + + if (!valid_) { + return false; + } + } + + res.bound_check_result = IterBoundCheck::kInbound; + *result = res; + + return true; + } + + void Prev() override { + assert(valid_); + iter_->Prev(); + UpdateAndEnforceLowerBound(); + } + + Slice key() const override { + assert(valid_); + return iter_->key(); + } + + Slice user_key() const override { + assert(valid_); + return iter_->user_key(); + } + + Slice value() const override { + assert(valid_); + return iter_->value(); + } + + Status status() const override { return iter_->status(); } + + bool PrepareValue() override { + assert(valid_); + + if (iter_->PrepareValue()) { + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } + + bool MayBeOutOfLowerBound() override { + assert(valid_); + return false; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(valid_); + return IterBoundCheck::kInbound; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(valid_); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(valid_); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + bool IsDeleteRangeSentinelKey() const override { + assert(valid_); + return iter_->IsDeleteRangeSentinelKey(); + } + + private: + void UpdateValid() { + assert(!iter_->Valid() || iter_->status().ok()); + + valid_ = iter_->Valid(); + } + + void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) { + if (bound_check_result == IterBoundCheck::kInbound) { + return; + } + + if (bound_check_result == IterBoundCheck::kOutOfBound) { + valid_ = false; + return; + } + + assert(bound_check_result == IterBoundCheck::kUnknown); + + if (cmp_->Compare(key(), *end_) >= 0) { + valid_ = false; + } + } + + void EnforceUpperBound() { + if (!valid_) { + return; + } + + if (!end_) { + return; + } + + EnforceUpperBoundImpl(iter_->UpperBoundCheckResult()); + } + + void EnforceLowerBound() { + if (!valid_) { + return; + } + + if (!start_) { + return; + } + + if (!iter_->MayBeOutOfLowerBound()) { + return; + } + + if (cmp_->Compare(key(), *start_) < 0) { + valid_ = false; + } + } + + void AssertBounds() { + assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0); + assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0); + } + + void UpdateAndEnforceBounds() { + UpdateValid(); + EnforceUpperBound(); + EnforceLowerBound(); + AssertBounds(); + } + + void UpdateAndEnforceUpperBound() { + UpdateValid(); + EnforceUpperBound(); + AssertBounds(); + } + + void UpdateAndEnforceLowerBound() { + UpdateValid(); + EnforceLowerBound(); + AssertBounds(); + } + + InternalIterator* iter_; + const Slice* start_; + const Slice* end_; + const CompareInterface* cmp_; + bool valid_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.cc new file mode 100644 index 0000000000..1832f13031 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.cc @@ -0,0 +1,857 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction.h" + +#include +#include + +#include "db/column_family.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/sst_partitioner.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + +uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->fd.GetFileSize(); + } + return sum; +} + +void Compaction::SetInputVersion(Version* _input_version) { + input_version_ = _input_version; + cfd_ = input_version_->cfd(); + + cfd_->Ref(); + input_version_->Ref(); + edit_.SetColumnFamily(cfd_->GetID()); +} + +void Compaction::GetBoundaryKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, Slice* smallest_user_key, + Slice* largest_user_key, int exclude_level) { + bool initialized = false; + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty() || inputs[i].level == exclude_level) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + const Slice& start_user_key = f->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = f->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = inputs[i].files[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = inputs[i].files.back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + +std::vector Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = &f->largest; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return inputs; +} + +// helper function to determine if compaction is creating files at the +// bottommost level +bool Compaction::IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs) { + int output_l0_idx; + if (output_level == 0) { + output_l0_idx = 0; + for (const auto* file : vstorage->LevelFiles(0)) { + if (inputs[0].files.back() == file) { + break; + } + ++output_l0_idx; + } + assert(static_cast(output_l0_idx) < vstorage->LevelFiles(0).size()); + } else { + output_l0_idx = -1; + } + Slice smallest_key, largest_key; + GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key); + return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key, + output_level, output_l0_idx); +} + +// test function to validate the functionality of IsBottommostLevel() +// function -- determines if compaction with inputs and storage is bottommost +bool Compaction::TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs) { + return IsBottommostLevel(output_level, vstorage, inputs); +} + +bool Compaction::IsFullCompaction( + VersionStorageInfo* vstorage, + const std::vector& inputs) { + size_t num_files_in_compaction = 0; + size_t total_num_files = 0; + for (int l = 0; l < vstorage->num_levels(); l++) { + total_num_files += vstorage->NumLevelFiles(l); + } + for (size_t i = 0; i < inputs.size(); i++) { + num_files_in_compaction += inputs[i].size(); + } + return num_files_in_compaction == total_num_files; +} + +Compaction::Compaction( + VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, + const MutableCFOptions& _mutable_cf_options, + const MutableDBOptions& _mutable_db_options, + std::vector _inputs, int _output_level, + uint64_t _target_file_size, uint64_t _max_compaction_bytes, + uint32_t _output_path_id, CompressionType _compression, + CompressionOptions _compression_opts, Temperature _output_temperature, + uint32_t _max_subcompactions, std::vector _grandparents, + bool _manual_compaction, const std::string& _trim_ts, double _score, + bool _deletion_compaction, bool l0_files_might_overlap, + CompactionReason _compaction_reason, + BlobGarbageCollectionPolicy _blob_garbage_collection_policy, + double _blob_garbage_collection_age_cutoff) + : input_vstorage_(vstorage), + start_level_(_inputs[0].level), + output_level_(_output_level), + target_output_file_size_(_target_file_size), + max_compaction_bytes_(_max_compaction_bytes), + max_subcompactions_(_max_subcompactions), + immutable_options_(_immutable_options), + mutable_cf_options_(_mutable_cf_options), + input_version_(nullptr), + number_levels_(vstorage->num_levels()), + cfd_(nullptr), + output_path_id_(_output_path_id), + output_compression_(_compression), + output_compression_opts_(_compression_opts), + output_temperature_(_output_temperature), + deletion_compaction_(_deletion_compaction), + l0_files_might_overlap_(l0_files_might_overlap), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), + grandparents_(std::move(_grandparents)), + score_(_score), + bottommost_level_( + // For simplicity, we don't support the concept of "bottommost level" + // with + // `CompactionReason::kExternalSstIngestion` and + // `CompactionReason::kRefitLevel` + (_compaction_reason == CompactionReason::kExternalSstIngestion || + _compaction_reason == CompactionReason::kRefitLevel) + ? false + : IsBottommostLevel(output_level_, vstorage, inputs_)), + is_full_compaction_(IsFullCompaction(vstorage, inputs_)), + is_manual_compaction_(_manual_compaction), + trim_ts_(_trim_ts), + is_trivial_move_(false), + compaction_reason_(_compaction_reason), + notify_on_compaction_completion_(false), + enable_blob_garbage_collection_( + _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce + ? true + : (_blob_garbage_collection_policy == + BlobGarbageCollectionPolicy::kDisable + ? false + : mutable_cf_options()->enable_blob_garbage_collection)), + blob_garbage_collection_age_cutoff_( + _blob_garbage_collection_age_cutoff < 0 || + _blob_garbage_collection_age_cutoff > 1 + ? mutable_cf_options()->blob_garbage_collection_age_cutoff + : _blob_garbage_collection_age_cutoff), + penultimate_level_( + // For simplicity, we don't support the concept of "penultimate level" + // with `CompactionReason::kExternalSstIngestion` and + // `CompactionReason::kRefitLevel` + _compaction_reason == CompactionReason::kExternalSstIngestion || + _compaction_reason == CompactionReason::kRefitLevel + ? Compaction::kInvalidLevel + : EvaluatePenultimateLevel(vstorage, immutable_options_, + start_level_, output_level_)) { + MarkFilesBeingCompacted(true); + if (is_manual_compaction_) { + compaction_reason_ = CompactionReason::kManualCompaction; + } + if (max_subcompactions_ == 0) { + max_subcompactions_ = _mutable_db_options.max_subcompactions; + } + + // for the non-bottommost levels, it tries to build files match the target + // file size, but not guaranteed. It could be 2x the size of the target size. + max_output_file_size_ = + bottommost_level_ || grandparents_.empty() || + !_immutable_options.level_compaction_dynamic_file_size + ? target_output_file_size_ + : 2 * target_output_file_size_; + +#ifndef NDEBUG + for (size_t i = 1; i < inputs_.size(); ++i) { + assert(inputs_[i].level > inputs_[i - 1].level); + } +#endif + + // setup input_levels_ + { + input_levels_.resize(num_input_levels()); + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); + } + } + + GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_); + + // Every compaction regardless of any compaction reason may respect the + // existing compact cursor in the output level to split output files + output_split_key_ = nullptr; + if (immutable_options_.compaction_style == kCompactionStyleLevel && + immutable_options_.compaction_pri == kRoundRobin) { + const InternalKey* cursor = + &input_vstorage_->GetCompactCursors()[output_level_]; + if (cursor->size() != 0) { + const Slice& cursor_user_key = ExtractUserKey(cursor->Encode()); + auto ucmp = vstorage->InternalComparator()->user_comparator(); + // May split output files according to the cursor if it in the user-key + // range + if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) > + 0 && + ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <= + 0) { + output_split_key_ = cursor; + } + } + } + + PopulatePenultimateLevelOutputRange(); +} + +void Compaction::PopulatePenultimateLevelOutputRange() { + if (!SupportsPerKeyPlacement()) { + return; + } + + // exclude the last level, the range of all input levels is the safe range + // of keys that can be moved up. + int exclude_level = number_levels_ - 1; + penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange; + + // For universal compaction, the penultimate_output_range could be extended if + // all penultimate level files are included in the compaction (which includes + // the case that the penultimate level is empty). + if (immutable_options_.compaction_style == kCompactionStyleUniversal) { + exclude_level = kInvalidLevel; + penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange; + std::set penultimate_inputs; + for (const auto& input_lvl : inputs_) { + if (input_lvl.level == penultimate_level_) { + for (const auto& file : input_lvl.files) { + penultimate_inputs.emplace(file->fd.GetNumber()); + } + } + } + auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); + for (const auto& file : penultimate_files) { + if (penultimate_inputs.find(file->fd.GetNumber()) == + penultimate_inputs.end()) { + exclude_level = number_levels_ - 1; + penultimate_output_range_type_ = + PenultimateOutputRangeType::kNonLastRange; + break; + } + } + } + + GetBoundaryKeys(input_vstorage_, inputs_, + &penultimate_level_smallest_user_key_, + &penultimate_level_largest_user_key_, exclude_level); +} + +Compaction::~Compaction() { + if (input_version_ != nullptr) { + input_version_->Unref(); + } + if (cfd_ != nullptr) { + cfd_->UnrefAndTryDelete(); + } +} + +bool Compaction::SupportsPerKeyPlacement() const { + return penultimate_level_ != kInvalidLevel; +} + +int Compaction::GetPenultimateLevel() const { return penultimate_level_; } + +// smallest_key and largest_key include timestamps if user-defined timestamp is +// enabled. +bool Compaction::OverlapPenultimateLevelOutputRange( + const Slice& smallest_key, const Slice& largest_key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->CompareWithoutTimestamp( + smallest_key, penultimate_level_largest_user_key_) <= 0 && + ucmp->CompareWithoutTimestamp( + largest_key, penultimate_level_smallest_user_key_) >= 0; +} + +// key includes timestamp if user-defined timestamp is enabled. +bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + + if (penultimate_level_smallest_user_key_.empty() || + penultimate_level_largest_user_key_.empty()) { + return false; + } + + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->CompareWithoutTimestamp( + key, penultimate_level_smallest_user_key_) >= 0 && + ucmp->CompareWithoutTimestamp( + key, penultimate_level_largest_user_key_) <= 0; +} + +bool Compaction::InputCompressionMatchesOutput() const { + int base_level = input_vstorage_->base_level(); + bool matches = + (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_, + base_level) == output_compression_); + if (matches) { + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches"); + return true; + } + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch"); + return matches; +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If start_level_== output_level_, the purpose is to force compaction + // filter to be applied to that level, and thus cannot be a trivial move. + + // Check if start level have files with overlapping ranges + if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false && + l0_files_might_overlap_) { + // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are + // overlapping, unless we are sure that files picked in L0 don't overlap. + return false; + } + + if (is_manual_compaction_ && + (immutable_options_.compaction_filter != nullptr || + immutable_options_.compaction_filter_factory != nullptr)) { + // This is a manual compaction and we have a compaction filter that should + // be executed, we cannot do a trivial move + return false; + } + + if (start_level_ == output_level_) { + // It doesn't make sense if compaction picker picks files just to trivial + // move to the same level. + return false; + } + + if (compaction_reason_ == CompactionReason::kChangeTemperature) { + // Changing temperature usually requires rewriting the file. + return false; + } + + // Used in universal compaction, where trivial move can be done if the + // input files are non overlapping + if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && + (output_level_ != 0) && + (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) { + return is_trivial_move_; + } + + if (!(start_level_ != output_level_ && num_input_levels() == 1 && + input(0, 0)->fd.GetPathId() == output_path_id() && + InputCompressionMatchesOutput())) { + return false; + } + + // assert inputs_.size() == 1 + + std::unique_ptr partitioner = CreateSstPartitioner(); + + for (const auto& file : inputs_.front().files) { + std::vector file_grand_parents; + if (output_level_ + 1 >= number_levels_) { + continue; + } + input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + &file->largest, &file_grand_parents); + const auto compaction_size = + file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + if (compaction_size > max_compaction_bytes_) { + return false; + } + + if (partitioner.get() != nullptr) { + if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), + file->largest.user_key())) { + return false; + } + } + } + + // PerKeyPlacement compaction should never be trivial move. + if (SupportsPerKeyPlacement()) { + return false; + } + + return true; +} + +void Compaction::AddInputDeletions(VersionEdit* out_edit) { + for (size_t which = 0; which < num_input_levels(); which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); + } + } +} + +bool Compaction::KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast(number_levels_)); + if (bottommost_level_) { + return true; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + // In the presence of user-defined timestamp, we may need to handle + // the case in which f->smallest.user_key() (including ts) has the + // same user key, but the ts part is smaller. If so, + // Compare(user_key, f->smallest.user_key()) returns -1. + // That's why we need CompareWithoutTimestamp(). + if (user_cmp->CompareWithoutTimestamp(user_key, + f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so it may + // exist beyond output level + return false; + } + break; + } + } + } + return true; + } + return false; +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { + for (size_t i = 0; i < num_input_levels(); i++) { + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = mark_as_compacted; + } + } +} + +// Sample output: +// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5, +// print: "3@0 + 2@3 + 1@4 files to L5" +const char* Compaction::InputLevelSummary( + InputLevelSummaryBuffer* scratch) const { + int len = 0; + bool is_first = true; + for (auto& input_level : inputs_) { + if (input_level.empty()) { + continue; + } + if (!is_first) { + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + "); + len = std::min(len, static_cast(sizeof(scratch->buffer))); + } else { + is_first = false; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "%" ROCKSDB_PRIszt "@%d", input_level.size(), + input_level.level); + len = std::min(len, static_cast(sizeof(scratch->buffer))); + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " files to L%d", output_level()); + + return scratch->buffer; +} + +uint64_t Compaction::CalculateTotalInputSize() const { + uint64_t size = 0; + for (auto& input_level : inputs_) { + for (auto f : input_level.files) { + size += f->fd.GetFileSize(); + } + } + return size; +} + +void Compaction::ReleaseCompactionFiles(Status status) { + MarkFilesBeingCompacted(false); + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); +} + +void Compaction::ResetNextCompactionIndex() { + assert(input_version_ != nullptr); + input_vstorage_->ResetNextCompactionIndex(start_level_); +} + +namespace { +int InputSummary(const std::vector& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (size_t i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); + if (ret < 0 || ret >= sz) break; + write += ret; + } + // if files.size() is non-zero, overwrite the last space + return write - !!files.size(); +} +} // namespace + +void Compaction::Summary(char* output, int len) { + int write = + snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [", + input_version_->GetVersionNumber(), start_level_); + if (write < 0 || write >= len) { + return; + } + + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + if (level_iter > 0) { + write += snprintf(output + write, len - write, "], ["); + if (write < 0 || write >= len) { + return; + } + } + write += + InputSummary(inputs_[level_iter].files, output + write, len - write); + if (write < 0 || write >= len) { + return; + } + } + + snprintf(output + write, len - write, "]"); +} + +uint64_t Compaction::OutputFilePreallocationSize() const { + uint64_t preallocation_size = 0; + + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + preallocation_size += file->fd.GetFileSize(); + } + } + + if (max_output_file_size_ != std::numeric_limits::max() && + (immutable_options_.compaction_style == kCompactionStyleLevel || + output_level() > 0)) { + preallocation_size = std::min(max_output_file_size_, preallocation_size); + } + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold + // No point to preallocate more than 1GB. + return std::min(uint64_t{1073741824}, + preallocation_size + (preallocation_size / 10)); +} + +std::unique_ptr Compaction::CreateCompactionFilter() const { + if (!cfd_->ioptions()->compaction_filter_factory) { + return nullptr; + } + + if (!cfd_->ioptions() + ->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kCompaction)) { + return nullptr; + } + + CompactionFilter::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.column_family_id = cfd_->GetID(); + context.reason = TableFileCreationReason::kCompaction; + return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( + context); +} + +std::unique_ptr Compaction::CreateSstPartitioner() const { + if (!immutable_options_.sst_partitioner_factory) { + return nullptr; + } + + SstPartitioner::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.output_level = output_level_; + context.smallest_user_key = smallest_user_key_; + context.largest_user_key = largest_user_key_; + return immutable_options_.sst_partitioner_factory->CreatePartitioner(context); +} + +bool Compaction::IsOutputLevelEmpty() const { + return inputs_.back().level != output_level_ || inputs_.back().empty(); +} + +bool Compaction::ShouldFormSubcompactions() const { + if (cfd_ == nullptr) { + return false; + } + + // Round-Robin pri under leveled compaction allows subcompactions by default + // and the number of subcompactions can be larger than max_subcompactions_ + if (cfd_->ioptions()->compaction_pri == kRoundRobin && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return output_level_ > 0; + } + + if (max_subcompactions_ <= 1) { + return false; + } + + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0; + } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + return number_levels_ > 1 && output_level_ > 0; + } else { + return false; + } +} + +bool Compaction::DoesInputReferenceBlobFiles() const { + assert(input_version_); + + const VersionStorageInfo* storage_info = input_version_->storage_info(); + assert(storage_info); + + if (storage_info->GetBlobFiles().empty()) { + return false; + } + + for (size_t i = 0; i < inputs_.size(); ++i) { + for (const FileMetaData* meta : inputs_[i].files) { + assert(meta); + + if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) { + return true; + } + } + } + + return false; +} + +uint64_t Compaction::MinInputFileOldestAncesterTime( + const InternalKey* start, const InternalKey* end) const { + uint64_t min_oldest_ancester_time = std::numeric_limits::max(); + const InternalKeyComparator& icmp = + column_family_data()->internal_comparator(); + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + if (start != nullptr && icmp.Compare(file->largest, *start) < 0) { + continue; + } + if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) { + continue; + } + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0) { + min_oldest_ancester_time = + std::min(min_oldest_ancester_time, oldest_ancester_time); + } + } + } + return min_oldest_ancester_time; +} + +uint64_t Compaction::MinInputFileEpochNumber() const { + uint64_t min_epoch_number = std::numeric_limits::max(); + for (const auto& inputs_per_level : inputs_) { + for (const auto& file : inputs_per_level.files) { + min_epoch_number = std::min(min_epoch_number, file->epoch_number); + } + } + return min_epoch_number; +} + +int Compaction::EvaluatePenultimateLevel( + const VersionStorageInfo* vstorage, + const ImmutableOptions& immutable_options, const int start_level, + const int output_level) { + // TODO: currently per_key_placement feature only support level and universal + // compaction + if (immutable_options.compaction_style != kCompactionStyleLevel && + immutable_options.compaction_style != kCompactionStyleUniversal) { + return kInvalidLevel; + } + if (output_level != immutable_options.num_levels - 1) { + return kInvalidLevel; + } + + int penultimate_level = output_level - 1; + assert(penultimate_level < immutable_options.num_levels); + if (penultimate_level <= 0) { + return kInvalidLevel; + } + + // If the penultimate level is not within input level -> output level range + // check if the penultimate output level is empty, if it's empty, it could + // also be locked for the penultimate output. + // TODO: ideally, it only needs to check if there's a file within the + // compaction output key range. For simplicity, it just check if there's any + // file on the penultimate level. + if (start_level == immutable_options.num_levels - 1 && + (immutable_options.compaction_style != kCompactionStyleUniversal || + !vstorage->LevelFiles(penultimate_level).empty())) { + return kInvalidLevel; + } + + bool supports_per_key_placement = + immutable_options.preclude_last_level_data_seconds > 0; + + // it could be overridden by unittest + TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled", + &supports_per_key_placement); + if (!supports_per_key_placement) { + return kInvalidLevel; + } + + return penultimate_level; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.h new file mode 100644 index 0000000000..4fa1dce71a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction.h @@ -0,0 +1,565 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/sst_partitioner.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. + +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; +}; + +// The structure that manages compaction input files associated +// with the same physical level. +struct CompactionInputFiles { + int level; + std::vector files; + std::vector atomic_compaction_unit_boundaries; + inline bool empty() const { return files.empty(); } + inline size_t size() const { return files.size(); } + inline void clear() { files.clear(); } + inline FileMetaData* operator[](size_t i) const { return files[i]; } +}; + +class Version; +class ColumnFamilyData; +class VersionStorageInfo; +class CompactionFilter; + +// A Compaction encapsulates metadata about a compaction. +class Compaction { + public: + Compaction(VersionStorageInfo* input_version, + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + std::vector inputs, int output_level, + uint64_t target_file_size, uint64_t max_compaction_bytes, + uint32_t output_path_id, CompressionType compression, + CompressionOptions compression_opts, + Temperature output_temperature, uint32_t max_subcompactions, + std::vector grandparents, + bool manual_compaction = false, const std::string& trim_ts = "", + double score = -1, bool deletion_compaction = false, + bool l0_files_might_overlap = true, + CompactionReason compaction_reason = CompactionReason::kUnknown, + BlobGarbageCollectionPolicy blob_garbage_collection_policy = + BlobGarbageCollectionPolicy::kUseDefault, + double blob_garbage_collection_age_cutoff = -1); + + // The type of the penultimate level output range + enum class PenultimateOutputRangeType : int { + kNotSupported, // it cannot output to the penultimate level + kFullRange, // any data could be output to the penultimate level + kNonLastRange, // only the keys within non_last_level compaction inputs can + // be outputted to the penultimate level + kDisabled, // no data can be outputted to the penultimate level + }; + + // No copying allowed + Compaction(const Compaction&) = delete; + void operator=(const Compaction&) = delete; + + ~Compaction(); + + // Returns the level associated to the specified compaction input level. + // If compaction_input_level is not specified, then input_level is set to 0. + int level(size_t compaction_input_level = 0) const { + return inputs_[compaction_input_level].level; + } + + int start_level() const { return start_level_; } + + // Outputs will go to this level + int output_level() const { return output_level_; } + + // Returns the number of input levels in this compaction. + size_t num_input_levels() const { return inputs_.size(); } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // Returns the number of input files associated to the specified + // compaction input level. + // The function will return 0 if when "compaction_input_level" < 0 + // or "compaction_input_level" >= "num_input_levels()". + size_t num_input_files(size_t compaction_input_level) const { + if (compaction_input_level < inputs_.size()) { + return inputs_[compaction_input_level].size(); + } + return 0; + } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + // Returns the ColumnFamilyData associated with the compaction. + ColumnFamilyData* column_family_data() const { return cfd_; } + + // Returns the file meta data of the 'i'th input file at the + // specified compaction input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + FileMetaData* input(size_t compaction_input_level, size_t i) const { + assert(compaction_input_level < inputs_.size()); + return inputs_[compaction_input_level][i]; + } + + const std::vector* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + + // Returns the list of file meta data of the specified compaction + // input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + const std::vector* inputs( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].files; + } + + const std::vector* inputs() { return &inputs_; } + + // Returns the LevelFilesBrief of the specified compaction input level. + const LevelFilesBrief* input_levels(size_t compaction_input_level) const { + return &input_levels_[compaction_input_level]; + } + + // Maximum size of files to build during this compaction. + uint64_t max_output_file_size() const { return max_output_file_size_; } + + // Target output file size for this compaction + uint64_t target_output_file_size() const { return target_output_file_size_; } + + // What compression for output + CompressionType output_compression() const { return output_compression_; } + + // What compression options for output + const CompressionOptions& output_compression_opts() const { + return output_compression_opts_; + } + + // Whether need to write output file to second DB path. + uint32_t output_path_id() const { return output_path_id_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // The split user key in the output level if this compaction is required to + // split the output files according to the existing cursor in the output + // level under round-robin compaction policy. Empty indicates no required + // splitting key + const InternalKey* GetOutputSplitKey() const { return output_split_key_; } + + // If true, then the compaction can be done by simply deleting input files. + bool deletion_compaction() const { return deletion_compaction_; } + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the available information we have guarantees that + // the input "user_key" does not exist in any level beyond "output_level()". + bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, + std::vector* level_ptrs) const; + + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Status status); + + // Returns the summary of the compaction in "output" with maximum "len" + // in bytes. The caller is responsible for the memory management of + // "output". + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level() const { return bottommost_level_; } + + // Is the compaction compact to the last level + bool is_last_level() const { + return output_level_ == immutable_options_.num_levels - 1; + } + + // Does this compaction include all sst files? + bool is_full_compaction() const { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool is_manual_compaction() const { return is_manual_compaction_; } + + std::string trim_ts() const { return trim_ts_; } + + // Used when allow_trivial_move option is set in + // Universal compaction. If all the input files are + // non overlapping, then is_trivial_move_ variable + // will be set true, else false + void set_is_trivial_move(bool trivial_move) { + is_trivial_move_ = trivial_move; + } + + // Used when allow_trivial_move option is set in + // Universal compaction. Returns true, if the input files + // are non-overlapping and can be trivially moved. + bool is_trivial_move() const { return is_trivial_move_; } + + // How many total levels are there? + int number_levels() const { return number_levels_; } + + // Return the ImmutableOptions that should be used throughout the compaction + // procedure + const ImmutableOptions* immutable_options() const { + return &immutable_options_; + } + + // Return the MutableCFOptions that should be used throughout the compaction + // procedure + const MutableCFOptions* mutable_cf_options() const { + return &mutable_cf_options_; + } + + // Returns the size in bytes that the output file should be preallocated to. + // In level compaction, that is max_file_size_. In universal compaction, that + // is the sum of all input file sizes. + uint64_t OutputFilePreallocationSize() const; + + void SetInputVersion(Version* input_version); + + struct InputLevelSummaryBuffer { + char buffer[128]; + }; + + const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const; + + uint64_t CalculateTotalInputSize() const; + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); + + // Create a CompactionFilter from compaction_filter_factory + std::unique_ptr CreateCompactionFilter() const; + + // Create a SstPartitioner from sst_partitioner_factory + std::unique_ptr CreateSstPartitioner() const; + + // Is the input level corresponding to output_level_ empty? + bool IsOutputLevelEmpty() const; + + // Should this compaction be broken up into smaller ones run in parallel? + bool ShouldFormSubcompactions() const; + + // Returns true iff at least one input file references a blob file. + // + // PRE: input version has been set. + bool DoesInputReferenceBlobFiles() const; + + // test function to validate the functionality of IsBottommostLevel() + // function -- determines if compaction with inputs and storage is bottommost + static bool TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs); + + TablePropertiesCollection GetOutputTableProperties() const { + return output_table_properties_; + } + + void SetOutputTableProperties(TablePropertiesCollection tp) { + output_table_properties_ = std::move(tp); + } + + Slice GetSmallestUserKey() const { return smallest_user_key_; } + + Slice GetLargestUserKey() const { return largest_user_key_; } + + Slice GetPenultimateLevelSmallestUserKey() const { + return penultimate_level_smallest_user_key_; + } + + Slice GetPenultimateLevelLargestUserKey() const { + return penultimate_level_largest_user_key_; + } + + PenultimateOutputRangeType GetPenultimateOutputRangeType() const { + return penultimate_output_range_type_; + } + + // Return true if the compaction supports per_key_placement + bool SupportsPerKeyPlacement() const; + + // Get per_key_placement penultimate output level, which is `last_level - 1` + // if per_key_placement feature is supported. Otherwise, return -1. + int GetPenultimateLevel() const; + + // Return true if the given range is overlap with penultimate level output + // range. + // Both smallest_key and largest_key include timestamps if user-defined + // timestamp is enabled. + bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, + const Slice& largest_key) const; + + // Return true if the key is within penultimate level output range for + // per_key_placement feature, which is safe to place the key to the + // penultimate level. different compaction strategy has different rules. + // If per_key_placement is not supported, always return false. + // TODO: currently it doesn't support moving data from the last level to the + // penultimate level + // key includes timestamp if user-defined timestamp is enabled. + bool WithinPenultimateLevelOutputRange(const Slice& key) const; + + CompactionReason compaction_reason() const { return compaction_reason_; } + + const std::vector& grandparents() const { + return grandparents_; + } + + uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } + + Temperature output_temperature() const { return output_temperature_; } + + uint32_t max_subcompactions() const { return max_subcompactions_; } + + bool enable_blob_garbage_collection() const { + return enable_blob_garbage_collection_; + } + + double blob_garbage_collection_age_cutoff() const { + return blob_garbage_collection_age_cutoff_; + } + + // start and end are sub compact range. Null if no boundary. + // This is used to filter out some input files' ancester's time range. + uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, + const InternalKey* end) const; + // Return the minimum epoch number among + // input files' associated with this compaction + uint64_t MinInputFileEpochNumber() const; + + // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of + // compaction begin and compaction completion callbacks match. + void SetNotifyOnCompactionCompleted() { + notify_on_compaction_completion_ = true; + } + + bool ShouldNotifyOnCompactionCompleted() const { + return notify_on_compaction_completion_; + } + + static constexpr int kInvalidLevel = -1; + + // Evaluate penultimate output level. If the compaction supports + // per_key_placement feature, it returns the penultimate level number. + // Otherwise, it's set to kInvalidLevel (-1), which means + // output_to_penultimate_level is not supported. + // Note: even the penultimate level output is supported (PenultimateLevel != + // kInvalidLevel), some key range maybe unsafe to be outputted to the + // penultimate level. The safe key range is populated by + // `PopulatePenultimateLevelOutputRange()`. + // Which could potentially disable all penultimate level output. + static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage, + const ImmutableOptions& immutable_options, + const int start_level, + const int output_level); + + private: + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool mark_as_compacted); + + // get the smallest and largest key present in files to be compacted + static void GetBoundaryKeys(VersionStorageInfo* vstorage, + const std::vector& inputs, + Slice* smallest_key, Slice* largest_key, + int exclude_level = -1); + + // populate penultimate level output range, which will be used to determine if + // a key is safe to output to the penultimate level (details see + // `Compaction::WithinPenultimateLevelOutputRange()`. + void PopulatePenultimateLevelOutputRange(); + + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and + // plumb down appropriate key boundaries to RangeDelAggregator during + // compaction. + static std::vector PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs); + + // helper function to determine if compaction with inputs and storage is + // bottommost + static bool IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs); + + static bool IsFullCompaction(VersionStorageInfo* vstorage, + const std::vector& inputs); + + VersionStorageInfo* input_vstorage_; + + const int start_level_; // the lowest level to be compacted + const int output_level_; // levels to which output files are stored + uint64_t target_output_file_size_; + uint64_t max_output_file_size_; + uint64_t max_compaction_bytes_; + uint32_t max_subcompactions_; + const ImmutableOptions immutable_options_; + const MutableCFOptions mutable_cf_options_; + Version* input_version_; + VersionEdit edit_; + const int number_levels_; + ColumnFamilyData* cfd_; + Arena arena_; // Arena used to allocate space for file_levels_ + + const uint32_t output_path_id_; + CompressionType output_compression_; + CompressionOptions output_compression_opts_; + Temperature output_temperature_; + // If true, then the compaction can be done by simply deleting input files. + const bool deletion_compaction_; + // should it split the output file using the compact cursor? + const InternalKey* output_split_key_; + + // L0 files in LSM-tree might be overlapping. But the compaction picking + // logic might pick a subset of the files that aren't overlapping. if + // that is the case, set the value to false. Otherwise, set it true. + bool l0_files_might_overlap_; + + // Compaction input files organized by level. Constant after construction + const std::vector inputs_; + + // A copy of inputs_, organized more closely in memory + autovector input_levels_; + + // State used to check for number of overlapping grandparent files + // (grandparent == "output_level_ + 1") + std::vector grandparents_; + const double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + const bool bottommost_level_; + // Does this compaction include all sst files? + const bool is_full_compaction_; + + // Is this compaction requested by the client? + const bool is_manual_compaction_; + + // The data with timestamp > trim_ts_ will be removed + const std::string trim_ts_; + + // True if we can do trivial move in Universal multi level + // compaction + bool is_trivial_move_; + + // Does input compression match the output compression? + bool InputCompressionMatchesOutput() const; + + // table properties of output files + TablePropertiesCollection output_table_properties_; + + // smallest user keys in compaction + // includes timestamp if user-defined timestamp is enabled. + Slice smallest_user_key_; + + // largest user keys in compaction + // includes timestamp if user-defined timestamp is enabled. + Slice largest_user_key_; + + // Reason for compaction + CompactionReason compaction_reason_; + + // Notify on compaction completion only if listener was notified on compaction + // begin. + bool notify_on_compaction_completion_; + + // Enable/disable GC collection for blobs during compaction. + bool enable_blob_garbage_collection_; + + // Blob garbage collection age cutoff. + double blob_garbage_collection_age_cutoff_; + + // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) + // means not supported. + const int penultimate_level_; + + // Key range for penultimate level output + // includes timestamp if user-defined timestamp is enabled. + // penultimate_output_range_type_ shows the range type + Slice penultimate_level_smallest_user_key_; + Slice penultimate_level_largest_user_key_; + PenultimateOutputRangeType penultimate_output_range_type_ = + PenultimateOutputRangeType::kNotSupported; +}; + +#ifndef NDEBUG +// Helper struct only for tests, which contains the data to decide if a key +// should be output to the penultimate level. +// TODO: remove this when the public feature knob is available +struct PerKeyPlacementContext { + const int level; + const Slice key; + const Slice value; + const SequenceNumber seq_num; + + bool& output_to_penultimate_level; + + PerKeyPlacementContext(int _level, Slice _key, Slice _value, + SequenceNumber _seq_num, + bool& _output_to_penultimate_level) + : level(_level), + key(_key), + value(_value), + seq_num(_seq_num), + output_to_penultimate_level(_output_to_penultimate_level) {} +}; +#endif /* !NDEBUG */ + +// Return sum of sizes of all files in `files`. +extern uint64_t TotalFileSize(const std::vector& files); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iteration_stats.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iteration_stats.h new file mode 100644 index 0000000000..1b1c28b57a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iteration_stats.h @@ -0,0 +1,49 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +struct CompactionIterationStats { + // Compaction statistics + + // Doesn't include records skipped because of + // CompactionFilter::Decision::kRemoveAndSkipUntil. + int64_t num_record_drop_user = 0; + + int64_t num_record_drop_hidden = 0; + int64_t num_record_drop_obsolete = 0; + int64_t num_record_drop_range_del = 0; + int64_t num_range_del_drop_obsolete = 0; + // Deletions obsoleted before bottom level due to file gap optimization. + int64_t num_optimized_del_drop_obsolete = 0; + uint64_t total_filter_time = 0; + + // Input statistics + // TODO(noetzli): The stats are incomplete. They are lacking everything + // consumed by MergeHelper. + uint64_t num_input_records = 0; + uint64_t num_input_deletion_records = 0; + uint64_t num_input_corrupt_records = 0; + uint64_t total_input_raw_key_bytes = 0; + uint64_t total_input_raw_value_bytes = 0; + + // Single-Delete diagnostics for exceptional situations + uint64_t num_single_del_fallthru = 0; + uint64_t num_single_del_mismatch = 0; + + // Blob related statistics + uint64_t num_blobs_read = 0; + uint64_t total_blob_bytes_read = 0; + uint64_t num_blobs_relocated = 0; + uint64_t total_blob_bytes_relocated = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.cc new file mode 100644 index 0000000000..5be7b565ac --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.cc @@ -0,0 +1,1445 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/compaction_iterator.h" + +#include +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_builder.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" +#include "db/snapshot_checker.h" +#include "db/wide/wide_column_serialization.h" +#include "logging/logging.h" +#include "port/likely.h" +#include "rocksdb/listener.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + const Compaction* compaction, const CompactionFilter* compaction_filter, + const std::atomic* shutting_down, + const std::shared_ptr info_log, + const std::string* full_history_ts_low, + const SequenceNumber preserve_time_min_seqno, + const SequenceNumber preclude_last_level_min_seqno) + : CompactionIterator( + input, cmp, merge_helper, last_sequence, snapshots, + earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, + report_detailed_time, expect_valid_internal_key, range_del_agg, + blob_file_builder, allow_data_in_errors, enforce_single_del_contracts, + manual_compaction_canceled, + std::unique_ptr( + compaction ? new RealCompaction(compaction) : nullptr), + compaction_filter, shutting_down, info_log, full_history_ts_low, + preserve_time_min_seqno, preclude_last_level_min_seqno) {} + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber /*last_sequence*/, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter, + const std::atomic* shutting_down, + const std::shared_ptr info_log, + const std::string* full_history_ts_low, + const SequenceNumber preserve_time_min_seqno, + const SequenceNumber preclude_last_level_min_seqno) + : input_(input, cmp, + !compaction || compaction->DoesInputReferenceBlobFiles()), + cmp_(cmp), + merge_helper_(merge_helper), + snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + job_snapshot_(job_snapshot), + snapshot_checker_(snapshot_checker), + env_(env), + clock_(env_->GetSystemClock().get()), + report_detailed_time_(report_detailed_time), + expect_valid_internal_key_(expect_valid_internal_key), + range_del_agg_(range_del_agg), + blob_file_builder_(blob_file_builder), + compaction_(std::move(compaction)), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + manual_compaction_canceled_(manual_compaction_canceled), + bottommost_level_(!compaction_ ? false + : compaction_->bottommost_level() && + !compaction_->allow_ingest_behind()), + // snapshots_ cannot be nullptr, but we will assert later in the body of + // the constructor. + visible_at_tip_(snapshots_ ? snapshots_->empty() : false), + earliest_snapshot_(!snapshots_ || snapshots_->empty() + ? kMaxSequenceNumber + : snapshots_->at(0)), + info_log_(info_log), + allow_data_in_errors_(allow_data_in_errors), + enforce_single_del_contracts_(enforce_single_del_contracts), + timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0), + full_history_ts_low_(full_history_ts_low), + current_user_key_sequence_(0), + current_user_key_snapshot_(0), + merge_out_iter_(merge_helper_), + blob_garbage_collection_cutoff_file_number_( + ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())), + blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())), + prefetch_buffers_( + CreatePrefetchBufferCollectionIfNeeded(compaction_.get())), + current_key_committed_(false), + cmp_with_history_ts_low_(0), + level_(compaction_ == nullptr ? 0 : compaction_->level()), + preserve_time_min_seqno_(preserve_time_min_seqno), + preclude_last_level_min_seqno_(preclude_last_level_min_seqno) { + assert(snapshots_ != nullptr); + assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_); + + if (compaction_ != nullptr) { + level_ptrs_ = std::vector(compaction_->number_levels(), 0); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } + assert(timestamp_size_ == 0 || !full_history_ts_low_ || + timestamp_size_ == full_history_ts_low_->size()); +#endif + input_.SetPinnedItersMgr(&pinned_iters_mgr_); + // The default `merge_until_status_` does not need to be checked since it is + // overwritten as soon as `MergeUntil()` is called + merge_until_status_.PermitUncheckedError(); + TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); +} + +CompactionIterator::~CompactionIterator() { + // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime + input_.SetPinnedItersMgr(nullptr); +} + +void CompactionIterator::ResetRecordCounts() { + iter_stats_.num_record_drop_user = 0; + iter_stats_.num_record_drop_hidden = 0; + iter_stats_.num_record_drop_obsolete = 0; + iter_stats_.num_record_drop_range_del = 0; + iter_stats_.num_range_del_drop_obsolete = 0; + iter_stats_.num_optimized_del_drop_obsolete = 0; +} + +void CompactionIterator::SeekToFirst() { + NextFromInput(); + PrepareOutput(); +} + +void CompactionIterator::Next() { + // If there is a merge output, return it before continuing to process the + // input. + if (merge_out_iter_.Valid()) { + merge_out_iter_.Next(); + + // Check if we returned all records of the merge output. + if (merge_out_iter_.Valid()) { + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to be valid. + if (!s.ok()) { + ROCKS_LOG_FATAL( + info_log_, "Invalid ikey %s in compaction. %s", + allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", + s.getState()); + assert(false); + } + + // Keep current_key_ in sync. + if (0 == timestamp_size_) { + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } else { + Slice ts = ikey_.GetTimestamp(timestamp_size_); + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts); + } + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + validity_info_.SetValid(ValidContext::kMerge1); + } else { + if (merge_until_status_.IsMergeInProgress()) { + // `Status::MergeInProgress()` tells us that the previous `MergeUntil()` + // produced only merge operands. Those merge operands were accessed and + // written out using `merge_out_iter_`. Since `merge_out_iter_` is + // exhausted at this point, all merge operands have been written out. + // + // Still, there may be a base value (PUT, DELETE, SINGLEDEL, etc.) that + // needs to be written out. Normally, `CompactionIterator` would skip it + // on the basis that it has already output something in the same + // snapshot stripe. To prevent this, we reset `has_current_user_key_` to + // trick the future iteration from finding out the snapshot stripe is + // unchanged. + has_current_user_key_ = false; + } + // We consumed all pinned merge operands, release pinned iterators + pinned_iters_mgr_.ReleasePinnedData(); + // MergeHelper moves the iterator to the first record after the merged + // records, so even though we reached the end of the merge output, we do + // not want to advance the iterator. + NextFromInput(); + } + } else { + // Only advance the input iterator if there is no merge output and the + // iterator is not already at the next record. + if (!at_next_) { + AdvanceInputIter(); + } + NextFromInput(); + } + + if (Valid()) { + // Record that we've outputted a record for the current key. + has_outputted_key_ = true; + } + + PrepareOutput(); +} + +bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, + Slice* skip_until) { + if (!compaction_filter_) { + return true; + } + + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && + ikey_.type != kTypeWideColumnEntity) { + return true; + } + + CompactionFilter::Decision decision = + CompactionFilter::Decision::kUndetermined; + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : ikey_.type == kTypeBlobIndex + ? CompactionFilter::ValueType::kBlobIndex + : CompactionFilter::ValueType::kWideColumnEntity; + + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + assert(compaction_filter_); + const Slice& filter_key = + (ikey_.type != kTypeBlobIndex || + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) + ? ikey_.user_key + : key_; + + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + + std::vector> new_columns; + + { + StopWatchNano timer(clock_, report_detailed_time_); + + if (ikey_.type == kTypeBlobIndex) { + decision = compaction_filter_->FilterBlobByKey( + level_, filter_key, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + if (decision == CompactionFilter::Decision::kUndetermined && + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + if (!compaction_) { + status_ = + Status::Corruption("Unexpected blob index outside of compaction"); + validity_info_.Invalidate(); + return false; + } + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex", + &value_); + + // For integrated BlobDB impl, CompactionIterator reads blob value. + // For Stacked BlobDB impl, the corresponding CompactionFilter's + // FilterV2 method should read the blob value. + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value_); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher_); + + s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index, + prefetch_buffer, &blob_value_, + &bytes_read); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + value_type = CompactionFilter::ValueType::kValue; + } + } + + if (decision == CompactionFilter::Decision::kUndetermined) { + const Slice* existing_val = nullptr; + const WideColumns* existing_col = nullptr; + + WideColumns existing_columns; + + if (ikey_.type != kTypeWideColumnEntity) { + if (!blob_value_.empty()) { + existing_val = &blob_value_; + } else { + existing_val = &value_; + } + } else { + Slice value_copy = value_; + const Status s = + WideColumnSerialization::Deserialize(value_copy, existing_columns); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + existing_col = &existing_columns; + } + + decision = compaction_filter_->FilterV3( + level_, filter_key, value_type, existing_val, existing_col, + &compaction_filter_value_, &new_columns, + compaction_filter_skip_until_.rep()); + } + + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } + + if (decision == CompactionFilter::Decision::kUndetermined) { + // Should not reach here, since FilterV2/FilterV3 should never return + // kUndetermined. + status_ = Status::NotSupported( + "FilterV2/FilterV3 should never return kUndetermined"); + validity_info_.Invalidate(); + return false; + } + + if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2/FilterV3 documentation. + decision = CompactionFilter::Decision::kKeep; + } + + if (decision == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (decision == CompactionFilter::Decision::kPurge) { + // convert the current key to a single delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeSingleDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion); + // no value associated with single delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (decision == CompactionFilter::Decision::kChangeValue) { + if (ikey_.type != kTypeValue) { + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeValue); + } + + value_ = compaction_filter_value_; + } else if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } else if (decision == CompactionFilter::Decision::kChangeBlobIndex) { + // Only the StackableDB-based BlobDB impl's compaction filter should return + // kChangeBlobIndex. Decision about rewriting blob and changing blob index + // in the integrated BlobDB impl is made in subsequent call to + // PrepareOutput() and its callees. + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "Only stacked BlobDB's internal compaction filter can return " + "kChangeBlobIndex."); + validity_info_.Invalidate(); + return false; + } + + if (ikey_.type != kTypeBlobIndex) { + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeBlobIndex); + } + + value_ = compaction_filter_value_; + } else if (decision == CompactionFilter::Decision::kIOError) { + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "CompactionFilter for integrated BlobDB should not return kIOError"); + validity_info_.Invalidate(); + return false; + } + + status_ = Status::IOError("Failed to access blob during compaction filter"); + validity_info_.Invalidate(); + return false; + } else if (decision == CompactionFilter::Decision::kChangeWideColumnEntity) { + WideColumns sorted_columns; + + sorted_columns.reserve(new_columns.size()); + for (const auto& column : new_columns) { + sorted_columns.emplace_back(column.first, column.second); + } + + std::sort(sorted_columns.begin(), sorted_columns.end(), + [](const WideColumn& lhs, const WideColumn& rhs) { + return lhs.name().compare(rhs.name()) < 0; + }); + + { + const Status s = WideColumnSerialization::Serialize( + sorted_columns, compaction_filter_value_); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + } + + if (ikey_.type != kTypeWideColumnEntity) { + ikey_.type = kTypeWideColumnEntity; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeWideColumnEntity); + } + + value_ = compaction_filter_value_; + } + + return true; +} + +void CompactionIterator::NextFromInput() { + at_next_ = false; + validity_info_.Invalidate(); + + while (!Valid() && input_.Valid() && !IsPausingManualCompaction() && + !IsShuttingDown()) { + key_ = input_.key(); + value_ = input_.value(); + blob_value_.Reset(); + iter_stats_.num_input_records++; + is_range_del_ = input_.IsDeleteRangeSentinelKey(); + + Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + if (!pik_status.ok()) { + iter_stats_.num_input_corrupt_records++; + + // If `expect_valid_internal_key_` is false, return the corrupted key + // and let the caller decide what to do with it. + if (expect_valid_internal_key_) { + status_ = pik_status; + return; + } + key_ = current_key_.SetInternalKey(key_); + has_current_user_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + validity_info_.SetValid(ValidContext::kParseKeyError); + break; + } + TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); + if (is_range_del_) { + validity_info_.SetValid(kRangeDeletion); + break; + } + // Update input statistics + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || + ikey_.type == kTypeDeletionWithTimestamp) { + iter_stats_.num_input_deletion_records++; + } + iter_stats_.total_input_raw_key_bytes += key_.size(); + iter_stats_.total_input_raw_value_bytes += value_.size(); + + // If need_skip is true, we should seek the input iterator + // to internal key skip_until and continue from there. + bool need_skip = false; + // Points either into compaction_filter_skip_until_ or into + // merge_helper_->compaction_filter_skip_until_. + Slice skip_until; + + bool user_key_equal_without_ts = false; + int cmp_ts = 0; + if (has_current_user_key_) { + user_key_equal_without_ts = + cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_); + // if timestamp_size_ > 0, then curr_ts_ has been initialized by a + // previous key. + cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp( + ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_), + curr_ts_) + : 0; + } + + // Check whether the user key changed. After this if statement current_key_ + // is a copy of the current input key (maybe converted to a delete by the + // compaction filter). ikey_.user_key is pointing to the copy. + if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) { + // First occurrence of this user key + // Copy key for output + key_ = current_key_.SetInternalKey(key_, &ikey_); + + int prev_cmp_with_ts_low = + !full_history_ts_low_ ? 0 + : curr_ts_.empty() + ? 0 + : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_); + + // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use + // in next iteration to compare with the timestamp of next key. + UpdateTimestampAndCompareWithFullHistoryLow(); + + // If + // (1) !has_current_user_key_, OR + // (2) timestamp is disabled, OR + // (3) all history will be preserved, OR + // (4) user key (excluding timestamp) is different from previous key, OR + // (5) timestamp is NO older than *full_history_ts_low_, OR + // (6) timestamp is the largest one older than full_history_ts_low_, + // then current_user_key_ must be treated as a different user key. + // This means, if a user key (excluding ts) is the same as the previous + // user key, and its ts is older than *full_history_ts_low_, then we + // consider this key for GC, e.g. it may be dropped if certain conditions + // match. + if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ || + !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 || + prev_cmp_with_ts_low >= 0) { + // Initialize for future comparison for rule (A) and etc. + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + has_current_user_key_ = true; + } + current_user_key_ = ikey_.user_key; + + has_outputted_key_ = false; + + last_key_seq_zeroed_ = false; + + current_key_committed_ = KeyCommitted(ikey_.sequence); + + // Apply the compaction filter to the first committed version of the user + // key. + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; + } + } else { + // Update the current key to reflect the new sequence number/type without + // copying the user key. + // TODO(rven): Compaction filter does not process keys in this path + // Need to have the compaction filter process multiple versions + // if we have versions on both sides of a snapshot + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + + // Note that newer version of a key is ordered before older versions. If a + // newer version of a key is committed, so as the older version. No need + // to query snapshot_checker_ in that case. + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + current_key_committed_ = KeyCommitted(ikey_.sequence); + // Apply the compaction filter to the first committed version of the + // user key. + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; + } + } + } + + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted); + break; + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find the earliest + // snapshot that is affected by this kv. + SequenceNumber last_sequence = current_user_key_sequence_; + current_user_key_sequence_ = ikey_.sequence; + SequenceNumber last_snapshot = current_user_key_snapshot_; + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + current_user_key_snapshot_ = + visible_at_tip_ + ? earliest_snapshot_ + : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot); + + if (need_skip) { + // This case is handled below. + } else if (clear_and_output_next_key_) { + // In the previous iteration we encountered a single delete that we could + // not compact out. We will keep this Put, but can drop it's data. + // (See Optimization 3, below.) + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && + ikey_.type != kTypeWideColumnEntity) { + ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output", + ikey_.DebugString(allow_data_in_errors_, true).c_str()); + assert(false); + } + if (current_user_key_snapshot_ < last_snapshot) { + ROCKS_LOG_FATAL(info_log_, + "key %s, current_user_key_snapshot_ (%" PRIu64 + ") < last_snapshot (%" PRIu64 ")", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + current_user_key_snapshot_, last_snapshot); + assert(false); + } + + if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) { + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + + value_.clear(); + validity_info_.SetValid(ValidContext::kKeepSDAndClearPut); + clear_and_output_next_key_ = false; + } else if (ikey_.type == kTypeSingleDeletion) { + // We can compact out a SingleDelete if: + // 1) We encounter the corresponding PUT -OR- we know that this key + // doesn't appear past this output level + // =AND= + // 2) We've already returned a record in this snapshot -OR- + // there are no earlier earliest_write_conflict_snapshot. + // + // A note about 2) above: + // we try to determine whether there is any earlier write conflict + // checking snapshot by calling DefinitelyInSnapshot() with seq and + // earliest_write_conflict_snapshot as arguments. For write-prepared + // and write-unprepared transactions, if earliest_write_conflict_snapshot + // is evicted from WritePreparedTxnDB::commit_cache, then + // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns + // false, even if the seq is actually visible within + // earliest_write_conflict_snapshot. Consequently, CompactionIterator + // may try to zero out its sequence number, thus hitting assertion error + // in debug mode or cause incorrect DBIter return result. + // We observe that earliest_write_conflict_snapshot >= earliest_snapshot, + // and the seq zeroing logic depends on + // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot + // determine whether seq is **definitely** in + // earliest_write_conflict_snapshot, then we can additionally check if + // seq is definitely in earliest_snapshot. If the latter holds, then the + // former holds too. + // + // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to + // allow Transactions to do write-conflict checking (if we compacted away + // all keys, then we wouldn't know that a write happened in this + // snapshot). If there is no earlier snapshot, then we know that there + // are no active transactions that need to know about any writes. + // + // Optimization 3: + // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT + // true, then we must output a SingleDelete. In this case, we will decide + // to also output the PUT. While we are compacting less by outputting the + // PUT now, hopefully this will lead to better compaction in the future + // when Rule 2 is later true (Ie, We are hoping we can later compact out + // both the SingleDelete and the Put, while we couldn't if we only + // outputted the SingleDelete now). + // In this case, we can save space by removing the PUT's value as it will + // never be read. + // + // Deletes and Merges are not supported on the same key that has a + // SingleDelete as it is not possible to correctly do any partial + // compaction of such a combination of operations. The result of mixing + // those operations for a given key is documented as being undefined. So + // we can choose how to handle such a combinations of operations. We will + // try to compact out as much as we can in these cases. + // We will report counts on these anomalous cases. + // + // Note: If timestamp is enabled, then record will be eligible for + // deletion, only if, along with above conditions (Rule 1 and Rule 2) + // full_history_ts_low_ is specified and timestamp for that key is less + // than *full_history_ts_low_. If it's not eligible for deletion, then we + // will output the SingleDelete. For Optimization 3 also, if + // full_history_ts_low_ is specified and timestamp for the key is less + // than *full_history_ts_low_ then only optimization will be applied. + + // The easiest way to process a SingleDelete during iteration is to peek + // ahead at the next key. + const bool is_timestamp_eligible_for_gc = + (timestamp_size_ == 0 || + (full_history_ts_low_ && cmp_with_history_ts_low_ < 0)); + + ParsedInternalKey next_ikey; + AdvanceInputIter(); + while (input_.Valid() && input_.IsDeleteRangeSentinelKey() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + // skip range tombstone start keys with the same user key + // since they are not "real" point keys. + AdvanceInputIter(); + } + + // Check whether the next key exists, is not corrupt, and is the same key + // as the single delete. + if (input_.Valid() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + assert(!input_.IsDeleteRangeSentinelKey()); +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:1", + const_cast(c)); + if (last_key_seq_zeroed_) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + AdvanceInputIter(); + } else if (prev_snapshot == 0 || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) { + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:2", nullptr); + if (next_ikey.type == kTypeSingleDeletion) { + // We encountered two SingleDeletes for same key in a row. This + // could be due to unexpected user input. If write-(un)prepared + // transaction is used, this could also be due to releasing an old + // snapshot between a Put and its matching SingleDelete. + // Skip the first SingleDelete and let the next iteration decide + // how to handle the second SingleDelete. + + // First SingleDelete has been skipped since we already called + // input_.Next(). + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + } else if (next_ikey.type == kTypeDeletion) { + std::ostringstream oss; + oss << "Found SD and type: " << static_cast(next_ikey.type) + << " on the same key, violating the contract " + "of SingleDelete. Check your application to make sure the " + "application does not mix SingleDelete and Delete for " + "the same key. If you are using " + "write-prepared/write-unprepared transactions, and use " + "SingleDelete to delete certain keys, then make sure " + "TransactionDBOptions::rollback_deletion_type_callback is " + "configured properly. Mixing SD and DEL can lead to " + "undefined behaviors"; + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + if (enforce_single_del_contracts_) { + ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str()); + validity_info_.Invalidate(); + status_ = Status::Corruption(oss.str()); + return; + } + ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str()); + } else if (!is_timestamp_eligible_for_gc) { + // We cannot drop the SingleDelete as timestamp is enabled, and + // timestamp of this key is greater than or equal to + // *full_history_ts_low_. We will output the SingleDelete. + validity_info_.SetValid(ValidContext::kKeepTsHistory); + } else if (has_outputted_key_ || + DefinitelyInSnapshot(ikey_.sequence, + earliest_write_conflict_snapshot_) || + (earliest_snapshot_ < earliest_write_conflict_snapshot_ && + DefinitelyInSnapshot(ikey_.sequence, + earliest_snapshot_))) { + // Found a matching value, we can drop the single delete and the + // value. It is safe to drop both records since we've already + // outputted a key in this snapshot, or there is no earlier + // snapshot (Rule 2 above). + + // Note: it doesn't matter whether the second key is a Put or if it + // is an unexpected Merge or Delete. We will compact it out + // either way. We will maintain counts of how many mismatches + // happened + if (next_ikey.type != kTypeValue && + next_ikey.type != kTypeBlobIndex && + next_ikey.type != kTypeWideColumnEntity) { + ++iter_stats_.num_single_del_mismatch; + } + + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + // Already called input_.Next() once. Call it a second time to + // skip past the second key. + AdvanceInputIter(); + } else { + // Found a matching value, but we cannot drop both keys since + // there is an earlier snapshot and we need to leave behind a record + // to know that a write happened in this snapshot (Rule 2 above). + // Clear the value and output the SingleDelete. (The value will be + // outputted on the next iteration.) + + // Setting valid_ to true will output the current SingleDelete + validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck); + + // Set up the Put to be outputted in the next iteration. + // (Optimization 3). + clear_and_output_next_key_ = true; + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:KeepSDForWW", + /*arg=*/nullptr); + } + } else { + // We hit the next snapshot without hitting a put, so the iterator + // returns the single delete. + validity_info_.SetValid(ValidContext::kKeepSDForSnapshot); + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:3", + const_cast(c)); + } + } else { + // We are at the end of the input, could not parse the next key, or hit + // a different key. The iterator returns the single delete if the key + // possibly exists beyond the current output level. We set + // has_current_user_key to false so that if the iterator is at the next + // key, we do not compare it again against the previous key at the next + // iteration. If the next key is corrupt, we return before the + // comparison, so the value of has_current_user_key does not matter. + has_current_user_key_ = false; + if (compaction_ != nullptr && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_) && + is_timestamp_eligible_for_gc) { + // Key doesn't exist outside of this range. + // Can compact out this SingleDelete. + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_fallthru; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + } else if (last_key_seq_zeroed_) { + // Skip. + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + } else { + // Output SingleDelete + validity_info_.SetValid(ValidContext::kKeepSD); + } + } + + if (Valid()) { + at_next_ = true; + } + } else if (last_snapshot == current_user_key_snapshot_ || + (last_snapshot > 0 && + last_snapshot < current_user_key_snapshot_)) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibility of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // + // Note: Dropping this key will not affect TransactionDB write-conflict + // checking since there has already been a record returned for this key + // in this snapshot. + if (last_sequence < current_user_key_sequence_) { + ROCKS_LOG_FATAL(info_log_, + "key %s, last_sequence (%" PRIu64 + ") < current_user_key_sequence_ (%" PRIu64 ")", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + last_sequence, current_user_key_sequence_); + assert(false); + } + + ++iter_stats_.num_record_drop_hidden; // rule (A) + AdvanceInputIter(); + } else if (compaction_ != nullptr && + (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // TODO(noetzli): This is the only place where we use compaction_ + // (besides the constructor). We should probably get rid of this + // dependency and find a way to do similar filtering during flushes. + // + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + // + // Note: Dropping this Delete will not affect TransactionDB + // write-conflict checking since it is earlier than any snapshot. + // + // It seems that we can also drop deletion later than earliest snapshot + // given that: + // (1) The deletion is earlier than earliest_write_conflict_snapshot, and + // (2) No value exist earlier than the deletion. + // + // Note also that a deletion marker of type kTypeDeletionWithTimestamp + // will be treated as a different user key unless the timestamp is older + // than *full_history_ts_low_. + ++iter_stats_.num_record_drop_obsolete; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + AdvanceInputIter(); + } else if ((ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + bottommost_level_) { + // Handle the case where we have a delete key at the bottom most level + // We can skip outputting the key iff there are no subsequent puts for + // this key + assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( + ikey_.user_key, &level_ptrs_)); + ParsedInternalKey next_ikey; + AdvanceInputIter(); +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:BottommostDelete:1", + const_cast(c)); + // Skip over all versions of this key that happen to occur in the same + // snapshot range as the delete. + // + // Note that a deletion marker of type kTypeDeletionWithTimestamp will be + // considered to have a different user key unless the timestamp is older + // than *full_history_ts_low_. + // + // Range tombstone start keys are skipped as they are not "real" keys. + while (!IsPausingManualCompaction() && !IsShuttingDown() && + input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && + (prev_snapshot == 0 || input_.IsDeleteRangeSentinelKey() || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) { + AdvanceInputIter(); + } + // If you find you still need to output a row with this key, we need to + // output the delete too + if (input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + validity_info_.SetValid(ValidContext::kKeepDel); + at_next_ = true; + } + } else if (ikey_.type == kTypeMerge) { + if (!merge_helper_->HasOperator()) { + status_ = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + + pinned_iters_mgr_.StartPinning(); + + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. + merge_until_status_ = merge_helper_->MergeUntil( + &input_, range_del_agg_, prev_snapshot, bottommost_level_, + allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_, + prefetch_buffers_.get(), &iter_stats_); + merge_out_iter_.SeekToFirst(); + + if (!merge_until_status_.ok() && + !merge_until_status_.IsMergeInProgress()) { + status_ = merge_until_status_; + return; + } else if (merge_out_iter_.Valid()) { + // NOTE: key, value, and ikey_ refer to old entries. + // These will be correctly set below. + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to valid. + if (!pik_status.ok()) { + ROCKS_LOG_FATAL( + info_log_, "Invalid key %s in compaction. %s", + allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", + pik_status.getState()); + assert(false); + } + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + validity_info_.SetValid(ValidContext::kMerge2); + } else { + // all merge operands were filtered out. reset the user key, since the + // batch consumed by the merge operator should not shadow any keys + // coming after the merges + has_current_user_key_ = false; + pinned_iters_mgr_.ReleasePinnedData(); + + if (merge_helper_->FilteredUntil(&skip_until)) { + need_skip = true; + } + } + } else { + // 1. new user key -OR- + // 2. different snapshot stripe + // If user-defined timestamp is enabled, we consider keys for GC if they + // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete() + // only considers range deletions that are at or below history_ts_low_ and + // trim_ts_. We drop keys here that are below history_ts_low_ and are + // covered by a range tombstone that is at or below history_ts_low_ and + // trim_ts. + bool should_delete = false; + if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) { + should_delete = range_del_agg_->ShouldDelete( + key_, RangeDelPositioningMode::kForwardTraversal); + } + if (should_delete) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_range_del; + AdvanceInputIter(); + } else { + validity_info_.SetValid(ValidContext::kNewUserKey); + } + } + + if (need_skip) { + SkipUntil(skip_until); + } + } + + if (!Valid() && IsShuttingDown()) { + status_ = Status::ShutdownInProgress(); + } + + if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + // Propagate corruption status from memtable itereator + if (!input_.Valid() && input_.status().IsCorruption()) { + status_ = input_.status(); + } +} + +bool CompactionIterator::ExtractLargeValueIfNeededImpl() { + if (!blob_file_builder_) { + return false; + } + + blob_index_.clear(); + const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return false; + } + + if (blob_index_.empty()) { + return false; + } + + value_ = blob_index_; + + return true; +} + +void CompactionIterator::ExtractLargeValueIfNeeded() { + assert(ikey_.type == kTypeValue); + + if (!ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); +} + +void CompactionIterator::GarbageCollectBlobIfNeeded() { + assert(ikey_.type == kTypeBlobIndex); + + if (!compaction_) { + return; + } + + // GC for integrated BlobDB + if (compaction_->enable_blob_garbage_collection()) { + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex", + &value_); + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value_); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return; + } + } + + if (blob_index.file_number() >= + blob_garbage_collection_cutoff_file_number_) { + return; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + { + assert(blob_fetcher_); + + const Status s = blob_fetcher_->FetchBlob( + user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return; + } + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + ++iter_stats_.num_blobs_relocated; + iter_stats_.total_blob_bytes_relocated += blob_index.size(); + + value_ = blob_value_; + + if (ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + + return; + } + + // GC for stacked BlobDB + if (compaction_filter_ && + compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = + Status::Corruption("Corrupted blob reference encountered during GC"); + validity_info_.Invalidate(); + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + validity_info_.Invalidate(); + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + + return; + } + } +} + +void CompactionIterator::DecideOutputLevel() { + assert(compaction_->SupportsPerKeyPlacement()); + output_to_penultimate_level_ = false; + // if the key is newer than the cutoff sequence or within the earliest + // snapshot, it should output to the penultimate level. + if (ikey_.sequence > preclude_last_level_min_seqno_ || + ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } + +#ifndef NDEBUG + // Could be overridden by unittest + PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence, + output_to_penultimate_level_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + if (ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } +#endif // NDEBUG + + if (output_to_penultimate_level_) { + // If it's decided to output to the penultimate level, but unsafe to do so, + // still output to the last level. For example, moving the data from a lower + // level to a higher level outside of the higher-level input key range is + // considered unsafe, because the key may conflict with higher-level SSTs + // not from this compaction. + // TODO: add statistic for declined output_to_penultimate_level + bool safe_to_penultimate_level = + compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key); + if (!safe_to_penultimate_level) { + output_to_penultimate_level_ = false; + // It could happen when disable/enable `last_level_temperature` while + // holding a snapshot. When `last_level_temperature` is not set + // (==kUnknown), the data newer than any snapshot is pushed to the last + // level, but when the per_key_placement feature is enabled on the fly, + // the data later than the snapshot has to be moved to the penultimate + // level, which may or may not be safe. So the user needs to make sure all + // snapshot is released before enabling `last_level_temperature` feature + // We will migrate the feature to `last_level_temperature` and maybe make + // it not dynamically changeable. + if (ikey_.sequence > earliest_snapshot_) { + status_ = Status::Corruption( + "Unsafe to store Seq later than snapshot in the last level if " + "per_key_placement is enabled"); + } + } + } +} + +void CompactionIterator::PrepareOutput() { + if (Valid()) { + if (LIKELY(!is_range_del_)) { + if (ikey_.type == kTypeValue) { + ExtractLargeValueIfNeeded(); + } else if (ikey_.type == kTypeBlobIndex) { + GarbageCollectBlobIfNeeded(); + } + } + + if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { + DecideOutputLevel(); + } + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // and the userkey differs from the last userkey in compaction + // then we can squash the seqno to zero. + // + // This is safe for TransactionDB write-conflict checking since transactions + // only care about sequence number larger than any active snapshots. + // + // Can we do the same for levels above bottom level as long as + // KeyNotExistsBeyondOutputLevel() return true? + if (Valid() && compaction_ != nullptr && + !compaction_->allow_ingest_behind() && bottommost_level_ && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + ikey_.type != kTypeMerge && current_key_committed_ && + !output_to_penultimate_level_ && + ikey_.sequence < preserve_time_min_seqno_ && !is_range_del_) { + if (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { + ROCKS_LOG_FATAL( + info_log_, + "Unexpected key %s for seq-zero optimization. " + "earliest_snapshot %" PRIu64 + ", earliest_write_conflict_snapshot %" PRIu64 + " job_snapshot %" PRIu64 + ". timestamp_size: %d full_history_ts_low_ %s. validity %x", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + earliest_snapshot_, earliest_write_conflict_snapshot_, + job_snapshot_, static_cast(timestamp_size_), + full_history_ts_low_ != nullptr + ? Slice(*full_history_ts_low_).ToString(true).c_str() + : "null", + validity_info_.rep); + assert(false); + } + ikey_.sequence = 0; + last_key_seq_zeroed_ = true; + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq", + &ikey_); + if (!timestamp_size_) { + current_key_.UpdateInternalKey(0, ikey_.type); + } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) { + // We can also zero out timestamp for better compression. + // For the same user key (excluding timestamp), the timestamp-based + // history can be collapsed to save some space if the timestamp is + // older than *full_history_ts_low_. + const std::string kTsMin(timestamp_size_, static_cast(0)); + const Slice ts_slice = kTsMin; + ikey_.SetTimestamp(ts_slice); + current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice); + } + } + } +} + +inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot) { + assert(snapshots_->size()); + if (snapshots_->size() == 0) { + ROCKS_LOG_FATAL(info_log_, + "No snapshot left in findEarliestVisibleSnapshot"); + } + auto snapshots_iter = + std::lower_bound(snapshots_->begin(), snapshots_->end(), in); + assert(prev_snapshot != nullptr); + if (snapshots_iter == snapshots_->begin()) { + *prev_snapshot = 0; + } else { + *prev_snapshot = *std::prev(snapshots_iter); + if (*prev_snapshot >= in) { + ROCKS_LOG_FATAL(info_log_, + "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64 + ") in findEarliestVisibleSnapshot", + *prev_snapshot, in); + assert(false); + } + } + if (snapshot_checker_ == nullptr) { + return snapshots_iter != snapshots_->end() ? *snapshots_iter + : kMaxSequenceNumber; + } + bool has_released_snapshot = !released_snapshots_.empty(); + for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { + auto cur = *snapshots_iter; + if (in > cur) { + ROCKS_LOG_FATAL(info_log_, + "in (%" PRIu64 ") > cur (%" PRIu64 + ") in findEarliestVisibleSnapshot", + in, cur); + assert(false); + } + // Skip if cur is in released_snapshots. + if (has_released_snapshot && released_snapshots_.count(cur) > 0) { + continue; + } + auto res = snapshot_checker_->CheckInSnapshot(in, cur); + if (res == SnapshotCheckerResult::kInSnapshot) { + return cur; + } else if (res == SnapshotCheckerResult::kSnapshotReleased) { + released_snapshots_.insert(cur); + } + *prev_snapshot = cur; + } + return kMaxSequenceNumber; +} + +uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction) { + if (!compaction) { + return 0; + } + + if (!compaction->enable_blob_garbage_collection()) { + return 0; + } + + const Version* const version = compaction->input_version(); + assert(version); + + const VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + const size_t cutoff_index = static_cast( + compaction->blob_garbage_collection_age_cutoff() * blob_files.size()); + + if (cutoff_index >= blob_files.size()) { + return std::numeric_limits::max(); + } + + const auto& meta = blob_files[cutoff_index]; + assert(meta); + + return meta->GetBlobFileNumber(); +} + +std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + const Version* const version = compaction->input_version(); + if (!version) { + return nullptr; + } + + ReadOptions read_options; + read_options.io_activity = Env::IOActivity::kCompaction; + read_options.fill_cache = false; + + return std::unique_ptr(new BlobFetcher(version, read_options)); +} + +std::unique_ptr +CompactionIterator::CreatePrefetchBufferCollectionIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + if (!compaction->input_version()) { + return nullptr; + } + + if (compaction->allow_mmap_reads()) { + return nullptr; + } + + const uint64_t readahead_size = compaction->blob_compaction_readahead_size(); + if (!readahead_size) { + return nullptr; + } + + return std::unique_ptr( + new PrefetchBufferCollection(readahead_size)); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.h new file mode 100644 index 0000000000..ea2dc062e2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_iterator.h @@ -0,0 +1,530 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileBuilder; +class BlobFetcher; +class PrefetchBufferCollection; + +// A wrapper of internal iterator whose purpose is to count how +// many entries there are in the iterator. +class SequenceIterWrapper : public InternalIterator { + public: + SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp, + bool need_count_entries) + : icmp_(cmp), + inner_iter_(iter), + need_count_entries_(need_count_entries) {} + bool Valid() const override { return inner_iter_->Valid(); } + Status status() const override { return inner_iter_->status(); } + void Next() override { + num_itered_++; + inner_iter_->Next(); + } + void Seek(const Slice& target) override { + if (!need_count_entries_) { + inner_iter_->Seek(target); + } else { + // For flush cases, we need to count total number of entries, so we + // do Next() rather than Seek(). + while (inner_iter_->Valid() && + icmp_.Compare(inner_iter_->key(), target) < 0) { + Next(); + } + } + } + Slice key() const override { return inner_iter_->key(); } + Slice value() const override { return inner_iter_->value(); } + + // Unused InternalIterator methods + void SeekToFirst() override { assert(false); } + void Prev() override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + uint64_t num_itered() const { return num_itered_; } + bool IsDeleteRangeSentinelKey() const override { + assert(Valid()); + return inner_iter_->IsDeleteRangeSentinelKey(); + } + + private: + InternalKeyComparator icmp_; + InternalIterator* inner_iter_; // not owned + uint64_t num_itered_ = 0; + bool need_count_entries_; +}; + +class CompactionIterator { + public: + // A wrapper around Compaction. Has a much smaller interface, only what + // CompactionIterator uses. Tests can override it. + class CompactionProxy { + public: + virtual ~CompactionProxy() = default; + + virtual int level() const = 0; + + virtual bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const = 0; + + virtual bool bottommost_level() const = 0; + + virtual int number_levels() const = 0; + + // Result includes timestamp if user-defined timestamp is enabled. + virtual Slice GetLargestUserKey() const = 0; + + virtual bool allow_ingest_behind() const = 0; + + virtual bool allow_mmap_reads() const = 0; + + virtual bool enable_blob_garbage_collection() const = 0; + + virtual double blob_garbage_collection_age_cutoff() const = 0; + + virtual uint64_t blob_compaction_readahead_size() const = 0; + + virtual const Version* input_version() const = 0; + + virtual bool DoesInputReferenceBlobFiles() const = 0; + + virtual const Compaction* real_compaction() const = 0; + + virtual bool SupportsPerKeyPlacement() const = 0; + + // `key` includes timestamp if user-defined timestamp is enabled. + virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0; + }; + + class RealCompaction : public CompactionProxy { + public: + explicit RealCompaction(const Compaction* compaction) + : compaction_(compaction) { + assert(compaction_); + assert(compaction_->immutable_options()); + assert(compaction_->mutable_cf_options()); + } + + int level() const override { return compaction_->level(); } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const override { + return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); + } + + bool bottommost_level() const override { + return compaction_->bottommost_level(); + } + + int number_levels() const override { return compaction_->number_levels(); } + + // Result includes timestamp if user-defined timestamp is enabled. + Slice GetLargestUserKey() const override { + return compaction_->GetLargestUserKey(); + } + + bool allow_ingest_behind() const override { + return compaction_->immutable_options()->allow_ingest_behind; + } + + bool allow_mmap_reads() const override { + return compaction_->immutable_options()->allow_mmap_reads; + } + + bool enable_blob_garbage_collection() const override { + return compaction_->enable_blob_garbage_collection(); + } + + double blob_garbage_collection_age_cutoff() const override { + return compaction_->blob_garbage_collection_age_cutoff(); + } + + uint64_t blob_compaction_readahead_size() const override { + return compaction_->mutable_cf_options()->blob_compaction_readahead_size; + } + + const Version* input_version() const override { + return compaction_->input_version(); + } + + bool DoesInputReferenceBlobFiles() const override { + return compaction_->DoesInputReferenceBlobFiles(); + } + + const Compaction* real_compaction() const override { return compaction_; } + + bool SupportsPerKeyPlacement() const override { + return compaction_->SupportsPerKeyPlacement(); + } + + // Check if key is within penultimate level output range, to see if it's + // safe to output to the penultimate level for per_key_placement feature. + // `key` includes timestamp if user-defined timestamp is enabled. + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return compaction_->WithinPenultimateLevelOutputRange(key); + } + + private: + const Compaction* compaction_; + }; + + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, + const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + + // Constructor with custom CompactionProxy, used for tests. + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, + const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + + ~CompactionIterator(); + + void ResetRecordCounts(); + + // Seek to the beginning of the compaction iterator output. + // + // REQUIRED: Call only once. + void SeekToFirst(); + + // Produces the next record in the compaction. + // + // REQUIRED: SeekToFirst() has been called. + void Next(); + + // Getters + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + const Status& status() const { return status_; } + const ParsedInternalKey& ikey() const { return ikey_; } + inline bool Valid() const { return validity_info_.IsValid(); } + const Slice& user_key() const { + if (UNLIKELY(is_range_del_)) { + return ikey_.user_key; + } + return current_user_key_; + } + const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint64_t num_input_entry_scanned() const { return input_.num_itered(); } + // If the current key should be placed on penultimate level, only valid if + // per_key_placement is supported + bool output_to_penultimate_level() const { + return output_to_penultimate_level_; + } + Status InputStatus() const { return input_.status(); } + + bool IsDeleteRangeSentinelKey() const { return is_range_del_; } + + private: + // Processes the input stream to find the next output + void NextFromInput(); + + // Do final preparations before presenting the output to the callee. + void PrepareOutput(); + + // Decide the current key should be output to the last level or penultimate + // level, only call for compaction supports per key placement + void DecideOutputLevel(); + + // Passes the output value to the blob file builder (if any), and replaces it + // with the corresponding blob reference if it has been actually written to a + // blob file (i.e. if it passed the value size check). Returns true if the + // value got extracted to a blob file, false otherwise. + bool ExtractLargeValueIfNeededImpl(); + + // Extracts large values as described above, and updates the internal key's + // type to kTypeBlobIndex if the value got extracted. Should only be called + // for regular values (kTypeValue). + void ExtractLargeValueIfNeeded(); + + // Relocates valid blobs residing in the oldest blob files if garbage + // collection is enabled. Relocated blobs are written to new blob files or + // inlined in the LSM tree depending on the current settings (i.e. + // enable_blob_files and min_blob_size). Should only be called for blob + // references (kTypeBlobIndex). + // + // Note: the stacked BlobDB implementation's compaction filter based GC + // algorithm is also called from here. + void GarbageCollectBlobIfNeeded(); + + // Invoke compaction filter if needed. + // Return true on success, false on failures (e.g.: kIOError). + bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); + + // Given a sequence number, return the sequence number of the + // earliest snapshot that this sequence number is visible in. + // The snapshots themselves are arranged in ascending order of + // sequence numbers. + // Employ a sequential search because the total number of + // snapshots are typically small. + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot); + + inline bool KeyCommitted(SequenceNumber sequence) { + return snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) == + SnapshotCheckerResult::kInSnapshot; + } + + bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + // Extract user-defined timestamp from user key if possible and compare it + // with *full_history_ts_low_ if applicable. + inline void UpdateTimestampAndCompareWithFullHistoryLow() { + if (!timestamp_size_) { + return; + } + Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_); + curr_ts_.assign(ts.data(), ts.size()); + if (full_history_ts_low_) { + cmp_with_history_ts_low_ = + cmp_->CompareTimestamp(ts, *full_history_ts_low_); + } + } + + static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction); + static std::unique_ptr CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction); + static std::unique_ptr + CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction); + + SequenceIterWrapper input_; + const Comparator* cmp_; + MergeHelper* merge_helper_; + const std::vector* snapshots_; + // List of snapshots released during compaction. + // findEarliestVisibleSnapshot() find them out from return of + // snapshot_checker, and make sure they will not be returned as + // earliest visible snapshot of an older value. + // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3. + std::unordered_set released_snapshots_; + const SequenceNumber earliest_write_conflict_snapshot_; + const SequenceNumber job_snapshot_; + const SnapshotChecker* const snapshot_checker_; + Env* env_; + SystemClock* clock_; + const bool report_detailed_time_; + const bool expect_valid_internal_key_; + CompactionRangeDelAggregator* range_del_agg_; + BlobFileBuilder* blob_file_builder_; + std::unique_ptr compaction_; + const CompactionFilter* compaction_filter_; + const std::atomic* shutting_down_; + const std::atomic& manual_compaction_canceled_; + const bool bottommost_level_; + const bool visible_at_tip_; + const SequenceNumber earliest_snapshot_; + + std::shared_ptr info_log_; + + const bool allow_data_in_errors_; + + const bool enforce_single_del_contracts_; + + // Comes from comparator. + const size_t timestamp_size_; + + // Lower bound timestamp to retain full history in terms of user-defined + // timestamp. If a key's timestamp is older than full_history_ts_low_, then + // the key *may* be eligible for garbage collection (GC). The skipping logic + // is in `NextFromInput()` and `PrepareOutput()`. + // If nullptr, NO GC will be performed and all history will be preserved. + const std::string* const full_history_ts_low_; + + // State + // + enum ValidContext : uint8_t { + kMerge1 = 0, + kMerge2 = 1, + kParseKeyError = 2, + kCurrentKeyUncommitted = 3, + kKeepSDAndClearPut = 4, + kKeepTsHistory = 5, + kKeepSDForConflictCheck = 6, + kKeepSDForSnapshot = 7, + kKeepSD = 8, + kKeepDel = 9, + kNewUserKey = 10, + kRangeDeletion = 11, + }; + + struct ValidityInfo { + inline bool IsValid() const { return rep & 1; } + ValidContext GetContext() const { + return static_cast(rep >> 1); + } + inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; } + inline void Invalidate() { rep = 0; } + + uint8_t rep{0}; + } validity_info_; + + // Points to a copy of the current compaction iterator output (current_key_) + // if valid. + Slice key_; + // Points to the value in the underlying iterator that corresponds to the + // current output. + Slice value_; + // The status is OK unless compaction iterator encounters a merge operand + // while not having a merge operator defined. + Status status_; + // Stores the user key, sequence number and type of the current compaction + // iterator output (or current key in the underlying iterator during + // NextFromInput()). + ParsedInternalKey ikey_; + // Stores whether ikey_.user_key is valid. If set to false, the user key is + // not compared against the current key in the underlying iterator. + bool has_current_user_key_ = false; + // If false, the iterator holds a copy of the current compaction iterator + // output (or current key in the underlying iterator during NextFromInput()). + bool at_next_ = false; + + IterKey current_key_; + Slice current_user_key_; + std::string curr_ts_; + SequenceNumber current_user_key_sequence_; + SequenceNumber current_user_key_snapshot_; + + // True if the iterator has already returned a record for the current key. + bool has_outputted_key_ = false; + + // truncated the value of the next key and output it without applying any + // compaction rules. This is used for outputting a put after a single delete. + bool clear_and_output_next_key_ = false; + + MergeOutputIterator merge_out_iter_; + Status merge_until_status_; + // PinnedIteratorsManager used to pin input_ Iterator blocks while reading + // merge operands and then releasing them after consuming them. + PinnedIteratorsManager pinned_iters_mgr_; + + uint64_t blob_garbage_collection_cutoff_file_number_; + + std::unique_ptr blob_fetcher_; + std::unique_ptr prefetch_buffers_; + + std::string blob_index_; + PinnableSlice blob_value_; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + // "level_ptrs" holds indices that remember which file of an associated + // level we were last checking during the last call to compaction-> + // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function + // to pick off where it left off since each subcompaction's key range is + // increasing so a later call to the function must be looking for a key that + // is in or beyond the last file checked during the previous call + std::vector level_ptrs_; + CompactionIterationStats iter_stats_; + + // Used to avoid purging uncommitted values. The application can specify + // uncommitted values by providing a SnapshotChecker object. + bool current_key_committed_; + + // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_) + int cmp_with_history_ts_low_; + + const int level_; + + // True if the previous internal key (same user key)'s sequence number has + // just been zeroed out during bottommost compaction. + bool last_key_seq_zeroed_{false}; + + // True if the current key should be output to the penultimate level if + // possible, compaction logic makes the final decision on which level to + // output to. + bool output_to_penultimate_level_{false}; + + // min seqno for preserving the time information. + const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + + // min seqno to preclude the data from the last level, if the key seqno larger + // than this, it will be output to penultimate level + const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + + void AdvanceInputIter() { input_.Next(); } + + void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } + + bool IsPausingManualCompaction() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return manual_compaction_canceled_.load(std::memory_order_relaxed); + } + + // Stores whether the current compaction iterator output + // is a range tombstone start key. + bool is_range_del_{false}; +}; + +inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, + SequenceNumber snapshot) { + return ((seq) <= (snapshot) && + (snapshot_checker_ == nullptr || + LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kInSnapshot))); +} + +inline bool CompactionIterator::DefinitelyNotInSnapshot( + SequenceNumber seq, SequenceNumber snapshot) { + return ((seq) > (snapshot) || + (snapshot_checker_ != nullptr && + UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kNotInSnapshot))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.cc new file mode 100644 index 0000000000..ed152f28c6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.cc @@ -0,0 +1,2054 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_counting_iterator.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_builder.h" +#include "db/builder.h" +#include "db/compaction/clipping_iterator.h" +#include "db/compaction/compaction_state.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/history_trimming_iterator.h" +#include "db/log_writer.h" +#include "db/merge_helper.h" +#include "db/range_del_aggregator.h" +#include "db/version_edit.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetCompactionReasonString(CompactionReason compaction_reason) { + switch (compaction_reason) { + case CompactionReason::kUnknown: + return "Unknown"; + case CompactionReason::kLevelL0FilesNum: + return "LevelL0FilesNum"; + case CompactionReason::kLevelMaxLevelSize: + return "LevelMaxLevelSize"; + case CompactionReason::kUniversalSizeAmplification: + return "UniversalSizeAmplification"; + case CompactionReason::kUniversalSizeRatio: + return "UniversalSizeRatio"; + case CompactionReason::kUniversalSortedRunNum: + return "UniversalSortedRunNum"; + case CompactionReason::kFIFOMaxSize: + return "FIFOMaxSize"; + case CompactionReason::kFIFOReduceNumFiles: + return "FIFOReduceNumFiles"; + case CompactionReason::kFIFOTtl: + return "FIFOTtl"; + case CompactionReason::kManualCompaction: + return "ManualCompaction"; + case CompactionReason::kFilesMarkedForCompaction: + return "FilesMarkedForCompaction"; + case CompactionReason::kBottommostFiles: + return "BottommostFiles"; + case CompactionReason::kTtl: + return "Ttl"; + case CompactionReason::kFlush: + return "Flush"; + case CompactionReason::kExternalSstIngestion: + return "ExternalSstIngestion"; + case CompactionReason::kPeriodicCompaction: + return "PeriodicCompaction"; + case CompactionReason::kChangeTemperature: + return "ChangeTemperature"; + case CompactionReason::kForcedBlobGC: + return "ForcedBlobGC"; + case CompactionReason::kRoundRobinTtl: + return "RoundRobinTtl"; + case CompactionReason::kRefitLevel: + return "RefitLevel"; + case CompactionReason::kNumOfReasons: + // fall through + default: + assert(false); + return "Invalid"; + } +} + +const char* GetCompactionPenultimateOutputRangeTypeString( + Compaction::PenultimateOutputRangeType range_type) { + switch (range_type) { + case Compaction::PenultimateOutputRangeType::kNotSupported: + return "NotSupported"; + case Compaction::PenultimateOutputRangeType::kFullRange: + return "FullRange"; + case Compaction::PenultimateOutputRangeType::kNonLastRange: + return "NonLastRange"; + case Compaction::PenultimateOutputRangeType::kDisabled: + return "Disabled"; + default: + assert(false); + return "Invalid"; + } +} + +CompactionJob::CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_directory, FSDirectory* blob_output_directory, + Statistics* stats, InstrumentedMutex* db_mutex, + ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, JobContext* job_context, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, + CompactionJobStats* compaction_job_stats, Env::Priority thread_pri, + const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, std::string trim_ts, + BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled, + int* bg_bottom_compaction_scheduled) + : compact_(new CompactionState(compaction)), + compaction_stats_(compaction->compaction_reason(), 1), + db_options_(db_options), + mutable_db_options_copy_(mutable_db_options), + log_buffer_(log_buffer), + output_directory_(output_directory), + stats_(stats), + bottommost_level_(false), + write_hint_(Env::WLTH_NOT_SET), + compaction_job_stats_(compaction_job_stats), + job_id_(job_id), + dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), + file_options_(file_options), + env_(db_options.env), + io_tracer_(io_tracer), + fs_(db_options.fs, io_tracer), + file_options_for_read_( + fs_->OptimizeForCompactionTableRead(file_options, db_options_)), + versions_(versions), + shutting_down_(shutting_down), + manual_compaction_canceled_(manual_compaction_canceled), + db_directory_(db_directory), + blob_output_directory_(blob_output_directory), + db_mutex_(db_mutex), + db_error_handler_(db_error_handler), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + job_context_(job_context), + table_cache_(std::move(table_cache)), + event_logger_(event_logger), + paranoid_file_checks_(paranoid_file_checks), + measure_io_stats_(measure_io_stats), + thread_pri_(thread_pri), + full_history_ts_low_(std::move(full_history_ts_low)), + trim_ts_(std::move(trim_ts)), + blob_callback_(blob_callback), + extra_num_subcompaction_threads_reserved_(0), + bg_compaction_scheduled_(bg_compaction_scheduled), + bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) { + assert(compaction_job_stats_ != nullptr); + assert(log_buffer_ != nullptr); + + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + ReportStartedCompaction(compaction); +} + +CompactionJob::~CompactionJob() { + assert(compact_ == nullptr); + ThreadStatusUtil::ResetThreadStatus(); +} + +void CompactionJob::ReportStartedCompaction(Compaction* compaction) { + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_id_); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, + (static_cast(compact_->compaction->start_level()) << 32) + + compact_->compaction->output_level()); + + // In the current design, a CompactionJob is always created + // for non-trivial compaction. + assert(compaction->IsTrivialMove() == false || + compaction->is_manual_compaction() == true); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_PROP_FLAGS, + compaction->is_manual_compaction() + + (compaction->deletion_compaction() << 1)); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, + compaction->CalculateTotalInputSize()); + + IOSTATS_RESET(bytes_written); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, 0); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, 0); + + // Set the thread operation after operation properties + // to ensure GetThreadList() can always show them all together. + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats_->is_manual_compaction = + compaction->is_manual_compaction(); + compaction_job_stats_->is_full_compaction = compaction->is_full_compaction(); +} + +void CompactionJob::Prepare() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PREPARE); + + // Generate file_levels_ for compaction before making Iterator + auto* c = compact_->compaction; + ColumnFamilyData* cfd = c->column_family_data(); + assert(cfd != nullptr); + assert(cfd->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = cfd->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + if (c->ShouldFormSubcompactions()) { + StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); + GenSubcompactionBoundaries(); + } + if (boundaries_.size() > 1) { + for (size_t i = 0; i <= boundaries_.size(); i++) { + compact_->sub_compact_states.emplace_back( + c, (i != 0) ? std::optional(boundaries_[i - 1]) : std::nullopt, + (i != boundaries_.size()) ? std::optional(boundaries_[i]) + : std::nullopt, + static_cast(i)); + // assert to validate that boundaries don't have same user keys (without + // timestamp part). + assert(i == 0 || i == boundaries_.size() || + cfd->user_comparator()->CompareWithoutTimestamp( + boundaries_[i - 1], boundaries_[i]) < 0); + } + RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, + compact_->sub_compact_states.size()); + } else { + compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt, + /*sub_job_id*/ 0); + } + + // collect all seqno->time information from the input files which will be used + // to encode seqno->time to the output files. + uint64_t preserve_time_duration = + std::max(c->immutable_options()->preserve_internal_time_seconds, + c->immutable_options()->preclude_last_level_data_seconds); + + if (preserve_time_duration > 0) { + const ReadOptions read_options(Env::IOActivity::kCompaction); + // setup seqno_time_mapping_ + seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration); + for (const auto& each_level : *c->inputs()) { + for (const auto& fmd : each_level.files) { + std::shared_ptr tp; + Status s = + cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr); + if (s.ok()) { + seqno_time_mapping_.Add(tp->seqno_to_time_mapping) + .PermitUncheckedError(); + seqno_time_mapping_.Add(fmd->fd.smallest_seqno, + fmd->oldest_ancester_time); + } + } + } + + auto status = seqno_time_mapping_.Sort(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Invalid sequence number to time mapping: Status: %s", + status.ToString().c_str()); + } + int64_t _current_time = 0; + status = db_options_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time in compaction: Status: %s", + status.ToString().c_str()); + // preserve all time information + preserve_time_min_seqno_ = 0; + preclude_last_level_min_seqno_ = 0; + } else { + seqno_time_mapping_.TruncateOldEntries(_current_time); + uint64_t preserve_time = + static_cast(_current_time) > preserve_time_duration + ? _current_time - preserve_time_duration + : 0; + preserve_time_min_seqno_ = + seqno_time_mapping_.GetOldestSequenceNum(preserve_time); + if (c->immutable_options()->preclude_last_level_data_seconds > 0) { + uint64_t preclude_last_level_time = + static_cast(_current_time) > + c->immutable_options()->preclude_last_level_data_seconds + ? _current_time - + c->immutable_options()->preclude_last_level_data_seconds + : 0; + preclude_last_level_min_seqno_ = + seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time); + } + } + } +} + +uint64_t CompactionJob::GetSubcompactionsLimit() { + return extra_num_subcompaction_threads_reserved_ + + std::max( + std::uint64_t(1), + static_cast(compact_->compaction->max_subcompactions())); +} + +void CompactionJob::AcquireSubcompactionResources( + int num_extra_required_subcompactions) { + TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0"); + TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1"); + int max_db_compactions = + DBImpl::GetBGJobLimits( + mutable_db_options_copy_.max_background_flushes, + mutable_db_options_copy_.max_background_compactions, + mutable_db_options_copy_.max_background_jobs, + versions_->GetColumnFamilySet() + ->write_controller() + ->NeedSpeedupCompaction()) + .max_compactions; + InstrumentedMutexLock l(db_mutex_); + // Apply min function first since We need to compute the extra subcompaction + // against compaction limits. And then try to reserve threads for extra + // subcompactions. The actual number of reserved threads could be less than + // the desired number. + int available_bg_compactions_against_db_limit = + std::max(max_db_compactions - *bg_compaction_scheduled_ - + *bg_bottom_compaction_scheduled_, + 0); + // Reservation only supports backgrdoun threads of which the priority is + // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the + // origin thread_pri_ is higher than that. Similar to ReleaseThreads(). + extra_num_subcompaction_threads_reserved_ = + env_->ReserveThreads(std::min(num_extra_required_subcompactions, + available_bg_compactions_against_db_limit), + std::min(thread_pri_, Env::Priority::HIGH)); + + // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_ + // depending on if this compaction has the bottommost priority + if (thread_pri_ == Env::Priority::BOTTOM) { + *bg_bottom_compaction_scheduled_ += + extra_num_subcompaction_threads_reserved_; + } else { + *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_; + } +} + +void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) { + // Do nothing when we have zero resources to shrink + if (num_extra_resources == 0) return; + db_mutex_->Lock(); + // We cannot release threads more than what we reserved before + int extra_num_subcompaction_threads_released = env_->ReleaseThreads( + (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH)); + // Update the number of reserved threads and the number of background + // scheduled compactions for this compaction job + extra_num_subcompaction_threads_reserved_ -= + extra_num_subcompaction_threads_released; + // TODO (zichen): design a test case with new subcompaction partitioning + // when the number of actual partitions is less than the number of planned + // partitions + assert(extra_num_subcompaction_threads_released == (int)num_extra_resources); + // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_ + // depending on if this compaction has the bottommost priority + if (thread_pri_ == Env::Priority::BOTTOM) { + *bg_bottom_compaction_scheduled_ -= + extra_num_subcompaction_threads_released; + } else { + *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released; + } + db_mutex_->Unlock(); + TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0"); +} + +void CompactionJob::ReleaseSubcompactionResources() { + if (extra_num_subcompaction_threads_reserved_ == 0) { + return; + } + { + InstrumentedMutexLock l(db_mutex_); + // The number of reserved threads becomes larger than 0 only if the + // compaction prioity is round robin and there is no sufficient + // sub-compactions available + + // The scheduled compaction must be no less than 1 + extra number + // subcompactions using acquired resources since this compaction job has not + // finished yet + assert(*bg_bottom_compaction_scheduled_ >= + 1 + extra_num_subcompaction_threads_reserved_ || + *bg_compaction_scheduled_ >= + 1 + extra_num_subcompaction_threads_reserved_); + } + ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_); +} + +struct RangeWithSize { + Range range; + uint64_t size; + + RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0) + : range(a, b), size(s) {} +}; + +void CompactionJob::GenSubcompactionBoundaries() { + // The goal is to find some boundary keys so that we can evenly partition + // the compaction input data into max_subcompactions ranges. + // For every input file, we ask TableReader to estimate 128 anchor points + // that evenly partition the input file into 128 ranges and the range + // sizes. This can be calculated by scanning index blocks of the file. + // Once we have the anchor points for all the input files, we merge them + // together and try to find keys dividing ranges evenly. + // For example, if we have two input files, and each returns following + // ranges: + // File1: (a1, 1000), (b1, 1200), (c1, 1100) + // File2: (a2, 1100), (b2, 1000), (c2, 1000) + // We total sort the keys to following: + // (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000) + // We calculate the total size by adding up all ranges' size, which is 6400. + // If we would like to partition into 2 subcompactions, the target of the + // range size is 3200. Based on the size, we take "b1" as the partition key + // since the first three ranges would hit 3200. + // + // Note that the ranges are actually overlapping. For example, in the example + // above, the range ending with "b1" is overlapping with the range ending with + // "b2". So the size 1000+1100+1200 is an underestimation of data size up to + // "b1". In extreme cases where we only compact N L0 files, a range can + // overlap with N-1 other ranges. Since we requested a relatively large number + // (128) of ranges from each input files, even N range overlapping would + // cause relatively small inaccuracy. + const ReadOptions read_options(Env::IOActivity::kCompaction); + auto* c = compact_->compaction; + if (c->max_subcompactions() <= 1 && + !(c->immutable_options()->compaction_pri == kRoundRobin && + c->immutable_options()->compaction_style == kCompactionStyleLevel)) { + return; + } + auto* cfd = c->column_family_data(); + const Comparator* cfd_comparator = cfd->user_comparator(); + const InternalKeyComparator& icomp = cfd->internal_comparator(); + + auto* v = compact_->compaction->input_version(); + int base_level = v->storage_info()->base_level(); + InstrumentedMutexUnlock unlock_guard(db_mutex_); + + uint64_t total_size = 0; + std::vector all_anchors; + int start_lvl = c->start_level(); + int out_lvl = c->output_level(); + + for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) { + int lvl = c->level(lvl_idx); + if (lvl >= start_lvl && lvl <= out_lvl) { + const LevelFilesBrief* flevel = c->input_levels(lvl_idx); + size_t num_files = flevel->num_files; + + if (num_files == 0) { + continue; + } + + for (size_t i = 0; i < num_files; i++) { + FileMetaData* f = flevel->files[i].file_metadata; + std::vector my_anchors; + Status s = cfd->table_cache()->ApproximateKeyAnchors( + read_options, icomp, *f, + c->mutable_cf_options()->block_protection_bytes_per_key, + my_anchors); + if (!s.ok() || my_anchors.empty()) { + my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); + } + for (auto& ac : my_anchors) { + // Can be optimize to avoid this loop. + total_size += ac.range_size; + } + + all_anchors.insert(all_anchors.end(), my_anchors.begin(), + my_anchors.end()); + } + } + } + // Here we total sort all the anchor points across all files and go through + // them in the sorted order to find partitioning boundaries. + // Not the most efficient implementation. A much more efficient algorithm + // probably exists. But they are more complex. If performance turns out to + // be a problem, we can optimize. + std::sort( + all_anchors.begin(), all_anchors.end(), + [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool { + return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) < + 0; + }); + + // Remove duplicated entries from boundaries. + all_anchors.erase( + std::unique(all_anchors.begin(), all_anchors.end(), + [cfd_comparator](TableReader::Anchor& a, + TableReader::Anchor& b) -> bool { + return cfd_comparator->CompareWithoutTimestamp( + a.user_key, b.user_key) == 0; + }), + all_anchors.end()); + + // Get the number of planned subcompactions, may update reserve threads + // and update extra_num_subcompaction_threads_reserved_ for round-robin + uint64_t num_planned_subcompactions; + if (c->immutable_options()->compaction_pri == kRoundRobin && + c->immutable_options()->compaction_style == kCompactionStyleLevel) { + // For round-robin compaction prioity, we need to employ more + // subcompactions (may exceed the max_subcompaction limit). The extra + // subcompactions will be executed using reserved threads and taken into + // account bg_compaction_scheduled or bg_bottom_compaction_scheduled. + + // Initialized by the number of input files + num_planned_subcompactions = static_cast(c->num_input_files(0)); + uint64_t max_subcompactions_limit = GetSubcompactionsLimit(); + if (max_subcompactions_limit < num_planned_subcompactions) { + // Assert two pointers are not empty so that we can use extra + // subcompactions against db compaction limits + assert(bg_bottom_compaction_scheduled_ != nullptr); + assert(bg_compaction_scheduled_ != nullptr); + // Reserve resources when max_subcompaction is not sufficient + AcquireSubcompactionResources( + (int)(num_planned_subcompactions - max_subcompactions_limit)); + // Subcompactions limit changes after acquiring additional resources. + // Need to call GetSubcompactionsLimit() again to update the number + // of planned subcompactions + num_planned_subcompactions = + std::min(num_planned_subcompactions, GetSubcompactionsLimit()); + } else { + num_planned_subcompactions = max_subcompactions_limit; + } + } else { + num_planned_subcompactions = GetSubcompactionsLimit(); + } + + TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0", + &num_planned_subcompactions); + if (num_planned_subcompactions == 1) return; + + // Group the ranges into subcompactions + uint64_t target_range_size = std::max( + total_size / num_planned_subcompactions, + MaxFileSizeForLevel( + *(c->mutable_cf_options()), out_lvl, + c->immutable_options()->compaction_style, base_level, + c->immutable_options()->level_compaction_dynamic_level_bytes)); + + if (target_range_size >= total_size) { + return; + } + + uint64_t next_threshold = target_range_size; + uint64_t cumulative_size = 0; + uint64_t num_actual_subcompactions = 1U; + for (TableReader::Anchor& anchor : all_anchors) { + cumulative_size += anchor.range_size; + if (cumulative_size > next_threshold) { + next_threshold += target_range_size; + num_actual_subcompactions++; + boundaries_.push_back(anchor.user_key); + } + if (num_actual_subcompactions == num_planned_subcompactions) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1", + &num_actual_subcompactions); + // Shrink extra subcompactions resources when extra resrouces are acquired + ShrinkSubcompactionResources( + std::min((int)(num_planned_subcompactions - num_actual_subcompactions), + extra_num_subcompaction_threads_reserved_)); +} + +Status CompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + TEST_SYNC_POINT("CompactionJob::Run():Start"); + log_buffer_->FlushBufferToLog(); + LogCompaction(); + + const size_t num_threads = compact_->sub_compact_states.size(); + assert(num_threads > 0); + const uint64_t start_micros = db_options_.clock->NowMicros(); + + // Launch a thread for each of subcompactions 1...num_threads-1 + std::vector thread_pool; + thread_pool.reserve(num_threads - 1); + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this, + &compact_->sub_compact_states[i]); + } + + // Always schedule the first subcompaction (whether or not there are also + // others) in the current thread to be efficient with resources + ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + + // Wait for all other threads (if there are any) to finish execution + for (auto& thread : thread_pool) { + thread.join(); + } + + compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); + + for (auto& state : compact_->sub_compact_states) { + compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros); + state.RemoveLastEmptyOutput(); + } + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + + // Check if any thread encountered an error during execution + Status status; + IOStatus io_s; + bool wrote_new_blob_files = false; + + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + io_s = state.io_status; + break; + } + + if (state.Current().HasBlobFileAdditions()) { + wrote_new_blob_files = true; + } + } + + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + + if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && + blob_output_directory_ != output_directory_) { + io_s = blob_output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + thread_pool.clear(); + std::vector files_output; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + files_output.emplace_back(&output); + } + } + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + auto& prefix_extractor = + compact_->compaction->mutable_cf_options()->prefix_extractor; + std::atomic next_file_idx(0); + auto verify_table = [&](Status& output_status) { + while (true) { + size_t file_idx = next_file_idx.fetch_add(1); + if (file_idx >= files_output.size()) { + break; + } + // Verify that the table is usable + // We set for_compaction to false and don't + // OptimizeForCompactionTableRead here because this is a special case + // after we finish the table building No matter whether + // use_direct_io_for_flush_and_compaction is true, we will regard this + // verification as user reads since the goal is to cache it here for + // further user reads + const ReadOptions verify_table_read_options( + Env::IOActivity::kCompaction); + InternalIterator* iter = cfd->table_cache()->NewIterator( + verify_table_read_options, file_options_, + cfd->internal_comparator(), files_output[file_idx]->meta, + /*range_del_agg=*/nullptr, prefix_extractor, + /*table_reader_ptr=*/nullptr, + cfd->internal_stats()->GetFileReadHist( + compact_->compaction->output_level()), + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + MaxFileSizeForL0MetaPin( + *compact_->compaction->mutable_cf_options()), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false, + compact_->compaction->mutable_cf_options() + ->block_protection_bytes_per_key); + auto s = iter->status(); + + if (s.ok() && paranoid_file_checks_) { + OutputValidator validator(cfd->internal_comparator(), + /*_enable_order_check=*/true, + /*_enable_hash=*/true); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = validator.Add(iter->key(), iter->value()); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + s = iter->status(); + } + if (s.ok() && + !validator.CompareValidator(files_output[file_idx]->validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } + } + + delete iter; + + if (!s.ok()) { + output_status = s; + break; + } + } + }; + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back( + verify_table, std::ref(compact_->sub_compact_states[i].status)); + } + verify_table(compact_->sub_compact_states[0].status); + for (auto& thread : thread_pool) { + thread.join(); + } + + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + } + + ReleaseSubcompactionResources(); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0"); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1"); + + TablePropertiesCollection tp; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + auto fn = + TableFileName(state.compaction->immutable_options()->cf_paths, + output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); + tp[fn] = output.table_properties; + } + } + compact_->compaction->SetOutputTableProperties(std::move(tp)); + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + + RecordCompactionIOStats(); + LogFlush(db_options_.info_log); + TEST_SYNC_POINT("CompactionJob::Run():End"); + + compact_->status = status; + return status; +} + +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + assert(compact_); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_INSTALL); + db_mutex_->AssertHeld(); + Status status = compact_->status; + + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + assert(cfd); + + int output_level = compact_->compaction->output_level(); + cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_, + compaction_stats_); + + if (status.ok()) { + status = InstallCompactionResults(mutable_cf_options); + } + if (!versions_->io_status().ok()) { + io_status_ = versions_->io_status(); + } + + VersionStorageInfo::LevelSummaryStorage tmp; + auto vstorage = cfd->current()->storage_info(); + const auto& stats = compaction_stats_.stats; + + double read_write_amp = 0.0; + double write_amp = 0.0; + double bytes_read_per_sec = 0; + double bytes_written_per_sec = 0; + + const uint64_t bytes_read_non_output_and_blob = + stats.bytes_read_non_output_levels + stats.bytes_read_blob; + const uint64_t bytes_read_all = + stats.bytes_read_output_level + bytes_read_non_output_and_blob; + const uint64_t bytes_written_all = + stats.bytes_written + stats.bytes_written_blob; + + if (bytes_read_non_output_and_blob > 0) { + read_write_amp = (bytes_written_all + bytes_read_all) / + static_cast(bytes_read_non_output_and_blob); + write_amp = + bytes_written_all / static_cast(bytes_read_non_output_and_blob); + } + if (stats.micros > 0) { + bytes_read_per_sec = bytes_read_all / static_cast(stats.micros); + bytes_written_per_sec = + bytes_written_all / static_cast(stats.micros); + } + + const std::string& column_family_name = cfd->GetName(); + + constexpr double kMB = 1048576.0; + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d +%d blob) " + "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), " + "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64 + ", records dropped: %" PRIu64 " output_compression: %s\n", + column_family_name.c_str(), vstorage->LevelSummary(&tmp), + bytes_read_per_sec, bytes_written_per_sec, + compact_->compaction->output_level(), + stats.num_input_files_in_non_output_levels, + stats.num_input_files_in_output_level, stats.num_output_files, + stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB, + stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB, + stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp, + write_amp, status.ToString().c_str(), stats.num_input_records, + stats.num_dropped_records, + CompressionTypeToString(compact_->compaction->output_compression()) + .c_str()); + + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] has Penultimate Level output: %" PRIu64 + ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64, + column_family_name.c_str(), + compaction_stats_.penultimate_level_stats.bytes_written, + compact_->compaction->GetPenultimateLevel(), + compaction_stats_.penultimate_level_stats.num_output_files, + compaction_stats_.penultimate_level_stats.num_output_records); + } + + UpdateCompactionJobStats(stats); + + auto stream = event_logger_->LogToBuffer(log_buffer_, 8192); + stream << "job" << job_id_ << "event" + << "compaction_finished" + << "compaction_time_micros" << stats.micros + << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" + << compact_->compaction->output_level() << "num_output_files" + << stats.num_output_files << "total_output_size" + << stats.bytes_written; + + if (stats.num_output_files_blob > 0) { + stream << "num_blob_output_files" << stats.num_output_files_blob + << "total_blob_output_size" << stats.bytes_written_blob; + } + + stream << "num_input_records" << stats.num_input_records + << "num_output_records" << stats.num_output_records + << "num_subcompactions" << compact_->sub_compact_states.size() + << "output_compression" + << CompressionTypeToString(compact_->compaction->output_compression()); + + stream << "num_single_delete_mismatches" + << compaction_job_stats_->num_single_del_mismatch; + stream << "num_single_delete_fallthrough" + << compaction_job_stats_->num_single_del_fallthru; + + if (measure_io_stats_) { + stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; + stream << "file_range_sync_nanos" + << compaction_job_stats_->file_range_sync_nanos; + stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos; + stream << "file_prepare_write_nanos" + << compaction_job_stats_->file_prepare_write_nanos; + } + + stream << "lsm_state"; + stream.StartArray(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + if (!blob_files.empty()) { + assert(blob_files.front()); + stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber(); + + assert(blob_files.back()); + stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); + } + + if (compaction_stats_.has_penultimate_level_output) { + InternalStats::CompactionStats& pl_stats = + compaction_stats_.penultimate_level_stats; + stream << "penultimate_level_num_output_files" << pl_stats.num_output_files; + stream << "penultimate_level_bytes_written" << pl_stats.bytes_written; + stream << "penultimate_level_num_output_records" + << pl_stats.num_output_records; + stream << "penultimate_level_num_output_files_blob" + << pl_stats.num_output_files_blob; + stream << "penultimate_level_bytes_written_blob" + << pl_stats.bytes_written_blob; + } + + CleanupCompaction(); + return status; +} + +void CompactionJob::NotifyOnSubcompactionBegin( + SubcompactionState* sub_compact) { + Compaction* c = compact_->compaction; + + if (db_options_.listeners.empty()) { + return; + } + if (shutting_down_->load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_canceled_.load(std::memory_order_acquire)) { + return; + } + + sub_compact->notify_on_subcompaction_completion = true; + + SubcompactionJobInfo info{}; + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast(job_id_); + info.thread_id = env_->GetThreadID(); + + for (const auto& listener : db_options_.listeners) { + listener->OnSubcompactionBegin(info); + } + info.status.PermitUncheckedError(); + +} + +void CompactionJob::NotifyOnSubcompactionCompleted( + SubcompactionState* sub_compact) { + + if (db_options_.listeners.empty()) { + return; + } + if (shutting_down_->load(std::memory_order_acquire)) { + return; + } + + if (sub_compact->notify_on_subcompaction_completion == false) { + return; + } + + SubcompactionJobInfo info{}; + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast(job_id_); + info.thread_id = env_->GetThreadID(); + + for (const auto& listener : db_options_.listeners) { + listener->OnSubcompactionCompleted(info); + } +} + +void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + if (db_options_.compaction_service) { + CompactionServiceJobStatus comp_status = + ProcessKeyValueCompactionWithCompactionService(sub_compact); + if (comp_status == CompactionServiceJobStatus::kSuccess || + comp_status == CompactionServiceJobStatus::kFailure) { + return; + } + // fallback to local compaction + assert(comp_status == CompactionServiceJobStatus::kUseLocal); + } + + uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + + // Create compaction filter and fail the compaction if + // IgnoreSnapshots() = false because it is not supported anymore + const CompactionFilter* compaction_filter = + cfd->ioptions()->compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (compaction_filter == nullptr) { + compaction_filter_from_factory = + sub_compact->compaction->CreateCompactionFilter(); + compaction_filter = compaction_filter_from_factory.get(); + } + if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) { + sub_compact->status = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return; + } + + NotifyOnSubcompactionBegin(sub_compact); + + auto range_del_agg = std::make_unique( + &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_, + &trim_ts_); + + // TODO: since we already use C++17, should use + // std::optional instead. + const std::optional start = sub_compact->start; + const std::optional end = sub_compact->end; + + std::optional start_without_ts; + std::optional end_without_ts; + + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.fill_cache = false; + read_options.rate_limiter_priority = GetRateLimiterPriority(); + read_options.io_activity = Env::IOActivity::kCompaction; + // Compaction iterators shouldn't be confined to a single prefix. + // Compactions use Seek() for + // (a) concurrent compactions, + // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. + read_options.total_order_seek = true; + + // Remove the timestamps from boundaries because boundaries created in + // GenSubcompactionBoundaries doesn't strip away the timestamp. + size_t ts_sz = cfd->user_comparator()->timestamp_size(); + if (start.has_value()) { + read_options.iterate_lower_bound = &start.value(); + if (ts_sz > 0) { + start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz); + read_options.iterate_lower_bound = &start_without_ts.value(); + } + } + if (end.has_value()) { + read_options.iterate_upper_bound = &end.value(); + if (ts_sz > 0) { + end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz); + read_options.iterate_upper_bound = &end_without_ts.value(); + } + } + + // Although the v2 aggregator is what the level iterator(s) know about, + // the AddTombstones calls will be propagated down to the v1 aggregator. + std::unique_ptr raw_input(versions_->MakeInputIterator( + read_options, sub_compact->compaction, range_del_agg.get(), + file_options_for_read_, start, end)); + InternalIterator* input = raw_input.get(); + + IterKey start_ikey; + IterKey end_ikey; + Slice start_slice; + Slice end_slice; + Slice start_user_key{}; + Slice end_user_key{}; + + static constexpr char kMaxTs[] = + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + Slice ts_slice; + std::string max_ts; + if (ts_sz > 0) { + if (ts_sz <= strlen(kMaxTs)) { + ts_slice = Slice(kMaxTs, ts_sz); + } else { + max_ts = std::string(ts_sz, '\xff'); + ts_slice = Slice(max_ts); + } + } + + if (start.has_value()) { + start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber, + kValueTypeForSeek); + if (ts_sz > 0) { + start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, + &ts_slice); + } + start_slice = start_ikey.GetInternalKey(); + start_user_key = start_ikey.GetUserKey(); + } + if (end.has_value()) { + end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek); + if (ts_sz > 0) { + end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, + &ts_slice); + } + end_slice = end_ikey.GetInternalKey(); + end_user_key = end_ikey.GetUserKey(); + } + + std::unique_ptr clip; + if (start.has_value() || end.has_value()) { + clip = std::make_unique( + raw_input.get(), start.has_value() ? &start_slice : nullptr, + end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator()); + input = clip.get(); + } + + std::unique_ptr blob_counter; + + if (sub_compact->compaction->DoesInputReferenceBlobFiles()) { + BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter(); + blob_counter = std::make_unique(input, meter); + input = blob_counter.get(); + } + + std::unique_ptr trim_history_iter; + if (ts_sz > 0 && !trim_ts_.empty()) { + trim_history_iter = std::make_unique( + input, cfd->user_comparator(), trim_ts_); + input = trim_history_iter.get(); + } + + input->SeekToFirst(); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PROCESS_KV); + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + const uint64_t kRecordStatsEvery = 1000; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + MergeHelper merge( + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(), + compaction_filter, db_options_.info_log.get(), + false /* internal key corruption is expected */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_, compact_->compaction->level(), db_options_.stats); + + const MutableCFOptions* mutable_cf_options = + sub_compact->compaction->mutable_cf_options(); + assert(mutable_cf_options); + + std::vector blob_file_paths; + + // TODO: BlobDB to support output_to_penultimate_level compaction, which needs + // 2 builders, so may need to move to `CompactionOutputs` + std::unique_ptr blob_file_builder( + (mutable_cf_options->enable_blob_files && + sub_compact->compaction->output_level() >= + mutable_cf_options->blob_file_starting_level) + ? new BlobFileBuilder( + versions_, fs_.get(), + sub_compact->compaction->immutable_options(), + mutable_cf_options, &file_options_, db_id_, db_session_id_, + job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW, + write_hint_, io_tracer_, blob_callback_, + BlobFileCreationReason::kCompaction, &blob_file_paths, + sub_compact->Current().GetBlobFileAdditionsPtr()) + : nullptr); + + TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:1", + reinterpret_cast( + const_cast*>(&manual_compaction_canceled_))); + + const std::string* const full_history_ts_low = + full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; + const SequenceNumber job_snapshot_seq = + job_context_ ? job_context_->GetJobSnapshotSequence() + : kMaxSequenceNumber; + + auto c_iter = std::make_unique( + input, cfd->user_comparator(), &merge, versions_->LastSequence(), + &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq, + snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), + /*expect_valid_internal_key=*/true, range_del_agg.get(), + blob_file_builder.get(), db_options_.allow_data_in_errors, + db_options_.enforce_single_del_contracts, manual_compaction_canceled_, + sub_compact->compaction, compaction_filter, shutting_down_, + db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_, + preclude_last_level_min_seqno_); + c_iter->SeekToFirst(); + + // Assign range delete aggregator to the target output level, which makes sure + // it only output to single level + sub_compact->AssignRangeDelAggregator(std::move(range_del_agg)); + + const auto& c_iter_stats = c_iter->iter_stats(); + + // define the open and close functions for the compaction files, which will be + // used open/close output files when needed. + const CompactionFileOpenFunc open_file_func = + [this, sub_compact](CompactionOutputs& outputs) { + return this->OpenCompactionOutputFile(sub_compact, outputs); + }; + + const CompactionFileCloseFunc close_file_func = + [this, sub_compact, start_user_key, end_user_key]( + CompactionOutputs& outputs, const Status& status, + const Slice& next_table_min_key) { + return this->FinishCompactionOutputFile( + status, sub_compact, outputs, next_table_min_key, + sub_compact->start.has_value() ? &start_user_key : nullptr, + sub_compact->end.has_value() ? &end_user_key : nullptr); + }; + + Status status; + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::ProcessKeyValueCompaction()::Processing", + reinterpret_cast( + const_cast(sub_compact->compaction))); + while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { + // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() + // returns true. + assert(!end.has_value() || cfd->user_comparator()->Compare( + c_iter->user_key(), end.value()) < 0); + + if (c_iter_stats.num_input_records % kRecordStatsEvery == + kRecordStatsEvery - 1) { + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + c_iter->ResetRecordCounts(); + RecordCompactionIOStats(); + } + + // Add current compaction_iterator key to target compaction output, if the + // output file needs to be close or open, it will call the `open_file_func` + // and `close_file_func`. + // TODO: it would be better to have the compaction file open/close moved + // into `CompactionOutputs` which has the output file information. + status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func); + if (!status.ok()) { + break; + } + + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:2", + reinterpret_cast( + const_cast*>(&manual_compaction_canceled_))); + c_iter->Next(); + if (c_iter->status().IsManualCompactionPaused()) { + break; + } + } + + sub_compact->compaction_job_stats.num_blobs_read = + c_iter_stats.num_blobs_read; + sub_compact->compaction_job_stats.total_blob_bytes_read = + c_iter_stats.total_blob_bytes_read; + sub_compact->compaction_job_stats.num_input_deletion_records = + c_iter_stats.num_input_deletion_records; + sub_compact->compaction_job_stats.num_corrupt_keys = + c_iter_stats.num_input_corrupt_records; + sub_compact->compaction_job_stats.num_single_del_fallthru = + c_iter_stats.num_single_del_fallthru; + sub_compact->compaction_job_stats.num_single_del_mismatch = + c_iter_stats.num_single_del_mismatch; + sub_compact->compaction_job_stats.total_input_raw_key_bytes += + c_iter_stats.total_input_raw_key_bytes; + sub_compact->compaction_job_stats.total_input_raw_value_bytes += + c_iter_stats.total_input_raw_value_bytes; + + RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, + c_iter_stats.total_filter_time); + + if (c_iter_stats.num_blobs_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED, + c_iter_stats.num_blobs_relocated); + } + if (c_iter_stats.total_blob_bytes_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED, + c_iter_stats.total_blob_bytes_relocated); + } + + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + RecordCompactionIOStats(); + + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + (manual_compaction_canceled_.load(std::memory_order_relaxed))) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (status.ok()) { + status = input->status(); + } + if (status.ok()) { + status = c_iter->status(); + } + + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output files. Open file function is also passed, in case there's + // only range-dels, no file was opened, to save the range-dels, it need to + // create a new output file. + status = sub_compact->CloseCompactionFiles(status, open_file_func, + close_file_func); + + if (blob_file_builder) { + if (status.ok()) { + status = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(status); + } + blob_file_builder.reset(); + sub_compact->Current().UpdateBlobStats(); + } + + sub_compact->compaction_job_stats.cpu_micros = + db_options_.clock->CPUMicros() - prev_cpu_micros; + + if (measure_io_stats_) { + sub_compact->compaction_job_stats.file_write_nanos += + IOSTATS(write_nanos) - prev_write_nanos; + sub_compact->compaction_job_stats.file_fsync_nanos += + IOSTATS(fsync_nanos) - prev_fsync_nanos; + sub_compact->compaction_job_stats.file_range_sync_nanos += + IOSTATS(range_sync_nanos) - prev_range_sync_nanos; + sub_compact->compaction_job_stats.file_prepare_write_nanos += + IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; + sub_compact->compaction_job_stats.cpu_micros -= + (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos + + IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) / + 1000; + if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { + SetPerfLevel(prev_perf_level); + } + } +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!status.ok()) { + if (c_iter) { + c_iter->status().PermitUncheckedError(); + } + if (input) { + input->status().PermitUncheckedError(); + } + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + + blob_counter.reset(); + clip.reset(); + raw_input.reset(); + sub_compact->status = status; + NotifyOnSubcompactionCompleted(sub_compact); +} + +uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const { + return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id; +} + +void CompactionJob::RecordDroppedKeys( + const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats) { + if (c_iter_stats.num_record_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, + c_iter_stats.num_record_drop_user); + } + if (c_iter_stats.num_record_drop_hidden > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + c_iter_stats.num_record_drop_hidden); + if (compaction_job_stats) { + compaction_job_stats->num_records_replaced += + c_iter_stats.num_record_drop_hidden; + } + } + if (c_iter_stats.num_record_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, + c_iter_stats.num_record_drop_obsolete); + if (compaction_job_stats) { + compaction_job_stats->num_expired_deletion_records += + c_iter_stats.num_record_drop_obsolete; + } + } + if (c_iter_stats.num_record_drop_range_del > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL, + c_iter_stats.num_record_drop_range_del); + } + if (c_iter_stats.num_range_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE, + c_iter_stats.num_range_del_drop_obsolete); + } + if (c_iter_stats.num_optimized_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + c_iter_stats.num_optimized_del_drop_obsolete); + } +} + +Status CompactionJob::FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionOutputs& outputs, const Slice& next_table_min_key, + const Slice* comp_start_user_key, const Slice* comp_end_user_key) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_SYNC_FILE); + assert(sub_compact != nullptr); + assert(outputs.HasBuilder()); + + FileMetaData* meta = outputs.GetMetaData(); + uint64_t output_number = meta->fd.GetNumber(); + assert(output_number != 0); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + + // Check for iterator errors + Status s = input_status; + + // Add range tombstones + auto earliest_snapshot = kMaxSequenceNumber; + if (existing_snapshots_.size() > 0) { + earliest_snapshot = existing_snapshots_[0]; + } + if (s.ok()) { + CompactionIterationStats range_del_out_stats; + // if the compaction supports per_key_placement, only output range dels to + // the penultimate level. + // Note: Use `bottommost_level_ = true` for both bottommost and + // output_to_penultimate_level compaction here, as it's only used to decide + // if range dels could be dropped. + if (outputs.HasRangeDel()) { + s = outputs.AddRangeDels(comp_start_user_key, comp_end_user_key, + range_del_out_stats, bottommost_level_, + cfd->internal_comparator(), earliest_snapshot, + next_table_min_key, full_history_ts_low_); + } + RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); + TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); + } + + const uint64_t current_entries = outputs.NumEntries(); + + s = outputs.Finish(s, seqno_time_mapping_); + + if (s.ok()) { + // With accurate smallest and largest key, we can get a slightly more + // accurate oldest ancester time. + // This makes oldest ancester time in manifest more accurate than in + // table properties. Not sure how to resolve it. + if (meta->smallest.size() > 0 && meta->largest.size() > 0) { + uint64_t refined_oldest_ancester_time; + Slice new_smallest = meta->smallest.user_key(); + Slice new_largest = meta->largest.user_key(); + if (!new_largest.empty() && !new_smallest.empty()) { + refined_oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + &(meta->smallest), &(meta->largest)); + if (refined_oldest_ancester_time != + std::numeric_limits::max()) { + meta->oldest_ancester_time = refined_oldest_ancester_time; + } + } + } + } + + // Finish and check for file errors + IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_, + db_options_.use_fsync); + + if (s.ok() && io_s.ok()) { + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + } + + if (s.ok()) { + s = io_s; + } + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the + // "normal" status, it does not also need to be checked + sub_compact->io_status.PermitUncheckedError(); + } + + TableProperties tp; + if (s.ok()) { + tp = outputs.GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { + // If there is nothing to output, no necessary to generate a sst file. + // This happens when the output level is bottom level, at the same time + // the sub_compact output nothing. + std::string fname = + TableFileName(sub_compact->compaction->immutable_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + + // TODO(AR) it is not clear if there are any larger implications if + // DeleteFile fails here + Status ds = env_->DeleteFile(fname); + if (!ds.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64 + " at bottom level%s", + cfd->GetName().c_str(), job_id_, output_number, + meta->marked_for_compaction ? " (need compaction)" : ""); + } + + // Also need to remove the file from outputs, or it will be added to the + // VersionEdit. + outputs.RemoveLastOutput(); + meta = nullptr; + } + + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { + // Output to event logger and fire events. + outputs.UpdateTableProperties(); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 + " keys, %" PRIu64 " bytes%s, temperature: %s", + cfd->GetName().c_str(), job_id_, output_number, + current_entries, meta->fd.file_size, + meta->marked_for_compaction ? " (need compaction)" : "", + temperature_to_string[meta->temperature].c_str()); + } + std::string fname; + FileDescriptor output_fd; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + Status status_for_listener = s; + if (meta != nullptr) { + fname = GetTableFileName(meta->fd.GetNumber()); + output_fd = meta->fd; + oldest_blob_file_number = meta->oldest_blob_file_number; + } else { + fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } + } + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, + job_id_, output_fd, oldest_blob_file_number, tp, + TableFileCreationReason::kCompaction, status_for_listener, file_checksum, + file_checksum_func_name); + + // Report new file to SstFileManagerImpl + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) { + Status add_s = sfm->OnAddFile(fname); + if (!add_s.ok() && s.ok()) { + s = add_s; + } + if (sfm->IsMaxAllowedSpaceReached()) { + // TODO(ajkr): should we return OK() if max space was reached by the final + // compaction output file (similarly to how flush works when full)? + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached"); + InstrumentedMutexLock l(db_mutex_); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); + } + } + + outputs.ResetBuilder(); + return s; +} + +Status CompactionJob::InstallCompactionResults( + const MutableCFOptions& mutable_cf_options) { + assert(compact_); + + db_mutex_->AssertHeld(); + + const ReadOptions read_options(Env::IOActivity::kCompaction); + auto* compaction = compact_->compaction; + assert(compaction); + + { + Compaction::InputLevelSummaryBuffer inputs_summary; + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64 + " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.penultimate_level_stats.bytes_written, + compaction_stats_.stats.bytes_written, + compaction_stats_.TotalBytesWritten()); + } else { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), + job_id_, compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.TotalBytesWritten()); + } + } + + VersionEdit* const edit = compaction->edit(); + assert(edit); + + // Add compaction inputs + compaction->AddInputDeletions(edit); + + std::unordered_map blob_total_garbage; + + for (const auto& sub_compact : compact_->sub_compact_states) { + sub_compact.AddOutputsEdit(edit); + + for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) { + edit->AddBlobFile(blob); + } + + if (sub_compact.Current().GetBlobGarbageMeter()) { + const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows(); + + for (const auto& pair : flows) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobInOutFlow& flow = pair.second; + + assert(flow.IsValid()); + if (flow.HasGarbage()) { + blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(), + flow.GetGarbageBytes()); + } + } + } + } + + for (const auto& pair : blob_total_garbage) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobStats& stats = pair.second; + + edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(), + stats.GetBytes()); + } + + if ((compaction->compaction_reason() == + CompactionReason::kLevelMaxLevelSize || + compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) && + compaction->immutable_options()->compaction_pri == kRoundRobin) { + int start_level = compaction->start_level(); + if (start_level > 0) { + auto vstorage = compaction->input_version()->storage_info(); + edit->AddCompactCursor(start_level, + vstorage->GetNextCompactCursor( + start_level, compaction->num_input_files(0))); + } + } + + return versions_->LogAndApply(compaction->column_family_data(), + mutable_cf_options, read_options, edit, + db_mutex_, db_directory_); +} + +void CompactionJob::RecordCompactionIOStats() { + RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + CompactionReason compaction_reason = + compact_->compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kPeriodicCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kTtl) { + RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written)); + } + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read)); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs) { + assert(sub_compact != nullptr); + + // no need to lock because VersionSet::next_file_number_ is atomic + uint64_t file_number = versions_->NewFileNumber(); + std::string fname = GetTableFileName(file_number); + // Fire events. + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + EventHelpers::NotifyTableFileCreationStarted( + cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, + TableFileCreationReason::kCompaction); + // Make the output file + std::unique_ptr writable_file; +#ifndef NDEBUG + bool syncpoint_arg = file_options_.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", + &syncpoint_arg); +#endif + + // Pass temperature of the last level files to FileSystem. + FileOptions fo_copy = file_options_; + Temperature temperature = sub_compact->compaction->output_temperature(); + // only set for the last level compaction and also it's not output to + // penultimate level (when preclude_last_level feature is enabled) + if (temperature == Temperature::kUnknown && + sub_compact->compaction->is_last_level() && + !sub_compact->IsCurrentPenultimateLevel()) { + temperature = + sub_compact->compaction->mutable_cf_options()->last_level_temperature; + } + fo_copy.temperature = temperature; + + Status s; + IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); + s = io_s; + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the io_s that is checked below as s, + // it does not also need to be checked. + sub_compact->io_status.PermitUncheckedError(); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 + " fails at NewWritableFile with status %s", + sub_compact->compaction->column_family_data()->GetName().c_str(), + job_id_, file_number, s.ToString().c_str()); + LogFlush(db_options_.info_log); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), + fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, + TableProperties(), TableFileCreationReason::kCompaction, s, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + return s; + } + + // Try to figure out the output file's oldest ancester time. + int64_t temp_current_time = 0; + auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!get_time_status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time. Status: %s", + get_time_status.ToString().c_str()); + } + uint64_t current_time = static_cast(temp_current_time); + InternalKey tmp_start, tmp_end; + if (sub_compact->start.has_value()) { + tmp_start.SetMinPossibleForUserKey(sub_compact->start.value()); + } + if (sub_compact->end.has_value()) { + tmp_end.SetMinPossibleForUserKey(sub_compact->end.value()); + } + uint64_t oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + sub_compact->start.has_value() ? &tmp_start : nullptr, + sub_compact->end.has_value() ? &tmp_end : nullptr); + if (oldest_ancester_time == std::numeric_limits::max()) { + oldest_ancester_time = current_time; + } + + // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber(); + { + FileMetaData meta; + meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + meta.oldest_ancester_time = oldest_ancester_time; + meta.file_creation_time = current_time; + meta.epoch_number = epoch_number; + meta.temperature = temperature; + assert(!db_id_.empty()); + assert(!db_session_id_.empty()); + s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(), + &meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] file #%" PRIu64 + " failed to generate unique id: %s.", + cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(), + s.ToString().c_str()); + return s; + } + + outputs.AddOutput(std::move(meta), cfd->internal_comparator(), + sub_compact->compaction->mutable_cf_options() + ->check_flush_compaction_key_order, + paranoid_file_checks_); + } + + writable_file->SetIOPriority(GetRateLimiterPriority()); + writable_file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = db_options_.checksum_handoff_file_types; + writable_file->SetPreallocationBlockSize(static_cast( + sub_compact->compaction->OutputFilePreallocationSize())); + const auto& listeners = + sub_compact->compaction->immutable_options()->listeners; + outputs.AssignFileWriter(new WritableFileWriter( + std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_, + db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + TableBuilderOptions tboptions( + *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + sub_compact->compaction->output_compression(), + sub_compact->compaction->output_compression_opts(), cfd->GetID(), + cfd->GetName(), sub_compact->compaction->output_level(), + bottommost_level_, TableFileCreationReason::kCompaction, + 0 /* oldest_key_time */, current_time, db_id_, db_session_id_, + sub_compact->compaction->max_output_file_size(), file_number); + + outputs.NewBuilder(tboptions); + + LogFlush(db_options_.info_log); + return s; +} + +void CompactionJob::CleanupCompaction() { + for (SubcompactionState& sub_compact : compact_->sub_compact_states) { + sub_compact.Cleanup(table_cache_.get()); + } + delete compact_; + compact_ = nullptr; +} + +namespace { +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} +} // namespace + + +void CompactionJob::UpdateCompactionStats() { + assert(compact_); + + Compaction* compaction = compact_->compaction; + compaction_stats_.stats.num_input_files_in_non_output_levels = 0; + compaction_stats_.stats.num_input_files_in_output_level = 0; + for (int input_level = 0; + input_level < static_cast(compaction->num_input_levels()); + ++input_level) { + if (compaction->level(input_level) != compaction->output_level()) { + UpdateCompactionInputStatsHelper( + &compaction_stats_.stats.num_input_files_in_non_output_levels, + &compaction_stats_.stats.bytes_read_non_output_levels, input_level); + } else { + UpdateCompactionInputStatsHelper( + &compaction_stats_.stats.num_input_files_in_output_level, + &compaction_stats_.stats.bytes_read_output_level, input_level); + } + } + + assert(compaction_job_stats_); + compaction_stats_.stats.bytes_read_blob = + compaction_job_stats_->total_blob_bytes_read; + + compaction_stats_.stats.num_dropped_records = + compaction_stats_.DroppedRecords(); +} + +void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, + uint64_t* bytes_read, + int input_level) { + const Compaction* compaction = compact_->compaction; + auto num_input_files = compaction->num_input_files(input_level); + *num_files += static_cast(num_input_files); + + for (size_t i = 0; i < num_input_files; ++i) { + const auto* file_meta = compaction->input(input_level, i); + *bytes_read += file_meta->fd.GetFileSize(); + compaction_stats_.stats.num_input_records += + static_cast(file_meta->num_entries); + } +} + +void CompactionJob::UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const { + compaction_job_stats_->elapsed_micros = stats.micros; + + // input information + compaction_job_stats_->total_input_bytes = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + compaction_job_stats_->num_input_records = stats.num_input_records; + compaction_job_stats_->num_input_files = + stats.num_input_files_in_non_output_levels + + stats.num_input_files_in_output_level; + compaction_job_stats_->num_input_files_at_output_level = + stats.num_input_files_in_output_level; + + // output information + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; + compaction_job_stats_->num_output_records = stats.num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; + + if (stats.num_output_files > 0) { + CopyPrefix(compact_->SmallestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->smallest_output_key_prefix); + CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->largest_output_key_prefix); + } +} + +void CompactionJob::LogCompaction() { + Compaction* compaction = compact_->compaction; + ColumnFamilyData* cfd = compaction->column_family_data(); + + // Let's check if anything will get logged. Don't prepare all the info if + // we're not logging + if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f", + cfd->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compaction->score()); + char scratch[2345]; + compaction->Summary(scratch, sizeof(scratch)); + ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + // build event logger report + auto stream = event_logger_->Log(); + stream << "job" << job_id_ << "event" + << "compaction_started" + << "compaction_reason" + << GetCompactionReasonString(compaction->compaction_reason()); + for (size_t i = 0; i < compaction->num_input_levels(); ++i) { + stream << ("files_L" + std::to_string(compaction->level(i))); + stream.StartArray(); + for (auto f : *compaction->inputs(i)) { + stream << f->fd.GetNumber(); + } + stream.EndArray(); + } + stream << "score" << compaction->score() << "input_data_size" + << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno" + << (existing_snapshots_.empty() + ? int64_t{-1} // Use -1 for "none" + : static_cast(existing_snapshots_[0])); + if (compaction->SupportsPerKeyPlacement()) { + stream << "preclude_last_level_min_seqno" + << preclude_last_level_min_seqno_; + stream << "penultimate_output_level" << compaction->GetPenultimateLevel(); + stream << "penultimate_output_range" + << GetCompactionPenultimateOutputRangeTypeString( + compaction->GetPenultimateOutputRangeType()); + + if (compaction->GetPenultimateOutputRangeType() == + Compaction::PenultimateOutputRangeType::kDisabled) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Penultimate level output is disabled, likely " + "because of the range conflict in the penultimate level", + cfd->GetName().c_str(), job_id_); + } + } + } +} + +std::string CompactionJob::GetTableFileName(uint64_t file_number) { + return TableFileName(compact_->compaction->immutable_options()->cf_paths, + file_number, compact_->compaction->output_path_id()); +} + +Env::IOPriority CompactionJob::GetRateLimiterPriority() { + if (versions_ && versions_->GetColumnFamilySet() && + versions_->GetColumnFamilySet()->write_controller()) { + WriteController* write_controller = + versions_->GetColumnFamilySet()->write_controller(); + if (write_controller->NeedsDelay() || write_controller->IsStopped()) { + return Env::IO_USER; + } + } + + return Env::IO_LOW; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.h new file mode 100644 index 0000000000..a930c15f1f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_job.h @@ -0,0 +1,504 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_completion_callback.h" +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/memtable_list.h" +#include "db/range_del_aggregator.h" +#include "db/seqno_to_time_mapping.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class CompactionState; +class ErrorHandler; +class MemTable; +class SnapshotChecker; +class SystemClock; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class SubcompactionState; + +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. +// +// CompactionJob has 2 main stats: +// 1. CompactionJobStats compaction_job_stats_ +// CompactionJobStats is a public data structure which is part of Compaction +// event listener that rocksdb share the job stats with the user. +// Internally it's an aggregation of all the compaction_job_stats from each +// `SubcompactionState`: +// +------------------------+ +// | SubcompactionState | +// | | +// +--------->| compaction_job_stats | +// | | | +// | +------------------------+ +// +------------------------+ | +// | CompactionJob | | +------------------------+ +// | | | | SubcompactionState | +// | compaction_job_stats +-----+ | | +// | | +--------->| compaction_job_stats | +// | | | | | +// +------------------------+ | +------------------------+ +// | +// | +------------------------+ +// | | SubcompactionState | +// | | | +// +--------->+ compaction_job_stats | +// | | | +// | +------------------------+ +// | +// | +------------------------+ +// | | ... | +// +--------->+ | +// +------------------------+ +// +// 2. CompactionStatsFull compaction_stats_ +// `CompactionStatsFull` is an internal stats about the compaction, which +// is eventually sent to `ColumnFamilyData::internal_stats_` and used for +// logging and public metrics. +// Internally, it's an aggregation of stats_ from each `SubcompactionState`. +// It has 2 parts, normal stats about the main compaction information and +// the penultimate level output stats. +// `SubcompactionState` maintains the CompactionOutputs for normal output and +// the penultimate level output if exists, the per_level stats is +// stored with the outputs. +// +---------------------------+ +// | SubcompactionState | +// | | +// | +----------------------+ | +// | | CompactionOutputs | | +// | | (normal output) | | +// +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// +--------------------------------+ | | | CompactionOutputs | | +// | CompactionJob | | | | (penultimate_level) | | +// | | +--------->| stats_ | | +// | compaction_stats_ | | | | +----------------------+ | +// | +-------------------------+ | | | | | +// | |stats (normal) |------|----+ +---------------------------+ +// | +-------------------------+ | | | +// | | | | +// | +-------------------------+ | | | +---------------------------+ +// | |penultimate_level_stats +------+ | | SubcompactionState | +// | +-------------------------+ | | | | | +// | | | | | +----------------------+ | +// | | | | | | CompactionOutputs | | +// +--------------------------------+ | | | | (normal output) | | +// | +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// | | | CompactionOutputs | | +// | | | (penultimate_level) | | +// +--------->| stats_ | | +// | +----------------------+ | +// | | +// +---------------------------+ + +class CompactionJob { + public: + CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, JobContext* job_context, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", std::string trim_ts = "", + BlobFileCompletionCallback* blob_callback = nullptr, + int* bg_compaction_scheduled = nullptr, + int* bg_bottom_compaction_scheduled = nullptr); + + virtual ~CompactionJob(); + + // no copy/move + CompactionJob(CompactionJob&& job) = delete; + CompactionJob(const CompactionJob& job) = delete; + CompactionJob& operator=(const CompactionJob& job) = delete; + + // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction + void Prepare(); + // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results + Status Run(); + + // REQUIRED: mutex held + // Add compaction input/output to the current version + Status Install(const MutableCFOptions& mutable_cf_options); + + // Return the IO status + IOStatus io_status() const { return io_status_; } + + protected: + void UpdateCompactionStats(); + void LogCompaction(); + virtual void RecordCompactionIOStats(); + void CleanupCompaction(); + + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + + CompactionState* compact_; + InternalStats::CompactionStatsFull compaction_stats_; + const ImmutableDBOptions& db_options_; + const MutableDBOptions mutable_db_options_copy_; + LogBuffer* log_buffer_; + FSDirectory* output_directory_; + Statistics* stats_; + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + + Env::WriteLifeTimeHint write_hint_; + + IOStatus io_status_; + + CompactionJobStats* compaction_job_stats_; + + private: + friend class CompactionJobTestBase; + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. + void GenSubcompactionBoundaries(); + + // Get the number of planned subcompactions based on max_subcompactions and + // extra reserved resources + uint64_t GetSubcompactionsLimit(); + + // Additional reserved threads are reserved and the number is stored in + // extra_num_subcompaction_threads_reserved__. For now, this happens only if + // the compaction priority is round-robin and max_subcompactions is not + // sufficient (extra resources may be needed) + void AcquireSubcompactionResources(int num_extra_required_subcompactions); + + // Additional threads may be reserved during IncreaseSubcompactionResources() + // if num_actual_subcompactions is less than num_planned_subcompactions. + // Additional threads will be released and the bg_compaction_scheduled_ or + // bg_bottom_compaction_scheduled_ will be updated if they are used. + // DB Mutex lock is required. + void ShrinkSubcompactionResources(uint64_t num_extra_resources); + + // Release all reserved threads and update the compaction limits. + void ReleaseSubcompactionResources(); + + CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact); + + // update the thread status for starting a compaction. + void ReportStartedCompaction(Compaction* compaction); + + Status FinishCompactionOutputFile(const Status& input_status, + SubcompactionState* sub_compact, + CompactionOutputs& outputs, + const Slice& next_table_min_key, + const Slice* comp_start_user_key, + const Slice* comp_end_user_key); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); + Status OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs); + void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const; + void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats = nullptr); + + void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read, + int input_level); + + void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); + + void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact); + + uint32_t job_id_; + + // DBImpl state + const std::string& dbname_; + const std::string db_id_; + const std::string db_session_id_; + const FileOptions file_options_; + + Env* env_; + std::shared_ptr io_tracer_; + FileSystemPtr fs_; + // env_option optimized for compaction table reads + FileOptions file_options_for_read_; + VersionSet* versions_; + const std::atomic* shutting_down_; + const std::atomic& manual_compaction_canceled_; + FSDirectory* db_directory_; + FSDirectory* blob_output_directory_; + InstrumentedMutex* db_mutex_; + ErrorHandler* db_error_handler_; + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot_; + + const SnapshotChecker* const snapshot_checker_; + + JobContext* job_context_; + + std::shared_ptr table_cache_; + + EventLogger* event_logger_; + + bool paranoid_file_checks_; + bool measure_io_stats_; + // Stores the Slices that designate the boundaries for each subcompaction + std::vector boundaries_; + Env::Priority thread_pri_; + std::string full_history_ts_low_; + std::string trim_ts_; + BlobFileCompletionCallback* blob_callback_; + + uint64_t GetCompactionId(SubcompactionState* sub_compact) const; + // Stores the number of reserved threads in shared env_ for the number of + // extra subcompaction in kRoundRobin compaction priority + int extra_num_subcompaction_threads_reserved_; + + // Stores the pointer to bg_compaction_scheduled_, + // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing + // or updating it. + int* bg_compaction_scheduled_; + int* bg_bottom_compaction_scheduled_; + + // Stores the sequence number to time mapping gathered from all input files + // it also collects the smallest_seqno -> oldest_ancester_time from the SST. + SeqnoToTimeMapping seqno_time_mapping_; + + // Minimal sequence number for preserving the time information. The time info + // older than this sequence number won't be preserved after the compaction and + // if it's bottommost compaction, the seq num will be zeroed out. + SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + + // Minimal sequence number to preclude the data from the last level. If the + // key has bigger (newer) sequence number than this, it will be precluded from + // the last level (output to penultimate level). + SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + + // Get table file name in where it's outputting to, which should also be in + // `output_directory_`. + virtual std::string GetTableFileName(uint64_t file_number); + // The rate limiter priority (io_priority) is determined dynamically here. + // The Compaction Read and Write priorities are the same for different + // scenarios, such as write stalled. + Env::IOPriority GetRateLimiterPriority(); +}; + +// CompactionServiceInput is used the pass compaction information between two +// db instances. It contains the information needed to do a compaction. It +// doesn't contain the LSM tree information, which is passed though MANIFEST +// file. +struct CompactionServiceInput { + ColumnFamilyDescriptor column_family; + + DBOptions db_options; + + std::vector snapshots; + + // SST files for compaction, it should already be expended to include all the + // files needed for this compaction, for both input level files and output + // level files. + std::vector input_files; + int output_level; + + // db_id is used to generate unique id of sst on the remote compactor + std::string db_id; + + // information for subcompaction + bool has_begin = false; + std::string begin; + bool has_end = false; + std::string end; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceInput* obj); + Status Write(std::string* output); + + // Initialize a dummy ColumnFamilyDescriptor + CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {} + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceInput* other); + bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceOutputFile is the metadata for the output SST file +struct CompactionServiceOutputFile { + std::string file_name; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; + std::string smallest_internal_key; + std::string largest_internal_key; + uint64_t oldest_ancester_time; + uint64_t file_creation_time; + uint64_t epoch_number; + uint64_t paranoid_hash; + bool marked_for_compaction; + UniqueId64x2 unique_id; + + CompactionServiceOutputFile() = default; + CompactionServiceOutputFile( + const std::string& name, SequenceNumber smallest, SequenceNumber largest, + std::string _smallest_internal_key, std::string _largest_internal_key, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _epoch_number, uint64_t _paranoid_hash, + bool _marked_for_compaction, UniqueId64x2 _unique_id) + : file_name(name), + smallest_seqno(smallest), + largest_seqno(largest), + smallest_internal_key(std::move(_smallest_internal_key)), + largest_internal_key(std::move(_largest_internal_key)), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + epoch_number(_epoch_number), + paranoid_hash(_paranoid_hash), + marked_for_compaction(_marked_for_compaction), + unique_id(std::move(_unique_id)) {} +}; + +// CompactionServiceResult contains the compaction result from a different db +// instance, with these information, the primary db instance with write +// permission is able to install the result to the DB. +struct CompactionServiceResult { + Status status; + std::vector output_files; + int output_level; + + // location of the output files + std::string output_path; + + // some statistics about the compaction + uint64_t num_output_records = 0; + uint64_t total_bytes = 0; + uint64_t bytes_read = 0; + uint64_t bytes_written = 0; + CompactionJobStats stats; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceResult* obj); + Status Write(std::string* output); + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceResult* other); + bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceCompactionJob is an read-only compaction job, it takes +// input information from `compaction_service_input` and put result information +// in `compaction_service_result`, the SST files are generated to `output_path`. +class CompactionServiceCompactionJob : private CompactionJob { + public: + CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result); + + // Run the compaction in current thread and return the result + Status Run(); + + void CleanupCompaction(); + + IOStatus io_status() const { return CompactionJob::io_status(); } + + protected: + void RecordCompactionIOStats() override; + + private: + // Get table file name in output_path + std::string GetTableFileName(uint64_t file_number) override; + // Specific the compaction output path, otherwise it uses default DB path + const std::string output_path_; + + // Compaction job input + const CompactionServiceInput& compaction_input_; + + // Compaction job result + CompactionServiceResult* compaction_result_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.cc new file mode 100644 index 0000000000..f6e0dbd7d4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.cc @@ -0,0 +1,784 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_outputs.h" + +#include "db/builder.h" + +namespace ROCKSDB_NAMESPACE { + +void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { + builder_.reset(NewTableBuilder(tboptions, file_writer_.get())); +} + +Status CompactionOutputs::Finish(const Status& intput_status, + const SeqnoToTimeMapping& seqno_time_mapping) { + FileMetaData* meta = GetMetaData(); + assert(meta != nullptr); + Status s = intput_status; + if (s.ok()) { + std::string seqno_time_mapping_str; + seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str, + meta->oldest_ancester_time); + s = builder_->Finish(); + + } else { + builder_->Abandon(); + } + Status io_s = builder_->io_status(); + if (s.ok()) { + s = io_s; + } else { + io_s.PermitUncheckedError(); + } + const uint64_t current_bytes = builder_->FileSize(); + if (s.ok()) { + meta->fd.file_size = current_bytes; + meta->tail_size = builder_->GetTailSize(); + meta->marked_for_compaction = builder_->NeedCompact(); + } + current_output().finished = true; + stats_.bytes_written += current_bytes; + stats_.num_output_files = outputs_.size(); + + return s; +} + +IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, + SystemClock* clock, + Statistics* statistics, + bool use_fsync) { + IOStatus io_s; + if (input_status.ok()) { + StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS); + io_s = file_writer_->Sync(use_fsync); + } + if (input_status.ok() && io_s.ok()) { + io_s = file_writer_->Close(); + } + + if (input_status.ok() && io_s.ok()) { + FileMetaData* meta = GetMetaData(); + meta->file_checksum = file_writer_->GetFileChecksum(); + meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName(); + } + + file_writer_.reset(); + + return io_s; +} + +bool CompactionOutputs::UpdateFilesToCutForTTLStates( + const Slice& internal_key) { + if (!files_to_cut_for_ttl_.empty()) { + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + if (cur_files_to_cut_for_ttl_ != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; + cur_files_to_cut_for_ttl_ = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl_ < + static_cast(files_to_cut_for_ttl_.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl_++; + } else { + // Still fall into the gap + break; + } + } + } + } + return false; +} + +size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( + const Slice& internal_key) { + size_t curr_key_boundary_switched_num = 0; + const std::vector& grandparents = compaction_->grandparents(); + + if (grandparents.empty()) { + return curr_key_boundary_switched_num; + } + assert(!internal_key.empty()); + InternalKey ikey; + ikey.DecodeFrom(internal_key); + assert(ikey.Valid()); + + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + + // Move the grandparent_index_ to the file containing the current user_key. + // If there are multiple files containing the same user_key, make sure the + // index points to the last file containing the key. + while (grandparent_index_ < grandparents.size()) { + if (being_grandparent_gap_) { + if (sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) < 0) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_overlapped_bytes_ += + grandparents[grandparent_index_]->fd.GetFileSize(); + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = false; + } else { + int cmp_result = sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_]->largest); + // If it's same key, make sure grandparent_index_ is pointing to the last + // one. + if (cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_ + 1]->smallest) < + 0))) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = true; + grandparent_index_++; + } + } + + // If the first key is in the middle of a grandparent file, adding it to the + // overlap + if (!seen_key_ && !being_grandparent_gap_) { + assert(grandparent_overlapped_bytes_ == 0); + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(internal_key); + } + + seen_key_ = true; + return curr_key_boundary_switched_num; +} + +uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const { + // no overlap with any grandparent file + if (being_grandparent_gap_) { + return 0; + } + uint64_t overlapped_bytes = 0; + + const std::vector& grandparents = compaction_->grandparents(); + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + InternalKey ikey; + ikey.DecodeFrom(internal_key); +#ifndef NDEBUG + // make sure the grandparent_index_ is pointing to the last files containing + // the current key. + int cmp_result = + sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest); + assert( + cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0))); + assert(sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) >= 0); +#endif + overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize(); + + // go backwards to find all overlapped files, one key can overlap multiple + // files. In the following example, if the current output key is `c`, and one + // compaction file was cut before `c`, current `c` can overlap with 3 files: + // [a b] [c... + // [b, b] [c, c] [c, c] [c, d] + for (int64_t i = static_cast(grandparent_index_) - 1; + i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0; + i--) { + overlapped_bytes += grandparents[i]->fd.GetFileSize(); + } + + return overlapped_bytes; +} + +bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { + assert(c_iter.Valid()); + const Slice& internal_key = c_iter.key(); +#ifndef NDEBUG + bool should_stop = false; + std::pair p{&should_stop, internal_key}; + TEST_SYNC_POINT_CALLBACK( + "CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p); + if (should_stop) { + return true; + } +#endif // NDEBUG + const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + size_t num_grandparent_boundaries_crossed = 0; + bool should_stop_for_ttl = false; + // Always update grandparent information like overlapped file number, size + // etc., and TTL states. + // If compaction_->output_level() == 0, there is no need to update grandparent + // info, and that `grandparent` should be empty. + if (compaction_->output_level() > 0) { + num_grandparent_boundaries_crossed = + UpdateGrandparentBoundaryInfo(internal_key); + should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key); + } + + if (!HasBuilder()) { + return false; + } + + if (should_stop_for_ttl) { + return true; + } + + // If there's user defined partitioner, check that first + if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( + last_key_for_partitioner_, c_iter.user_key(), + current_output_file_size_)) == kRequired) { + return true; + } + + // files output to Level 0 won't be split + if (compaction_->output_level() == 0) { + return false; + } + + // reach the max file size + if (current_output_file_size_ >= compaction_->max_output_file_size()) { + return true; + } + + // Check if it needs to split for RoundRobin + // Invalid local_output_split_key indicates that we do not need to split + if (local_output_split_key_ != nullptr && !is_split_) { + // Split occurs when the next key is larger than/equal to the cursor + if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) { + is_split_ = true; + return true; + } + } + + // only check if the current key is going to cross the grandparents file + // boundary (either the file beginning or ending). + if (num_grandparent_boundaries_crossed > 0) { + // Cut the file before the current key if the size of the current output + // file + its overlapped grandparent files is bigger than + // max_compaction_bytes. Which is to prevent future bigger than + // max_compaction_bytes compaction from the current output level. + if (grandparent_overlapped_bytes_ + current_output_file_size_ > + compaction_->max_compaction_bytes()) { + return true; + } + + // Cut the file if including the key is going to add a skippable file on + // the grandparent level AND its size is reasonably big (1/8 of target file + // size). For example, if it's compacting the files L0 + L1: + // L0: [1, 21] + // L1: [3, 23] + // L2: [2, 4] [11, 15] [22, 24] + // Without this break, it will output as: + // L1: [1,3, 21,23] + // With this break, it will output as (assuming [11, 15] at L2 is bigger + // than 1/8 of target size): + // L1: [1,3] [21,23] + // Then for the future compactions, [11,15] won't be included. + // For random datasets (either evenly distributed or skewed), it rarely + // triggers this condition, but if the user is adding 2 different datasets + // without any overlap, it may likely happen. + // More details, check PR #1963 + const size_t num_skippable_boundaries_crossed = + being_grandparent_gap_ ? 2 : 3; + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + num_grandparent_boundaries_crossed >= + num_skippable_boundaries_crossed && + grandparent_overlapped_bytes_ - previous_overlapped_bytes > + compaction_->target_output_file_size() / 8) { + return true; + } + + // Pre-cut the output file if it's reaching a certain size AND it's at the + // boundary of a grandparent file. It can reduce the future compaction size, + // the cost is having smaller files. + // The pre-cut size threshold is based on how many grandparent boundaries + // it has seen before. Basically, if it has seen no boundary at all, then it + // will pre-cut at 50% target file size. Every boundary it has seen + // increases the threshold by 5%, max at 90%, which it will always cut. + // The idea is based on if it has seen more boundaries before, it will more + // likely to see another boundary (file cutting opportunity) before the + // target file size. The test shows it can generate larger files than a + // static threshold like 75% and has a similar write amplification + // improvement. + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + current_output_file_size_ >= + ((compaction_->target_output_file_size() + 99) / 100) * + (50 + std::min(grandparent_boundary_switched_num_ * 5, + size_t{40}))) { + return true; + } + } + + return false; +} + +Status CompactionOutputs::AddToOutput( + const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status s; + bool is_range_del = c_iter.IsDeleteRangeSentinelKey(); + if (is_range_del && compaction_->bottommost_level()) { + // We don't consider range tombstone for bottommost level since: + // 1. there is no grandparent and hence no overlap to consider + // 2. range tombstone may be dropped at bottommost level. + return s; + } + const Slice& key = c_iter.key(); + if (ShouldStopBefore(c_iter) && HasBuilder()) { + s = close_file_func(*this, c_iter.InputStatus(), key); + if (!s.ok()) { + return s; + } + // reset grandparent information + grandparent_boundary_switched_num_ = 0; + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(key); + if (UNLIKELY(is_range_del)) { + // lower bound for this new output file, this is needed as the lower bound + // does not come from the smallest point key in this case. + range_tombstone_lower_bound_.DecodeFrom(key); + } else { + range_tombstone_lower_bound_.Clear(); + } + } + + // Open output file if necessary + if (!HasBuilder()) { + s = open_file_func(*this); + if (!s.ok()) { + return s; + } + } + + // c_iter may emit range deletion keys, so update `last_key_for_partitioner_` + // here before returning below when `is_range_del` is true + if (partitioner_) { + last_key_for_partitioner_.assign(c_iter.user_key().data_, + c_iter.user_key().size_); + } + + if (UNLIKELY(is_range_del)) { + return s; + } + + assert(builder_ != nullptr); + const Slice& value = c_iter.value(); + s = current_output().validator.Add(key, value); + if (!s.ok()) { + return s; + } + builder_->Add(key, value); + + stats_.num_output_records++; + current_output_file_size_ = builder_->EstimatedFileSize(); + + if (blob_garbage_meter_) { + s = blob_garbage_meter_->ProcessOutFlow(key, value); + } + + if (!s.ok()) { + return s; + } + + const ParsedInternalKey& ikey = c_iter.ikey(); + s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence, + ikey.type); + + return s; +} + +namespace { +void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key, + const size_t ts_sz) { + if (ts_sz) { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_sz <= strlen(kTsMax)) { + internal_key = InternalKey(user_key, kMaxSequenceNumber, + kTypeRangeDeletion, Slice(kTsMax, ts_sz)); + } else { + internal_key = + InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion, + std::string(ts_sz, '\xff')); + } + } else { + internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion); + } +} +} // namespace + +Status CompactionOutputs::AddRangeDels( + const Slice* comp_start_user_key, const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, bool bottommost_level, + const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, + const Slice& next_table_min_key, const std::string& full_history_ts_low) { + // The following example does not happen since + // CompactionOutput::ShouldStopBefore() always return false for the first + // point key. But we should consider removing this dependency. Suppose for the + // first compaction output file, + // - next_table_min_key.user_key == comp_start_user_key + // - no point key is in the output file + // - there is a range tombstone @seqno to be added that covers + // comp_start_user_key + // Then meta.smallest will be set to comp_start_user_key@seqno + // and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber + // which violates the assumption that meta.smallest should be <= meta.largest. + assert(HasRangeDel()); + FileMetaData& meta = current_output().meta; + const Comparator* ucmp = icmp.user_comparator(); + InternalKey lower_bound_buf, upper_bound_buf; + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + + // We first determine the internal key lower_bound and upper_bound for + // this output file. All and only range tombstones that overlap with + // [lower_bound, upper_bound] should be added to this file. File + // boundaries (meta.smallest/largest) should be updated accordingly when + // extended by range tombstones. + size_t output_size = outputs_.size(); + if (output_size == 1) { + // This is the first file in the subcompaction. + // + // When outputting a range tombstone that spans a subcompaction boundary, + // the files on either side of that boundary need to include that + // boundary's user key. Otherwise, the spanning range tombstone would lose + // coverage. + // + // To achieve this while preventing files from overlapping in internal key + // (an LSM invariant violation), we allow the earlier file to include the + // boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The + // later file can begin at the boundary user key at the newest key version + // it contains. At this point that version number is unknown since we have + // not processed the range tombstones yet, so permit any version. Same story + // applies to timestamp, and a non-nullptr `comp_start_user_key` should have + // `kMaxTs` here, which similarly permits any timestamp. + if (comp_start_user_key) { + lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber, + kTypeRangeDeletion); + lower_bound_guard = lower_bound_buf.Encode(); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + } else { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + if (range_tombstone_lower_bound_.size() > 0) { + assert(meta.smallest.size() == 0 || + icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0); + lower_bound_guard = range_tombstone_lower_bound_.Encode(); + } else { + assert(meta.smallest.size() > 0); + lower_bound_guard = meta.smallest.Encode(); + } + lower_bound = &lower_bound_guard; + } + + const size_t ts_sz = ucmp->timestamp_size(); + if (next_table_min_key.empty()) { + // Last file of the subcompaction. + if (comp_end_user_key) { + upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber, + kTypeRangeDeletion); + upper_bound_guard = upper_bound_buf.Encode(); + upper_bound = &upper_bound_guard; + } else { + upper_bound = nullptr; + } + } else { + // There is another file coming whose coverage will begin at + // `next_table_min_key`. The current file needs to extend range tombstone + // coverage through its own keys (through `meta.largest`) and through user + // keys preceding `next_table_min_key`'s user key. + ParsedInternalKey next_table_min_key_parsed; + ParseInternalKey(next_table_min_key, &next_table_min_key_parsed, + false /* log_err_key */) + .PermitUncheckedError(); + assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber); + assert(meta.largest.size() == 0 || + icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0); + assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0); + if (meta.largest.size() > 0 && + ucmp->EqualWithoutTimestamp(meta.largest.user_key(), + next_table_min_key_parsed.user_key)) { + // Caution: this assumes meta.largest.Encode() lives longer than + // upper_bound, which is only true if meta.largest is never updated. + // This just happens to be the case here since meta.largest serves + // as the upper_bound. + upper_bound_guard = meta.largest.Encode(); + } else { + SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key, + ts_sz); + upper_bound_guard = upper_bound_buf.Encode(); + } + upper_bound = &upper_bound_guard; + } + if (lower_bound && upper_bound && + icmp.Compare(*lower_bound, *upper_bound) > 0) { + assert(meta.smallest.size() == 0 && + ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound), + ExtractUserKey(*upper_bound))); + // This can only happen when lower_bound have the same user key as + // next_table_min_key and that there is no point key in the current + // compaction output file. + return Status::OK(); + } + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(comp_end_user_key == nullptr || upper_bound == nullptr || + ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound), + *comp_end_user_key) <= 0); + auto it = range_del_agg_->NewIterator(lower_bound, upper_bound); + Slice last_tombstone_start_user_key{}; + bool reached_lower_bound = false; + const ReadOptions read_options(Env::IOActivity::kCompaction); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + auto tombstone = it->Tombstone(); + auto kv = tombstone.Serialize(); + InternalKey tombstone_end = tombstone.SerializeEndKey(); + // TODO: the underlying iterator should support clamping the bounds. + // tombstone_end.Encode is of form user_key@kMaxSeqno + // if it is equal to lower_bound, there is no need to include + // such range tombstone. + if (!reached_lower_bound && lower_bound && + icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) { + continue; + } + assert(!lower_bound || + icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0); + reached_lower_bound = true; + + // Garbage collection for range tombstones. + // If user-defined timestamp is enabled, range tombstones are dropped if + // they are at bottommost_level, below full_history_ts_low and not visible + // in any snapshot. trim_ts_ is passed to the constructor for + // range_del_agg_, and range_del_agg_ internally drops tombstones above + // trim_ts_. + if (bottommost_level && tombstone.seq_ <= earliest_snapshot && + (ts_sz == 0 || + (!full_history_ts_low.empty() && + ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) { + // TODO(andrewkr): tombstones that span multiple output files are + // counted for each compaction output file, so lots of double + // counting. + range_del_out_stats.num_range_del_drop_obsolete++; + range_del_out_stats.num_record_drop_obsolete++; + continue; + } + + assert(lower_bound == nullptr || + ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound), + kv.second) < 0); + InternalKey tombstone_start = kv.first; + if (lower_bound && + ucmp->CompareWithoutTimestamp(tombstone_start.user_key(), + ExtractUserKey(*lower_bound)) < 0) { + // This just updates the non-timestamp portion of `tombstone_start`'s user + // key. Ideally there would be a simpler API usage + ParsedInternalKey tombstone_start_parsed; + ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed, + false /* log_err_key */) + .PermitUncheckedError(); + // timestamp should be from where sequence number is from, which is from + // tombstone in this case + std::string ts = + tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size()) + .ToString(); + tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound); + tombstone_start.SetFrom(tombstone_start_parsed, ts); + } + if (upper_bound != nullptr && + icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) { + break; + } + // Here we show that *only* range tombstones that overlap with + // [lower_bound, upper_bound] are added to the current file, and + // sanity checking invariants that should hold: + // - [tombstone_start, tombstone_end] overlaps with [lower_bound, + // upper_bound] + // - meta.smallest <= meta.largest + // Corresponding assertions are made, the proof is broken is any of them + // fails. + // TODO: show that *all* range tombstones that overlap with + // [lower_bound, upper_bound] are added. + // TODO: some invariant about boundaries are correctly updated. + // + // Note that `tombstone_start` is updated in the if condition above, we use + // tombstone_start to refer to its initial value, i.e., + // it->Tombstone().first, and use tombstone_start* to refer to its value + // after the update. + // + // To show [lower_bound, upper_bound] overlaps with [tombstone_start, + // tombstone_end]: + // lower_bound <= upper_bound from the if condition right after all + // bounds are initialized. We assume each tombstone fragment has + // start_key.user_key < end_key.user_key, so + // tombstone_start < tombstone_end by + // FragmentedTombstoneIterator::Tombstone(). So these two ranges are both + // non-emtpy. The flag `reached_lower_bound` and the if logic before it + // ensures lower_bound <= tombstone_end. tombstone_start is only updated + // if it has a smaller user_key than lower_bound user_key, so + // tombstone_start <= tombstone_start*. The above if condition implies + // tombstone_start* <= upper_bound. So we have + // tombstone_start <= upper_bound and lower_bound <= tombstone_end + // and the two ranges overlap. + // + // To show meta.smallest <= meta.largest: + // From the implementation of UpdateBoundariesForRange(), it suffices to + // prove that when it is first called in this function, its parameters + // satisfy `start <= end`, where start = max(tombstone_start*, lower_bound) + // and end = min(tombstone_end, upper_bound). From the above proof we have + // lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need + // to show that tombstone_start* <= min(tombstone_end, upper_bound). + // Note that tombstone_start*.user_key = max(tombstone_start.user_key, + // lower_bound.user_key). Assuming tombstone_end always has + // kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber. + // Since lower_bound <= tombstone_end and lower_bound.seqno < + // tombstone_end.seqno (in absolute number order, not internal key order), + // lower_bound.user_key < tombstone_end.user_key. + // Since lower_bound.user_key < tombstone_end.user_key and + // tombstone_start.user_key < tombstone_end.user_key, tombstone_start* < + // tombstone_end. Since tombstone_start* <= upper_bound from the above proof + // and tombstone_start* < tombstone_end, tombstone_start* <= + // min(tombstone_end, upper_bound), so the two ranges overlap. + + // Range tombstone is not supported by output validator yet. + builder_->Add(kv.first.Encode(), kv.second); + if (lower_bound && + icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) { + tombstone_start.DecodeFrom(*lower_bound); + } + if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) { + tombstone_end.DecodeFrom(*upper_bound); + } + assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); + meta.UpdateBoundariesForRange(tombstone_start, tombstone_end, + tombstone.seq_, icmp); + if (!bottommost_level) { + bool start_user_key_changed = + last_tombstone_start_user_key.empty() || + ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, + it->start_key()) < 0; + last_tombstone_start_user_key = it->start_key(); + if (start_user_key_changed) { + // If tombstone_start >= tombstone_end, then either no key range is + // covered, or that they have the same user key. If they have the same + // user key, then the internal key range should only be within this + // level, and no keys from older levels is covered. + if (ucmp->CompareWithoutTimestamp(tombstone_start.user_key(), + tombstone_end.user_key()) < 0) { + SizeApproximationOptions approx_opts; + approx_opts.files_size_error_margin = 0.1; + auto approximate_covered_size = + compaction_->input_version()->version_set()->ApproximateSize( + approx_opts, read_options, compaction_->input_version(), + tombstone_start.Encode(), tombstone_end.Encode(), + compaction_->output_level() + 1 /* start_level */, + -1 /* end_level */, kCompaction); + meta.compensated_range_deletion_size += approximate_covered_size; + } + } + } + } + return Status::OK(); +} + +void CompactionOutputs::FillFilesToCutForTtl() { + if (compaction_->immutable_options()->compaction_style != + kCompactionStyleLevel || + compaction_->immutable_options()->compaction_pri != + kMinOverlappingRatio || + compaction_->mutable_cf_options()->ttl == 0 || + compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) { + return; + } + + // We define new file with the oldest ancestor time to be younger than 1/4 + // TTL, and an old one to be older than 1/2 TTL time. + int64_t temp_current_time; + auto get_time_status = + compaction_->immutable_options()->clock->GetCurrentTime( + &temp_current_time); + if (!get_time_status.ok()) { + return; + } + + auto current_time = static_cast(temp_current_time); + if (current_time < compaction_->mutable_cf_options()->ttl) { + return; + } + + uint64_t old_age_thres = + current_time - compaction_->mutable_cf_options()->ttl / 2; + const std::vector& olevel = + *(compaction_->inputs(compaction_->num_input_levels() - 1)); + for (FileMetaData* file : olevel) { + // Worth filtering out by start and end? + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + // We put old files if they are not too small to prevent a flood + // of small files. + if (oldest_ancester_time < old_age_thres && + file->fd.GetFileSize() > + compaction_->mutable_cf_options()->target_file_size_base / 2) { + files_to_cut_for_ttl_.push_back(file); + } + } +} + +CompactionOutputs::CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level) + : compaction_(compaction), is_penultimate_level_(is_penultimate_level) { + partitioner_ = compaction->output_level() == 0 + ? nullptr + : compaction->CreateSstPartitioner(); + + if (compaction->output_level() != 0) { + FillFilesToCutForTtl(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.h new file mode 100644 index 0000000000..1b65ee662f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_outputs.h @@ -0,0 +1,403 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionOutputs; +using CompactionFileOpenFunc = std::function; +using CompactionFileCloseFunc = + std::function; + +// Files produced by subcompaction, most of the functions are used by +// compaction_job Open/Close compaction file functions. +class CompactionOutputs { + public: + // compaction output file + struct Output { + Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, + bool _enable_order_check, bool _enable_hash, bool _finished, + uint64_t precalculated_hash) + : meta(std::move(_meta)), + validator(_icmp, _enable_order_check, _enable_hash, + precalculated_hash), + finished(_finished) {} + FileMetaData meta; + OutputValidator validator; + bool finished; + std::shared_ptr table_properties; + }; + + CompactionOutputs() = delete; + + explicit CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level); + + // Add generated output to the list + void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + bool finished = false, uint64_t precalculated_hash = 0) { + outputs_.emplace_back(std::move(meta), icmp, enable_order_check, + enable_hash, finished, precalculated_hash); + } + + // Set new table builder for the current output + void NewBuilder(const TableBuilderOptions& tboptions); + + // Assign a new WritableFileWriter to the current output + void AssignFileWriter(WritableFileWriter* writer) { + file_writer_.reset(writer); + } + + // TODO: Remove it when remote compaction support tiered compaction + void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; } + void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } + + // TODO: Move the BlobDB builder into CompactionOutputs + const std::vector& GetBlobFileAdditions() const { + if (is_penultimate_level_) { + assert(blob_file_additions_.empty()); + } + return blob_file_additions_; + } + + std::vector* GetBlobFileAdditionsPtr() { + assert(!is_penultimate_level_); + return &blob_file_additions_; + } + + bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); } + + BlobGarbageMeter* CreateBlobGarbageMeter() { + assert(!is_penultimate_level_); + blob_garbage_meter_ = std::make_unique(); + return blob_garbage_meter_.get(); + } + + BlobGarbageMeter* GetBlobGarbageMeter() const { + if (is_penultimate_level_) { + // blobdb doesn't support per_key_placement yet + assert(blob_garbage_meter_ == nullptr); + return nullptr; + } + return blob_garbage_meter_.get(); + } + + void UpdateBlobStats() { + assert(!is_penultimate_level_); + stats_.num_output_files_blob = blob_file_additions_.size(); + for (const auto& blob : blob_file_additions_) { + stats_.bytes_written_blob += blob.GetTotalBlobBytes(); + } + } + + // Finish the current output file + Status Finish(const Status& intput_status, + const SeqnoToTimeMapping& seqno_time_mapping); + + // Update output table properties from table builder + void UpdateTableProperties() { + current_output().table_properties = + std::make_shared(GetTableProperties()); + } + + IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock, + Statistics* statistics, bool use_fsync); + + TableProperties GetTableProperties() { + return builder_->GetTableProperties(); + } + + Slice SmallestUserKey() const { + if (!outputs_.empty() && outputs_[0].finished) { + return outputs_[0].meta.smallest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + Slice LargestUserKey() const { + if (!outputs_.empty() && outputs_.back().finished) { + return outputs_.back().meta.largest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + // In case the last output file is empty, which doesn't need to keep. + void RemoveLastEmptyOutput() { + if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) { + // An error occurred, so ignore the last output. + outputs_.pop_back(); + } + } + + // Remove the last output, for example the last output doesn't have data (no + // entry and no range-dels), but file_size might not be 0, as it has SST + // metadata. + void RemoveLastOutput() { + assert(!outputs_.empty()); + outputs_.pop_back(); + } + + bool HasBuilder() const { return builder_ != nullptr; } + + FileMetaData* GetMetaData() { return ¤t_output().meta; } + + bool HasOutput() const { return !outputs_.empty(); } + + uint64_t NumEntries() const { return builder_->NumEntries(); } + + void ResetBuilder() { + builder_.reset(); + current_output_file_size_ = 0; + } + + // Add range deletions from the range_del_agg_ to the current output file. + // Input parameters, `range_tombstone_lower_bound_` and current output's + // metadata determine the bounds on range deletions to add. Updates output + // file metadata boundary if extended by range tombstones. + // + // @param comp_start_user_key and comp_end_user_key include timestamp if + // user-defined timestamp is enabled. Their timestamp should be max timestamp. + // @param next_table_min_key internal key lower bound for the next compaction + // output. + // @param full_history_ts_low used for range tombstone garbage collection. + Status AddRangeDels(const Slice* comp_start_user_key, + const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, + bool bottommost_level, const InternalKeyComparator& icmp, + SequenceNumber earliest_snapshot, + const Slice& next_table_min_key, + const std::string& full_history_ts_low); + + // if the outputs have range delete, range delete is also data + bool HasRangeDel() const { + return range_del_agg_ && !range_del_agg_->IsEmpty(); + } + + private: + friend class SubcompactionState; + + void FillFilesToCutForTtl(); + + void SetOutputSlitKey(const std::optional start, + const std::optional end) { + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + + const InternalKey* output_split_key = compaction_->GetOutputSplitKey(); + // Invalid output_split_key indicates that we do not need to split + if (output_split_key != nullptr) { + // We may only split the output when the cursor is in the range. Split + if ((!end.has_value() || + icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), end.value()) < 0) && + (!start.has_value() || icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), + start.value()) > 0)) { + local_output_split_key_ = output_split_key; + } + } + } + + // Returns true iff we should stop building the current output + // before processing the current key in compaction iterator. + bool ShouldStopBefore(const CompactionIterator& c_iter); + + void Cleanup() { + if (builder_ != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + builder_->Abandon(); + builder_.reset(); + } + } + + // Updates states related to file cutting for TTL. + // Returns a boolean value indicating whether the current + // compaction output file should be cut before `internal_key`. + // + // @param internal_key the current key to be added to output. + bool UpdateFilesToCutForTTLStates(const Slice& internal_key); + + // update tracked grandparents information like grandparent index, if it's + // in the gap between 2 grandparent files, accumulated grandparent files size + // etc. + // It returns how many boundaries it crosses by including current key. + size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key); + + // helper function to get the overlapped grandparent files size, it's only + // used for calculating the first key's overlap. + uint64_t GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const; + + // Add current key from compaction_iterator to the output file. If needed + // close and open new compaction output with the functions provided. + Status AddToOutput(const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close the current output. `open_file_func` is needed for creating new file + // for range-dels only output file. + Status CloseOutput(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status status = curr_status; + // handle subcompaction containing only range deletions + if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) { + status = open_file_func(*this); + } + if (HasBuilder()) { + const Slice empty_key{}; + Status s = close_file_func(*this, status, empty_key); + if (!s.ok() && status.ok()) { + status = s; + } + } + + return status; + } + + // This subcompaction's output could be empty if compaction was aborted before + // this subcompaction had a chance to generate any output files. When + // subcompactions are executed sequentially this is more likely and will be + // particularly likely for the later subcompactions to be empty. Once they are + // run in parallel however it should be much rarer. + // It's caller's responsibility to make sure it's not empty. + Output& current_output() { + assert(!outputs_.empty()); + return outputs_.back(); + } + + // Assign the range_del_agg to the target output level. There's only one + // range-del-aggregator per compaction outputs, for + // output_to_penultimate_level compaction it is only assigned to the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr&& range_del_agg) { + assert(range_del_agg_ == nullptr); + range_del_agg_ = std::move(range_del_agg); + } + + const Compaction* compaction_; + + // current output builder and writer + std::unique_ptr builder_; + std::unique_ptr file_writer_; + uint64_t current_output_file_size_ = 0; + + // all the compaction outputs so far + std::vector outputs_; + + // BlobDB info + std::vector blob_file_additions_; + std::unique_ptr blob_garbage_meter_; + + // Basic compaction output stats for this level's outputs + InternalStats::CompactionOutputsStats stats_; + + // indicate if this CompactionOutputs obj for penultimate_level, should always + // be false if per_key_placement feature is not enabled. + const bool is_penultimate_level_; + std::unique_ptr range_del_agg_ = nullptr; + + // partitioner information + std::string last_key_for_partitioner_; + std::unique_ptr partitioner_; + + // A flag determines if this subcompaction has been split by the cursor + // for RoundRobin compaction + bool is_split_ = false; + + // We also maintain the output split key for each subcompaction to avoid + // repetitive comparison in ShouldStopBefore() + const InternalKey* local_output_split_key_ = nullptr; + + // Some identified files with old oldest ancester time and the range should be + // isolated out so that the output file(s) in that range can be merged down + // for TTL and clear the timestamps for the range. + std::vector files_to_cut_for_ttl_; + int cur_files_to_cut_for_ttl_ = -1; + int next_files_to_cut_for_ttl_ = 0; + + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index_ = 0; + + // if the output key is being grandparent files gap, so: + // key > grandparents[grandparent_index_ - 1].largest && + // key < grandparents[grandparent_index_].smallest + bool being_grandparent_gap_ = true; + + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t grandparent_overlapped_bytes_ = 0; + + // A flag determines whether the key has been seen in ShouldStopBefore() + bool seen_key_ = false; + + // for the current output file, how many file boundaries has it crossed, + // basically number of files overlapped * 2 + size_t grandparent_boundary_switched_num_ = 0; + + // The smallest key of the current output file, this is set when current + // output file's smallest key is a range tombstone start key. + InternalKey range_tombstone_lower_bound_; +}; + +// helper struct to concatenate the last level and penultimate level outputs +// which could be replaced by std::ranges::join_view() in c++20 +struct OutputIterator { + public: + explicit OutputIterator(const std::vector& a, + const std::vector& b) + : a_(a), b_(b) { + within_a = !a_.empty(); + idx_ = 0; + } + + OutputIterator begin() { return *this; } + + OutputIterator end() { return *this; } + + size_t size() { return a_.size() + b_.size(); } + + const CompactionOutputs::Output& operator*() const { + return within_a ? a_[idx_] : b_[idx_]; + } + + OutputIterator& operator++() { + idx_++; + if (within_a && idx_ >= a_.size()) { + within_a = false; + idx_ = 0; + } + assert(within_a || idx_ <= b_.size()); + return *this; + } + + bool operator!=(const OutputIterator& /*rhs*/) const { + return within_a || idx_ < b_.size(); + } + + private: + const std::vector& a_; + const std::vector& b_; + bool within_a; + size_t idx_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.cc new file mode 100644 index 0000000000..a8f314a5bb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.cc @@ -0,0 +1,1225 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker.h" + +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/statistics_impl.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +bool FindIntraL0Compaction(const std::vector& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs) { + TEST_SYNC_POINT("FindIntraL0Compaction"); + + size_t start = 0; + + if (level_files.size() == 0 || level_files[start]->being_compacted) { + return false; + } + + size_t compact_bytes = static_cast(level_files[start]->fd.file_size); + size_t compact_bytes_per_del_file = std::numeric_limits::max(); + // Compaction range will be [start, limit). + size_t limit; + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. + size_t new_compact_bytes_per_del_file = 0; + for (limit = start + 1; limit < level_files.size(); ++limit) { + compact_bytes += static_cast(level_files[limit]->fd.file_size); + new_compact_bytes_per_del_file = compact_bytes / (limit - start); + if (level_files[limit]->being_compacted || + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compact_bytes > max_compaction_bytes) { + break; + } + compact_bytes_per_del_file = new_compact_bytes_per_del_file; + } + + if ((limit - start) >= min_files_to_compact && + compact_bytes_per_del_file < max_compact_bytes_per_del_file) { + assert(comp_inputs != nullptr); + comp_inputs->level = 0; + for (size_t i = start; i < limit; ++i) { + comp_inputs->files.push_back(level_files[i]); + } + return true; + } + return false; +} + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use it. + if (mutable_cf_options.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1)) { + return mutable_cf_options.bottommost_compression; + } + // If the user has specified a different compression level for each level, + // then pick the compression for that level. + if (!mutable_cf_options.compression_per_level.empty()) { + assert(level == 0 || level >= base_level); + int idx = (level == 0) ? 0 : level - base_level + 1; + + const int n = + static_cast(mutable_cf_options.compression_per_level.size()) - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level is beyond the end of the + // specified compression levels, use the last value. + return mutable_cf_options + .compression_per_level[std::max(0, std::min(idx, n))]; + } else { + return mutable_cf_options.compression; + } +} + +CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression) { + if (!enable_compression) { + return cf_options.compression_opts; + } + // If bottommost_compression_opts is enabled and we are compacting to the + // bottommost level then we should use the specified compression options. + if (level >= (vstorage->num_non_empty_levels() - 1) && + cf_options.bottommost_compression_opts.enabled) { + return cf_options.bottommost_compression_opts; + } + return cf_options.compression_opts; +} + +CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : ioptions_(ioptions), icmp_(icmp) {} + +CompactionPicker::~CompactionPicker() {} + +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + UnregisterCompaction(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs, + InternalKey* smallest, + InternalKey* largest) const { + const int level = inputs.level; + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + + if (level == 0) { + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } + } else { + *smallest = inputs[0]->smallest; + *largest = inputs[inputs.size() - 1]->largest; + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, + InternalKey* smallest, + InternalKey* largest) const { + assert(!inputs1.empty() || !inputs2.empty()); + if (inputs1.empty()) { + GetRange(inputs2, smallest, largest); + } else if (inputs2.empty()) { + GetRange(inputs1, smallest, largest); + } else { + InternalKey smallest1, smallest2, largest1, largest2; + GetRange(inputs1, &smallest1, &largest1); + GetRange(inputs2, &smallest2, &largest2); + *smallest = + icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2; + *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1; + } +} + +void CompactionPicker::GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest, + int exclude_level) const { + InternalKey current_smallest; + InternalKey current_largest; + bool initialized = false; + for (const auto& in : inputs) { + if (in.empty() || in.level == exclude_level) { + continue; + } + GetRange(in, ¤t_smallest, ¤t_largest); + if (!initialized) { + *smallest = current_smallest; + *largest = current_largest; + initialized = true; + } else { + if (icmp_->Compare(current_smallest, *smallest) < 0) { + *smallest = current_smallest; + } + if (icmp_->Compare(current_largest, *largest) > 0) { + *largest = current_largest; + } + } + } + assert(initialized); +} + +bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest) { + // This isn't good compaction + assert(!inputs->empty()); + + const int level = inputs->level; + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (level == 0) { + return true; + } + + InternalKey smallest, largest; + + // Keep expanding inputs until we are sure that there is a "clean cut" + // boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = inputs->size(); + GetRange(*inputs, &smallest, &largest); + inputs->clear(); + vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, + hint_index, &hint_index, true, + next_smallest); + } while (inputs->size() > old_size); + + // we started off with inputs non-empty and the previous loop only grew + // inputs. thus, inputs should be non-empty here + assert(!inputs->empty()); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + if (AreFilesInCompaction(inputs->files)) { + return false; + } + return true; +} + +bool CompactionPicker::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + const Comparator* ucmp = icmp_->user_comparator(); + for (Compaction* c : compactions_in_progress_) { + if (c->output_level() == level && + ucmp->CompareWithoutTimestamp(smallest_user_key, + c->GetLargestUserKey()) <= 0 && + ucmp->CompareWithoutTimestamp(largest_user_key, + c->GetSmallestUserKey()) >= 0) { + // Overlap + return true; + } + if (c->SupportsPerKeyPlacement()) { + if (c->OverlapPenultimateLevelOutputRange(smallest_user_key, + largest_user_key)) { + return true; + } + } + } + // Did not overlap with any running compaction in level `level` + return false; +} + +bool CompactionPicker::FilesRangeOverlapWithCompaction( + const std::vector& inputs, int level, + int penultimate_level) const { + bool is_empty = true; + for (auto& in : inputs) { + if (!in.empty()) { + is_empty = false; + break; + } + } + if (is_empty) { + // No files in inputs + return false; + } + + // TODO: Intra L0 compactions can have the ranges overlapped, but the input + // files cannot be overlapped in the order of L0 files. + InternalKey smallest, largest; + GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel); + if (penultimate_level != Compaction::kInvalidLevel) { + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + penultimate_level)) { + return true; + } + } else { + InternalKey penultimate_smallest, penultimate_largest; + GetRange(inputs, &penultimate_smallest, &penultimate_largest, level); + if (RangeOverlapWithCompaction(penultimate_smallest.user_key(), + penultimate_largest.user_key(), + penultimate_level)) { + return true; + } + } + } + + return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + level); +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::AreFilesInCompaction( + const std::vector& files) { + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +Compaction* CompactionPicker::CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_files, int output_level, + VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, uint32_t output_path_id) { +#ifndef NDEBUG + assert(input_files.size()); + // This compaction output should not overlap with a running compaction as + // `SanitizeCompactionInputFiles` should've checked earlier and db mutex + // shouldn't have been released since. + int start_level = Compaction::kInvalidLevel; + for (const auto& in : input_files) { + // input_files should already be sorted by level + if (!in.empty()) { + start_level = in.level; + break; + } + } + assert(output_level == 0 || + !FilesRangeOverlapWithCompaction( + input_files, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, + start_level, output_level))); +#endif /* !NDEBUG */ + + CompressionType compression_type; + if (compact_options.compression == kDisableCompressionOption) { + int base_level; + if (ioptions_.compaction_style == kCompactionStyleLevel) { + base_level = vstorage->base_level(); + } else { + base_level = 1; + } + compression_type = GetCompressionType(vstorage, mutable_cf_options, + output_level, base_level); + } else { + // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType` + // without configurable `CompressionOptions`, which is inconsistent. + compression_type = compact_options.compression; + } + auto c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files, + output_level, compact_options.output_file_size_limit, + mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_options.max_subcompactions, + /* grandparents */ {}, true); + RegisterCompaction(c); + return c; +} + +Status CompactionPicker::GetCompactionInputsFromFileNumbers( + std::vector* input_files, + std::unordered_set* input_set, const VersionStorageInfo* vstorage, + const CompactionOptions& /*compact_options*/) const { + if (input_set->size() == 0U) { + return Status::InvalidArgument( + "Compaction must include at least one file."); + } + assert(input_files); + + std::vector matched_input_files; + matched_input_files.resize(vstorage->num_levels()); + int first_non_empty_level = -1; + int last_non_empty_level = -1; + // TODO(yhchiang): use a lazy-initialized mapping from + // file_number to FileMetaData in Version. + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (auto file : vstorage->LevelFiles(level)) { + auto iter = input_set->find(file->fd.GetNumber()); + if (iter != input_set->end()) { + matched_input_files[level].files.push_back(file); + input_set->erase(iter); + last_non_empty_level = level; + if (first_non_empty_level == -1) { + first_non_empty_level = level; + } + } + } + } + + if (!input_set->empty()) { + std::string message( + "Cannot find matched SST files for the following file numbers:"); + for (auto fn : *input_set) { + message += " "; + message += std::to_string(fn); + } + return Status::InvalidArgument(message); + } + + for (int level = first_non_empty_level; level <= last_non_empty_level; + ++level) { + matched_input_files[level].level = level; + input_files->emplace_back(std::move(matched_input_files[level])); + } + + return Status::OK(); +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* level_index) { + std::vector inputs; + assert(level < NumberLevels()); + + vstorage->GetOverlappingInputs(level, smallest, largest, &inputs, + level_index ? *level_index : 0, level_index); + return AreFilesInCompaction(inputs); +} + +// Populates the set of inputs of all other levels that overlap with the +// start level. +// Now we assume all levels except start level and output level are empty. +// Will also attempt to expand "start level" if that doesn't expand +// "output level" or cause "level" to include a file for compaction that has an +// overlapping user-key with another file. +// REQUIRES: input_level and output_level are different +// REQUIRES: inputs->empty() == false +// Returns false if files on parent level are currently in compaction, which +// means that we can't compact them +bool CompactionPicker::SetupOtherInputs( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, int* parent_index, + int base_index, bool only_expand_towards_right) { + assert(!inputs->empty()); + assert(output_level_inputs->empty()); + const int input_level = inputs->level; + const int output_level = output_level_inputs->level; + if (input_level == output_level) { + // no possibility of conflict + return true; + } + + // For now, we only support merging two levels, start level and output level. + // We need to assert other levels are empty. + for (int l = input_level + 1; l < output_level; l++) { + assert(vstorage->NumLevelFiles(l) == 0); + } + + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(*inputs, &smallest, &largest); + + // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to + // include in compaction + vstorage->GetOverlappingInputs(output_level, &smallest, &largest, + &output_level_inputs->files, *parent_index, + parent_index); + if (AreFilesInCompaction(output_level_inputs->files)) { + return false; + } + if (!output_level_inputs->empty()) { + if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) { + return false; + } + } + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!output_level_inputs->empty()) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + const uint64_t output_level_inputs_size = + TotalFileSize(output_level_inputs->files); + const uint64_t inputs_size = TotalFileSize(inputs->files); + bool expand_inputs = false; + + CompactionInputFiles expanded_inputs; + expanded_inputs.level = input_level; + // Get closed interval of output level + InternalKey all_start, all_limit; + GetRange(*inputs, *output_level_inputs, &all_start, &all_limit); + bool try_overlapping_inputs = true; + if (only_expand_towards_right) { + // Round-robin compaction only allows expansion towards the larger side. + vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit, + &expanded_inputs.files, base_index, + nullptr); + } else { + vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit, + &expanded_inputs.files, base_index, + nullptr); + } + uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files); + if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) { + try_overlapping_inputs = false; + } + if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && + !AreFilesInCompaction(expanded_inputs.files)) { + InternalKey new_start, new_limit; + GetRange(expanded_inputs, &new_start, &new_limit); + CompactionInputFiles expanded_output_level_inputs; + expanded_output_level_inputs.level = output_level; + vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit, + &expanded_output_level_inputs.files, + *parent_index, parent_index); + assert(!expanded_output_level_inputs.empty()); + if (!AreFilesInCompaction(expanded_output_level_inputs.files) && + ExpandInputsToCleanCut(cf_name, vstorage, + &expanded_output_level_inputs) && + expanded_output_level_inputs.size() == output_level_inputs->size()) { + expand_inputs = true; + } + } + if (!expand_inputs) { + vstorage->GetCleanInputsWithinInterval(input_level, &all_start, + &all_limit, &expanded_inputs.files, + base_index, nullptr); + expanded_inputs_size = TotalFileSize(expanded_inputs.files); + if (expanded_inputs.size() > inputs->size() && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && + !AreFilesInCompaction(expanded_inputs.files)) { + expand_inputs = true; + } + } + if (expand_inputs) { + ROCKS_LOG_INFO(ioptions_.logger, + "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt + "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt + "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", + cf_name.c_str(), input_level, inputs->size(), + output_level_inputs->size(), inputs_size, + output_level_inputs_size, expanded_inputs.size(), + output_level_inputs->size(), expanded_inputs_size, + output_level_inputs_size); + inputs->files = expanded_inputs.files; + } + } else { + // Likely to be trivial move. Expand files if they are still trivial moves, + // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that + // we don't create too much compaction pressure for the next level. + } + return true; +} + +void CompactionPicker::GetGrandparents( + VersionStorageInfo* vstorage, const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector* grandparents) { + InternalKey start, limit; + GetRange(inputs, output_level_inputs, &start, &limit); + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2 or the first + // level after that has overlapping files) + for (int level = output_level_inputs.level + 1; level < NumberLevels(); + level++) { + vstorage->GetOverlappingInputs(level, &start, &limit, grandparents); + if (!grandparents->empty()) { + break; + } + } +} + +Compaction* CompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + // CompactionPickerFIFO has its own implementation of compact range + assert(ioptions_.compaction_style != kCompactionStyleFIFO); + + if (input_level == ColumnFamilyData::kCompactAllLevels) { + assert(ioptions_.compaction_style == kCompactionStyleUniversal); + + // Universal compaction with more than one level always compacts all the + // files together to the last level. + assert(vstorage->num_levels() > 1); + // DBImpl::CompactRange() set output level to be the last level + if (ioptions_.allow_ingest_behind) { + assert(output_level == vstorage->num_levels() - 2); + } else { + assert(output_level == vstorage->num_levels() - 1); + } + // DBImpl::RunManualCompaction will make full range for universal compaction + assert(begin == nullptr); + assert(end == nullptr); + *compaction_end = nullptr; + + int start_level = 0; + for (; start_level < vstorage->num_levels() && + vstorage->NumLevelFiles(start_level) == 0; + start_level++) { + } + if (start_level == vstorage->num_levels()) { + return nullptr; + } + + if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) { + *manual_conflict = true; + // Only one level 0 compaction allowed + return nullptr; + } + + std::vector inputs(vstorage->num_levels() - + start_level); + for (int level = start_level; level < vstorage->num_levels(); level++) { + inputs[level - start_level].level = level; + auto& files = inputs[level - start_level].files; + for (FileMetaData* f : vstorage->LevelFiles(level)) { + files.push_back(f); + } + if (AreFilesInCompaction(files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, + start_level, output_level))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style), + /* max_compaction_bytes */ LLONG_MAX, + compact_range_options.target_path_id, + GetCompressionType(vstorage, mutable_cf_options, output_level, 1), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1, + /* deletion_compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kUnknown, + compact_range_options.blob_garbage_collection_policy, + compact_range_options.blob_garbage_collection_age_cutoff); + + RegisterCompaction(c); + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + return c; + } + + CompactionInputFiles inputs; + inputs.level = input_level; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + + vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files); + if (inputs.empty()) { + return nullptr; + } + + if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) { + // Only one level 0 compaction allowed + TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict"); + *manual_conflict = true; + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + uint64_t input_level_total = 0; + int hint_index = -1; + InternalKey* smallest = nullptr; + InternalKey* largest = nullptr; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + if (!smallest) { + smallest = &inputs[i]->smallest; + } + largest = &inputs[i]->largest; + + uint64_t input_file_size = inputs[i]->fd.GetFileSize(); + uint64_t output_level_total = 0; + if (output_level < vstorage->num_non_empty_levels()) { + std::vector files; + vstorage->GetOverlappingInputsRangeBinarySearch( + output_level, smallest, largest, &files, hint_index, &hint_index); + for (const auto& file : files) { + output_level_total += file->fd.GetFileSize(); + } + } + + input_level_total += input_file_size; + + if (input_level_total + output_level_total >= limit) { + covering_the_whole_range = false; + // still include the current file, so the compaction could be larger + // than max_compaction_bytes, which is also to make sure the compaction + // can make progress even `max_compaction_bytes` is small (e.g. smaller + // than an SST file). + inputs.files.resize(i + 1); + break; + } + } + } + + assert(compact_range_options.target_path_id < + static_cast(ioptions_.cf_paths.size())); + + // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out + // files that are created during the current compaction. + if (compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized && + max_file_num_to_ignore != std::numeric_limits::max()) { + assert(input_level == output_level); + // inputs_shrunk holds a continuous subset of input files which were all + // created before the current manual compaction + std::vector inputs_shrunk; + size_t skip_input_index = inputs.size(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + inputs_shrunk.push_back(inputs[i]); + } else if (!inputs_shrunk.empty()) { + // inputs[i] was created during the current manual compaction and + // need to be skipped + skip_input_index = i; + break; + } + } + if (inputs_shrunk.empty()) { + return nullptr; + } + if (inputs.size() != inputs_shrunk.size()) { + inputs.files.swap(inputs_shrunk); + } + // set covering_the_whole_range to false if there is any file that need to + // be compacted in the range of inputs[skip_input_index+1, inputs.size()) + for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + covering_the_whole_range = false; + } + } + } + + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { + // manual compaction is now multi-threaded, so it can + // happen that ExpandWhileOverlapping fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + + if (covering_the_whole_range || !next_smallest) { + *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; + } + + CompactionInputFiles output_level_inputs; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(input_level == 0); + output_level = vstorage->base_level(); + assert(output_level > 0); + } + output_level_inputs.level = output_level; + if (input_level != output_level) { + int parent_index = -1; + if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, + &output_level_inputs, &parent_index, -1)) { + // manual compaction is now multi-threaded, so it can + // happen that SetupOtherInputs fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + } + + std::vector compaction_inputs({inputs}); + if (!output_level_inputs.empty()) { + compaction_inputs.push_back(output_level_inputs); + } + for (size_t i = 0; i < compaction_inputs.size(); i++) { + if (AreFilesInCompaction(compaction_inputs[i].files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction( + compaction_inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level, + output_level))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + std::vector grandparents; + GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); + Compaction* compaction = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(compaction_inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style, vstorage->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options.max_compaction_bytes, + compact_range_options.target_path_id, + GetCompressionType(vstorage, mutable_cf_options, output_level, + vstorage->base_level()), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1, + /* deletion_compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kUnknown, + compact_range_options.blob_garbage_collection_policy, + compact_range_options.blob_garbage_collection_age_cutoff); + + TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); + RegisterCompaction(compaction); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + + return compaction; +} + +namespace { +// Test whether two files have overlapping key-ranges. +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} +} // namespace + +Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + auto& levels = cf_meta.levels; + auto comparator = icmp_->user_comparator(); + + // TODO(yhchiang): add is_adjustable to CompactionOptions + + // the smallest and largest key of the current compaction input + std::string smallestkey; + std::string largestkey; + // a flag for initializing smallest and largest key + bool is_first = false; + const int kNotFound = -1; + + // For each level, it does the following things: + // 1. Find the first and the last compaction input files + // in the current level. + // 2. Include all files between the first and the last + // compaction input files. + // 3. Update the compaction key-range. + // 4. For all remaining levels, include files that have + // overlapping key-range with the compaction key-range. + for (int l = 0; l <= output_level; ++l) { + auto& current_files = levels[l].files; + int first_included = static_cast(current_files.size()); + int last_included = kNotFound; + + // identify the first and the last compaction input files + // in the current level. + for (size_t f = 0; f < current_files.size(); ++f) { + const uint64_t file_number = TableFileNameToNumber(current_files[f].name); + if (input_files->find(file_number) == input_files->end()) { + continue; + } + first_included = std::min(first_included, static_cast(f)); + last_included = std::max(last_included, static_cast(f)); + if (is_first == false) { + smallestkey = current_files[f].smallestkey; + largestkey = current_files[f].largestkey; + is_first = true; + } + } + if (last_included == kNotFound) { + continue; + } + + if (l != 0) { + // expand the compaction input of the current level if it + // has overlapping key-range with other non-compaction input + // files in the same level. + while (first_included > 0) { + if (comparator->CompareWithoutTimestamp( + current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < 0) { + break; + } + first_included--; + } + + while (last_included < static_cast(current_files.size()) - 1) { + if (comparator->CompareWithoutTimestamp( + current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { + break; + } + last_included++; + } + } else if (output_level > 0) { + last_included = static_cast(current_files.size() - 1); + } + + // include all files between the first and the last compaction input files. + for (int f = first_included; f <= last_included; ++f) { + if (current_files[f].being_compacted) { + return Status::Aborted("Necessary compaction input file " + + current_files[f].name + + " is currently being compacted."); + } + + input_files->insert(TableFileNameToNumber(current_files[f].name)); + } + + // update smallest and largest key + if (l == 0) { + for (int f = first_included; f <= last_included; ++f) { + if (comparator->CompareWithoutTimestamp( + smallestkey, current_files[f].smallestkey) > 0) { + smallestkey = current_files[f].smallestkey; + } + if (comparator->CompareWithoutTimestamp( + largestkey, current_files[f].largestkey) < 0) { + largestkey = current_files[f].largestkey; + } + } + } else { + if (comparator->CompareWithoutTimestamp( + smallestkey, current_files[first_included].smallestkey) > 0) { + smallestkey = current_files[first_included].smallestkey; + } + if (comparator->CompareWithoutTimestamp( + largestkey, current_files[last_included].largestkey) < 0) { + largestkey = current_files[last_included].largestkey; + } + } + + SstFileMetaData aggregated_file_meta; + aggregated_file_meta.smallestkey = smallestkey; + aggregated_file_meta.largestkey = largestkey; + + // For all lower levels, include all overlapping files. + // We need to add overlapping files from the current level too because even + // if there no input_files in level l, we would still need to add files + // which overlap with the range containing the input_files in levels 0 to l + // Level 0 doesn't need to be handled this way because files are sorted by + // time and not by key + for (int m = std::max(l, 1); m <= output_level; ++m) { + for (auto& next_lv_file : levels[m].files) { + if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta, + next_lv_file)) { + if (next_lv_file.being_compacted) { + return Status::Aborted( + "File " + next_lv_file.name + + " that has overlapping key range with one of the compaction " + " input file is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(next_lv_file.name)); + } + } + } + } + if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) { + return Status::Aborted( + "A running compaction is writing to the same output level in an " + "overlapping key range"); + } + return Status::OK(); +} + +Status CompactionPicker::SanitizeCompactionInputFiles( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + assert(static_cast(cf_meta.levels.size()) - 1 == + cf_meta.levels[cf_meta.levels.size() - 1].level); + if (output_level >= static_cast(cf_meta.levels.size())) { + return Status::InvalidArgument( + "Output level for column family " + cf_meta.name + + " must between [0, " + + std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "]."); + } + + if (output_level > MaxOutputLevel()) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm --- " + + std::to_string(MaxOutputLevel())); + } + + if (output_level < 0) { + return Status::InvalidArgument("Output level cannot be negative."); + } + + if (input_files->size() == 0) { + return Status::InvalidArgument( + "A compaction must contain at least one file."); + } + + Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta, + output_level); + + if (!s.ok()) { + return s; + } + + // for all input files, check whether the file number matches + // any currently-existing files. + for (auto file_num : *input_files) { + bool found = false; + int input_file_level = -1; + for (const auto& level_meta : cf_meta.levels) { + for (const auto& file_meta : level_meta.files) { + if (file_num == TableFileNameToNumber(file_meta.name)) { + if (file_meta.being_compacted) { + return Status::Aborted("Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); + } + found = true; + input_file_level = level_meta.level; + break; + } + } + if (found) { + break; + } + } + if (!found) { + return Status::InvalidArgument( + "Specified compaction input file " + MakeTableFileName("", file_num) + + " does not exist in column family " + cf_meta.name + "."); + } + if (input_file_level > output_level) { + return Status::InvalidArgument( + "Cannot compact file to up level, input file: " + + MakeTableFileName("", file_num) + " level " + + std::to_string(input_file_level) + " > output level " + + std::to_string(output_level)); + } + } + + return Status::OK(); +} + +void CompactionPicker::RegisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + assert(ioptions_.compaction_style != kCompactionStyleLevel || + c->output_level() == 0 || + !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(), + c->GetPenultimateLevel())); + // CompactionReason::kExternalSstIngestion's start level is just a placeholder + // number without actual meaning as file ingestion technically does not have + // an input level like other compactions + if ((c->start_level() == 0 && + c->compaction_reason() != CompactionReason::kExternalSstIngestion) || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.insert(c); + } + compactions_in_progress_.insert(c); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered", + c); +} + +void CompactionPicker::UnregisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.erase(c); + } + compactions_in_progress_.erase(c); +} + +void CompactionPicker::PickFilesMarkedForCompaction( + const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, + int* output_level, CompactionInputFiles* start_level_inputs) { + if (vstorage->FilesMarkedForCompaction().empty()) { + return; + } + + auto continuation = [&, cf_name](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + *start_level = level_file.first; + *output_level = + (*start_level == 0) ? vstorage->base_level() : *start_level + 1; + + if (*start_level == 0 && !level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs->files = {level_file.second}; + start_level_inputs->level = *start_level; + return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs); + }; + + // take a chance on a random file first + Random64 rnd(/* seed */ reinterpret_cast(vstorage)); + size_t random_file_index = static_cast(rnd.Uniform( + static_cast(vstorage->FilesMarkedForCompaction().size()))); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction", + &random_file_index); + + if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) { + // found the compaction! + return; + } + + for (auto& level_file : vstorage->FilesMarkedForCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + start_level_inputs->files.clear(); +} + +bool CompactionPicker::GetOverlappingL0Files( + VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index) { + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + assert(level0_compactions_in_progress()->empty()); + InternalKey smallest, largest; + GetRange(*start_level_inputs, &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + start_level_inputs->files.clear(); + vstorage->GetOverlappingInputs(0, &smallest, &largest, + &(start_level_inputs->files)); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(*start_level_inputs, &smallest, &largest); + if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level, + parent_index)) { + return false; + } + assert(!start_level_inputs->files.empty()); + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.h new file mode 100644 index 0000000000..0556e99275 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker.h @@ -0,0 +1,318 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/version_set.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + +class LogBuffer; +class Compaction; +class VersionStorageInfo; +struct CompactionInputFiles; + +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. +class CompactionPicker { + public: + CompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts); + + // The maximum allowed output level. Default value is NumberLevels() - 1. + virtual int MaxOutputLevel() const { return NumberLevels() - 1; } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; + +// Sanitize the input set of compaction input files. +// When the input parameters do not describe a valid compaction, the +// function will try to fix the input_files by adding necessary +// files. If it's not possible to conver an invalid input_files +// into a valid one by adding more files, the function will return a +// non-ok status with specific reason. +// + Status SanitizeCompactionInputFiles(std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; + + // Free up the files that participated in a compaction + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Returns true if any one of the specified files are being compacted + bool AreFilesInCompaction(const std::vector& files); + + // Takes a list of CompactionInputFiles and returns a (manual) Compaction + // object. + // + // Caller must provide a set of input files that has been passed through + // `SanitizeCompactionInputFiles` earlier. The lock should not be released + // between that call and this one. + Compaction* CompactFiles(const CompactionOptions& compact_options, + const std::vector& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + uint32_t output_path_id); + + // Converts a set of compaction input file numbers into + // a list of CompactionInputFiles. + Status GetCompactionInputsFromFileNumbers( + std::vector* input_files, + std::unordered_set* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const; + + // Is there currently a compaction involving level 0 taking place + bool IsLevel0CompactionInProgress() const { + return !level0_compactions_in_progress_.empty(); + } + + // Return true if the passed key range overlap with a compaction output + // that is currently running. + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs + // in *smallest, *largest. + // REQUIRES: inputs is not empty (at least on entry have one file) + void GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest, + int exclude_level) const; + + int NumberLevels() const { return ioptions_.num_levels; } + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandInputsToCleanCut(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); + + // Returns true if any one of the parent files are being compacted + bool IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, int level, int* index); + + // Returns true if the key range that `inputs` files cover overlap with the + // key range of a currently running compaction. + bool FilesRangeOverlapWithCompaction( + const std::vector& inputs, int level, + int penultimate_level) const; + + bool SetupOtherInputs(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, + int* parent_index, int base_index, + bool only_expand_towards_right = false); + + void GetGrandparents(VersionStorageInfo* vstorage, + const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector* grandparents); + + void PickFilesMarkedForCompaction(const std::string& cf_name, + VersionStorageInfo* vstorage, + int* start_level, int* output_level, + CompactionInputFiles* start_level_inputs); + + bool GetOverlappingL0Files(VersionStorageInfo* vstorage, + CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index); + + // Register this compaction in the set of running compactions + void RegisterCompaction(Compaction* c); + + // Remove this compaction from the set of running compactions + void UnregisterCompaction(Compaction* c); + + std::set* level0_compactions_in_progress() { + return &level0_compactions_in_progress_; + } + std::unordered_set* compactions_in_progress() { + return &compactions_in_progress_; + } + + const InternalKeyComparator* icmp() const { return icmp_; } + + protected: + const ImmutableOptions& ioptions_; + +// A helper function to SanitizeCompactionInputFiles() that +// sanitizes "input_files" by adding necessary files. + virtual Status SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const; + + // Keeps track of all compactions that are running on Level0. + // Protected by DB mutex + std::set level0_compactions_in_progress_; + + // Keeps track of all compactions that are running. + // Protected by DB mutex + std::unordered_set compactions_in_progress_; + + const InternalKeyComparator* const icmp_; +}; + +// A dummy compaction that never triggers any automatic +// compaction. +class NullCompactionPicker : public CompactionPicker { + public: + NullCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual ~NullCompactionPicker() {} + + // Always return "nullptr" + Compaction* PickCompaction(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, + LogBuffer* /* log_buffer */) override { + return nullptr; + } + + // Always return "nullptr" + Compaction* CompactRange(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, + int /*input_level*/, int /*output_level*/, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, + const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, + bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, + const std::string& /*trim_ts*/) override { + return nullptr; + } + + // Always returns false. + virtual bool NeedsCompaction( + const VersionStorageInfo* /*vstorage*/) const override { + return false; + } +}; + +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. +bool FindIntraL0Compaction(const std::vector& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs); + +CompressionType GetCompressionType(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression = true); + +CompressionOptions GetCompressionOptions( + const MutableCFOptions& mutable_cf_options, + const VersionStorageInfo* vstorage, int level, + const bool enable_compression = true); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.cc new file mode 100644 index 0000000000..9aa24302e2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_fifo.h" + +#include +#include +#include + +#include "db/column_family.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "options/options_helper.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +uint64_t GetTotalFilesSize(const std::vector& files) { + uint64_t total_size = 0; + for (const auto& f : files) { + total_size += f->fd.file_size; + } + return total_size; +} +} // anonymous namespace + +bool FIFOCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + +Compaction* FIFOCompactionPicker::PickTTLCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + assert(mutable_cf_options.ttl > 0); + + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on TTL. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.ttl) { + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + uint64_t creation_time = + f->fd.table_reader->GetTableProperties()->creation_time; + if (creation_time == 0 || + creation_time >= (current_time - mutable_cf_options.ttl)) { + break; + } + } + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + } + } + + // Return a nullptr and proceed to size-based FIFO compaction if: + // 1. there are no files older than ttl OR + // 2. there are a few files older than ttl, but deleting them will not bring + // the total size to be less than max_table_files_size threshold. + if (inputs[0].files.empty() || + total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + return nullptr; + } + + for (const auto& f : inputs[0].files) { + uint64_t creation_time = 0; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + creation_time = f->fd.table_reader->GetTableProperties()->creation_time; + } + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with creation time %" PRIu64 " for deletion", + cf_name.c_str(), f->fd.GetNumber(), creation_time); + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0, 0, 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ true, /* l0_files_might_overlap */ true, + CompactionReason::kFIFOTtl); + return c; +} + +// The size-based compaction picker for FIFO. +// +// When the entire column family size exceeds max_table_files_size, FIFO will +// try to delete the oldest sst file(s) until the resulting column family size +// is smaller than max_table_files_size. +// +// This function also takes care the case where a DB is migrating from level / +// universal compaction to FIFO compaction. During the migration, the column +// family will also have non-L0 files while FIFO can only create L0 files. +// In this case, this function will first purge the sst files in the bottom- +// most non-empty level first, and the DB will eventually converge to the +// regular FIFO case where there're only L0 files. Note that during the +// migration case, the purge order will only be an approximation of "FIFO" +// as entries inside lower-level files might sometimes be newer than some +// entries inside upper-level files. +Compaction* FIFOCompactionPicker::PickSizeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + // compute the total size and identify the last non-empty level + int last_level = 0; + uint64_t total_size = 0; + for (int level = 0; level < vstorage->num_levels(); ++level) { + auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level)); + total_size += level_size; + if (level_size > 0) { + last_level = level; + } + } + const std::vector& last_level_files = + vstorage->LevelFiles(last_level); + + if (last_level == 0 && + total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + // total size not exceeded, try to find intra level 0 compaction if enabled + const std::vector& level0_files = vstorage->LevelFiles(0); + if (mutable_cf_options.compaction_options_fifo.allow_compaction && + level0_files.size() > 0) { + CompactionInputFiles comp_inputs; + // try to prevent same files from being compacted multiple times, which + // could produce large files that may never TTL-expire. Achieve this by + // disallowing compactions with files larger than memtable (inflate its + // size by 10% to account for uncompressed L0 files that may have size + // slightly greater than memtable size limit). + size_t max_compact_bytes_per_del_file = + static_cast(MultiplyCheckOverflow( + static_cast(mutable_cf_options.write_buffer_size), + 1.1)); + if (FindIntraL0Compaction( + level0_files, + mutable_cf_options + .level0_file_num_compaction_trigger /* min_files_to_compact */ + , + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, + 0 /* output path ID */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions */, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ false, + /* l0_files_might_overlap */ true, + CompactionReason::kFIFOReduceNumFiles); + return c; + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; + } + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = last_level; + + if (last_level == 0) { + // In L0, right-most files are the oldest files. + for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend(); + ++ritr) { + auto f = *ritr; + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + } else if (total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + // If the last level is non-L0, we actually don't know which file is + // logically the oldest since the file creation time only represents + // when this file was compacted to this level, which is independent + // to when the entries in this file were first inserted. + // + // As a result, we delete files from the left instead. This means the sst + // file with the smallest key will be deleted first. This design decision + // better serves a major type of FIFO use cases where smaller keys are + // associated with older data. + for (const auto& f : last_level_files) { + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion under total size %" PRIu64 + " vs max table files size %" PRIu64, + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), last_level, + /* target_file_size */ 0, + /* max_compaction_bytes */ 0, + /* output_path_id */ 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ true, + /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize); + return c; +} + +Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + const std::vector& ages = + mutable_cf_options.compaction_options_fifo + .file_temperature_age_thresholds; + if (ages.empty()) { + return nullptr; + } + + // Does not apply to multi-level FIFO. + if (vstorage->num_levels() > 1) { + return nullptr; + } + + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + if (level_files.empty()) { + return nullptr; + } + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on file temperature-age threshold. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. Parallel " + "compactions are not supported", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + uint64_t min_age = ages[0].age; + // kLastTemperature means target temperature is to be determined. + Temperature compaction_target_temp = Temperature::kLastTemperature; + if (current_time > min_age) { + uint64_t create_time_threshold = current_time - min_age; + uint64_t compaction_size = 0; + // We will ideally identify a file qualifying for temperature change by + // knowing the timestamp for the youngest entry in the file. However, right + // now we don't have the information. We infer it by looking at timestamp of + // the previous file's (which is just younger) oldest entry's timestamp. + Temperature cur_target_temp; + // avoid index underflow + assert(level_files.size() >= 1); + for (size_t index = level_files.size() - 1; index >= 1; --index) { + // Try to add cur_file to compaction inputs. + FileMetaData* cur_file = level_files[index]; + // prev_file is just younger than cur_file + FileMetaData* prev_file = level_files[index - 1]; + if (cur_file->being_compacted) { + // Should not happen since we check for + // `level0_compactions_in_progress_` above. Here we simply just don't + // schedule anything. + return nullptr; + } + uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime(); + if (oldest_ancestor_time == kUnknownOldestAncesterTime) { + // Older files might not have enough information. It is possible to + // handle these files by looking at newer files, but maintaining the + // logic isn't worth it. + break; + } + if (oldest_ancestor_time > create_time_threshold) { + // cur_file is too fresh + break; + } + cur_target_temp = ages[0].temperature; + for (size_t i = 1; i < ages.size(); ++i) { + if (current_time >= ages[i].age && + oldest_ancestor_time <= current_time - ages[i].age) { + cur_target_temp = ages[i].temperature; + } + } + if (cur_file->temperature == cur_target_temp) { + if (inputs[0].empty()) { + continue; + } else { + break; + } + } + + // cur_file needs to change temperature + if (compaction_target_temp == Temperature::kLastTemperature) { + assert(inputs[0].empty()); + compaction_target_temp = cur_target_temp; + } else if (cur_target_temp != compaction_target_temp) { + assert(!inputs[0].empty()); + break; + } + if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <= + mutable_cf_options.max_compaction_bytes) { + inputs[0].files.push_back(cur_file); + compaction_size += cur_file->fd.GetFileSize(); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with next file's oldest time %" PRIu64 " for temperature %s.", + cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time, + temperature_to_string[cur_target_temp].c_str()); + } + if (compaction_size > mutable_cf_options.max_compaction_bytes) { + break; + } + } + } + + if (inputs[0].files.empty()) { + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, + mutable_cf_options.compression, mutable_cf_options.compression_opts, + compaction_target_temp, + /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", + vstorage->CompactionScore(0), + /* is deletion compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kChangeTemperature); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + Compaction* c = nullptr; + if (mutable_cf_options.ttl > 0) { + c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickTemperatureChangeCompaction( + cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer); + } + RegisterCompaction(c); + return c; +} + +Compaction* FIFOCompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, const InternalKey* /*end*/, + InternalKey** compaction_end, bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) { +#ifdef NDEBUG + (void)input_level; + (void)output_level; +#endif + assert(input_level == 0); + assert(output_level == 0); + *compaction_end = nullptr; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); + Compaction* c = PickCompaction(cf_name, mutable_cf_options, + mutable_db_options, vstorage, &log_buffer); + log_buffer.FlushBufferToLog(); + return c; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.h new file mode 100644 index 0000000000..df21a1bde0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_fifo.h @@ -0,0 +1,60 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class FIFOCompactionPicker : public CompactionPicker { + public: + FIFOCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer) override; + + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) override; + + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { return 0; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; + + private: + Compaction* PickTTLCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickSizeCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickTemperatureChangeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.cc new file mode 100644 index 0000000000..c436689bb6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.cc @@ -0,0 +1,888 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_level.h" + +#include +#include +#include + +#include "db/version_edit.h" +#include "logging/log_buffer.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForForcedBlobGC().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableOptions& ioptions, + const MutableDBOptions& mutable_db_options) + : cf_name_(cf_name), + vstorage_(vstorage), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions), + mutable_db_options_(mutable_db_options) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Compaction with round-robin compaction priority allows more files to be + // picked to form a large compaction + void SetupOtherFilesWithRoundRobinExpansion(); + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // From `start_level_`, pick files to compact to `output_level_`. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one for + // all compaction priorities except round-robin. For round-robin, + // multiple consecutive files may be put into inputs->files. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // Return true if a L0 trivial move is picked up. + bool TryPickL0TrivialMove(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + // Return true if TrivialMove is extended. `start_index` is the index of + // the initial file picked, which should already be in `start_level_inputs_`. + bool TryExtendNonL0TrivialMove(int start_index, + bool only_expand_right = false); + + // Picks a file from level_files to compact. + // level_files is a vector of (level, file metadata) in ascending order of + // level. If compact_to_next_level is true, compact the file to the next + // level, otherwise, compact to the same level as the input file. + void PickFileToCompact( + const autovector>& level_files, + bool compact_to_next_level); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + bool is_l0_trivial_move_ = false; + CompactionInputFiles start_level_inputs_; + std::vector compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableOptions& ioptions_; + const MutableDBOptions& mutable_db_options_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickFileToCompact( + const autovector>& level_files, + bool compact_to_next_level) { + for (auto& level_file : level_files) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + if ((compact_to_next_level && + start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + continue; + } + if (compact_to_next_level) { + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + } else { + output_level_ = start_level_; + } + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + return; + } + } + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + bool picked_file_to_compact = PickFileToCompact(); + TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact", + &picked_file_to_compact); + if (picked_file_to_compact) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } else { + // Compaction scores are sorted in descending order, no further scores + // will be >= 1. + break; + } + } + if (!start_level_inputs_.empty()) { + return; + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + + // Bottommost Files Compaction on deleting tombstones + PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + + // TTL Compaction + if (ioptions_.compaction_pri == kRoundRobin && + !vstorage_->ExpiredTtlFiles().empty()) { + auto expired_files = vstorage_->ExpiredTtlFiles(); + // the expired files list should already be sorted by level + start_level_ = expired_files.front().first; +#ifndef NDEBUG + for (const auto& file : expired_files) { + assert(start_level_ <= file.first); + } +#endif + if (start_level_ > 0) { + output_level_ = start_level_ + 1; + if (PickFileToCompact()) { + compaction_reason_ = CompactionReason::kRoundRobinTtl; + return; + } + } + } + + PickFileToCompact(vstorage_->ExpiredTtlFiles(), true); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + + // Periodic Compaction + PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + + // Forced blob garbage collection + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kForcedBlobGC; + return; + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() { + // We only expand when the start level is not L0 under round robin + assert(start_level_ >= 1); + + // For round-robin compaction priority, we have 3 constraints when picking + // multiple files. + // Constraint 1: We can only pick consecutive files + // -> Constraint 1a: When a file is being compacted (or some input files + // are being compacted after expanding, we cannot + // choose it and have to stop choosing more files + // -> Constraint 1b: When we reach the last file (with largest keys), we + // cannot choose more files (the next file will be the + // first one) + // Constraint 2: We should ensure the total compaction bytes (including the + // overlapped files from the next level) is no more than + // mutable_cf_options_.max_compaction_bytes + // Constraint 3: We try our best to pick as many files as possible so that + // the post-compaction level size is less than + // MaxBytesForLevel(start_level_) + // Constraint 4: We do not expand if it is possible to apply a trivial move + // Constraint 5 (TODO): Try to pick minimal files to split into the target + // number of subcompactions + TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin"); + + // Only expand the inputs when we have selected a file in start_level_inputs_ + if (start_level_inputs_.size() == 0) return; + + uint64_t start_lvl_bytes_no_compacting = 0; + uint64_t curr_bytes_to_compact = 0; + uint64_t start_lvl_max_bytes_to_compact = 0; + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + // Constraint 3 (pre-calculate the ideal max bytes to compact) + for (auto f : level_files) { + if (!f->being_compacted) { + start_lvl_bytes_no_compacting += f->fd.GetFileSize(); + } + } + if (start_lvl_bytes_no_compacting > + vstorage_->MaxBytesForLevel(start_level_)) { + start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting - + vstorage_->MaxBytesForLevel(start_level_); + } + + size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0]; + InternalKey smallest, largest; + // Constraint 4 (No need to check again later) + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + if (TryExtendNonL0TrivialMove((int)start_index, + true /* only_expand_right */)) { + return; + } + } + // Constraint 3 + if (start_level_inputs_[0]->fd.GetFileSize() >= + start_lvl_max_bytes_to_compact) { + return; + } + CompactionInputFiles tmp_start_level_inputs; + tmp_start_level_inputs = start_level_inputs_; + // TODO (zichen): Future parallel round-robin may also need to update this + // Constraint 1b (only expand till the end) + for (size_t i = start_index + 1; i < level_files.size(); i++) { + auto* f = level_files[i]; + if (f->being_compacted) { + // Constraint 1a + return; + } + + tmp_start_level_inputs.files.push_back(f); + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &tmp_start_level_inputs) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {tmp_start_level_inputs}, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // Constraint 1a + tmp_start_level_inputs.clear(); + return; + } + + curr_bytes_to_compact = 0; + for (auto start_lvl_f : tmp_start_level_inputs.files) { + curr_bytes_to_compact += start_lvl_f->fd.GetFileSize(); + } + + // Check whether any output level files are locked + compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest); + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + // Constraint 1a + tmp_start_level_inputs.clear(); + return; + } + + uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact; + for (auto output_lvl_f : output_level_inputs.files) { + curr_bytes_to_compact += output_lvl_f->fd.GetFileSize(); + } + if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) { + // Constraint 2 + tmp_start_level_inputs.clear(); + return; + } + + start_level_inputs_.files = tmp_start_level_inputs.files; + // Constraint 3 + if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) { + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + bool round_robin_expanding = + ioptions_.compaction_pri == kRoundRobin && + compaction_reason_ == CompactionReason::kLevelMaxLevelSize; + if (round_robin_expanding) { + SetupOtherFilesWithRoundRobinExpansion(); + } + if (!is_l0_trivial_move_ && + !compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_, + round_robin_expanding)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction( + compaction_inputs_, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + if (!is_l0_trivial_move_) { + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + // TryPickL0TrivialMove() does not apply to the case when compacting L0 to an + // empty output level. So L0 files is picked in PickFileToCompact() by + // compaction score. We may still be able to do trivial move when this file + // does not overlap with other L0s. This happens when + // compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not + // pull in more L0s. + assert(!compaction_inputs_.empty()); + bool l0_files_might_overlap = + start_level_ == 0 && !is_l0_trivial_move_ && + (compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1); + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(compaction_inputs_), output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(vstorage_, mutable_cf_options_, output_level_, + vstorage_->base_level()), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), + Temperature::kUnknown, + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, + l0_files_might_overlap, compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/main/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::TryPickL0TrivialMove() { + if (vstorage_->base_level() <= 0) { + return false; + } + if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() && + !vstorage_->LevelFiles(output_level_).empty() && + ioptions_.db_paths.size() <= 1) { + // Try to pick trivial move from L0 to L1. We start from the oldest + // file. We keep expanding to newer files if it would form a + // trivial move. + // For now we don't support it with + // mutable_cf_options_.compression_per_level to prevent the logic + // of determining whether L0 can be trivial moved to the next level. + // We skip the case where output level is empty, since in this case, at + // least the oldest file would qualify for trivial move, and this would + // be a surprising behavior with few benefits. + + // We search from the oldest file from the newest. In theory, there are + // files in the middle can form trivial move too, but it is probably + // uncommon and we ignore these cases for simplicity. + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + InternalKey my_smallest, my_largest; + for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) { + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + FileMetaData* file = *it; + if (it == level_files.rbegin()) { + my_smallest = file->smallest; + my_largest = file->largest; + } else { + if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) < + 0) { + my_smallest = file->smallest; + } else if (compaction_picker_->icmp()->Compare(file->smallest, + my_largest) > 0) { + my_largest = file->largest; + } else { + break; + } + } + vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + assert(!file->being_compacted); + start_level_inputs_.files.push_back(file); + } else { + break; + } + } + } + + if (!start_level_inputs_.empty()) { + // Sort files by key range. Not sure it's 100% necessary but it's cleaner + // to always keep files sorted by key the key ranges don't overlap. + std::sort(start_level_inputs_.files.begin(), + start_level_inputs_.files.end(), + [icmp = compaction_picker_->icmp()](FileMetaData* f1, + FileMetaData* f2) -> bool { + return (icmp->Compare(f1->smallest, f2->smallest) < 0); + }); + + is_l0_trivial_move_ = true; + return true; + } + return false; +} + +bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index, + bool only_expand_right) { + if (start_level_inputs_.size() == 1 && + (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) && + (mutable_cf_options_.compression_per_level.empty())) { + // Only file of `index`, and it is likely a trivial move. Try to + // expand if it is still a trivial move, but not beyond + // max_compaction_bytes or 4 files, so that we don't create too + // much compaction pressure for the next level. + // Ignore if there are more than one DB path, as it would be hard + // to predict whether it is a trivial move. + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + const size_t kMaxMultiTrivialMove = 4; + FileMetaData* initial_file = start_level_inputs_.files[0]; + size_t total_size = initial_file->fd.GetFileSize(); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + // Expand towards right + for (int i = start_index + 1; + i < static_cast(level_files.size()) && + start_level_inputs_.size() < kMaxMultiTrivialMove; + i++) { + FileMetaData* next_file = level_files[i]; + if (next_file->being_compacted) { + break; + } + vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest), + &(next_file->largest), + &output_level_inputs.files); + if (!output_level_inputs.empty()) { + break; + } + if (i < static_cast(level_files.size()) - 1 && + compaction_picker_->icmp() + ->user_comparator() + ->CompareWithoutTimestamp( + next_file->largest.user_key(), + level_files[i + 1]->smallest.user_key()) == 0) { + TEST_SYNC_POINT_CALLBACK( + "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut", + nullptr); + // Not a clean up after adding the next file. Skip. + break; + } + total_size += next_file->fd.GetFileSize(); + if (total_size > mutable_cf_options_.max_compaction_bytes) { + break; + } + start_level_inputs_.files.push_back(next_file); + } + // Expand towards left + if (!only_expand_right) { + for (int i = start_index - 1; + i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) { + FileMetaData* next_file = level_files[i]; + if (next_file->being_compacted) { + break; + } + vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest), + &(initial_file->largest), + &output_level_inputs.files); + if (!output_level_inputs.empty()) { + break; + } + if (i > 0 && compaction_picker_->icmp() + ->user_comparator() + ->CompareWithoutTimestamp( + next_file->smallest.user_key(), + level_files[i - 1]->largest.user_key()) == 0) { + // Not a clean up after adding the next file. Skip. + break; + } + total_size += next_file->fd.GetFileSize(); + if (total_size > mutable_cf_options_.max_compaction_bytes) { + break; + } + // keep `files` sorted in increasing order by key range + start_level_inputs_.files.insert(start_level_inputs_.files.begin(), + next_file); + } + } + return start_level_inputs_.size() > 1; + } + return false; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + start_level_inputs_.level = start_level_; + + assert(start_level_ >= 0); + + if (TryPickL0TrivialMove()) { + return true; + } + + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + // Pick the file with the highest score in this level that is not already + // being compacted. + const std::vector& file_scores = + vstorage_->FilesByCompactionPri(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_scores.size(); cmp_idx++) { + int index = file_scores[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + if (ioptions_.compaction_pri == kRoundRobin) { + // TODO(zichen): this file may be involved in one compaction from + // an upper level, cannot advance the cursor for round-robin policy. + // Currently, we do not pick any file to compact in this case. We + // should fix this later to ensure a compaction is picked but the + // cursor shall not be advanced. + return false; + } + continue; + } + + start_level_inputs_.files.push_back(f); + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; + } + + // Now that input level is fully expanded, we check whether any output + // files are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + if (start_level_ > 0 && + TryExtendNonL0TrivialMove(index, + ioptions_.compaction_pri == + kRoundRobin /* only_expand_right */)) { + break; + } + } else { + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; + } + } + + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + if (ioptions_.compaction_pri != kRoundRobin) { + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + } + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, + std::numeric_limits::max(), + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, + mutable_cf_options, ioptions_, + mutable_db_options); + return builder.PickCompaction(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.h new file mode 100644 index 0000000000..6eb0f586f4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_level.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.cc new file mode 100644 index 0000000000..8ad43efcce --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.cc @@ -0,0 +1,1452 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_universal.h" + +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/statistics_impl.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// A helper class that form universal compactions. The class is used by +// UniversalCompactionPicker::PickCompaction(). +// The usage is to create the class, and get the compaction object by calling +// PickCompaction(). +class UniversalCompactionBuilder { + public: + UniversalCompactionBuilder( + const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, LogBuffer* log_buffer) + : ioptions_(ioptions), + icmp_(icmp), + cf_name_(cf_name), + mutable_cf_options_(mutable_cf_options), + mutable_db_options_(mutable_db_options), + vstorage_(vstorage), + picker_(picker), + log_buffer_(log_buffer) {} + + // Form and return the compaction object. The caller owns return object. + Compaction* PickCompaction(); + + private: + struct SortedRun { + SortedRun(int _level, FileMetaData* _file, uint64_t _size, + uint64_t _compensated_file_size, bool _being_compacted) + : level(_level), + file(_file), + size(_size), + compensated_file_size(_compensated_file_size), + being_compacted(_being_compacted) { + assert(compensated_file_size > 0); + assert(level != 0 || file != nullptr); + } + + void Dump(char* out_buf, size_t out_buf_size, + bool print_path = false) const; + + // sorted_run_count is added into the string to print + void DumpSizeInfo(char* out_buf, size_t out_buf_size, + size_t sorted_run_count) const; + + int level; + // `file` Will be null for level > 0. For level = 0, the sorted run is + // for this file. + FileMetaData* file; + // For level > 0, `size` and `compensated_file_size` are sum of sizes all + // files in the level. `being_compacted` should be the same for all files + // in a non-zero level. Use the value here. + uint64_t size; + uint64_t compensated_file_size; + bool being_compacted; + }; + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionToReduceSizeAmp(); + + // Try to pick incremental compaction to reduce space amplification. + // It will return null if it cannot find a fanout within the threshold. + // Fanout is defined as + // total size of files to compact at output level + // -------------------------------------------------- + // total size of files to compact at other levels + Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold); + + Compaction* PickDeleteTriggeredCompaction(); + + // Form a compaction from the sorted run indicated by start_index to the + // oldest sorted run. + // The caller is responsible for making sure that those files are not in + // compaction. + Compaction* PickCompactionToOldest(size_t start_index, + CompactionReason compaction_reason); + + Compaction* PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason); + + // Try to pick periodic compaction. The caller should only call it + // if there is at least one file marked for periodic compaction. + // null will be returned if no such a compaction can be formed + // because some files are being compacted. + Compaction* PickPeriodicCompaction(); + + // Used in universal compaction when the allow_trivial_move + // option is set. Checks whether there are any overlapping files + // in the input. Returns true if the input files are non + // overlapping. + bool IsInputFilesNonOverlapping(Compaction* c); + + uint64_t GetMaxOverlappingBytes() const; + + const ImmutableOptions& ioptions_; + const InternalKeyComparator* icmp_; + double score_; + std::vector sorted_runs_; + const std::string& cf_name_; + const MutableCFOptions& mutable_cf_options_; + const MutableDBOptions& mutable_db_options_; + VersionStorageInfo* vstorage_; + UniversalCompactionPicker* picker_; + LogBuffer* log_buffer_; + + static std::vector CalculateSortedRuns( + const VersionStorageInfo& vstorage); + + // Pick a path ID to place a newly generated file, with its estimated file + // size. + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + uint64_t file_size); +}; + +// Used in universal compaction when trivial move is enabled. +// This structure is used for the construction of min heap +// that contains the file meta data, the level of the file +// and the index of the file in that level + +struct InputFileInfo { + InputFileInfo() : f(nullptr), level(0), index(0) {} + + FileMetaData* f; + size_t level; + size_t index; +}; + +// Used in universal compaction when trivial move is enabled. +// This comparator is used for the construction of min heap +// based on the smallest key of the file. +struct SmallestKeyHeapComparator { + explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; } + + bool operator()(InputFileInfo i1, InputFileInfo i2) const { + return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(), + i2.f->smallest.user_key()) > 0); + } + + private: + const Comparator* ucmp_; +}; + +using SmallestKeyHeap = + std::priority_queue, + SmallestKeyHeapComparator>; + +// This function creates the heap that is used to find if the files are +// overlapping during universal compaction when the allow_trivial_move +// is set. +SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { + SmallestKeyHeap smallest_key_priority_q = + SmallestKeyHeap(SmallestKeyHeapComparator(ucmp)); + + InputFileInfo input_file; + + for (size_t l = 0; l < c->num_input_levels(); l++) { + if (c->num_input_files(l) != 0) { + if (l == 0 && c->start_level() == 0) { + for (size_t i = 0; i < c->num_input_files(0); i++) { + input_file.f = c->input(0, i); + input_file.level = 0; + input_file.index = i; + smallest_key_priority_q.push(std::move(input_file)); + } + } else { + input_file.f = c->input(l, 0); + input_file.level = l; + input_file.index = 0; + smallest_key_priority_q.push(std::move(input_file)); + } + } + } + return smallest_key_priority_q; +} + +#ifndef NDEBUG +// smallest_seqno and largest_seqno are set iff. `files` is not empty. +void GetSmallestLargestSeqno(const std::vector& files, + SequenceNumber* smallest_seqno, + SequenceNumber* largest_seqno) { + bool is_first = true; + for (FileMetaData* f : files) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + *smallest_seqno = f->fd.smallest_seqno; + *largest_seqno = f->fd.largest_seqno; + } else { + if (f->fd.smallest_seqno < *smallest_seqno) { + *smallest_seqno = f->fd.smallest_seqno; + } + if (f->fd.largest_seqno > *largest_seqno) { + *largest_seqno = f->fd.largest_seqno; + } + } + } +} +#endif +} // namespace + +// Algorithm that checks to see if there are any overlapping +// files in the input +bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { + auto comparator = icmp_->user_comparator(); + int first_iter = 1; + + InputFileInfo prev, curr, next; + + SmallestKeyHeap smallest_key_priority_q = + create_level_heap(c, icmp_->user_comparator()); + + while (!smallest_key_priority_q.empty()) { + curr = smallest_key_priority_q.top(); + smallest_key_priority_q.pop(); + + if (first_iter) { + prev = curr; + first_iter = 0; + } else { + if (comparator->CompareWithoutTimestamp( + prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) { + // found overlapping files, return false + return false; + } + assert(comparator->CompareWithoutTimestamp( + curr.f->largest.user_key(), prev.f->largest.user_key()) > 0); + prev = curr; + } + + next.f = nullptr; + + if (c->level(curr.level) != 0 && + curr.index < c->num_input_files(curr.level) - 1) { + next.f = c->input(curr.level, curr.index + 1); + next.level = curr.level; + next.index = curr.index + 1; + } + + if (next.f) { + smallest_key_priority_q.push(std::move(next)); + } + } + return true; +} + +bool UniversalCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + if (vstorage->CompactionScore(kLevel0) >= 1) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + return false; +} + +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, + mutable_cf_options, mutable_db_options, + vstorage, this, log_buffer); + return builder.PickCompaction(); +} + +void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, + size_t out_buf_size, + bool print_path) const { + if (level == 0) { + assert(file != nullptr); + if (file->fd.GetPathId() == 0 || !print_path) { + snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber()); + } else { + snprintf(out_buf, out_buf_size, + "file %" PRIu64 + "(path " + "%" PRIu32 ")", + file->fd.GetNumber(), file->fd.GetPathId()); + } + } else { + snprintf(out_buf, out_buf_size, "level %d", level); + } +} + +void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( + char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { + if (level == 0) { + assert(file != nullptr); + snprintf(out_buf, out_buf_size, + "file %" PRIu64 "[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(), + file->compensated_file_size); + } else { + snprintf(out_buf, out_buf_size, + "level %d[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + level, sorted_run_count, size, compensated_file_size); + } +} + +std::vector +UniversalCompactionBuilder::CalculateSortedRuns( + const VersionStorageInfo& vstorage) { + std::vector ret; + for (FileMetaData* f : vstorage.LevelFiles(0)) { + ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, + f->being_compacted); + } + for (int level = 1; level < vstorage.num_levels(); level++) { + uint64_t total_compensated_size = 0U; + uint64_t total_size = 0U; + bool being_compacted = false; + for (FileMetaData* f : vstorage.LevelFiles(level)) { + total_compensated_size += f->compensated_file_size; + total_size += f->fd.GetFileSize(); + // Size amp, read amp and periodic compactions always include all files + // for a non-zero level. However, a delete triggered compaction and + // a trivial move might pick a subset of files in a sorted run. So + // always check all files in a sorted run and mark the entire run as + // being compacted if one or more files are being compacted + if (f->being_compacted) { + being_compacted = f->being_compacted; + } + } + if (total_compensated_size > 0) { + ret.emplace_back(level, nullptr, total_size, total_compensated_size, + being_compacted); + } + } + return ret; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +Compaction* UniversalCompactionBuilder::PickCompaction() { + const int kLevel0 = 0; + score_ = vstorage_->CompactionScore(kLevel0); + sorted_runs_ = CalculateSortedRuns(*vstorage_); + + if (sorted_runs_.size() == 0 || + (vstorage_->FilesMarkedForPeriodicCompaction().empty() && + vstorage_->FilesMarkedForCompaction().empty() && + sorted_runs_.size() < (unsigned int)mutable_cf_options_ + .level0_file_num_compaction_trigger)) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n", + cf_name_.c_str()); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER_MAX_SZ( + log_buffer_, 3072, + "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n", + cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); + + Compaction* c = nullptr; + // Periodic compaction has higher priority than other type of compaction + // because it's a hard requirement. + if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + // Always need to do a full compaction for periodic compaction. + c = PickPeriodicCompaction(); + TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c); + } + + // Check for size amplification. + if (c == nullptr && + sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)) { + if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr"); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", + cf_name_.c_str()); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = + mutable_cf_options_.compaction_options_universal.size_ratio; + + if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr"); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for size ratio\n", + cf_name_.c_str()); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + // This is guaranteed by NeedsCompaction() + assert(sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)); + // Get the total number of sorted runs that are not being compacted + int num_sr_not_compacted = 0; + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false) { + num_sr_not_compacted++; + } + } + + // The number of sorted runs that are not being compacted is greater + // than the maximum allowed number of sorted runs + if (num_sr_not_compacted > + mutable_cf_options_.level0_file_num_compaction_trigger) { + unsigned int num_files = + num_sr_not_compacted - + mutable_cf_options_.level0_file_num_compaction_trigger + 1; + if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != + nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for file num -- %u\n", + cf_name_.c_str(), num_files); + } + } + } + } + } + + if (c == nullptr) { + if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr"); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: delete triggered compaction\n", + cf_name_.c_str()); + } + } + + if (c == nullptr) { + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + + if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == + true && + c->compaction_reason() != CompactionReason::kPeriodicCompaction) { + c->set_is_trivial_move(IsInputFilesNonOverlapping(c)); + } + +// validate that all the chosen files of L0 are non overlapping in time +#ifndef NDEBUG + bool is_first = true; + + size_t level_index = 0U; + if (c->start_level() == 0) { + for (auto f : *c->inputs(0)) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + } + } + level_index = 1U; + } + for (; level_index < c->num_input_levels(); level_index++) { + if (c->num_input_files(level_index) != 0) { + SequenceNumber smallest_seqno = 0U; + SequenceNumber largest_seqno = 0U; + GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno, + &largest_seqno); + if (is_first) { + is_first = false; + } + } + } +#endif + // update statistics + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files); + + picker_->RegisterCompaction(c); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + + TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", + c); + return c; +} + +uint32_t UniversalCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, uint64_t file_size) { + // Two conditions need to be satisfied: + // (1) the target path needs to be able to hold the file's size + // (2) Total size left in this and previous paths need to be not + // smaller than expected future file size before this new file is + // compacted, which is estimated based on size_ratio. + // For example, if now we are compacting files of size (1, 1, 2, 4, 8), + // we will make sure the target file, probably with size of 16, will be + // placed in a path so that eventually when new files are generated and + // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or + // before the path we chose. + // + // TODO(sdong): now the case of multiple column families is not + // considered in this algorithm. So the target size can be violated in + // that case. We need to improve it. + uint64_t accumulated_size = 0; + uint64_t future_size = + file_size * + (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100; + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + for (; p < ioptions.cf_paths.size() - 1; p++) { + uint64_t target_size = ioptions.cf_paths[p].target_size; + if (target_size > file_size && + accumulated_size + (target_size - file_size) > future_size) { + return p; + } + accumulated_size += target_size; + } + return p; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact) { + unsigned int min_merge_width = + mutable_cf_options_.compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + mutable_cf_options_.compaction_options_universal.max_merge_width; + + const SortedRun* sr = nullptr; + bool done = false; + size_t start_index = 0; + unsigned int candidate_count = 0; + + unsigned int max_files_to_compact = + std::min(max_merge_width, max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Caller checks the size before executing this function. This invariant is + // important because otherwise we may have a possible integer underflow when + // dealing with unsigned types. + assert(sorted_runs_.size() > 0); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (size_t loop = 0; loop < sorted_runs_.size(); loop++) { + candidate_count = 0; + + // Skip files that are already being compacted + for (sr = nullptr; loop < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + + if (!sr->being_compacted) { + candidate_count = 1; + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] being compacted, skipping", + cf_name_.c_str(), file_num_buf, loop); + + sr = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0; + if (sr != nullptr) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Possible candidate %s[%d].", + cf_name_.c_str(), file_num_buf, loop); + } + + // Check if the succeeding files need compaction. + for (size_t i = loop + 1; + candidate_count < max_files_to_compact && i < sorted_runs_.size(); + i++) { + const SortedRun* succeeding_sr = &sorted_runs_[i]; + if (succeeding_sr->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast(succeeding_sr->size)) { + break; + } + if (mutable_cf_options_.compaction_options_universal.stop_style == + kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0; + if (sz < static_cast(candidate_size)) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = succeeding_sr->compensated_file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += succeeding_sr->compensated_file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (size_t i = loop; + i < loop + candidate_count && i < sorted_runs_.size(); i++) { + const SortedRun* skipping_sr = &sorted_runs_[i]; + char file_num_buf[256]; + skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s", + cf_name_.c_str(), file_num_buf); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + size_t first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + mutable_cf_options_.compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = 0; + for (auto& sorted_run : sorted_runs_) { + total_size += sorted_run.compensated_file_size; + } + + uint64_t older_file_size = 0; + for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) { + older_file_size += sorted_runs_[i].size; + if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { + enable_compression = false; + break; + } + } + } + + uint64_t estimated_total_size = 0; + for (unsigned int i = 0; i < first_index_after; i++) { + estimated_total_size += sorted_runs_[i].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + int output_level; + if (first_index_after == sorted_runs_.size()) { + output_level = vstorage_->num_levels() - 1; + } else if (sorted_runs_[first_index_after].level == 0) { + output_level = 0; + } else { + output_level = sorted_runs_[first_index_after].level - 1; + } + + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind && + (output_level == vstorage_->num_levels() - 1)) { + assert(output_level > 1); + output_level--; + } + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t i = start_index; i < first_index_after; i++) { + auto& picking_sr = sorted_runs_[i]; + if (picking_sr.level == 0) { + FileMetaData* picking_file = picking_sr.file; + inputs[0].files.push_back(picking_file); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s", + cf_name_.c_str(), file_num_buf); + } + + std::vector grandparents; + // Include grandparents for potential file cutting in incremental + // mode. It is for aligning file cutting boundaries across levels, + // so that subsequent compactions can pick files with aligned + // buffer. + // Single files are only picked up in incremental mode, so that + // there is no need for full range. + if (mutable_cf_options_.compaction_options_universal.incremental && + first_index_after < sorted_runs_.size() && + sorted_runs_[first_index_after].level > 1) { + grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level); + } + + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + CompactionReason compaction_reason; + if (max_number_of_files_to_compact == UINT_MAX) { + compaction_reason = CompactionReason::kUniversalSizeRatio; + } else { + compaction_reason = CompactionReason::kUniversalSortedRunNum; + } + return new Compaction(vstorage_, ioptions_, mutable_cf_options_, + mutable_db_options_, std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, + output_level, 1, enable_compression), + GetCompressionOptions(mutable_cf_options_, vstorage_, + output_level, enable_compression), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, + /* is manual */ false, /* trim_ts */ "", score_, + false /* deletion_compaction */, + /* l0_files_might_overlap */ true, compaction_reason); +} + +// Look at overall size amplification. If size amplification +// exceeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { + // percentage flexibility while reducing size amplification + uint64_t ratio = mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + size_t start_index = 0; + const SortedRun* sr = nullptr; + + assert(!sorted_runs_.empty()); + if (sorted_runs_.back().being_compacted) { + return nullptr; + } + + // Skip files that are already being compacted + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + if (!sr->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: skipping %s[%d] compacted %s", + cf_name_.c_str(), file_num_buf, loop, + " cannot be a candidate to reduce size amp.\n"); + sr = nullptr; + } + + if (sr == nullptr) { + return nullptr; // no candidate files + } + { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", + cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); + } + + // size of the base sorted run for size amp calculation + uint64_t base_sr_size = sorted_runs_.back().size; + size_t sr_end_idx = sorted_runs_.size() - 1; + // If tiered compaction is enabled and the last sorted run is the last level + if (ioptions_.preclude_last_level_data_seconds > 0 && + ioptions_.num_levels > 2 && + sorted_runs_.back().level == ioptions_.num_levels - 1 && + sorted_runs_.size() > 1) { + sr_end_idx = sorted_runs_.size() - 2; + base_sr_size = sorted_runs_[sr_end_idx].size; + } + + // keep adding up all the remaining files + for (size_t loop = start_index; loop < sr_end_idx; loop++) { + sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + // TODO with incremental compaction is supported, we might want to + // schedule some incremental compactions in parallel if needed. + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", + cf_name_.c_str(), file_num_buf, start_index, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += sr->compensated_file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * base_sr_size) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); + return nullptr; + } else { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); + } + // Since incremental compaction can't include more than second last + // level, it can introduce penalty, compared to full compaction. We + // hard code the pentalty to be 80%. If we end up with a compaction + // fanout higher than 80% of full level compactions, we fall back + // to full level compaction. + // The 80% threshold is arbitrary and can be adjusted or made + // configurable in the future. + // This also prevent the case when compaction falls behind and we + // need to compact more levels for compactions to catch up. + if (mutable_cf_options_.compaction_options_universal.incremental) { + double fanout_threshold = static_cast(base_sr_size) / + static_cast(candidate_size) * 1.8; + Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold); + if (picked != nullptr) { + // As the feature is still incremental, picking incremental compaction + // might fail and we will fall bck to compacting full level. + return picked; + } + } + return PickCompactionWithSortedRunRange( + start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification); +} + +Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( + double fanout_threshold) { + // Try find all potential compactions with total size just over + // options.max_compaction_size / 2, and take the one with the lowest + // fanout (defined in declaration of the function). + // This is done by having a sliding window of the files at the second + // lowest level, and keep expanding while finding overlapping in the + // last level. Once total size exceeds the size threshold, calculate + // the fanout value. And then shrinking from the small side of the + // window. Keep doing it until the end. + // Finally, we try to include upper level files if they fall into + // the range. + // + // Note that it is a similar problem as leveled compaction's + // kMinOverlappingRatio priority, but instead of picking single files + // we expand to a target compaction size. The reason is that in + // leveled compaction, actual fanout value tends to high, e.g. 10, so + // even with single file in down merging level, the extra size + // compacted in boundary files is at a lower ratio. But here users + // often have size of second last level size to be 1/4, 1/3 or even + // 1/2 of the bottommost level, so picking single file in second most + // level will cause significant waste, which is not desirable. + // + // This algorithm has lots of room to improve to pick more efficient + // compactions. + assert(sorted_runs_.size() >= 2); + int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level; + if (second_last_level == 0) { + // Can't split Level 0. + return nullptr; + } + int output_level = sorted_runs_.back().level; + const std::vector& bottom_files = + vstorage_->LevelFiles(output_level); + const std::vector& files = + vstorage_->LevelFiles(second_last_level); + assert(!bottom_files.empty()); + assert(!files.empty()); + + // std::unordered_map file_to_order; + + int picked_start_idx = 0; + int picked_end_idx = 0; + double picked_fanout = fanout_threshold; + + // Use half target compaction bytes as anchor to stop growing second most + // level files, and reserve growing space for more overlapping bottom level, + // clean cut, files from other levels, etc. + uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2; + int start_idx = 0; + int bottom_end_idx = 0; + int bottom_start_idx = 0; + uint64_t non_bottom_size = 0; + uint64_t bottom_size = 0; + bool end_bottom_size_counted = false; + for (int end_idx = 0; end_idx < static_cast(files.size()); end_idx++) { + FileMetaData* end_file = files[end_idx]; + + // Include bottom most level files smaller than the current second + // last level file. + int num_skipped = 0; + while (bottom_end_idx < static_cast(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->smallest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + } + bottom_end_idx++; + end_bottom_size_counted = false; + num_skipped++; + } + + if (num_skipped > 1) { + // At least a file in the bottom most level falls into the file gap. No + // reason to include the file. We cut the range and start a new sliding + // window. + start_idx = end_idx; + } + + if (start_idx == end_idx) { + // new sliding window. + non_bottom_size = 0; + bottom_size = 0; + bottom_start_idx = bottom_end_idx; + end_bottom_size_counted = false; + } + + non_bottom_size += end_file->fd.file_size; + + // Include all overlapping files in bottom level. + while (bottom_end_idx < static_cast(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->smallest, + end_file->largest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + end_bottom_size_counted = true; + } + if (icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + bottom_end_idx++; + end_bottom_size_counted = false; + } + + if ((non_bottom_size + bottom_size > comp_thres_size || + end_idx == static_cast(files.size()) - 1) && + non_bottom_size > 0) { // Do we alow 0 size file at all? + // If it is a better compaction, remember it in picked* variables. + double fanout = static_cast(bottom_size) / + static_cast(non_bottom_size); + if (fanout < picked_fanout) { + picked_start_idx = start_idx; + picked_end_idx = end_idx; + picked_fanout = fanout; + } + // Shrink from the start end to under comp_thres_size + while (non_bottom_size + bottom_size > comp_thres_size && + start_idx <= end_idx) { + non_bottom_size -= files[start_idx]->fd.file_size; + start_idx++; + if (start_idx < static_cast(files.size())) { + while (bottom_start_idx <= bottom_end_idx && + icmp_->Compare(bottom_files[bottom_start_idx]->largest, + files[start_idx]->smallest) < 0) { + bottom_size -= bottom_files[bottom_start_idx]->fd.file_size; + bottom_start_idx++; + } + } + } + } + } + + if (picked_fanout >= fanout_threshold) { + assert(picked_fanout == fanout_threshold); + return nullptr; + } + + std::vector inputs; + CompactionInputFiles bottom_level_inputs; + CompactionInputFiles second_last_level_inputs; + second_last_level_inputs.level = second_last_level; + bottom_level_inputs.level = output_level; + for (int i = picked_start_idx; i <= picked_end_idx; i++) { + if (files[i]->being_compacted) { + return nullptr; + } + second_last_level_inputs.files.push_back(files[i]); + } + assert(!second_last_level_inputs.empty()); + if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &second_last_level_inputs, + /*next_smallest=*/nullptr)) { + return nullptr; + } + // We might be able to avoid this binary search if we save and expand + // from bottom_start_idx and bottom_end_idx, but for now, we use + // SetupOtherInputs() for simplicity. + int parent_index = -1; // Create and use bottom_start_idx? + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &second_last_level_inputs, + &bottom_level_inputs, &parent_index, + /*base_index=*/-1)) { + return nullptr; + } + + // Try to include files in upper levels if they fall into the range. + // Since we need to go from lower level up and this is in the reverse + // order, compared to level order, we first write to an reversed + // data structure and finally copy them to compaction inputs. + InternalKey smallest, largest; + picker_->GetRange(second_last_level_inputs, &smallest, &largest); + std::vector inputs_reverse; + for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) { + SortedRun& sr = *it; + if (sr.level == 0) { + break; + } + std::vector level_inputs; + vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest, + &level_inputs); + if (!level_inputs.empty()) { + inputs_reverse.push_back({}); + inputs_reverse.back().level = sr.level; + inputs_reverse.back().files = level_inputs; + picker_->GetRange(inputs_reverse.back(), &smallest, &largest); + } + } + for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) { + inputs.push_back(*it); + } + + inputs.push_back(second_last_level_inputs); + inputs.push_back(bottom_level_inputs); + + int start_level = Compaction::kInvalidLevel; + for (const auto& in : inputs) { + if (!in.empty()) { + // inputs should already be sorted by level + start_level = in.level; + break; + } + } + + // intra L0 compactions outputs could have overlap + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + + // TODO support multi paths? + uint32_t path_id = 0; + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1, + true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, + CompactionReason::kUniversalSizeAmplification); +} + +// Pick files marked for compaction. Typically, files are marked by +// CompactOnDeleteCollector due to the presence of tombstones. +Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { + CompactionInputFiles start_level_inputs; + int output_level; + std::vector inputs; + std::vector grandparents; + + if (vstorage_->num_levels() == 1) { + // This is single level universal. Since we're basically trying to reclaim + // space by processing files marked for compaction due to high tombstone + // density, let's do the same thing as compaction to reduce size amp which + // has the same goals. + int start_index = -1; + + start_level_inputs.level = 0; + start_level_inputs.files.clear(); + output_level = 0; + // Find the first file marked for compaction. Ignore the last file + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + continue; + } + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + if (f->marked_for_compaction) { + start_level_inputs.files.push_back(f); + start_index = + static_cast(loop); // Consider this as the first candidate. + break; + } + } + if (start_index < 0) { + // Either no file marked, or they're already being compacted + return nullptr; + } + + for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + break; + } + + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + start_level_inputs.files.push_back(f); + } + if (start_level_inputs.size() <= 1) { + // If only the last file in L0 is marked for compaction, ignore it + return nullptr; + } + inputs.push_back(start_level_inputs); + } else { + int start_level; + + // For multi-level universal, the strategy is to make this look more like + // leveled. We pick one of the files marked for compaction and compact with + // overlapping files in the adjacent level. + picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, + &output_level, &start_level_inputs); + if (start_level_inputs.empty()) { + return nullptr; + } + + // Pick the first non-empty level after the start_level + for (output_level = start_level + 1; output_level < vstorage_->num_levels(); + output_level++) { + if (vstorage_->NumLevelFiles(output_level) != 0) { + break; + } + } + + // If all higher levels are empty, pick the highest level as output level + if (output_level == vstorage_->num_levels()) { + if (start_level == 0) { + output_level = vstorage_->num_levels() - 1; + } else { + // If start level is non-zero and all higher levels are empty, this + // compaction will translate into a trivial move. Since the idea is + // to reclaim space and trivial move doesn't help with that, we + // skip compaction in this case and return nullptr + return nullptr; + } + } + if (ioptions_.allow_ingest_behind && + output_level == vstorage_->num_levels() - 1) { + assert(output_level > 1); + output_level--; + } + + if (output_level != 0) { + if (start_level == 0) { + if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, + output_level, nullptr)) { + return nullptr; + } + } + + CompactionInputFiles output_level_inputs; + int parent_index = -1; + + output_level_inputs.level = output_level; + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &start_level_inputs, &output_level_inputs, + &parent_index, -1)) { + return nullptr; + } + inputs.push_back(start_level_inputs); + if (!output_level_inputs.empty()) { + inputs.push_back(output_level_inputs); + } + if (picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level, output_level))) { + return nullptr; + } + + picker_->GetGrandparents(vstorage_, start_level_inputs, + output_level_inputs, &grandparents); + } else { + inputs.push_back(start_level_inputs); + } + } + + uint64_t estimated_total_size = 0; + // Use size of the output level as estimated file size + for (FileMetaData* f : vstorage_->LevelFiles(output_level)) { + estimated_total_size += f->fd.GetFileSize(); + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, + CompactionReason::kFilesMarkedForCompaction); +} + +Compaction* UniversalCompactionBuilder::PickCompactionToOldest( + size_t start_index, CompactionReason compaction_reason) { + return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1, + compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason) { + assert(start_index < sorted_runs_.size()); + + // Estimate total file size + uint64_t estimated_total_size = 0; + for (size_t loop = start_index; loop <= end_index; loop++) { + estimated_total_size += sorted_runs_[loop].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t loop = start_index; loop <= end_index; loop++) { + auto& picking_sr = sorted_runs_[loop]; + if (picking_sr.level == 0) { + FileMetaData* f = picking_sr.file; + inputs[0].files.push_back(f); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + std::string comp_reason_print_string; + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + comp_reason_print_string = "periodic compaction"; + } else if (compaction_reason == + CompactionReason::kUniversalSizeAmplification) { + comp_reason_print_string = "size amp"; + } else { + assert(false); + comp_reason_print_string = "unknown: "; + comp_reason_print_string.append( + std::to_string(static_cast(compaction_reason))); + } + + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s", + cf_name_.c_str(), comp_reason_print_string.c_str(), + file_num_buf); + } + + int output_level; + if (end_index == sorted_runs_.size() - 1) { + // output files at the last level, unless it's reserved + output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + } else { + // if it's not including all sorted_runs, it can only output to the level + // above the `end_index + 1` sorted_run. + output_level = sorted_runs_[end_index + 1].level - 1; + } + + // intra L0 compactions outputs could have overlap + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + + // We never check size for + // compaction_options_universal.compression_size_percent, + // because we always compact all the files, so always compress. + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1, + true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction", + cf_name_.c_str()); + + // In universal compaction, sorted runs contain older data are almost always + // generated earlier too. To simplify the problem, we just try to trigger + // a full compaction. We start from the oldest sorted run and include + // all sorted runs, until we hit a sorted already being compacted. + // Since usually the largest (which is usually the oldest) sorted run is + // included anyway, doing a full compaction won't increase write + // amplification much. + + // Get some information from marked files to check whether a file is + // included in the compaction. + + size_t start_index = sorted_runs_.size(); + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + start_index--; + } + if (start_index == sorted_runs_.size()) { + return nullptr; + } + + // There is a rare corner case where we can't pick up all the files + // because some files are being compacted and we end up with picking files + // but none of them need periodic compaction. Unless we simply recompact + // the last sorted run (either the last level or last L0 file), we would just + // execute the compaction, in order to simplify the logic. + if (start_index == sorted_runs_.size() - 1) { + bool included_file_marked = false; + int start_level = sorted_runs_[start_index].level; + FileMetaData* start_file = sorted_runs_[start_index].file; + for (const std::pair& level_file_pair : + vstorage_->FilesMarkedForPeriodicCompaction()) { + if (start_level != 0) { + // Last sorted run is a level + if (start_level == level_file_pair.first) { + included_file_marked = true; + break; + } + } else { + // Last sorted run is a L0 file. + if (start_file == level_file_pair.second) { + included_file_marked = true; + break; + } + } + } + if (!included_file_marked) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Cannot form a compaction covering file " + "marked for periodic compaction", + cf_name_.c_str()); + return nullptr; + } + } + + Compaction* c = PickCompactionToOldest(start_index, + CompactionReason::kPeriodicCompaction); + + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", c); + + return c; +} + +uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const { + if (!mutable_cf_options_.compaction_options_universal.incremental) { + return std::numeric_limits::max(); + } else { + // Try to align cutting boundary with files at the next level if the + // file isn't end up with 1/2 of target size, or it would overlap + // with two full size files at the next level. + return mutable_cf_options_.target_file_size_base / 2 * 3; + } +} +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.h new file mode 100644 index 0000000000..cb16059692 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_picker_universal.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; + virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_service_job.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_service_job.cc new file mode 100644 index 0000000000..3149bb5002 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_service_job.cc @@ -0,0 +1,833 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" +#include "db/compaction/compaction_state.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +class SubcompactionState; + +CompactionServiceJobStatus +CompactionJob::ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + assert(db_options_.compaction_service); + + const Compaction* compaction = sub_compact->compaction; + CompactionServiceInput compaction_input; + compaction_input.output_level = compaction->output_level(); + compaction_input.db_id = db_id_; + + const std::vector& inputs = + *(compact_->compaction->inputs()); + for (const auto& files_per_level : inputs) { + for (const auto& file : files_per_level.files) { + compaction_input.input_files.emplace_back( + MakeTableFileName(file->fd.GetNumber())); + } + } + compaction_input.column_family.name = + compaction->column_family_data()->GetName(); + compaction_input.column_family.options = + compaction->column_family_data()->GetLatestCFOptions(); + compaction_input.db_options = + BuildDBOptions(db_options_, mutable_db_options_copy_); + compaction_input.snapshots = existing_snapshots_; + compaction_input.has_begin = sub_compact->start.has_value(); + compaction_input.begin = + compaction_input.has_begin ? sub_compact->start->ToString() : ""; + compaction_input.has_end = sub_compact->end.has_value(); + compaction_input.end = + compaction_input.has_end ? sub_compact->end->ToString() : ""; + + std::string compaction_input_binary; + Status s = compaction_input.Write(&compaction_input_binary); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + std::ostringstream input_files_oss; + bool is_first_one = true; + for (const auto& file : compaction_input.input_files) { + input_files_oss << (is_first_one ? "" : ", ") << file; + is_first_one = false; + } + + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_input.output_level, input_files_oss.str().c_str()); + CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, + GetCompactionId(sub_compact), thread_pri_); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->StartV2(info, compaction_input_binary); + switch (compaction_status) { + case CompactionServiceJobStatus::kSuccess: + break; + case CompactionServiceJobStatus::kFailure: + sub_compact->status = Status::Incomplete( + "CompactionService failed to start compaction job."); + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed to start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + case CompactionServiceJobStatus::kUseLocal: + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API Start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + default: + assert(false); // unknown status + break; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Waiting for remote compaction...", + compaction_input.column_family.name.c_str(), job_id_); + std::string compaction_result_binary; + compaction_status = db_options_.compaction_service->WaitForCompleteV2( + info, &compaction_result_binary); + + if (compaction_status == CompactionServiceJobStatus::kUseLocal) { + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API " + "WaitForComplete.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + CompactionServiceResult compaction_result; + s = CompactionServiceResult::Read(compaction_result_binary, + &compaction_result); + + if (compaction_status == CompactionServiceJobStatus::kFailure) { + if (s.ok()) { + if (compaction_result.status.ok()) { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (even though " + "the internal status is okay)."); + } else { + // set the current sub compaction status with the status returned from + // remote + sub_compact->status = compaction_result.status; + } + } else { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (and no valid " + "result is returned)."); + compaction_result.status.PermitUncheckedError(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + if (!s.ok()) { + sub_compact->status = s; + compaction_result.status.PermitUncheckedError(); + return CompactionServiceJobStatus::kFailure; + } + sub_compact->status = compaction_result.status; + + std::ostringstream output_files_oss; + is_first_one = true; + for (const auto& file : compaction_result.output_files) { + output_files_oss << (is_first_one ? "" : ", ") << file.file_name; + is_first_one = false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Receive remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), + output_files_oss.str().c_str()); + + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + for (const auto& file : compaction_result.output_files) { + uint64_t file_num = versions_->NewFileNumber(); + auto src_file = compaction_result.output_path + "/" + file.file_name; + auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, + file_num, compaction->output_path_id()); + s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + FileMetaData meta; + uint64_t file_size; + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, + file.smallest_seqno, file.largest_seqno); + meta.smallest.DecodeFrom(file.smallest_internal_key); + meta.largest.DecodeFrom(file.largest_internal_key); + meta.oldest_ancester_time = file.oldest_ancester_time; + meta.file_creation_time = file.file_creation_time; + meta.epoch_number = file.epoch_number; + meta.marked_for_compaction = file.marked_for_compaction; + meta.unique_id = file.unique_id; + + auto cfd = compaction->column_family_data(); + sub_compact->Current().AddOutput(std::move(meta), + cfd->internal_comparator(), false, false, + true, file.paranoid_hash); + } + sub_compact->compaction_job_stats = compaction_result.stats; + sub_compact->Current().SetNumOutputRecords( + compaction_result.num_output_records); + sub_compact->Current().SetTotalBytes(compaction_result.total_bytes); + RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); + RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, + compaction_result.bytes_written); + return CompactionServiceJobStatus::kSuccess; +} + +std::string CompactionServiceCompactionJob::GetTableFileName( + uint64_t file_number) { + return MakeTableFileName(output_path_, file_number); +} + +void CompactionServiceCompactionJob::RecordCompactionIOStats() { + compaction_result_->bytes_read += IOSTATS(bytes_read); + compaction_result_->bytes_written += IOSTATS(bytes_written); + CompactionJob::RecordCompactionIOStats(); +} + +CompactionServiceCompactionJob::CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result) + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, + std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr, + std::move(table_cache), event_logger, + compaction->mutable_cf_options()->paranoid_file_checks, + compaction->mutable_cf_options()->report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + manual_compaction_canceled, db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), + output_path_(std::move(output_path)), + compaction_input_(compaction_service_input), + compaction_result_(compaction_service_result) {} + +Status CompactionServiceCompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + Slice begin = compaction_input_.begin; + Slice end = compaction_input_.end; + compact_->sub_compact_states.emplace_back( + c, + compaction_input_.has_begin ? std::optional(begin) + : std::optional(), + compaction_input_.has_end ? std::optional(end) + : std::optional(), + /*sub_job_id*/ 0); + + log_buffer_->FlushBufferToLog(); + LogCompaction(); + const uint64_t start_micros = db_options_.clock->NowMicros(); + // Pick the only sub-compaction we should have + assert(compact_->sub_compact_states.size() == 1); + SubcompactionState* sub_compact = compact_->sub_compact_states.data(); + + ProcessKeyValueCompaction(sub_compact); + + compaction_stats_.stats.micros = + db_options_.clock->NowMicros() - start_micros; + compaction_stats_.stats.cpu_micros = + sub_compact->compaction_job_stats.cpu_micros; + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + Status status = sub_compact->status; + IOStatus io_s = sub_compact->io_status; + + if (io_status_.ok()) { + io_status_ = io_s; + } + + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, + DirFsyncOptions()); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + // TODO: Add verify_table() + } + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + compact_->status = status; + compact_->status.PermitUncheckedError(); + + // Build compaction result + compaction_result_->output_level = compact_->compaction->output_level(); + compaction_result_->output_path = output_path_; + for (const auto& output_file : sub_compact->GetOutputs()) { + auto& meta = output_file.meta; + compaction_result_->output_files.emplace_back( + MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.smallest.Encode().ToString(), + meta.largest.Encode().ToString(), meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, + output_file.validator.GetHash(), meta.marked_for_compaction, + meta.unique_id); + } + InternalStats::CompactionStatsFull compaction_stats; + sub_compact->AggregateCompactionStats(compaction_stats); + compaction_result_->num_output_records = + compaction_stats.stats.num_output_records; + compaction_result_->total_bytes = compaction_stats.TotalBytesWritten(); + + return status; +} + +void CompactionServiceCompactionJob::CleanupCompaction() { + CompactionJob::CleanupCompaction(); +} + +// Internal binary format for the input and result data +enum BinaryFormatVersion : uint32_t { + kOptionsString = 1, // Use string format similar to Option string format +}; + +static std::unordered_map cfd_type_info = { + {"name", + {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options", + {offsetof(struct ColumnFamilyDescriptor, options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto cf_options = static_cast(addr); + return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), + value, cf_options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto cf_options = static_cast(addr); + std::string result; + auto status = + GetStringFromColumnFamilyOptions(opts, *cf_options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = CFOptionsAsConfigurable(*this_one); + auto that_conf = CFOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, +}; + +static std::unordered_map cs_input_type_info = { + {"column_family", + OptionTypeInfo::Struct( + "column_family", &cfd_type_info, + offsetof(struct CompactionServiceInput, column_family), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"db_options", + {offsetof(struct CompactionServiceInput, db_options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto options = static_cast(addr); + return GetDBOptionsFromString(opts, DBOptions(), value, options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto options = static_cast(addr); + std::string result; + auto status = GetStringFromDBOptions(opts, *options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = DBOptionsAsConfigurable(*this_one); + auto that_conf = DBOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, + {"snapshots", OptionTypeInfo::Vector( + offsetof(struct CompactionServiceInput, snapshots), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, + {"input_files", OptionTypeInfo::Vector( + offsetof(struct CompactionServiceInput, input_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kEncodedString})}, + {"output_level", + {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"db_id", + {offsetof(struct CompactionServiceInput, db_id), + OptionType::kEncodedString}}, + {"has_begin", + {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"begin", + {offsetof(struct CompactionServiceInput, begin), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"has_end", + {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"end", + {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + cs_output_file_type_info = { + {"file_name", + {offsetof(struct CompactionServiceOutputFile, file_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_seqno", + {offsetof(struct CompactionServiceOutputFile, smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_seqno", + {offsetof(struct CompactionServiceOutputFile, largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_internal_key", + {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_internal_key", + {offsetof(struct CompactionServiceOutputFile, largest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"oldest_ancester_time", + {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct CompactionServiceOutputFile, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"epoch_number", + {offsetof(struct CompactionServiceOutputFile, epoch_number), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_hash", + {offsetof(struct CompactionServiceOutputFile, paranoid_hash), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"marked_for_compaction", + {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"unique_id", + OptionTypeInfo::Array( + offsetof(struct CompactionServiceOutputFile, unique_id), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, +}; + +static std::unordered_map + compaction_job_stats_type_info = { + {"elapsed_micros", + {offsetof(struct CompactionJobStats, elapsed_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct CompactionJobStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_blobs_read", + {offsetof(struct CompactionJobStats, num_blobs_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files", + {offsetof(struct CompactionJobStats, num_input_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_at_output_level", + {offsetof(struct CompactionJobStats, num_input_files_at_output_level), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionJobStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct CompactionJobStats, num_output_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct CompactionJobStats, num_output_files_blob), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_full_compaction", + {offsetof(struct CompactionJobStats, is_full_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_manual_compaction", + {offsetof(struct CompactionJobStats, is_manual_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_bytes", + {offsetof(struct CompactionJobStats, total_input_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_blob_bytes_read", + {offsetof(struct CompactionJobStats, total_blob_bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes", + {offsetof(struct CompactionJobStats, total_output_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes_blob", + {offsetof(struct CompactionJobStats, total_output_bytes_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_records_replaced", + {offsetof(struct CompactionJobStats, num_records_replaced), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_key_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_value_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_deletion_records", + {offsetof(struct CompactionJobStats, num_input_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_expired_deletion_records", + {offsetof(struct CompactionJobStats, num_expired_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_corrupt_keys", + {offsetof(struct CompactionJobStats, num_corrupt_keys), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_write_nanos", + {offsetof(struct CompactionJobStats, file_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_range_sync_nanos", + {offsetof(struct CompactionJobStats, file_range_sync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_fsync_nanos", + {offsetof(struct CompactionJobStats, file_fsync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_prepare_write_nanos", + {offsetof(struct CompactionJobStats, file_prepare_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_output_key_prefix", + {offsetof(struct CompactionJobStats, smallest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_output_key_prefix", + {offsetof(struct CompactionJobStats, largest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_fallthru", + {offsetof(struct CompactionJobStats, num_single_del_fallthru), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_mismatch", + {offsetof(struct CompactionJobStats, num_single_del_mismatch), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +namespace { +// this is a helper struct to serialize and deserialize class Status, because +// Status's members are not public. +struct StatusSerializationAdapter { + uint8_t code; + uint8_t subcode; + uint8_t severity; + std::string message; + + StatusSerializationAdapter() = default; + explicit StatusSerializationAdapter(const Status& s) { + code = s.code(); + subcode = s.subcode(); + severity = s.severity(); + auto msg = s.getState(); + message = msg ? msg : ""; + } + + Status GetStatus() const { + return Status{static_cast(code), + static_cast(subcode), + static_cast(severity), message}; + } +}; +} // namespace + +static std::unordered_map + status_adapter_type_info = { + {"code", + {offsetof(struct StatusSerializationAdapter, code), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"subcode", + {offsetof(struct StatusSerializationAdapter, subcode), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"severity", + {offsetof(struct StatusSerializationAdapter, severity), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"message", + {offsetof(struct StatusSerializationAdapter, message), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map cs_result_type_info = { + {"status", + {offsetof(struct CompactionServiceResult, status), + OptionType::kCustomizable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter; + Status s = OptionTypeInfo::ParseType( + opts, value, status_adapter_type_info, &adapter); + *status_obj = adapter.GetStatus(); + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter(*status_obj); + std::string result; + Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, + &adapter, &result); + *value = "{" + result + "}"; + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr1, const void* addr2, std::string* mismatch) { + const auto status1 = static_cast(addr1); + const auto status2 = static_cast(addr2); + + StatusSerializationAdapter adatper1(*status1); + StatusSerializationAdapter adapter2(*status2); + return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, + &adatper1, &adapter2, mismatch); + }}}, + {"output_files", + OptionTypeInfo::Vector( + offsetof(struct CompactionServiceResult, output_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone))}, + {"output_level", + {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"output_path", + {offsetof(struct CompactionServiceResult, output_path), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionServiceResult, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_bytes", + {offsetof(struct CompactionServiceResult, total_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read", + {offsetof(struct CompactionServiceResult, bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct CompactionServiceResult, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"stats", OptionTypeInfo::Struct( + "stats", &compaction_job_stats_type_info, + offsetof(struct CompactionServiceResult, stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + +Status CompactionServiceInput::Read(const std::string& data_str, + CompactionServiceInput* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceInput string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Input data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceInput::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); +} + +Status CompactionServiceResult::Read(const std::string& data_str, + CompactionServiceResult* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceResult string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Result data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceResult::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); +} + +#ifndef NDEBUG +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, + mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, + mismatch); +} +#endif // NDEBUG +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.cc new file mode 100644 index 0000000000..ee4b0c1896 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.cc @@ -0,0 +1,46 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_state.h" + +namespace ROCKSDB_NAMESPACE { + +Slice CompactionState::SmallestUserKey() { + for (const auto& sub_compact_state : sub_compact_states) { + Slice smallest = sub_compact_state.SmallestUserKey(); + if (!smallest.empty()) { + return smallest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +Slice CompactionState::LargestUserKey() { + for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); + ++it) { + Slice largest = it->LargestUserKey(); + if (!largest.empty()) { + return largest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +void CompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats) { + for (const auto& sc : sub_compact_states) { + sc.AggregateCompactionStats(compaction_stats); + compaction_job_stats.Add(sc.compaction_job_stats); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.h new file mode 100644 index 0000000000..cc5b66c682 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/compaction_state.h @@ -0,0 +1,42 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction.h" +#include "db/compaction/subcompaction_state.h" +#include "db/internal_stats.h" + +// Data structures used for compaction_job and compaction_service_job which has +// the list of sub_compact_states and the aggregated information for the +// compaction. +namespace ROCKSDB_NAMESPACE { + +// Maintains state for the entire compaction +class CompactionState { + public: + Compaction* const compaction; + + // REQUIRED: subcompaction states are stored in order of increasing key-range + std::vector sub_compact_states; + Status status; + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats); + + explicit CompactionState(Compaction* c) : compaction(c) {} + + Slice SmallestUserKey(); + + Slice LargestUserKey(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/file_pri.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/file_pri.h new file mode 100644 index 0000000000..82dddcf938 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/file_pri.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once +#include + +#include "db/version_edit.h" + +namespace ROCKSDB_NAMESPACE { +// We boost files that are closer to TTL limit. This boosting could be +// through FileMetaData.compensated_file_size but this compensated size +// is widely used as something similar to file size so dramatically boost +// the value might cause unintended consequences. +// +// This boosting algorithm can go very fancy, but here we use a simple +// formula which can satisify: +// (1) Different levels are triggered slightly differently to avoid +// too many cascading cases +// (2) Files in the same level get boosting more when TTL gets closer. +// +// Don't do any boosting before TTL has past by half. This is to make +// sure lower write amp for most of the case. And all levels should be +// fully boosted when total TTL compaction threshold triggers. +// Differientiate boosting ranges of each level by 1/2. This will make +// range for each level exponentially increasing. We could do it by +// having them to be equal, or go even fancier. We can adjust it after +// we observe the behavior in production. +// The threshold starting boosting: +// +------------------------------------------------------------------ + +// ^ ^ ^ ^ ^ ^ +// Age 0 ... | | second last level thresold +// | | +// | third last level +// | +// forth last level +// +// We arbitrarily set with 0 when a file is aged boost_age_start and +// grow linearly. The ratio is arbitrarily set so that when the next level +// starts to boost, the previous level's boosting amount is 16. +class FileTtlBooster { + public: + FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels, + int level) + : current_time_(current_time) { + if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) { + enabled_ = false; + boost_age_start_ = 0; + boost_step_ = 1; + } else { + enabled_ = true; + uint64_t all_boost_start_age = ttl / 2; + uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age; + uint64_t boost_age_range = + all_boost_age_range >> (num_non_empty_levels - level - 1); + boost_age_start_ = all_boost_start_age + boost_age_range; + const uint64_t kBoostRatio = 16; + // prevent 0 value to avoid divide 0 error. + boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1}); + } + } + + uint64_t GetBoostScore(FileMetaData* f) { + if (!enabled_) { + return 1; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time >= current_time_) { + return 1; + } + uint64_t age = current_time_ - oldest_ancester_time; + if (age > boost_age_start_) { + // Use integer just for convenience. + // We could make all file_to_order double if we want. + // Technically this can overflow if users override timing and + // give a very high current time. Ignore the case for simplicity. + // Boosting is addition to current value, so +1. This will effectively + // make boosting to kick in after the first boost_step_ is reached. + return (age - boost_age_start_) / boost_step_ + 1; + } + return 1; + } + + private: + bool enabled_; + uint64_t current_time_; + uint64_t boost_age_start_; + uint64_t boost_step_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/sst_partitioner.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/sst_partitioner.cc new file mode 100644 index 0000000000..2f4d879357 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/sst_partitioner.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "rocksdb/sst_partitioner.h" + +#include + +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map + sst_fixed_prefix_type_info = { + {"length", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len) + : len_(len) { + RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info); +} + +PartitionerResult SstPartitionerFixedPrefix::ShouldPartition( + const PartitionerRequest& request) { + Slice last_key_fixed(*request.prev_user_key); + if (last_key_fixed.size() > len_) { + last_key_fixed.size_ = len_; + } + Slice current_key_fixed(*request.current_user_key); + if (current_key_fixed.size() > len_) { + current_key_fixed.size_ = len_; + } + return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired + : kNotRequired; +} + +bool SstPartitionerFixedPrefix::CanDoTrivialMove( + const Slice& smallest_user_key, const Slice& largest_user_key) { + return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key, + 0)) == kNotRequired; +} + +std::unique_ptr +SstPartitionerFixedPrefixFactory::CreatePartitioner( + const SstPartitioner::Context& /* context */) const { + return std::unique_ptr(new SstPartitionerFixedPrefix(len_)); +} + +std::shared_ptr NewSstPartitionerFixedPrefixFactory( + size_t prefix_len) { + return std::make_shared(prefix_len); +} + +namespace { +static int RegisterSstPartitionerFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + SstPartitionerFixedPrefixFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new SstPartitionerFixedPrefixFactory(0)); + return guard->get(); + }); + return 1; +} +} // namespace + +Status SstPartitionerFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), ""); + }); + return LoadSharedObject(options, value, result); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.cc new file mode 100644 index 0000000000..0c56471e92 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.cc @@ -0,0 +1,106 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/subcompaction_state.h" + +#include "rocksdb/sst_partitioner.h" + +namespace ROCKSDB_NAMESPACE { +void SubcompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const { + compaction_stats.stats.Add(compaction_outputs_.stats_); + if (HasPenultimateLevelOutputs()) { + compaction_stats.has_penultimate_level_output = true; + compaction_stats.penultimate_level_stats.Add( + penultimate_level_outputs_.stats_); + } +} + +OutputIterator SubcompactionState::GetOutputs() const { + return OutputIterator(penultimate_level_outputs_.outputs_, + compaction_outputs_.outputs_); +} + +void SubcompactionState::Cleanup(Cache* cache) { + penultimate_level_outputs_.Cleanup(); + compaction_outputs_.Cleanup(); + + if (!status.ok()) { + for (const auto& out : GetOutputs()) { + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + TableCache::Evict(cache, out.meta.fd.GetNumber()); + } + } + // TODO: sub_compact.io_status is not checked like status. Not sure if thats + // intentional. So ignoring the io_status as of now. + io_status.PermitUncheckedError(); +} + +Slice SubcompactionState::SmallestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.SmallestUserKey(); + Slice b = penultimate_level_outputs_.SmallestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) > 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.SmallestUserKey(); + } +} + +Slice SubcompactionState::LargestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.LargestUserKey(); + Slice b = penultimate_level_outputs_.LargestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) < 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.LargestUserKey(); + } +} + +Status SubcompactionState::AddToOutput( + const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // update target output first + is_current_penultimate_level_ = iter.output_to_penultimate_level(); + current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_ + : &compaction_outputs_; + if (is_current_penultimate_level_) { + has_penultimate_level_outputs_ = true; + } + + return Current().AddToOutput(iter, open_file_func, close_file_func); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.h new file mode 100644 index 0000000000..b933a62a51 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/compaction/subcompaction_state.h @@ -0,0 +1,220 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" + +namespace ROCKSDB_NAMESPACE { + +// Maintains state and outputs for each sub-compaction +// It contains 2 `CompactionOutputs`: +// 1. one for the normal output files +// 2. another for the penultimate level outputs +// a `current` pointer maintains the current output group, when calling +// `AddToOutput()`, it checks the output of the current compaction_iterator key +// and point `current` to the target output group. By default, it just points to +// normal compaction_outputs, if the compaction_iterator key should be placed on +// the penultimate level, `current` is changed to point to +// `penultimate_level_outputs`. +// The later operations uses `Current()` to get the target group. +// +// +----------+ +-----------------------------+ +---------+ +// | *current |--------> | compaction_outputs |----->| output | +// +----------+ +-----------------------------+ +---------+ +// | | output | +// | +---------+ +// | | ... | +// | +// | +-----------------------------+ +---------+ +// +-------------> | penultimate_level_outputs |----->| output | +// +-----------------------------+ +---------+ +// | ... | + +class SubcompactionState { + public: + const Compaction* compaction; + + // The boundaries of the key-range this compaction is interested in. No two + // sub-compactions may have overlapping key-ranges. + // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded + const std::optional start, end; + + // The return status of this sub-compaction + Status status; + + // The return IO Status of this sub-compaction + IOStatus io_status; + + // Notify on sub-compaction completion only if listener was notified on + // sub-compaction begin. + bool notify_on_subcompaction_completion = false; + + // compaction job stats for this sub-compaction + CompactionJobStats compaction_job_stats; + + // sub-compaction job id, which is used to identify different sub-compaction + // within the same compaction job. + const uint32_t sub_job_id; + + Slice SmallestUserKey() const; + + Slice LargestUserKey() const; + + // Get all outputs from the subcompaction. For per_key_placement compaction, + // it returns both the last level outputs and penultimate level outputs. + OutputIterator GetOutputs() const; + + // Assign range dels aggregator, for each range_del, it can only be assigned + // to one output level, for per_key_placement, it's going to be the + // penultimate level. + // TODO: This does not work for per_key_placement + user-defined timestamp + + // DeleteRange() combo. If user-defined timestamp is enabled, + // it is possible for a range tombstone to belong to bottommost level ( + // seqno < earliest snapshot) without being dropped (garbage collection + // for user-defined timestamp). + void AssignRangeDelAggregator( + std::unique_ptr&& range_del_agg) { + if (compaction->SupportsPerKeyPlacement()) { + penultimate_level_outputs_.AssignRangeDelAggregator( + std::move(range_del_agg)); + } else { + compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg)); + } + } + + void RemoveLastEmptyOutput() { + compaction_outputs_.RemoveLastEmptyOutput(); + penultimate_level_outputs_.RemoveLastEmptyOutput(); + } + + void BuildSubcompactionJobInfo( + SubcompactionJobInfo& subcompaction_job_info) const { + const Compaction* c = compaction; + const ColumnFamilyData* cfd = c->column_family_data(); + + subcompaction_job_info.cf_id = cfd->GetID(); + subcompaction_job_info.cf_name = cfd->GetName(); + subcompaction_job_info.status = status; + subcompaction_job_info.subcompaction_job_id = static_cast(sub_job_id); + subcompaction_job_info.base_input_level = c->start_level(); + subcompaction_job_info.output_level = c->output_level(); + subcompaction_job_info.stats = compaction_job_stats; + } + + SubcompactionState() = delete; + SubcompactionState(const SubcompactionState&) = delete; + SubcompactionState& operator=(const SubcompactionState&) = delete; + + SubcompactionState(Compaction* c, const std::optional _start, + const std::optional _end, uint32_t _sub_job_id) + : compaction(c), + start(_start), + end(_end), + sub_job_id(_sub_job_id), + compaction_outputs_(c, /*is_penultimate_level=*/false), + penultimate_level_outputs_(c, /*is_penultimate_level=*/true) { + assert(compaction != nullptr); + // Set output split key (used for RoundRobin feature) only for normal + // compaction_outputs, output to penultimate_level feature doesn't support + // RoundRobin feature (and may never going to be supported, because for + // RoundRobin, the data time is mostly naturally sorted, no need to have + // per-key placement with output_to_penultimate_level). + compaction_outputs_.SetOutputSlitKey(start, end); + } + + SubcompactionState(SubcompactionState&& state) noexcept + : compaction(state.compaction), + start(state.start), + end(state.end), + status(std::move(state.status)), + io_status(std::move(state.io_status)), + notify_on_subcompaction_completion( + state.notify_on_subcompaction_completion), + compaction_job_stats(std::move(state.compaction_job_stats)), + sub_job_id(state.sub_job_id), + compaction_outputs_(std::move(state.compaction_outputs_)), + penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)), + is_current_penultimate_level_(state.is_current_penultimate_level_), + has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) { + current_outputs_ = is_current_penultimate_level_ + ? &penultimate_level_outputs_ + : &compaction_outputs_; + } + + bool HasPenultimateLevelOutputs() const { + return has_penultimate_level_outputs_ || + penultimate_level_outputs_.HasRangeDel(); + } + + bool IsCurrentPenultimateLevel() const { + return is_current_penultimate_level_; + } + + // Add all the new files from this compaction to version_edit + void AddOutputsEdit(VersionEdit* out_edit) const { + for (const auto& file : penultimate_level_outputs_.outputs_) { + out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta); + } + for (const auto& file : compaction_outputs_.outputs_) { + out_edit->AddFile(compaction->output_level(), file.meta); + } + } + + void Cleanup(Cache* cache); + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const; + + CompactionOutputs& Current() const { + assert(current_outputs_); + return *current_outputs_; + } + + // Add compaction_iterator key/value to the `Current` output group. + Status AddToOutput(const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close all compaction output files, both output_to_penultimate_level outputs + // and normal outputs. + Status CloseCompactionFiles(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output file. + // CloseOutput() may open new compaction output files. + is_current_penultimate_level_ = true; + Status s = penultimate_level_outputs_.CloseOutput( + curr_status, open_file_func, close_file_func); + is_current_penultimate_level_ = false; + s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func); + return s; + } + + private: + // State kept for output being generated + CompactionOutputs compaction_outputs_; + CompactionOutputs penultimate_level_outputs_; + CompactionOutputs* current_outputs_ = &compaction_outputs_; + bool is_current_penultimate_level_ = false; + bool has_penultimate_level_outputs_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/convenience.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/convenience.cc new file mode 100644 index 0000000000..32cdfafaab --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/convenience.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + + +#include "rocksdb/convenience.h" + +#include "db/db_impl/db_impl.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +void CancelAllBackgroundWork(DB* db, bool wait) { + (static_cast_with_check(db->GetRootDB())) + ->CancelAllBackgroundWork(wait); +} + +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool include_end) { + RangePtr range(begin, end); + return DeleteFilesInRanges(db, column_family, &range, 1, include_end); +} + +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, bool include_end) { + return (static_cast_with_check(db->GetRootDB())) + ->DeleteFilesInRanges(column_family, ranges, n, include_end); +} + +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + return VerifySstFileChecksum(options, env_options, read_options, file_path); +} +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path, + const SequenceNumber& largest_seqno) { + std::unique_ptr file; + uint64_t file_size; + InternalKeyComparator internal_comparator(options.comparator); + ImmutableOptions ioptions(options); + + Status s = ioptions.fs->NewRandomAccessFile( + file_path, FileOptions(env_options), &file, nullptr); + if (s.ok()) { + s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); + } else { + return s; + } + std::unique_ptr table_reader; + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */, + ioptions.stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, ioptions.rate_limiter.get())); + const bool kImmortal = true; + auto reader_options = TableReaderOptions( + ioptions, options.prefix_extractor, env_options, internal_comparator, + options.block_protection_bytes_per_key, false /* skip_filters */, + !kImmortal, false /* force_direct_prefetch */, -1 /* level */); + reader_options.largest_seqno = largest_seqno; + s = ioptions.table_factory->NewTableReader( + reader_options, std::move(file_reader), file_size, &table_reader, + false /* prefetch_index_and_filter_in_cache */); + if (!s.ok()) { + return s; + } + s = table_reader->VerifyChecksum(read_options, + TableReaderCaller::kUserVerifyChecksum); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_filesnapshot.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_filesnapshot.cc new file mode 100644 index 0000000000..f035041c7b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_filesnapshot.cc @@ -0,0 +1,436 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/types.h" +#include "test_util/sync_point.h" +#include "util/file_checksum_helper.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +Status DBImpl::FlushForGetLiveFiles() { + mutex_.AssertHeld(); + + // flush all dirty data to disk. + Status status; + if (immutable_db_options_.atomic_flush) { + mutex_.Unlock(); + status = AtomicFlushMemTables(FlushOptions(), FlushReason::kGetLiveFiles); + if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + mutex_.Unlock(); + status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); + mutex_.Lock(); + if (!status.ok() && !status.IsColumnFamilyDropped()) { + break; + } else if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + } + } + return status; +} + +Status DBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, bool flush_memtable) { + *manifest_file_size = 0; + + mutex_.Lock(); + + if (flush_memtable) { + Status status = FlushForGetLiveFiles(); + if (!status.ok()) { + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live table and blob files + std::vector live_table_files; + std::vector live_blob_files; + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files); + } + + ret.clear(); + ret.reserve(live_table_files.size() + live_blob_files.size() + + 3); // for CURRENT + MANIFEST + OPTIONS + + // create names of the live files. The names are not absolute + // paths, instead they are relative to dbname_. + for (const auto& table_file_number : live_table_files) { + ret.emplace_back(MakeTableFileName("", table_file_number)); + } + + for (const auto& blob_file_number : live_blob_files) { + ret.emplace_back(BlobFileName("", blob_file_number)); + } + + ret.emplace_back(CurrentFileName("")); + ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number())); + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (versions_->options_file_number() != 0) { + ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + } + + // find length of manifest file while holding the mutex lock + *manifest_file_size = versions_->manifest_file_size(); + + mutex_.Unlock(); + return Status::OK(); +} + +Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + // Record tracked WALs as a (minimum) cross-check for directory scan + std::vector required_by_manifest; + + // If caller disabled deletions, this function should return files that are + // guaranteed not to be deleted until deletions are re-enabled. We need to + // wait for pending purges to finish since WalManager doesn't know which + // files are going to be purged. Additional purges won't be scheduled as + // long as deletions are disabled (so the below loop must terminate). + // Also note that we disable deletions anyway to avoid the case where a + // file is deleted in the middle of the scan, causing IO error. + Status deletions_disabled = DisableFileDeletions(); + { + InstrumentedMutexLock l(&mutex_); + while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) { + bg_cv_.Wait(); + } + + // Record tracked WALs as a (minimum) cross-check for directory scan + const auto& manifest_wals = versions_->GetWalSet().GetWals(); + required_by_manifest.reserve(manifest_wals.size()); + for (const auto& wal : manifest_wals) { + required_by_manifest.push_back(wal.first); + } + } + + Status s = wal_manager_.GetSortedWalFiles(files); + + // DisableFileDeletions / EnableFileDeletions not supported in read-only DB + if (deletions_disabled.ok()) { + Status s2 = EnableFileDeletions(/*force*/ false); + assert(s2.ok()); + s2.PermitUncheckedError(); + } else { + assert(deletions_disabled.IsNotSupported()); + } + + if (s.ok()) { + // Verify includes those required by manifest (one sorted list is superset + // of the other) + auto required = required_by_manifest.begin(); + auto included = files.begin(); + + while (required != required_by_manifest.end()) { + if (included == files.end() || *required < (*included)->LogNumber()) { + // FAIL - did not find + return Status::Corruption( + "WAL file " + std::to_string(*required) + + " required by manifest but not in directory list"); + } + if (*required == (*included)->LogNumber()) { + ++required; + ++included; + } else { + assert(*required > (*included)->LogNumber()); + ++included; + } + } + } + + return s; +} + +Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_log_file) { + uint64_t current_logfile_number; + { + InstrumentedMutexLock l(&mutex_); + current_logfile_number = logfile_number_; + } + + return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file); +} + +Status DBImpl::GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) { + // To avoid returning partial results, only move results to files on success. + assert(files); + files->clear(); + std::vector results; + + // NOTE: This implementation was largely migrated from Checkpoint. + + Status s; + VectorLogPtr live_wal_files; + bool flush_memtable = true; + if (!immutable_db_options_.allow_2pc) { + if (opts.wal_size_for_flush == std::numeric_limits::max()) { + flush_memtable = false; + } else if (opts.wal_size_for_flush > 0) { + // If the outstanding log files are small, we skip the flush. + s = GetSortedWalFiles(live_wal_files); + + if (!s.ok()) { + return s; + } + + // Don't flush column families if total log size is smaller than + // log_size_for_flush. We copy the log files instead. + // We may be able to cover 2PC case too. + uint64_t total_wal_size = 0; + for (auto& wal : live_wal_files) { + total_wal_size += wal->SizeFileBytes(); + } + if (total_wal_size < opts.wal_size_for_flush) { + flush_memtable = false; + } + live_wal_files.clear(); + } + } + + // This is a modified version of GetLiveFiles, to get access to more + // metadata. + mutex_.Lock(); + if (flush_memtable) { + Status status = FlushForGetLiveFiles(); + if (!status.ok()) { + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live table and blob files + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + VersionStorageInfo& vsi = *cfd->current()->storage_info(); + auto& cf_paths = cfd->ioptions()->cf_paths; + + auto GetDir = [&](size_t path_id) { + // Matching TableFileName() behavior + if (path_id >= cf_paths.size()) { + assert(false); + return cf_paths.back().path; + } else { + return cf_paths[path_id].path; + } + }; + + for (int level = 0; level < vsi.num_levels(); ++level) { + const auto& level_files = vsi.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = MakeTableFileName(meta->fd.GetNumber()); + info.directory = GetDir(meta->fd.GetPathId()); + info.file_number = meta->fd.GetNumber(); + info.file_type = kTableFile; + info.size = meta->fd.GetFileSize(); + if (opts.include_checksum_info) { + info.file_checksum_func_name = meta->file_checksum_func_name; + info.file_checksum = meta->file_checksum; + if (info.file_checksum_func_name.empty()) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + info.temperature = meta->temperature; + } + } + const auto& blob_files = vsi.GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = BlobFileName(meta->GetBlobFileNumber()); + info.directory = GetDir(/* path_id */ 0); + info.file_number = meta->GetBlobFileNumber(); + info.file_type = kBlobFile; + info.size = meta->GetBlobFileSize(); + if (opts.include_checksum_info) { + info.file_checksum_func_name = meta->GetChecksumMethod(); + info.file_checksum = meta->GetChecksumValue(); + if (info.file_checksum_func_name.empty()) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + // TODO?: info.temperature + } + } + + // Capture some final info before releasing mutex + const uint64_t manifest_number = versions_->manifest_file_number(); + const uint64_t manifest_size = versions_->manifest_file_size(); + const uint64_t options_number = versions_->options_file_number(); + const uint64_t options_size = versions_->options_file_size_; + const uint64_t min_log_num = MinLogNumberToKeep(); + + mutex_.Unlock(); + + std::string manifest_fname = DescriptorFileName(manifest_number); + { // MANIFEST + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = manifest_fname; + info.directory = GetName(); + info.file_number = manifest_number; + info.file_type = kDescriptorFile; + info.size = manifest_size; + info.trim_to_size = true; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + { // CURRENT + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = kCurrentFileName; + info.directory = GetName(); + info.file_type = kCurrentFile; + // CURRENT could be replaced so we have to record the contents as needed. + info.replacement_contents = manifest_fname + "\n"; + info.size = manifest_fname.size() + 1; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (options_number != 0) { + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = OptionsFileName(options_number); + info.directory = GetName(); + info.file_number = options_number; + info.file_type = kOptionsFile; + info.size = options_size; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + // Some legacy testing stuff TODO: carefully clean up obsolete parts + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone"); + + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); + + if (s.ok()) { + // To maximize the effectiveness of track_and_verify_wals_in_manifest, + // sync WAL when it is enabled. + s = FlushWAL( + immutable_db_options_.track_and_verify_wals_in_manifest /* sync */); + if (s.IsNotSupported()) { // read-only DB or similar + s = Status::OK(); + } + } + + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"); + + // If we have more than one column family, we also need to get WAL files. + if (s.ok()) { + s = GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + return s; + } + + size_t wal_size = live_wal_files.size(); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size()); + + // Link WAL files. Copy exact size of last one because it is the only one + // that has changes after the last flush. + auto wal_dir = immutable_db_options_.GetWalDir(); + for (size_t i = 0; s.ok() && i < wal_size; ++i) { + if ((live_wal_files[i]->Type() == kAliveLogFile) && + (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) { + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + auto f = live_wal_files[i]->PathName(); + assert(!f.empty() && f[0] == '/'); + info.relative_filename = f.substr(1); + info.directory = wal_dir; + info.file_number = live_wal_files[i]->LogNumber(); + info.file_type = kWalFile; + info.size = live_wal_files[i]->SizeFileBytes(); + // Only last should need to be trimmed + info.trim_to_size = (i + 1 == wal_size); + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + } + + if (s.ok()) { + // Only move results to output on success. + *files = std::move(results); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.cc new file mode 100644 index 0000000000..3d824baf29 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.cc @@ -0,0 +1,260 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/compacted_db_impl.h" + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "logging/logging.h" +#include "table/get_context.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +extern void MarkKeyMayExist(void* arg); +extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool hit_and_return); + +CompactedDBImpl::CompactedDBImpl(const DBOptions& options, + const std::string& dbname) + : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true, + /*read_only*/ true), + cfd_(nullptr), + version_(nullptr), + user_comparator_(nullptr) {} + +CompactedDBImpl::~CompactedDBImpl() {} + +size_t CompactedDBImpl::FindFile(const Slice& key) { + size_t right = files_.num_files - 1; + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; + }; + return static_cast( + std::lower_bound(files_.files, files_.files + right, key, cmp) - + files_.files); +} + +Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, + const Slice& key, PinnableSlice* value) { + return Get(options, /*column_family*/ nullptr, key, value, + /*timestamp*/ nullptr); +} + +Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, + const Slice& key, PinnableSlice* value, + std::string* timestamp) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + assert(user_comparator_); + if (options.timestamp) { + const Status s = FailIfTsMismatchCf( + DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(DefaultColumnFamily()); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (timestamp) { + timestamp->clear(); + } + + GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); + std::string* ts = + user_comparator_->timestamp_size() > 0 ? timestamp : nullptr; + LookupKey lkey(key, kMaxSequenceNumber, options.timestamp); + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, lkey.user_key(), value, + /*columns=*/nullptr, ts, nullptr, nullptr, true, + nullptr, nullptr, nullptr, nullptr, &read_cb); + + const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())]; + if (user_comparator_->CompareWithoutTimestamp( + key, /*a_has_ts=*/false, + ExtractUserKeyAndStripTimestamp(f.smallest_key, + user_comparator_->timestamp_size()), + /*b_has_ts=*/false) < 0) { + return Status::NotFound(); + } + Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context, + nullptr); + if (!s.ok() && !s.IsNotFound()) { + return s; + } + if (get_context.State() == GetContext::kFound) { + return Status::OK(); + } + return Status::NotFound(); +} + +std::vector CompactedDBImpl::MultiGet( + const ReadOptions& options, const std::vector&, + const std::vector& keys, std::vector* values) { + return MultiGet(options, keys, values, /*timestamps*/ nullptr); +} + +std::vector CompactedDBImpl::MultiGet( + const ReadOptions& options, const std::vector&, + const std::vector& keys, std::vector* values, + std::vector* timestamps) { + assert(user_comparator_); + size_t num_keys = keys.size(); + + if (options.timestamp) { + Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return std::vector(num_keys, s); + } + } else { + Status s = FailIfCfHasTs(DefaultColumnFamily()); + if (!s.ok()) { + return std::vector(num_keys, s); + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (timestamps) { + for (auto& ts : *timestamps) { + ts.clear(); + } + } + + GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); + autovector reader_list; + for (const auto& key : keys) { + LookupKey lkey(key, kMaxSequenceNumber, options.timestamp); + const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())]; + if (user_comparator_->CompareWithoutTimestamp( + key, /*a_has_ts=*/false, + ExtractUserKeyAndStripTimestamp(f.smallest_key, + user_comparator_->timestamp_size()), + /*b_has_ts=*/false) < 0) { + reader_list.push_back(nullptr); + } else { + f.fd.table_reader->Prepare(lkey.internal_key()); + reader_list.push_back(f.fd.table_reader); + } + } + std::vector statuses(num_keys, Status::NotFound()); + values->resize(num_keys); + if (timestamps) { + timestamps->resize(num_keys); + } + int idx = 0; + for (auto* r : reader_list) { + if (r != nullptr) { + PinnableSlice pinnable_val; + std::string& value = (*values)[idx]; + LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp); + std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr; + GetContext get_context( + user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, + lkey.user_key(), &pinnable_val, /*columns=*/nullptr, + user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr, + nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb); + Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr); + assert(static_cast(idx) < statuses.size()); + if (!s.ok() && !s.IsNotFound()) { + statuses[idx] = s; + } else { + value.assign(pinnable_val.data(), pinnable_val.size()); + if (get_context.State() == GetContext::kFound) { + statuses[idx] = Status::OK(); + } + } + } + ++idx; + } + return statuses; +} + +Status CompactedDBImpl::Init(const Options& options) { + SuperVersionContext sv_context(/* create_superversion */ true); + mutex_.Lock(); + ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)); + Status s = Recover({cf}, true /* read only */, false, true); + if (s.ok()) { + cfd_ = static_cast_with_check(DefaultColumnFamily()) + ->cfd(); + cfd_->InstallSuperVersion(&sv_context, &mutex_); + } + mutex_.Unlock(); + sv_context.Clean(); + if (!s.ok()) { + return s; + } + NewThreadStatusCfInfo(cfd_); + version_ = cfd_->GetSuperVersion()->current; + user_comparator_ = cfd_->user_comparator(); + auto* vstorage = version_->storage_info(); + if (vstorage->num_non_empty_levels() == 0) { + return Status::NotSupported("no file exists"); + } + const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0); + // L0 should not have files + if (l0.num_files > 1) { + return Status::NotSupported("L0 contain more than 1 file"); + } + if (l0.num_files == 1) { + if (vstorage->num_non_empty_levels() > 1) { + return Status::NotSupported("Both L0 and other level contain files"); + } + files_ = l0; + return Status::OK(); + } + + for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) { + if (vstorage->LevelFilesBrief(i).num_files > 0) { + return Status::NotSupported("Other levels also contain files"); + } + } + + int level = vstorage->num_non_empty_levels() - 1; + if (vstorage->LevelFilesBrief(level).num_files > 0) { + files_ = vstorage->LevelFilesBrief(level); + return Status::OK(); + } + return Status::NotSupported("no file exists"); +} + +Status CompactedDBImpl::Open(const Options& options, const std::string& dbname, + DB** dbptr) { + *dbptr = nullptr; + + if (options.max_open_files != -1) { + return Status::InvalidArgument("require max_open_files = -1"); + } + if (options.merge_operator.get() != nullptr) { + return Status::InvalidArgument("merge operator is not supported"); + } + DBOptions db_options(options); + std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); + Status s = db->Init(options); + if (s.ok()) { + s = db->StartPeriodicTaskScheduler(); + } + if (s.ok()) { + ROCKS_LOG_INFO(db->immutable_db_options_.info_log, + "Opened the db as fully compacted mode"); + LogFlush(db->immutable_db_options_.info_log); + *dbptr = db.release(); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.h new file mode 100644 index 0000000000..9879d81b61 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/compacted_db_impl.h @@ -0,0 +1,157 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: Share common structure with DBImplSecondary and DBImplReadOnly +class CompactedDBImpl : public DBImpl { + public: + CompactedDBImpl(const DBOptions& options, const std::string& dbname); + // No copying allowed + CompactedDBImpl(const CompactedDBImpl&) = delete; + void operator=(const CompactedDBImpl&) = delete; + + ~CompactedDBImpl() override; + + static Status Open(const Options& options, const std::string& dbname, + DB** dbptr); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; + + using DB::MultiGet; + // Note that CompactedDBImpl::MultiGet is not the optimized version of + // MultiGet to use. + // TODO: optimize CompactedDBImpl::MultiGet, see DBImpl::MultiGet for details. + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector&, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGet(const ReadOptions& options, + const std::vector&, + const std::vector& keys, + std::vector* values, + std::vector* timestamps) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DBImpl::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + // FIXME: some missing overrides for more "write" functions + // Share with DBImplReadOnly? + + protected: + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } + + private: + friend class DB; + inline size_t FindFile(const Slice& key); + Status Init(const Options& options); + + ColumnFamilyData* cfd_; + Version* version_; + const Comparator* user_comparator_; + LevelFilesBrief files_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.cc new file mode 100644 index 0000000000..2ed0caf7f2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.cc @@ -0,0 +1,6132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#ifdef OS_SOLARIS +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/builder.h" +#include "db/compaction/compaction_job.h" +#include "db/db_info_dumper.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/forward_iterator.h" +#include "db/import_column_family_job.h" +#include "db/job_context.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/malloc_stats.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/periodic_task_scheduler.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/write_callback.h" +#include "env/unique_id_gen.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "logging/auto_roll_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/in_memory_stats_history.h" +#include "monitoring/instrumented_mutex.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/stats_history.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/version.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/get_context.h" +#include "table/merging_iterator.h" +#include "table/multiget_context.h" +#include "table/sst_file_dumper.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "trace_replay/trace_replay.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/defer.h" +#include "util/distributed_mutex.h" +#include "util/hash_containers.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "utilities/trace/replayer_impl.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kDefaultColumnFamilyName("default"); +const std::string kPersistentStatsColumnFamilyName( + "___rocksdb_stats_history___"); +void DumpRocksDBBuildVersion(Logger* log); + +CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + if (ioptions.compaction_style == kCompactionStyleUniversal && + mutable_cf_options.compaction_options_universal + .compression_size_percent >= 0) { + return kNoCompression; + } + if (mutable_cf_options.compression_per_level.empty()) { + return mutable_cf_options.compression; + } else { + // For leveled compress when min_level_to_compress != 0. + return mutable_cf_options.compression_per_level[0]; + } +} + +namespace { +void DumpSupportInfo(Logger* logger) { + ROCKS_LOG_HEADER(logger, "Compression algorithms supported:"); + for (auto& compression : OptionsHelper::compression_type_string_map) { + if (compression.second != kNoCompression && + compression.second != kDisableCompressionOption) { + ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(), + CompressionTypeSupported(compression.second)); + } + } + ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s", + crc32c::IsFastCrc32Supported().c_str()); + + ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName()); +} +} // namespace + +DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch, const bool batch_per_txn, + bool read_only) + : dbname_(dbname), + own_info_log_(options.info_log == nullptr), + init_logger_creation_s_(), + initial_db_options_(SanitizeOptions(dbname, options, read_only, + &init_logger_creation_s_)), + env_(initial_db_options_.env), + io_tracer_(std::make_shared()), + immutable_db_options_(initial_db_options_), + fs_(immutable_db_options_.fs, io_tracer_), + mutable_db_options_(initial_db_options_), + stats_(immutable_db_options_.stats), +#ifdef COERCE_CONTEXT_SWITCH + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, &bg_cv_, + immutable_db_options_.use_adaptive_mutex), +#else // COERCE_CONTEXT_SWITCH + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, + immutable_db_options_.use_adaptive_mutex), +#endif // COERCE_CONTEXT_SWITCH + default_cf_handle_(nullptr), + error_handler_(this, immutable_db_options_, &mutex_), + event_logger_(immutable_db_options_.info_log.get()), + max_total_in_memory_state_(0), + file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), + file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite( + file_options_, immutable_db_options_)), + seq_per_batch_(seq_per_batch), + batch_per_txn_(batch_per_txn), + next_job_id_(1), + shutting_down_(false), + db_lock_(nullptr), + manual_compaction_paused_(false), + bg_cv_(&mutex_), + logfile_number_(0), + log_dir_synced_(false), + log_empty_(true), + persist_stats_cf_handle_(nullptr), + log_sync_cv_(&log_write_mutex_), + total_log_size_(0), + is_snapshot_supported_(true), + write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), + write_thread_(immutable_db_options_), + nonmem_write_thread_(immutable_db_options_), + write_controller_(mutable_db_options_.delayed_write_rate), + last_batch_group_size_(0), + unscheduled_flushes_(0), + unscheduled_compactions_(0), + bg_bottom_compaction_scheduled_(0), + bg_compaction_scheduled_(0), + num_running_compactions_(0), + bg_flush_scheduled_(0), + num_running_flushes_(0), + bg_purge_scheduled_(0), + disable_delete_obsolete_files_(0), + pending_purge_obsolete_files_(0), + delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()), + last_stats_dump_time_microsec_(0), + has_unpersisted_data_(false), + unable_to_release_oldest_log_(false), + num_running_ingest_file_(0), + wal_manager_(immutable_db_options_, file_options_, io_tracer_, + seq_per_batch), + bg_work_paused_(0), + bg_compaction_paused_(0), + refitting_level_(false), + opened_successfully_(false), + periodic_task_scheduler_(), + two_write_queues_(options.two_write_queues), + manual_wal_flush_(options.manual_wal_flush), + // last_sequencee_ is always maintained by the main queue that also writes + // to the memtable. When two_write_queues_ is disabled last seq in + // memtable is the same as last seq published to the readers. When it is + // enabled but seq_per_batch_ is disabled, last seq in memtable still + // indicates last published seq since wal-only writes that go to the 2nd + // queue do not consume a sequence number. Otherwise writes performed by + // the 2nd queue could change what is visible to the readers. In this + // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a + // separate variable to indicate the last published sequence. + last_seq_same_as_publish_seq_( + !(seq_per_batch && options.two_write_queues)), + // Since seq_per_batch_ is currently set only by WritePreparedTxn which + // requires a custom gc for compaction, we use that to set use_custom_gc_ + // as well. + use_custom_gc_(seq_per_batch), + shutdown_initiated_(false), + own_sfm_(options.sst_file_manager == nullptr), + closed_(false), + atomic_flush_install_cv_(&mutex_), + blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_, + &error_handler_, &event_logger_, + immutable_db_options_.listeners, dbname_), + lock_wal_count_(0) { + // !batch_per_trx_ implies seq_per_batch_ because it is only unset for + // WriteUnprepared, which should use seq_per_batch_. + assert(batch_per_txn_ || seq_per_batch_); + + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = (mutable_db_options_.max_open_files == -1) + ? TableCache::kInfiniteCapacity + : mutable_db_options_.max_open_files - 10; + LRUCacheOptions co; + co.capacity = table_cache_size; + co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + // TODO: Consider a non-fixed seed once test fallout (prefetch_test) is + // dealt with + co.hash_seed = 0; + table_cache_ = NewLRUCache(co); + SetDbSessionId(); + assert(!db_session_id_.empty()); + + periodic_task_functions_.emplace(PeriodicTaskType::kDumpStats, + [this]() { this->DumpStats(); }); + periodic_task_functions_.emplace(PeriodicTaskType::kPersistStats, + [this]() { this->PersistStats(); }); + periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog, + [this]() { this->FlushInfoLog(); }); + periodic_task_functions_.emplace( + PeriodicTaskType::kRecordSeqnoTime, + [this]() { this->RecordSeqnoToTimeMapping(); }); + + versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, + table_cache_.get(), write_buffer_manager_, + &write_controller_, &block_cache_tracer_, + io_tracer_, db_id_, db_session_id_)); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + + DumpRocksDBBuildVersion(immutable_db_options_.info_log.get()); + DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_); + immutable_db_options_.Dump(immutable_db_options_.info_log.get()); + mutable_db_options_.Dump(immutable_db_options_.info_log.get()); + DumpSupportInfo(immutable_db_options_.info_log.get()); + + max_total_wal_size_.store(mutable_db_options_.max_total_wal_size, + std::memory_order_relaxed); + if (write_buffer_manager_) { + wbm_stall_.reset(new WBMStallInterface()); + } +} + +Status DBImpl::Resume() { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB"); + + InstrumentedMutexLock db_mutex(&mutex_); + + if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) { + // Nothing to do + return Status::OK(); + } + + if (error_handler_.IsRecoveryInProgress()) { + // Don't allow a mix of manual and automatic recovery + return Status::Busy(); + } + + mutex_.Unlock(); + Status s = error_handler_.RecoverFromBGError(true); + mutex_.Lock(); + return s; +} + +// This function implements the guts of recovery from a background error. It +// is eventually called for both manual as well as automatic recovery. It does +// the following - +// 1. Wait for currently scheduled background flush/compaction to exit, in +// order to inadvertently causing an error and thinking recovery failed +// 2. Flush memtables if there's any data for all the CFs. This may result +// another error, which will be saved by error_handler_ and reported later +// as the recovery status +// 3. Find and delete any obsolete files +// 4. Schedule compactions if needed for all the CFs. This is needed as the +// flush in the prior step might have been a no-op for some CFs, which +// means a new super version wouldn't have been installed +Status DBImpl::ResumeImpl(DBRecoverContext context) { + mutex_.AssertHeld(); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + WaitForBackgroundWork(); + + Status s; + if (shutdown_initiated_) { + // Returning shutdown status to SFM during auto recovery will cause it + // to abort the recovery and allow the shutdown to progress + s = Status::ShutdownInProgress(); + } + + if (s.ok()) { + Status bg_error = error_handler_.GetBGError(); + if (bg_error.severity() > Status::Severity::kHardError) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but failed due to Fatal/Unrecoverable error"); + s = bg_error; + } + } + + // Make sure the IO Status stored in version set is set to OK. + bool file_deletion_disabled = !IsFileDeletionsEnabled(); + if (s.ok()) { + IOStatus io_s = versions_->io_status(); + if (io_s.IsIOError()) { + // If resuming from IOError resulted from MANIFEST write, then assert + // that we must have already set the MANIFEST writer to nullptr during + // clean-up phase MANIFEST writing. We must have also disabled file + // deletions. + assert(!versions_->descriptor_log_); + assert(file_deletion_disabled); + // Since we are trying to recover from MANIFEST write error, we need to + // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted. + // Therefore, force writing a dummy version edit because we do not know + // whether there are flush jobs with non-empty data to flush, triggering + // appends to MANIFEST. + VersionEdit edit; + auto cfh = + static_cast_with_check(default_cf_handle_); + assert(cfh); + ColumnFamilyData* cfd = cfh->cfd(); + const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); + s = versions_->LogAndApply(cfd, cf_opts, read_options, &edit, &mutex_, + directories_.GetDbDir()); + if (!s.ok()) { + io_s = versions_->io_status(); + if (!io_s.ok()) { + s = error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); + } + } + } + } + + // We cannot guarantee consistency of the WAL. So force flush Memtables of + // all the column families + if (s.ok()) { + FlushOptions flush_opts; + // We allow flush to stall write since we are trying to resume from error. + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + mutex_.Unlock(); + s = AtomicFlushMemTables(flush_opts, context.flush_reason); + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + InstrumentedMutexUnlock u(&mutex_); + s = FlushMemTable(cfd, flush_opts, context.flush_reason); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DB resume requested but failed due to Flush failure [%s]", + s.ToString().c_str()); + } + } + + JobContext job_context(0); + FindObsoleteFiles(&job_context, true); + mutex_.Unlock(); + + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + + if (s.ok()) { + assert(versions_->io_status().ok()); + // If we reach here, we should re-enable file deletions if it was disabled + // during previous error handling. + if (file_deletion_disabled) { + // Always return ok + s = EnableFileDeletions(/*force=*/true); + if (!s.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but could not enable file deletions [%s]", + s.ToString().c_str()); + assert(false); + } + } + } + + mutex_.Lock(); + if (s.ok()) { + // This will notify and unblock threads waiting for error recovery to + // finish. Those previouly waiting threads can now proceed, which may + // include closing the db. + s = error_handler_.ClearBGError(); + } else { + // NOTE: this is needed to pass ASSERT_STATUS_CHECKED + // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test. + // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952 + error_handler_.GetRecoveryError().PermitUncheckedError(); + } + + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]", + s.ToString().c_str()); + } + + // Check for shutdown again before scheduling further compactions, + // since we released and re-acquired the lock above + if (shutdown_initiated_) { + s = Status::ShutdownInProgress(); + } + if (s.ok()) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + SchedulePendingCompaction(cfd); + } + MaybeScheduleFlushOrCompaction(); + } + + // Wake up any waiters - in this case, it could be the shutdown thread + bg_cv_.SignalAll(); + + // No need to check BGError again. If something happened, event listener would + // be notified and the operation causing it would have failed + return s; +} + +void DBImpl::WaitForBackgroundWork() { + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_) { + bg_cv_.Wait(); + } +} + +// Will lock the mutex_, will wait for completion if wait is true +void DBImpl::CancelAllBackgroundWork(bool wait) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Shutdown: canceling all background work"); + + for (uint8_t task_type = 0; + task_type < static_cast(PeriodicTaskType::kMax); task_type++) { + Status s = periodic_task_scheduler_.Unregister( + static_cast(task_type)); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to unregister periodic task %d, status: %s", + task_type, s.ToString().c_str()); + } + } + + InstrumentedMutexLock l(&mutex_); + if (!shutting_down_.load(std::memory_order_acquire) && + has_unpersisted_data_.load(std::memory_order_relaxed) && + !mutable_db_options_.avoid_flush_during_shutdown) { + if (immutable_db_options_.atomic_flush) { + mutex_.Unlock(); + Status s = AtomicFlushMemTables(FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { + InstrumentedMutexUnlock u(&mutex_); + Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? + } + } + } + } + + shutting_down_.store(true, std::memory_order_release); + bg_cv_.SignalAll(); + if (!wait) { + return; + } + WaitForBackgroundWork(); +} + +Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() { + size_t num_snapshots = 0; + ReleaseTimestampedSnapshotsOlderThan(std::numeric_limits::max(), + &num_snapshots); + + // If there is unreleased snapshot, fail the close call + if (num_snapshots > 0) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + + return Status::OK(); +} + +Status DBImpl::CloseHelper() { + // Guarantee that there is no background error recovery in progress before + // continuing with the shutdown + mutex_.Lock(); + shutdown_initiated_ = true; + error_handler_.CancelErrorRecovery(); + while (error_handler_.IsRecoveryInProgress()) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + + // Below check is added as recovery_error_ is not checked and it causes crash + // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is + // reached. + error_handler_.GetRecoveryError().PermitUncheckedError(); + + // CancelAllBackgroundWork called with false means we just set the shutdown + // marker. After this we do a variant of the waiting and unschedule work + // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) + CancelAllBackgroundWork(false); + + // Cancel manual compaction if there's any + if (HasPendingManualCompaction()) { + DisableManualCompaction(); + } + mutex_.Lock(); + // Unschedule all tasks for this DB + for (uint8_t i = 0; i < static_cast(TaskType::kCount); i++) { + env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM); + env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW); + env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH); + } + + Status ret = Status::OK(); + + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || bg_purge_scheduled_ || + pending_purge_obsolete_files_ || + error_handler_.IsRecoveryInProgress()) { + TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob"); + bg_cv_.Wait(); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished", + &files_grabbed_for_purge_); + EraseThreadStatusDbInfo(); + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + + while (!flush_queue_.empty()) { + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { + iter.first->UnrefAndTryDelete(); + } + } + + while (!compaction_queue_.empty()) { + auto cfd = PopFirstFromCompactionQueue(); + cfd->UnrefAndTryDelete(); + } + + if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { + // we need to delete handle outside of lock because it does its own locking + mutex_.Unlock(); + if (default_cf_handle_) { + delete default_cf_handle_; + default_cf_handle_ = nullptr; + } + if (persist_stats_cf_handle_) { + delete persist_stats_cf_handle_; + persist_stats_cf_handle_ = nullptr; + } + mutex_.Lock(); + } + + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + // manifest number starting from 2 + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + { + InstrumentedMutexLock lock(&log_write_mutex_); + for (auto l : logs_to_free_) { + delete l; + } + for (auto& log : logs_) { + uint64_t log_number = log.writer->get_log_number(); + Status s = log.ClearWriter(); + if (!s.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to Sync WAL file %s with error -- %s", + LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(), + s.ToString().c_str()); + // Retain the first error + if (ret.ok()) { + ret = s; + } + } + } + logs_.clear(); + } + + // Table cache may have table handles holding blocks from the block cache. + // We need to release them before the block cache is destroyed. The block + // cache may be destroyed inside versions_.reset(), when column family data + // list is destroyed, so leaving handles in table cache after + // versions_.reset() may cause issues. + // Here we clean all unreferenced handles in table cache. + // Now we assume all user queries have finished, so only version set itself + // can possibly hold the blocks from block cache. After releasing unreferenced + // handles here, only handles held by version set left and inside + // versions_.reset(), we will release them. There, we need to make sure every + // time a handle is released, we erase it from the cache too. By doing that, + // we can guarantee that after versions_.reset(), table cache is empty + // so the cache can be safely destroyed. + table_cache_->EraseUnRefEntries(); + + for (auto& txn_entry : recovered_transactions_) { + delete txn_entry.second; + } + + // versions need to be destroyed before table_cache since it can hold + // references to table_cache. + versions_.reset(); + mutex_.Unlock(); + if (db_lock_ != nullptr) { + // TODO: Check for unlock error + env_->UnlockFile(db_lock_).PermitUncheckedError(); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); + LogFlush(immutable_db_options_.info_log); + + // If the sst_file_manager was allocated by us during DB::Open(), ccall + // Close() on it before closing the info_log. Otherwise, background thread + // in SstFileManagerImpl might try to log something + if (immutable_db_options_.sst_file_manager && own_sfm_) { + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + sfm->Close(); + } + + if (immutable_db_options_.info_log && own_info_log_) { + Status s = immutable_db_options_.info_log->Close(); + if (!s.ok() && !s.IsNotSupported() && ret.ok()) { + ret = s; + } + } + + if (write_buffer_manager_ && wbm_stall_) { + write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get()); + } + + IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */); + if (!io_s.ok()) { + ret = io_s; + } + if (ret.IsAborted()) { + // Reserve IsAborted() error for those where users didn't release + // certain resource and they can release them and come back and + // retry. In this case, we wrap this exception to something else. + return Status::Incomplete(ret.ToString()); + } + + return ret; +} + +Status DBImpl::CloseImpl() { return CloseHelper(); } + +DBImpl::~DBImpl() { + // TODO: remove this. + init_logger_creation_s_.PermitUncheckedError(); + + InstrumentedMutexLock closing_lock_guard(&closing_mutex_); + if (closed_) { + return; + } + + closed_ = true; + + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + s.PermitUncheckedError(); + } + + closing_status_ = CloseImpl(); + closing_status_.PermitUncheckedError(); +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || immutable_db_options_.paranoid_checks) { + // No change needed + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s", + s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0) { + std::string archivalPath = + ArchivalDirectory(immutable_db_options_.GetWalDir()); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = immutable_db_options_.stats; + if (dbstats) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s", + dbstats->ToString().c_str()); + } +} + +Status DBImpl::StartPeriodicTaskScheduler() { + +#ifndef NDEBUG + // It only used by test to disable scheduler + bool disable_scheduler = false; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::StartPeriodicTaskScheduler:DisableScheduler", + &disable_scheduler); + if (disable_scheduler) { + return Status::OK(); + } + + { + InstrumentedMutexLock l(&mutex_); + TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicTaskScheduler:Init", + &periodic_task_scheduler_); + } + +#endif // !NDEBUG + if (mutable_db_options_.stats_dump_period_sec > 0) { + Status s = periodic_task_scheduler_.Register( + PeriodicTaskType::kDumpStats, + periodic_task_functions_.at(PeriodicTaskType::kDumpStats), + mutable_db_options_.stats_dump_period_sec); + if (!s.ok()) { + return s; + } + } + if (mutable_db_options_.stats_persist_period_sec > 0) { + Status s = periodic_task_scheduler_.Register( + PeriodicTaskType::kPersistStats, + periodic_task_functions_.at(PeriodicTaskType::kPersistStats), + mutable_db_options_.stats_persist_period_sec); + if (!s.ok()) { + return s; + } + } + + Status s = periodic_task_scheduler_.Register( + PeriodicTaskType::kFlushInfoLog, + periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog)); + + return s; +} + +Status DBImpl::RegisterRecordSeqnoTimeWorker() { + uint64_t min_time_duration = std::numeric_limits::max(); + uint64_t max_time_duration = std::numeric_limits::min(); + { + InstrumentedMutexLock l(&mutex_); + + for (auto cfd : *versions_->GetColumnFamilySet()) { + // preserve time is the max of 2 options. + uint64_t preserve_time_duration = + std::max(cfd->ioptions()->preserve_internal_time_seconds, + cfd->ioptions()->preclude_last_level_data_seconds); + if (!cfd->IsDropped() && preserve_time_duration > 0) { + min_time_duration = std::min(preserve_time_duration, min_time_duration); + max_time_duration = std::max(preserve_time_duration, max_time_duration); + } + } + if (min_time_duration == std::numeric_limits::max()) { + seqno_time_mapping_.Resize(0, 0); + } else { + seqno_time_mapping_.Resize(min_time_duration, max_time_duration); + } + } + + uint64_t seqno_time_cadence = 0; + if (min_time_duration != std::numeric_limits::max()) { + // round up to 1 when the time_duration is smaller than + // kMaxSeqnoTimePairsPerCF + seqno_time_cadence = + (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; + } + + Status s; + if (seqno_time_cadence == 0) { + s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime); + } else { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kRecordSeqnoTime, + periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime), + seqno_time_cadence); + } + + return s; +} + +// esitmate the total size of stats_history_ +size_t DBImpl::EstimateInMemoryStatsHistorySize() const { + size_t size_total = + sizeof(std::map>); + if (stats_history_.size() == 0) return size_total; + size_t size_per_slice = + sizeof(uint64_t) + sizeof(std::map); + // non-empty map, stats_history_.begin() guaranteed to exist + for (const auto& pairs : stats_history_.begin()->second) { + size_per_slice += + pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second); + } + size_total = size_per_slice * stats_history_.size(); + return size_total; +} + +void DBImpl::PersistStats() { + TEST_SYNC_POINT("DBImpl::PersistStats:Entry"); + if (shutdown_initiated_) { + return; + } + TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning"); + uint64_t now_seconds = + immutable_db_options_.clock->NowMicros() / kMicrosInSecond; + + Statistics* statistics = immutable_db_options_.stats; + if (!statistics) { + return; + } + size_t stats_history_size_limit = 0; + { + InstrumentedMutexLock l(&mutex_); + stats_history_size_limit = mutable_db_options_.stats_history_buffer_size; + } + + std::map stats_map; + if (!statistics->getTickerMap(&stats_map)) { + return; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- PERSISTING STATS -------"); + + if (immutable_db_options_.persist_stats_to_disk) { + WriteBatch batch; + Status s = Status::OK(); + if (stats_slice_initialized_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reading %" ROCKSDB_PRIszt " stats from statistics\n", + stats_slice_.size()); + for (const auto& stat : stats_map) { + if (s.ok()) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + s = batch.Put(persist_stats_cf_handle_, + Slice(key, std::min(100, length)), + std::to_string(delta)); + } + } + } + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + if (s.ok()) { + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing to persistent stats CF failed -- %s", + s.ToString().c_str()); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to persistent stats CF succeeded", + stats_slice_.size(), now_seconds); + } + // TODO(Zhongyi): add purging for persisted data + } else { + InstrumentedMutexLock l(&stats_history_mutex_); + // calculate the delta from last time + if (stats_slice_initialized_) { + std::map stats_delta; + for (const auto& stat : stats_map) { + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to in-memory stats history", + stats_slice_.size(), now_seconds); + stats_history_[now_seconds] = stats_delta; + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); + + // delete older stats snapshots to control memory consumption + size_t stats_history_size = EstimateInMemoryStatsHistorySize(); + bool purge_needed = stats_history_size > stats_history_size_limit; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + while (purge_needed && !stats_history_.empty()) { + stats_history_.erase(stats_history_.begin()); + purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + } + TEST_SYNC_POINT("DBImpl::PersistStats:End"); +} + +bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map) { + assert(new_time); + assert(stats_map); + if (!new_time || !stats_map) return false; + // lock when search for start_time + { + InstrumentedMutexLock l(&stats_history_mutex_); + auto it = stats_history_.lower_bound(start_time); + if (it != stats_history_.end() && it->first < end_time) { + // make a copy for timestamp and stats_map + *new_time = it->first; + *stats_map = it->second; + return true; + } else { + return false; + } + } +} + +Status DBImpl::GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) { + if (!stats_iterator) { + return Status::InvalidArgument("stats_iterator not preallocated."); + } + if (immutable_db_options_.persist_stats_to_disk) { + stats_iterator->reset( + new PersistentStatsHistoryIterator(start_time, end_time, this)); + } else { + stats_iterator->reset( + new InMemoryStatsHistoryIterator(start_time, end_time, this)); + } + return (*stats_iterator)->status(); +} + +void DBImpl::DumpStats() { + TEST_SYNC_POINT("DBImpl::DumpStats:1"); + std::string stats; + if (shutdown_initiated_) { + return; + } + + // Also probe block cache(s) for problems, dump to info log + UnorderedSet probed_caches; + TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (!cfd->initialized()) { + continue; + } + + // Release DB mutex for gathering cache entry stats. Pass over all + // column families for this first so that other stats are dumped + // near-atomically. + InstrumentedMutexUnlock u(&mutex_); + cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); + + // Probe block cache for problems (if not already via another CF) + if (immutable_db_options_.info_log) { + auto* table_factory = cfd->ioptions()->table_factory.get(); + assert(table_factory != nullptr); + Cache* cache = + table_factory->GetOptions(TableFactory::kBlockCacheOpts()); + if (cache && probed_caches.insert(cache).second) { + cache->ReportProblems(immutable_db_options_.info_log); + } + } + } + + const std::string* property = &DB::Properties::kDBStats; + const DBPropertyInfo* property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); + default_cf_internal_stats_->GetStringProperty(*property_info, *property, + &stats); + + property = &InternalStats::kPeriodicCFStats; + property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->GetStringProperty(*property_info, *property, + &stats); + } + } + } + TEST_SYNC_POINT("DBImpl::DumpStats:2"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- DUMPING STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + if (immutable_db_options_.dump_malloc_stats) { + stats.clear(); + DumpMallocStats(&stats); + if (!stats.empty()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- Malloc STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + } + } + + PrintStatistics(); +} + +// Periodically flush info log out of application buffer at a low frequency. +// This improves debuggability in case of RocksDB hanging since it ensures the +// log messages leading up to the hang will eventually become visible in the +// log. +void DBImpl::FlushInfoLog() { + if (shutdown_initiated_) { + return; + } + TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning"); + LogFlush(immutable_db_options_.info_log); +} + +Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str) { + auto* cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* version = super_version->current; + + Status s = + version->TablesRangeTombstoneSummary(max_entries_to_print, out_str); + + CleanupSuperVersion(super_version); + return s; +} + +void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { + mutex_.AssertHeld(); + if (!job_context->logs_to_free.empty()) { + for (auto l : job_context->logs_to_free) { + AddToLogsToFreeQueue(l); + } + job_context->logs_to_free.clear(); + } +} + +FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { + assert(cfd); + FSDirectory* ret_dir = cfd->GetDataDir(path_id); + if (ret_dir == nullptr) { + return directories_.GetDataDir(path_id); + } + return ret_dir; +} + +Status DBImpl::SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetOptions() on column family [%s], empty input", + cfd->GetName().c_str()); + return Status::InvalidArgument("empty input"); + } + + MutableCFOptions new_options; + Status s; + Status persist_options_status; + SuperVersionContext sv_context(/* create_superversion */ true); + { + auto db_options = GetDBOptions(); + InstrumentedMutexLock l(&mutex_); + s = cfd->SetOptions(db_options, options_map); + if (s.ok()) { + new_options = *cfd->GetLatestMutableCFOptions(); + // Append new version to recompute compaction score. + VersionEdit dummy_edit; + s = versions_->LogAndApply(cfd, new_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); + // Trigger possible flush/compactions. This has to be before we persist + // options to file, otherwise there will be a deadlock with writer + // thread. + InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); + + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, true /*need_enter_write_thread*/); + bg_cv_.SignalAll(); + } + } + sv_context.Clean(); + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str()); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] SetOptions() succeeded", cfd->GetName().c_str()); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + // NOTE: WriteOptionsFile already logs on failure + s = persist_options_status; + } + } else { + persist_options_status.PermitUncheckedError(); // less important + ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", + cfd->GetName().c_str()); + } + LogFlush(immutable_db_options_.info_log); + return s; +} + +Status DBImpl::SetDBOptions( + const std::unordered_map& options_map) { + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetDBOptions(), empty input."); + return Status::InvalidArgument("empty input"); + } + + MutableDBOptions new_options; + Status s; + Status persist_options_status = Status::OK(); + bool wal_changed = false; + WriteContext write_context; + { + InstrumentedMutexLock l(&mutex_); + s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, + &new_options); + + if (new_options.bytes_per_sync == 0) { + new_options.bytes_per_sync = 1024 * 1024; + } + + if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "SetDBOptions(), input option value is not changed, " + "skipping updating."); + persist_options_status.PermitUncheckedError(); + return s; + } + + DBOptions new_db_options = + BuildDBOptions(immutable_db_options_, new_options); + if (s.ok()) { + s = ValidateOptions(new_db_options); + } + if (s.ok()) { + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped()) { + auto cf_options = c->GetLatestCFOptions(); + s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options); + if (!s.ok()) { + break; + } + } + } + } + if (s.ok()) { + const BGJobLimits current_bg_job_limits = + GetBGJobLimits(mutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + /* parallelize_compactions */ true); + const BGJobLimits new_bg_job_limits = GetBGJobLimits( + new_options.max_background_flushes, + new_options.max_background_compactions, + new_options.max_background_jobs, /* parallelize_compactions */ true); + + const bool max_flushes_increased = + new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes; + const bool max_compactions_increased = + new_bg_job_limits.max_compactions > + current_bg_job_limits.max_compactions; + + if (max_flushes_increased || max_compactions_increased) { + if (max_flushes_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes, + Env::Priority::HIGH); + } + + if (max_compactions_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions, + Env::Priority::LOW); + } + + MaybeScheduleFlushOrCompaction(); + } + + mutex_.Unlock(); + if (new_options.stats_dump_period_sec == 0) { + s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kDumpStats); + } else { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kDumpStats, + periodic_task_functions_.at(PeriodicTaskType::kDumpStats), + new_options.stats_dump_period_sec); + } + if (new_options.max_total_wal_size != + mutable_db_options_.max_total_wal_size) { + max_total_wal_size_.store(new_options.max_total_wal_size, + std::memory_order_release); + } + if (s.ok()) { + if (new_options.stats_persist_period_sec == 0) { + s = periodic_task_scheduler_.Unregister( + PeriodicTaskType::kPersistStats); + } else { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kPersistStats, + periodic_task_functions_.at(PeriodicTaskType::kPersistStats), + new_options.stats_persist_period_sec); + } + } + mutex_.Lock(); + if (!s.ok()) { + return s; + } + + write_controller_.set_max_delayed_write_rate( + new_options.delayed_write_rate); + table_cache_.get()->SetCapacity(new_options.max_open_files == -1 + ? TableCache::kInfiniteCapacity + : new_options.max_open_files - 10); + wal_changed = mutable_db_options_.wal_bytes_per_sync != + new_options.wal_bytes_per_sync; + mutable_db_options_ = new_options; + file_options_for_compaction_ = FileOptions(new_db_options); + file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( + file_options_for_compaction_, immutable_db_options_); + versions_->ChangeFileOptions(mutable_db_options_); + // TODO(xiez): clarify why apply optimize for read to write options + file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( + file_options_for_compaction_, immutable_db_options_); + file_options_for_compaction_.compaction_readahead_size = + mutable_db_options_.compaction_readahead_size; + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) { + Status purge_wal_status = SwitchWAL(&write_context); + if (!purge_wal_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to purge WAL files in SetDBOptions() -- %s", + purge_wal_status.ToString().c_str()); + } + } + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + write_thread_.ExitUnbatched(&w); + } else { + // To get here, we must have had invalid options and will not attempt to + // persist the options, which means the status is "OK/Uninitialized. + persist_options_status.PermitUncheckedError(); + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetDBOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); + } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in SetDBOptions() -- %s", + persist_options_status.ToString().c_str()); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); + } + LogFlush(immutable_db_options_.info_log); + return s; +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting( + ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/, + int level) { + mutex_.AssertHeld(); + const auto* vstorage = cfd->current()->storage_info(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (vstorage->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { + break; + } + + minimum_level = i; + } + return minimum_level; +} + +Status DBImpl::FlushWAL(bool sync) { + if (manual_wal_flush_) { + IOStatus io_s; + { + // We need to lock log_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + io_s = cur_log_writer->WriteBuffer(); + } + if (!io_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + io_s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + IOStatusCheck(io_s); + // whether sync or not, we should abort the rest of function upon error + return static_cast(io_s); + } + if (!sync) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false"); + return static_cast(io_s); + } + } + if (!sync) { + return Status::OK(); + } + // sync = true + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true"); + return SyncWAL(); +} + +bool DBImpl::WALBufferIsEmpty() { + InstrumentedMutexLock l(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + auto res = cur_log_writer->BufferIsEmpty(); + return res; +} + +Status DBImpl::SyncWAL() { + TEST_SYNC_POINT("DBImpl::SyncWAL:Begin"); + autovector logs_to_sync; + bool need_log_dir_sync; + uint64_t current_log_number; + + { + InstrumentedMutexLock l(&log_write_mutex_); + assert(!logs_.empty()); + + // This SyncWAL() call only cares about logs up to this number. + current_log_number = logfile_number_; + + while (logs_.front().number <= current_log_number && + logs_.front().IsSyncing()) { + log_sync_cv_.Wait(); + } + // First check that logs are safe to sync in background. + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { + return Status::NotSupported( + "SyncWAL() is not supported for this implementation of WAL file", + immutable_db_options_.allow_mmap_writes + ? "try setting Options::allow_mmap_writes to false" + : Slice()); + } + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + auto& log = *it; + log.PrepareForSync(); + logs_to_sync.push_back(log.writer); + } + + need_log_dir_sync = !log_dir_synced_; + } + + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); + RecordTick(stats_, WAL_FILE_SYNCED); + Status status; + IOStatus io_s; + for (log::Writer* log : logs_to_sync) { + io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + status = io_s; + break; + } + } + if (!io_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s", + io_s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + IOStatusCheck(io_s); + } + if (status.ok() && need_log_dir_sync) { + status = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); + + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + VersionEdit synced_wals; + { + InstrumentedMutexLock l(&log_write_mutex_); + if (status.ok()) { + MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals); + } else { + MarkLogsNotSynced(current_log_number); + } + } + if (status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); + } + + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); + + return status; +} + +Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options, + VersionEdit* synced_wals) { + // not empty, write to MANIFEST. + mutex_.AssertHeld(); + + Status status = versions_->LogAndApplyToDefaultColumnFamily( + read_options, synced_wals, &mutex_, directories_.GetDbDir()); + if (!status.ok() && versions_->io_status().IsIOError()) { + status = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + return status; +} + +Status DBImpl::LockWAL() { + { + InstrumentedMutexLock lock(&mutex_); + if (lock_wal_count_ > 0) { + assert(lock_wal_write_token_); + ++lock_wal_count_; + } else { + // NOTE: this will "unnecessarily" wait for other non-LockWAL() write + // stalls to clear before LockWAL returns, however fixing that would + // not be simple because if we notice the primary queue is already + // stalled, that stall might clear while we release DB mutex in + // EnterUnbatched() for the nonmem queue. And if we work around that in + // the naive way, we could deadlock by locking the two queues in different + // orders. + + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + // NOTE: releasing mutex in EnterUnbatched might mean we are actually + // now lock_wal_count > 0 + if (lock_wal_count_ == 0) { + assert(!lock_wal_write_token_); + lock_wal_write_token_ = write_controller_.GetStopToken(); + } + ++lock_wal_count_; + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + } + } + // NOTE: avoid I/O holding DB mutex + Status s = FlushWAL(/*sync=*/false); + if (!s.ok()) { + // Non-OK return should not be in locked state + UnlockWAL().PermitUncheckedError(); + } + return s; +} + +Status DBImpl::UnlockWAL() { + bool signal = false; + uint64_t maybe_stall_begun_count = 0; + uint64_t nonmem_maybe_stall_begun_count = 0; + { + InstrumentedMutexLock lock(&mutex_); + if (lock_wal_count_ == 0) { + return Status::Aborted("No LockWAL() in effect"); + } + --lock_wal_count_; + if (lock_wal_count_ == 0) { + lock_wal_write_token_.reset(); + signal = true; + // For the last UnlockWAL, we don't want to return from UnlockWAL() + // until the thread(s) that called BeginWriteStall() have had a chance to + // call EndWriteStall(), so that no_slowdown writes after UnlockWAL() are + // guaranteed to succeed if there's no other source of stall. + maybe_stall_begun_count = write_thread_.GetBegunCountOfOutstandingStall(); + if (two_write_queues_) { + nonmem_maybe_stall_begun_count = + nonmem_write_thread_.GetBegunCountOfOutstandingStall(); + } + } + } + if (signal) { + // SignalAll outside of mutex for efficiency + bg_cv_.SignalAll(); + } + // Ensure stalls have cleared + if (maybe_stall_begun_count) { + write_thread_.WaitForStallEndedCount(maybe_stall_begun_count); + } + if (nonmem_maybe_stall_begun_count) { + nonmem_write_thread_.WaitForStallEndedCount(nonmem_maybe_stall_begun_count); + } + return Status::OK(); +} + +void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, + VersionEdit* synced_wals) { + log_write_mutex_.AssertHeld(); + if (synced_dir && logfile_number_ == up_to) { + log_dir_synced_ = true; + } + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { + auto& wal = *it; + assert(wal.IsSyncing()); + + if (wal.number < logs_.back().number) { + // Inactive WAL + if (immutable_db_options_.track_and_verify_wals_in_manifest && + wal.GetPreSyncSize() > 0) { + synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); + } + if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) { + // Fully synced + logs_to_free_.push_back(wal.ReleaseWriter()); + it = logs_.erase(it); + } else { + assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize()); + wal.FinishSync(); + ++it; + } + } else { + assert(wal.number == logs_.back().number); + // Active WAL + wal.FinishSync(); + ++it; + } + } + log_sync_cv_.SignalAll(); +} + +void DBImpl::MarkLogsNotSynced(uint64_t up_to) { + log_write_mutex_.AssertHeld(); + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; + ++it) { + auto& wal = *it; + wal.FinishSync(); + } + log_sync_cv_.SignalAll(); +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +void DBImpl::SetLastPublishedSequence(SequenceNumber seq) { + versions_->SetLastPublishedSequence(seq); +} + +Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) { + if (ts_low == nullptr) { + return Status::InvalidArgument("ts_low is nullptr"); + } + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + InstrumentedMutexLock l(&mutex_); + *ts_low = cfd->GetFullHistoryTsLow(); + assert(cfd->user_comparator()->timestamp_size() == ts_low->size()); + return Status::OK(); +} + +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + Arena* arena, + SequenceNumber sequence, + ColumnFamilyHandle* column_family, + bool allow_unprepared_value) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + return NewInternalIterator(read_options, cfd, super_version, arena, sequence, + allow_unprepared_value); +} + +void DBImpl::SchedulePurge() { + mutex_.AssertHeld(); + assert(opened_successfully_); + + // Purge operations are put into High priority queue + bg_purge_scheduled_++; + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); +} + +void DBImpl::BackgroundCallPurge() { + mutex_.Lock(); + + while (!logs_to_free_queue_.empty()) { + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); + } + while (!superversions_to_free_queue_.empty()) { + assert(!superversions_to_free_queue_.empty()); + SuperVersion* sv = superversions_to_free_queue_.front(); + superversions_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete sv; + mutex_.Lock(); + } + + assert(bg_purge_scheduled_ > 0); + + // Can't use iterator to go over purge_files_ because inside the loop we're + // unlocking the mutex that protects purge_files_. + while (!purge_files_.empty()) { + auto it = purge_files_.begin(); + // Need to make a copy of the PurgeFilesInfo before unlocking the mutex. + PurgeFileInfo purge_file = it->second; + + const std::string& fname = purge_file.fname; + const std::string& dir_to_sync = purge_file.dir_to_sync; + FileType type = purge_file.type; + uint64_t number = purge_file.number; + int job_id = purge_file.job_id; + + purge_files_.erase(it); + + mutex_.Unlock(); + DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number); + mutex_.Lock(); + } + + bg_purge_scheduled_--; + + bg_cv_.SignalAll(); + // IMPORTANT:there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + mutex_.Unlock(); +} + +namespace { + +// A `SuperVersionHandle` holds a non-null `SuperVersion*` pointing at a +// `SuperVersion` referenced once for this object. It also contains the state +// needed to clean up the `SuperVersion` reference from outside of `DBImpl` +// using `CleanupSuperVersionHandle()`. +struct SuperVersionHandle { + // `_super_version` must be non-nullptr and `Ref()`'d once as long as the + // `SuperVersionHandle` may use it. + SuperVersionHandle(DBImpl* _db, InstrumentedMutex* _mu, + SuperVersion* _super_version, bool _background_purge) + : db(_db), + mu(_mu), + super_version(_super_version), + background_purge(_background_purge) {} + + DBImpl* db; + InstrumentedMutex* mu; + SuperVersion* super_version; + bool background_purge; +}; + +static void CleanupSuperVersionHandle(void* arg1, void* /*arg2*/) { + SuperVersionHandle* sv_handle = reinterpret_cast(arg1); + + if (sv_handle->super_version->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + + sv_handle->mu->Lock(); + sv_handle->super_version->Cleanup(); + sv_handle->db->FindObsoleteFiles(&job_context, false, true); + if (sv_handle->background_purge) { + sv_handle->db->ScheduleBgLogWriterClose(&job_context); + sv_handle->db->AddSuperVersionsToFreeQueue(sv_handle->super_version); + sv_handle->db->SchedulePurge(); + } + sv_handle->mu->Unlock(); + + if (!sv_handle->background_purge) { + delete sv_handle->super_version; + } + if (job_context.HaveSomethingToDelete()) { + sv_handle->db->PurgeObsoleteFiles(job_context, + sv_handle->background_purge); + } + job_context.Clean(); + } + + delete sv_handle; +} + +struct GetMergeOperandsState { + MergeContext merge_context; + PinnedIteratorsManager pinned_iters_mgr; + SuperVersionHandle* sv_handle; +}; + +static void CleanupGetMergeOperandsState(void* arg1, void* /*arg2*/) { + GetMergeOperandsState* state = static_cast(arg1); + CleanupSuperVersionHandle(state->sv_handle /* arg1 */, nullptr /* arg2 */); + delete state; +} + +} // namespace + +InternalIterator* DBImpl::NewInternalIterator( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SuperVersion* super_version, Arena* arena, SequenceNumber sequence, + bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) { + InternalIterator* internal_iter; + assert(arena != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder( + &cfd->internal_comparator(), arena, + !read_options.total_order_seek && + super_version->mutable_cf_options.prefix_extractor != nullptr, + read_options.iterate_upper_bound); + // Collect iterator for mutable memtable + auto mem_iter = super_version->mem->NewIterator(read_options, arena); + Status s; + if (!read_options.ignore_range_deletions) { + TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; + auto range_del_iter = super_version->mem->NewRangeTombstoneIterator( + read_options, sequence, false /* immutable_memtable */); + if (range_del_iter == nullptr || range_del_iter->empty()) { + delete range_del_iter; + } else { + mem_tombstone_iter = new TruncatedRangeDelIterator( + std::unique_ptr(range_del_iter), + &cfd->ioptions()->internal_comparator, nullptr /* smallest */, + nullptr /* largest */); + } + merge_iter_builder.AddPointAndTombstoneIterator(mem_iter, + mem_tombstone_iter); + } else { + merge_iter_builder.AddIterator(mem_iter); + } + + // Collect all needed child iterators for immutable memtables + if (s.ok()) { + super_version->imm->AddIterators(read_options, &merge_iter_builder, + !read_options.ignore_range_deletions); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); + if (s.ok()) { + // Collect iterators for files in L0 - Ln + if (read_options.read_tier != kMemtableTier) { + super_version->current->AddIterators(read_options, file_options_, + &merge_iter_builder, + allow_unprepared_value); + } + internal_iter = merge_iter_builder.Finish( + read_options.ignore_range_deletions ? nullptr : db_iter); + SuperVersionHandle* cleanup = new SuperVersionHandle( + this, &mutex_, super_version, + read_options.background_purge_on_iterator_cleanup || + immutable_db_options_.avoid_unnecessary_blocking_io); + internal_iter->RegisterCleanup(CleanupSuperVersionHandle, cleanup, nullptr); + + return internal_iter; + } else { + CleanupSuperVersion(super_version); + } + return NewErrorInternalIterator(s, arena); +} + +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} + +ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { + return persist_stats_cf_handle_; +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return Get(read_options, column_family, key, value, /*timestamp=*/nullptr); +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) { + assert(value != nullptr); + value->Reset(); + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + get_impl_options.timestamp = timestamp; + Status s = GetImpl(read_options, key, get_impl_options); + return s; +} + +Status DBImpl::GetEntity(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call GetEntity without a column family handle"); + } + + if (!columns) { + return Status::InvalidArgument( + "Cannot call GetEntity without a PinnableWideColumns object"); + } + + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetEntity with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + + columns->Reset(); + + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.columns = columns; + + return GetImpl(read_options, key, get_impl_options); +} + +bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) { + // If both thresholds are reached, a function returning merge operands as + // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or + // numerous `memcpy()`s. + // + // The below constants enable the optimization conservatively. They are + // verified to not regress `GetMergeOperands()` latency in the following + // scenarios. + // + // - CPU: two socket Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz + // - `GetMergeOperands()` threads: 1 - 32 + // - Entry size: 32 bytes - 4KB + // - Merges per key: 1 - 16K + // - LSM component: memtable + // + // TODO(ajkr): expand measurement to SST files. + static const size_t kNumBytesForSvRef = 32768; + static const size_t kLog2AvgBytesForSvRef = 8; // 256 bytes + + size_t num_bytes = 0; + for (const Slice& sl : merge_context.GetOperands()) { + num_bytes += sl.size(); + } + return num_bytes >= kNumBytesForSvRef && + (num_bytes >> kLog2AvgBytesForSvRef) >= + merge_context.GetOperands().size(); +} + +Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, + GetImplOptions& get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.merge_operands != nullptr || + get_impl_options.columns != nullptr); + + assert(get_impl_options.column_family); + + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf(get_impl_options.column_family, + *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(get_impl_options.column_family); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (get_impl_options.timestamp) { + get_impl_options.timestamp->clear(); + } + + GetWithTimestampReadCallback read_cb(0); // Will call Refresh + + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = static_cast_with_check( + get_impl_options.column_family); + auto cfd = cfh->cfd(); + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError(); + } + } + + if (get_impl_options.get_merge_operands_options != nullptr) { + for (int i = 0; i < get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands; + ++i) { + get_impl_options.merge_operands[i].Reset(); + } + } + + // Acquire SuperVersion + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + TEST_SYNC_POINT("DBImpl::GetImpl:1"); + TEST_SYNC_POINT("DBImpl::GetImpl:2"); + + SequenceNumber snapshot; + if (read_options.snapshot != nullptr) { + if (get_impl_options.callback) { + // Already calculated based on read_options.snapshot + snapshot = get_impl_options.callback->max_visible_seq(); + } else { + snapshot = + reinterpret_cast(read_options.snapshot)->number_; + } + } else { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + snapshot = GetLastPublishedSequence(); + if (get_impl_options.callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + get_impl_options.callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = get_impl_options.callback->max_visible_seq(); + } + } + // If timestamp is used, we use read callback to ensure is returned + // only if t <= read_opts.timestamp and s <= snapshot. + // HACK: temporarily overwrite input struct field but restore + SaveAndRestore restore_callback(&get_impl_options.callback); + const Comparator* ucmp = get_impl_options.column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + assert(!get_impl_options + .callback); // timestamp with callback is not supported + read_cb.Refresh(snapshot); + get_impl_options.callback = &read_cb; + } + TEST_SYNC_POINT("DBImpl::GetImpl:3"); + TEST_SYNC_POINT("DBImpl::GetImpl:4"); + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + Status s; + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + + bool skip_memtable = (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + std::string* timestamp = + ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr; + if (!skip_memtable) { + // Get value associated with key + if (get_impl_options.get_value) { + if (sv->mem->Get( + lkey, + get_impl_options.value ? get_impl_options.value->GetSelf() + : nullptr, + get_impl_options.columns, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } + + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, + get_impl_options.value + ? get_impl_options.value->GetSelf() + : nullptr, + get_impl_options.columns, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } + + RecordTick(stats_, MEMTABLE_HIT); + } + } else { + // Get Merge Operands associated with key, Merge Operands should not be + // merged and raw values should be returned to the user. + if (sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, nullptr, nullptr, + false)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->GetMergeOperands(lkey, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, sv); + return s; + } + } + TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:0"); + TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:1"); + PinnedIteratorsManager pinned_iters_mgr; + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + sv->current->Get( + read_options, lkey, get_impl_options.value, get_impl_options.columns, + timestamp, &s, &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); + RecordTick(stats_, MEMTABLE_MISS); + } + + { + PERF_TIMER_GUARD(get_post_process_time); + + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = 0; + if (s.ok()) { + if (get_impl_options.get_value) { + if (get_impl_options.value) { + size = get_impl_options.value->size(); + } else if (get_impl_options.columns) { + size = get_impl_options.columns->serialized_size(); + } + } else { + // Return all merge operands for get_impl_options.key + *get_impl_options.number_of_operands = + static_cast(merge_context.GetNumOperands()); + if (*get_impl_options.number_of_operands > + get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands) { + s = Status::Incomplete( + Status::SubCode::KMergeOperandsInsufficientCapacity); + } else { + // Each operand depends on one of the following resources: `sv`, + // `pinned_iters_mgr`, or `merge_context`. It would be crazy expensive + // to reference `sv` for each operand relying on it because `sv` is + // (un)ref'd in all threads using the DB. Furthermore, we do not track + // on which resource each operand depends. + // + // To solve this, we bundle the resources in a `GetMergeOperandsState` + // and manage them with a `SharedCleanablePtr` shared among the + // `PinnableSlice`s we return. This bundle includes one `sv` reference + // and ownership of the `merge_context` and `pinned_iters_mgr` + // objects. + bool ref_sv = ShouldReferenceSuperVersion(merge_context); + if (ref_sv) { + assert(!merge_context.GetOperands().empty()); + SharedCleanablePtr shared_cleanable; + GetMergeOperandsState* state = nullptr; + state = new GetMergeOperandsState(); + state->merge_context = std::move(merge_context); + state->pinned_iters_mgr = std::move(pinned_iters_mgr); + + sv->Ref(); + + state->sv_handle = new SuperVersionHandle( + this, &mutex_, sv, + immutable_db_options_.avoid_unnecessary_blocking_io); + + shared_cleanable.Allocate(); + shared_cleanable->RegisterCleanup(CleanupGetMergeOperandsState, + state /* arg1 */, + nullptr /* arg2 */); + for (size_t i = 0; i < state->merge_context.GetOperands().size(); + ++i) { + const Slice& sl = state->merge_context.GetOperands()[i]; + size += sl.size(); + + get_impl_options.merge_operands->PinSlice( + sl, nullptr /* cleanable */); + if (i == state->merge_context.GetOperands().size() - 1) { + shared_cleanable.MoveAsCleanupTo( + get_impl_options.merge_operands); + } else { + shared_cleanable.RegisterCopyWith( + get_impl_options.merge_operands); + } + get_impl_options.merge_operands++; + } + } else { + for (const Slice& sl : merge_context.GetOperands()) { + size += sl.size(); + get_impl_options.merge_operands->PinSelf(sl); + get_impl_options.merge_operands++; + } + } + } + } + RecordTick(stats_, BYTES_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + + ReturnAndCleanupSuperVersion(cfd, sv); + + RecordInHistogram(stats_, BYTES_PER_READ, size); + } + return s; +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + return MultiGet(read_options, column_family, keys, values, + /*timestamps=*/nullptr); +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values, + std::vector* timestamps) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); + PERF_TIMER_GUARD(get_snapshot_time); + + size_t num_keys = keys.size(); + assert(column_family.size() == num_keys); + std::vector stat_list(num_keys); + + bool should_fail = false; + for (size_t i = 0; i < num_keys; ++i) { + assert(column_family[i]); + if (read_options.timestamp) { + stat_list[i] = FailIfTsMismatchCf( + column_family[i], *(read_options.timestamp), /*ts_for_read=*/true); + if (!stat_list[i].ok()) { + should_fail = true; + } + } else { + stat_list[i] = FailIfCfHasTs(column_family[i]); + if (!stat_list[i].ok()) { + should_fail = true; + } + } + } + + if (should_fail) { + for (auto& s : stat_list) { + if (s.ok()) { + s = Status::Incomplete( + "DB not queried due to invalid argument(s) in the same MultiGet"); + } + } + return stat_list; + } + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(column_family, keys).PermitUncheckedError(); + } + } + + SequenceNumber consistent_seqnum; + + UnorderedMap multiget_cf_data( + column_family.size()); + for (auto cf : column_family) { + auto cfh = static_cast_with_check(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + multiget_cf_data.emplace(cfd->GetID(), + MultiGetColumnFamilyData(cfh, nullptr)); + } + } + + std::function::iterator&)> + iter_deref_lambda = + [](UnorderedMap::iterator& + cf_iter) { return &cf_iter->second; }; + + bool unref_only = + MultiCFSnapshot>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1"); + TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2"); + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + values->resize(num_keys); + if (timestamps) { + timestamps->resize(num_keys); + } + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t num_found = 0; + size_t keys_read; + uint64_t curr_value_size = 0; + + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = nullptr; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + for (keys_read = 0; keys_read < num_keys; ++keys_read) { + merge_context.Clear(); + Status& s = stat_list[keys_read]; + std::string* value = &(*values)[keys_read]; + std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr; + + LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp); + auto cfh = static_cast_with_check( + column_family[keys_read]); + SequenceNumber max_covering_tombstone_seq = 0; + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd.super_version; + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get( + lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, read_callback)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr, + timestamp, &s, &merge_context, + &max_covering_tombstone_seq, + read_options, read_callback)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done) { + PinnableSlice pinnable_val; + PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, &pinnable_val, + /*columns=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, /*value_found=*/nullptr, + /*key_exists=*/nullptr, + /*seq=*/nullptr, read_callback); + value->assign(pinnable_val.data(), pinnable_val.size()); + RecordTick(stats_, MEMTABLE_MISS); + } + + if (s.ok()) { + bytes_read += value->size(); + num_found++; + curr_value_size += value->size(); + if (curr_value_size > read_options.value_size_soft_limit) { + while (++keys_read < num_keys) { + stat_list[keys_read] = Status::Aborted(); + } + break; + } + } + if (read_options.deadline.count() && + immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())) { + break; + } + } + + if (keys_read < num_keys) { + // The only reason to break out of the loop is when the deadline is + // exceeded + assert(immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())); + for (++keys_read; keys_read < num_keys; ++keys_read) { + stat_list[keys_read] = Status::TimedOut(); + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + if (!unref_only) { + ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); + } else { + mgd.cfd->GetSuperVersion()->Unref(); + } + } + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return stat_list; +} + +template +bool DBImpl::MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot) { + PERF_TIMER_GUARD(get_snapshot_time); + + bool last_try = false; + if (cf_list->size() == 1) { + // Fast path for a single column family. We can simply get the thread loca + // super version + auto cf_iter = cf_list->begin(); + auto node = iter_deref_func(cf_iter); + node->super_version = GetAndRefSuperVersion(node->cfd); + if (read_options.snapshot != nullptr) { + // Note: In WritePrepared txns this is not necessary but not harmful + // either. Because prep_seq > snapshot => commit_seq > snapshot so if + // a snapshot is specified we should be fine with skipping seq numbers + // that are greater than that. + // + // In WriteUnprepared, we cannot set snapshot in the lookup key because we + // may skip uncommitted data that should be visible to the transaction for + // reading own writes. + *snapshot = + static_cast(read_options.snapshot)->number_; + if (callback) { + *snapshot = std::max(*snapshot, callback->max_visible_seq()); + } + } else { + // Since we get and reference the super version before getting + // the snapshot number, without a mutex protection, it is possible + // that a memtable switch happened in the middle and not all the + // data for this snapshot is available. But it will contain all + // the data available in the super version we have, which is also + // a valid snapshot to read from. + // We shouldn't get snapshot before finding and referencing the super + // version because a flush happening in between may compact away data for + // the snapshot, but the snapshot is earlier than the data overwriting it, + // so users may see wrong results. + *snapshot = GetLastPublishedSequence(); + } + } else { + // If we end up with the same issue of memtable geting sealed during 2 + // consecutive retries, it means the write rate is very high. In that case + // its probably ok to take the mutex on the 3rd try so we can succeed for + // sure + constexpr int num_retries = 3; + for (int i = 0; i < num_retries; ++i) { + last_try = (i == num_retries - 1); + bool retry = false; + + if (i > 0) { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } + node->super_version = nullptr; + } + } + if (read_options.snapshot == nullptr) { + if (last_try) { + TEST_SYNC_POINT("DBImpl::MultiGet::LastTry"); + // We're close to max number of retries. For the last retry, + // acquire the lock so we're sure to succeed + mutex_.Lock(); + } + *snapshot = GetLastPublishedSequence(); + } else { + *snapshot = + static_cast_with_check(read_options.snapshot) + ->number_; + } + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + if (!last_try) { + node->super_version = GetAndRefSuperVersion(node->cfd); + } else { + node->super_version = node->cfd->GetSuperVersion()->Ref(); + } + TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); + if (read_options.snapshot != nullptr || last_try) { + // If user passed a snapshot, then we don't care if a memtable is + // sealed or compaction happens because the snapshot would ensure + // that older key versions are kept around. If this is the last + // retry, then we have the lock so nothing bad can happen + continue; + } + // We could get the earliest sequence number for the whole list of + // memtables, which will include immutable memtables as well, but that + // might be tricky to maintain in case we decide, in future, to do + // memtable compaction. + if (!last_try) { + SequenceNumber seq = + node->super_version->mem->GetEarliestSequenceNumber(); + if (seq > *snapshot) { + retry = true; + break; + } + } + } + if (!retry) { + if (last_try) { + mutex_.Unlock(); + } + break; + } + } + } + + // Keep track of bytes that we read for statistics-recording later + PERF_TIMER_STOP(get_snapshot_time); + + return last_try; +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + MultiGet(read_options, num_keys, column_families, keys, values, + /* timestamps */ nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) { + MultiGetCommon(read_options, num_keys, column_families, keys, values, + /* columns */ nullptr, timestamps, statuses, sorted_input); +} + +void DBImpl::MultiGetCommon(const ReadOptions& read_options, + const size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableSlice* values, + PinnableWideColumns* columns, + std::string* timestamps, Status* statuses, + const bool sorted_input) { + if (num_keys == 0) { + return; + } + + bool should_fail = false; + for (size_t i = 0; i < num_keys; ++i) { + ColumnFamilyHandle* cfh = column_families[i]; + assert(cfh); + if (read_options.timestamp) { + statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!statuses[i].ok()) { + should_fail = true; + } + } else { + statuses[i] = FailIfCfHasTs(cfh); + if (!statuses[i].ok()) { + should_fail = true; + } + } + } + if (should_fail) { + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = Status::Incomplete( + "DB not queried due to invalid argument(s) in the same MultiGet"); + } + } + return; + } + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError(); + } + } + + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + PinnableSlice* val = nullptr; + PinnableWideColumns* col = nullptr; + + if (values) { + val = &values[i]; + val->Reset(); + } else { + assert(columns); + + col = &columns[i]; + col->Reset(); + } + + key_context.emplace_back(column_families[i], keys[i], val, col, + timestamps ? ×tamps[i] : nullptr, + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + + autovector + multiget_cf_data; + size_t cf_start = 0; + ColumnFamilyHandle* cf = sorted_keys[0]->column_family; + + for (size_t i = 0; i < num_keys; ++i) { + KeyContext* key_ctx = sorted_keys[i]; + if (key_ctx->column_family != cf) { + multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr); + cf_start = i; + cf = key_ctx->column_family; + } + } + + multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); + + std::function::iterator&)> + iter_deref_lambda = + [](autovector::iterator& cf_iter) { + return &(*cf_iter); + }; + + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot< + autovector>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = nullptr; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + Status s; + auto cf_iter = multiget_cf_data.begin(); + for (; cf_iter != multiget_cf_data.end(); ++cf_iter) { + s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, + &sorted_keys, cf_iter->super_version, consistent_seqnum, + read_callback); + if (!s.ok()) { + break; + } + } + if (!s.ok()) { + assert(s.IsTimedOut() || s.IsAborted()); + for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) { + for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys; + ++i) { + *sorted_keys[i]->s = s; + } + } + } + + for (const auto& iter : multiget_cf_data) { + if (!unref_only) { + ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version); + } else { + iter.cfd->GetSuperVersion()->Unref(); + } + } +} + +namespace { +// Order keys by CF ID, followed by key contents +struct CompareKeyContext { + inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) { + ColumnFamilyHandleImpl* cfh = + static_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = static_cast(rhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + if (cfd_id1 < cfd_id2) { + return true; + } else if (cfd_id1 > cfd_id2) { + return false; + } + + // Both keys are from the same column family + int cmp = comparator->CompareWithoutTimestamp( + *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false); + if (cmp < 0) { + return true; + } + return false; + } +}; + +} // anonymous namespace + +void DBImpl::PrepareMultiGetKeys( + size_t num_keys, bool sorted_input, + autovector* sorted_keys) { + if (sorted_input) { +#ifndef NDEBUG + assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(), + CompareKeyContext())); +#endif + return; + } + + std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, + CompareKeyContext()); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input) { + MultiGet(read_options, column_family, num_keys, keys, values, + /* timestamps */ nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool sorted_input) { + MultiGetCommon(read_options, column_family, num_keys, keys, values, + /* columns */ nullptr, timestamps, statuses, sorted_input); +} + +void DBImpl::MultiGetCommon(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, PinnableWideColumns* columns, + std::string* timestamps, Status* statuses, + bool sorted_input) { + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError(); + } + } + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + PinnableSlice* val = nullptr; + PinnableWideColumns* col = nullptr; + + if (values) { + val = &values[i]; + val->Reset(); + } else { + assert(columns); + + col = &columns[i]; + col->Reset(); + } + + key_context.emplace_back(column_family, keys[i], val, col, + timestamps ? ×tamps[i] : nullptr, + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys); +} + +void DBImpl::MultiGetWithCallback( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys) { + std::array multiget_cf_data; + multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr); + std::function::iterator&)> + iter_deref_lambda = + [](std::array::iterator& cf_iter) { + return &(*cf_iter); + }; + + size_t num_keys = sorted_keys->size(); + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot>( + read_options, callback, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); +#ifndef NDEBUG + assert(!unref_only); +#else + // Silence unused variable warning + (void)unref_only; +#endif // NDEBUG + + if (callback && read_options.snapshot == nullptr) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + callback->Refresh(consistent_seqnum); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + consistent_seqnum = callback->max_visible_seq(); + } + + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = callback; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + assert(!read_callback); // timestamp with callback is not supported + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, + read_callback); + assert(s.ok() || s.IsTimedOut() || s.IsAborted()); + ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, + multiget_cf_data[0].super_version); +} + +// The actual implementation of batched MultiGet. Parameters - +// start_key - Index in the sorted_keys vector to start processing from +// num_keys - Number of keys to lookup, starting with sorted_keys[start_key] +// sorted_keys - The entire batch of sorted keys for this CF +// +// The per key status is returned in the KeyContext structures pointed to by +// sorted_keys. An overall Status is also returned, with the only possible +// values being Status::OK() and Status::TimedOut(). The latter indicates +// that the call exceeded read_options.deadline +Status DBImpl::MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* super_version, SequenceNumber snapshot, + ReadCallback* callback) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call MultiGet with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); + + assert(sorted_keys); + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + for (auto* kctx : *sorted_keys) { + assert(kctx); + if (kctx->timestamp) { + kctx->timestamp->clear(); + } + } + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t keys_left = num_keys; + Status s; + uint64_t curr_value_size = 0; + while (keys_left) { + if (read_options.deadline.count() && + immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())) { + s = Status::TimedOut(); + break; + } + + size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE) + ? MultiGetContext::MAX_BATCH_SIZE + : keys_left; + MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left, + batch_size, snapshot, read_options, GetFileSystem(), + stats_); + MultiGetRange range = ctx.GetMultiGetRange(); + range.AddValueSize(curr_value_size); + bool lookup_current = false; + + keys_left -= batch_size; + for (auto mget_iter = range.begin(); mget_iter != range.end(); + ++mget_iter) { + mget_iter->merge_context.Clear(); + *mget_iter->s = Status::OK(); + } + + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + if (!skip_memtable) { + super_version->mem->MultiGet(read_options, &range, callback, + false /* immutable_memtable */); + if (!range.empty()) { + super_version->imm->MultiGet(read_options, &range, callback); + } + if (!range.empty()) { + lookup_current = true; + uint64_t left = range.KeysLeft(); + RecordTick(stats_, MEMTABLE_MISS, left); + } + } + if (lookup_current) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->MultiGet(read_options, &range, callback); + } + curr_value_size = range.GetValueSize(); + if (curr_value_size > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + size_t num_found = 0; + uint64_t bytes_read = 0; + for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) { + KeyContext* key = (*sorted_keys)[i]; + assert(key); + assert(key->s); + + if (key->s->ok()) { + if (key->value) { + bytes_read += key->value->size(); + } else { + assert(key->columns); + bytes_read += key->columns->serialized_size(); + } + + num_found++; + } + } + if (keys_left) { + assert(s.IsTimedOut() || s.IsAborted()); + for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys; + ++i) { + KeyContext* key = (*sorted_keys)[i]; + *key->s = s; + } + } + + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return s; +} + +void DBImpl::MultiGetEntity(const ReadOptions& options, size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) { + MultiGetCommon(options, num_keys, column_families, keys, /* values */ nullptr, + results, /* timestamps */ nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, size_t num_keys, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) { + MultiGetCommon(options, column_family, num_keys, keys, /* values */ nullptr, + results, /* timestamps */ nullptr, statuses, sorted_input); +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) { + assert(handle != nullptr); + Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_family_names.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_families.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(column_families[i].options, + column_families[i].name, &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s; + *handle = nullptr; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + s = ColumnFamilyData::ValidateOptions(db_options, cf_options); + if (s.ok()) { + for (auto& cf_path : cf_options.cf_paths) { + s = env_->CreateDirIfMissing(cf_path.path); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + return s; + } + + SuperVersionContext sv_context(/* create_superversion */ true); + { + InstrumentedMutexLock l(&mutex_); + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(cf_options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + { // write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), + read_options, &edit, &mutex_, + directories_.GetDbDir(), false, &cf_options); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + std::map> dummy_created_dirs; + s = cfd->AddDirectories(&dummy_created_dirs); + } + if (s.ok()) { + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, + *cfd->GetLatestMutableCFOptions()); + + if (!cfd->mem()->IsSnapshotSupported()) { + is_snapshot_supported_ = false; + } + + cfd->set_initialized(); + + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + } // InstrumentedMutexLock l(&mutex_) + + if (cf_options.preserve_internal_time_seconds > 0 || + cf_options.preclude_last_level_data_seconds > 0) { + s = RegisterRecordSeqnoTimeWorker(); + } + sv_context.Clean(); + // this is outside the mutex + if (s.ok()) { + NewThreadStatusCfInfo( + static_cast_with_check(*handle)->cfd()); + } + return s; +} + +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + assert(column_family != nullptr); + Status s = DropColumnFamilyImpl(column_family); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::DropColumnFamilies( + const std::vector& column_families) { + Status s; + bool success_once = false; + for (auto* handle : column_families) { + s = DropColumnFamilyImpl(handle); + if (!s.ok()) { + break; + } + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); + } + + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); + + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); + + Status s; + { + InstrumentedMutexLock l(&mutex_); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); + } + if (s.ok()) { + // we drop column family from a single write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + if (!cf_support_snapshot) { + // Dropped Column Family doesn't support snapshot. Need to recalculate + // is_snapshot_supported_. + bool new_is_snapshot_supported = true; + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { + new_is_snapshot_supported = false; + break; + } + } + is_snapshot_supported_ = new_is_snapshot_supported; + } + bg_cv_.SignalAll(); + } + + if (cfd->ioptions()->preserve_internal_time_seconds > 0 || + cfd->ioptions()->preclude_last_level_data_seconds > 0) { + s = RegisterRecordSeqnoTimeWorker(); + } + + if (s.ok()) { + // Note that here we erase the associated cf_info of the to-be-dropped + // cfd before its ref-count goes to zero to avoid having to erase cf_info + // later inside db_mutex. + EraseThreadStatusCfInfo(cfd); + assert(cfd->IsDropped()); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Dropped column family with id %u\n", cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); + } + + return s; +} + +bool DBImpl::KeyMayExist(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found) { + assert(value != nullptr); + assert(read_options.io_activity == Env::IOActivity::kUnknown); + + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + // TODO: plumb Env::IOActivity + ReadOptions roptions = read_options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + PinnableSlice pinnable_val; + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = value_found; + get_impl_options.timestamp = timestamp; + auto s = GetImpl(roptions, key, get_impl_options); + value->assign(pinnable_val.data(), pinnable_val.size()); + + // If block_cache is enabled and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + Iterator* result = nullptr; + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + + assert(column_family); + + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return NewErrorIterator(s); + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return NewErrorIterator(s); + } + } + + auto cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + assert(cfd != nullptr); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv, + /* allow_unprepared_value */ true); + result = NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, + this, cfd); + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterator is overridden in + // WritePreparedTxnDB + result = NewIteratorImpl(read_options, cfd, + (read_options.snapshot != nullptr) + ? read_options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber, + read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool expose_blob_index, + bool allow_refresh) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + + TEST_SYNC_POINT("DBImpl::NewIterator:1"); + TEST_SYNC_POINT("DBImpl::NewIterator:2"); + + if (snapshot == kMaxSequenceNumber) { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + // Note that the super version might not contain all the data available + // to this snapshot, but in that case it can see all the data in the + // super version, which is a valid consistent state after the user + // calls NewIterator(). + snapshot = versions_->LastSequence(); + TEST_SYNC_POINT("DBImpl::NewIterator:3"); + TEST_SYNC_POINT("DBImpl::NewIterator:4"); + } + + // Try to generate a DB iterator tree in continuous memory area to be + // cache friendly. Here is an example of result: + // +-------------------------------+ + // | | + // | ArenaWrappedDBIter | + // | + | + // | +---> Inner Iterator ------------+ + // | | | | + // | | +-- -- -- -- -- -- -- --+ | + // | +--- | Arena | | + // | | | | + // | Allocated Memory: | | + // | | +-------------------+ | + // | | | DBIter | <---+ + // | | + | + // | | | +-> iter_ ------------+ + // | | | | | + // | | +-------------------+ | + // | | | MergingIterator | <---+ + // | | + | + // | | | +->child iter1 ------------+ + // | | | | | | + // | | +->child iter2 ----------+ | + // | | | | | | | + // | | | +->child iter3 --------+ | | + // | | | | | | + // | | +-------------------+ | | | + // | | | Iterator1 | <--------+ + // | | +-------------------+ | | + // | | | Iterator2 | <------+ + // | | +-------------------+ | + // | | | Iterator3 | <----+ + // | | +-------------------+ + // | | | + // +-------+-----------------------+ + // + // ArenaWrappedDBIter inlines an arena area where all the iterators in + // the iterator tree are allocated in the order of being accessed when + // querying. + // Laying out the iterators in the order of being accessed makes it more + // likely that any iterator pointer is close to the iterator it points to so + // that they are likely to be in the same cache line and/or page. + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, + snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, this, cfd, expose_blob_index, + read_options.snapshot != nullptr ? false : allow_refresh); + + InternalIterator* internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot, + /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; +} + +Status DBImpl::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call NewIterators with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + + if (read_options.timestamp) { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + } else { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfCfHasTs(cf); + if (!s.ok()) { + return s; + } + } + } + + ReadCallback* read_callback = nullptr; // No read callback provided. + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { + for (auto cfh : column_families) { + auto cfd = static_cast_with_check(cfh)->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv, + /* allow_unprepared_value */ true); + iterators->push_back(NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_callback, this, cfd)); + } + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterators is overridden in + // WritePreparedTxnDB + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + for (size_t i = 0; i < column_families.size(); ++i) { + auto* cfd = + static_cast_with_check(column_families[i]) + ->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, snapshot, read_callback)); + } + } + + return Status::OK(); +} + +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } + +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); +} + +std::pair> +DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) { + assert(ts != std::numeric_limits::max()); + + auto ret = CreateTimestampedSnapshotImpl(snapshot_seq, ts, /*lock=*/true); + return ret; +} + +std::shared_ptr DBImpl::GetTimestampedSnapshot( + uint64_t ts) const { + InstrumentedMutexLock lock_guard(&mutex_); + return timestamped_snapshots_.GetSnapshot(ts); +} + +void DBImpl::ReleaseTimestampedSnapshotsOlderThan(uint64_t ts, + size_t* remaining_total_ss) { + autovector> snapshots_to_release; + { + InstrumentedMutexLock lock_guard(&mutex_); + timestamped_snapshots_.ReleaseSnapshotsOlderThan(ts, snapshots_to_release); + } + snapshots_to_release.clear(); + + if (remaining_total_ss) { + InstrumentedMutexLock lock_guard(&mutex_); + *remaining_total_ss = static_cast(snapshots_.count()); + } +} + +Status DBImpl::GetTimestampedSnapshots( + uint64_t ts_lb, uint64_t ts_ub, + std::vector>& timestamped_snapshots) const { + if (ts_lb >= ts_ub) { + return Status::InvalidArgument( + "timestamp lower bound must be smaller than upper bound"); + } + timestamped_snapshots.clear(); + InstrumentedMutexLock lock_guard(&mutex_); + timestamped_snapshots_.GetSnapshots(ts_lb, ts_ub, timestamped_snapshots); + return Status::OK(); +} + +SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock) { + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + if (lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return nullptr; + } + auto snapshot_seq = GetLastPublishedSequence(); + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary); + if (lock) { + mutex_.Unlock(); + } + return snapshot; +} + +std::pair> +DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts, + bool lock) { + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + const bool need_update_seq = (snapshot_seq != kMaxSequenceNumber); + + if (lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return std::make_pair( + Status::NotSupported("Memtable does not support snapshot"), nullptr); + } + + // Caller is not write thread, thus didn't provide a valid snapshot_seq. + // Obtain seq from db. + if (!need_update_seq) { + snapshot_seq = GetLastPublishedSequence(); + } + + std::shared_ptr latest = + timestamped_snapshots_.GetSnapshot(std::numeric_limits::max()); + + // If there is already a latest timestamped snapshot, then we need to do some + // checks. + if (latest) { + uint64_t latest_snap_ts = latest->GetTimestamp(); + SequenceNumber latest_snap_seq = latest->GetSequenceNumber(); + assert(latest_snap_seq <= snapshot_seq); + bool needs_create_snap = true; + Status status; + std::shared_ptr ret; + if (latest_snap_ts > ts) { + // A snapshot created later cannot have smaller timestamp than a previous + // timestamped snapshot. + needs_create_snap = false; + std::ostringstream oss; + oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > " + << ts; + status = Status::InvalidArgument(oss.str()); + } else if (latest_snap_ts == ts) { + if (latest_snap_seq == snapshot_seq) { + // We are requesting the same sequence number and timestamp, thus can + // safely reuse (share) the current latest timestamped snapshot. + needs_create_snap = false; + ret = latest; + } else if (latest_snap_seq < snapshot_seq) { + // There may have been writes to the database since the latest + // timestamped snapshot, yet we are still requesting the same + // timestamp. In this case, we cannot create the new timestamped + // snapshot. + needs_create_snap = false; + std::ostringstream oss; + oss << "Allocated seq is " << snapshot_seq + << ", while snapshot exists with smaller seq " << latest_snap_seq + << " but same timestamp " << ts; + status = Status::InvalidArgument(oss.str()); + } + } + if (!needs_create_snap) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return std::make_pair(status, ret); + } else { + status.PermitUncheckedError(); + } + } + + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, + /*is_write_conflict_boundary=*/true, ts); + + std::shared_ptr ret( + snapshot, + std::bind(&DBImpl::ReleaseSnapshot, this, std::placeholders::_1)); + timestamped_snapshots_.AddSnapshot(ret); + + // Caller is from write thread, and we need to update database's sequence + // number. + if (need_update_seq) { + assert(versions_); + if (last_seq_same_as_publish_seq_) { + versions_->SetLastSequence(snapshot_seq); + } else { + // TODO: support write-prepared/write-unprepared transactions with two + // write queues. + assert(false); + } + } + + if (lock) { + mutex_.Unlock(); + } + return std::make_pair(Status::OK(), ret); +} + +namespace { +using CfdList = autovector; +bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) { + for (const ColumnFamilyData* t : list) { + if (t == cfd) { + return true; + } + } + return false; +} +} // namespace + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + if (s == nullptr) { + // DBImpl::GetSnapshot() can return nullptr when snapshot + // not supported by specifying the condition: + // inplace_update_support enabled. + return; + } + const SnapshotImpl* casted_s = reinterpret_cast(s); + { + InstrumentedMutexLock l(&mutex_); + snapshots_.Delete(casted_s); + uint64_t oldest_snapshot; + if (snapshots_.empty()) { + oldest_snapshot = GetLastPublishedSequence(); + } else { + oldest_snapshot = snapshots_.oldest()->number_; + } + // Avoid to go through every column family by checking a global threshold + // first. + if (oldest_snapshot > bottommost_files_mark_threshold_) { + CfdList cf_scheduled; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->ioptions()->allow_ingest_behind) { + cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot); + if (!cfd->current() + ->storage_info() + ->BottommostFilesMarkedForCompaction() + .empty()) { + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + cf_scheduled.push_back(cfd); + } + } + } + + // Calculate a new threshold, skipping those CFs where compactions are + // scheduled. We do not do the same pass as the previous loop because + // mutex might be unlocked during the loop, making the result inaccurate. + SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (CfdListContains(cf_scheduled, cfd) || + cfd->ioptions()->allow_ingest_behind) { + continue; + } + new_bottommost_files_mark_threshold = std::min( + new_bottommost_files_mark_threshold, + cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold; + } + } + delete casted_s; +} + +Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfAllTables(read_options, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, + const Range* range, std::size_t n, + TablePropertiesCollection* props) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfTablesInRange(read_options, range, n, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + + +const std::string& DBImpl::GetName() const { return dbname_; } + +Env* DBImpl::GetEnv() const { return env_; } + +FileSystem* DB::GetFileSystem() const { + const auto& fs = GetEnv()->GetFileSystem(); + return fs.get(); +} + +FileSystem* DBImpl::GetFileSystem() const { + return immutable_db_options_.fs.get(); +} + +SystemClock* DBImpl::GetSystemClock() const { + return immutable_db_options_.clock; +} + + +Status DBImpl::StartIOTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + assert(trace_writer != nullptr); + return io_tracer_->StartIOTrace(GetSystemClock(), trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndIOTrace() { + io_tracer_->EndIOTrace(); + return Status::OK(); +} + + +Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { + InstrumentedMutexLock l(&mutex_); + auto cfh = static_cast_with_check(column_family); + return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfh->cfd()->GetLatestCFOptions()); +} + +DBOptions DBImpl::GetDBOptions() const { + InstrumentedMutexLock l(&mutex_); + return BuildDBOptions(immutable_db_options_, mutable_db_options_); +} + +bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = + static_cast_with_check(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_int) { + uint64_t int_value; + bool ret_value = + GetIntPropertyInternal(cfd, *property_info, false, &int_value); + if (ret_value) { + *value = std::to_string(int_value); + } + return ret_value; + } else if (property_info->handle_string) { + if (property_info->need_out_of_mutex) { + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } + } else if (property_info->handle_string_dbimpl) { + if (property_info->need_out_of_mutex) { + return (this->*(property_info->handle_string_dbimpl))(value); + } else { + InstrumentedMutexLock l(&mutex_); + return (this->*(property_info->handle_string_dbimpl))(value); + } + } + // Shouldn't reach here since exactly one of handle_string and handle_int + // should be non-nullptr. + assert(false); + return false; +} + +bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = + static_cast_with_check(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_map) { + if (property_info->need_out_of_mutex) { + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } + } + // If we reach this point it means that handle_map is not provided for the + // requested property + return false; +} + +bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + auto cfd = + static_cast_with_check(column_family)->cfd(); + return GetIntPropertyInternal(cfd, *property_info, false, value); +} + +bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value) { + assert(property_info.handle_int != nullptr); + if (!property_info.need_out_of_mutex) { + if (is_locked) { + mutex_.AssertHeld(); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } + } else { + SuperVersion* sv = nullptr; + if (is_locked) { + mutex_.Unlock(); + } + sv = GetAndRefSuperVersion(cfd); + + bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( + property_info, sv->current, value); + + ReturnAndCleanupSuperVersion(cfd, sv); + if (is_locked) { + mutex_.Lock(); + } + + return ret; + } +} + +bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { + assert(value != nullptr); + Statistics* statistics = immutable_db_options_.stats; + if (!statistics) { + return false; + } + *value = statistics->ToString(); + return true; +} + +Status DBImpl::ResetStats() { + InstrumentedMutexLock l(&mutex_); + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->Clear(); + } + } + return Status::OK(); +} + +bool DBImpl::GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + + uint64_t sum = 0; + bool ret = true; + { + // Needs mutex to protect the list of column families. + InstrumentedMutexLock l(&mutex_); + uint64_t value; + for (auto* cfd : versions_->GetRefedColumnFamilySet()) { + if (!cfd->initialized()) { + continue; + } + ret = GetIntPropertyInternal(cfd, *property_info, true, &value); + // GetIntPropertyInternal may release db mutex and re-acquire it. + mutex_.AssertHeld(); + if (ret) { + sum += value; + } else { + ret = false; + break; + } + } + } + *aggregated_value = sum; + return ret; +} + +SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { + // TODO(ljin): consider using GetReferencedSuperVersion() directly + return cfd->GetThreadLocalSuperVersion(this); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + if (!cfd) { + return nullptr; + } + + return GetAndRefSuperVersion(cfd); +} + +void DBImpl::CleanupSuperVersion(SuperVersion* sv) { + // Release SuperVersion + if (sv->Unref()) { + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + SchedulePurge(); + } + } + if (!defer_purge) { + delete sv; + } + RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS); + } + RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES); +} + +void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, + SuperVersion* sv) { + if (!cfd->ReturnThreadLocalSuperVersion(sv)) { + CleanupSuperVersion(sv); + } +} + +// REQUIRED: this function should only be called on the write thread. +void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, + SuperVersion* sv) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + + // If SuperVersion is held, and we successfully fetched a cfd using + // GetAndRefSuperVersion(), it must still exist. + assert(cfd != nullptr); + ReturnAndCleanupSuperVersion(cfd, sv); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + +// REQUIRED: mutex is NOT held. +std::unique_ptr DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + InstrumentedMutexLock l(&mutex_); + + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id); + if (cfd == nullptr) { + return nullptr; + } + + return std::unique_ptr( + new ColumnFamilyHandleImpl(cfd, this, &mutex_)); +} + +void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) { + ColumnFamilyHandleImpl* cfh = + static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + // Convert user_key into a corresponding internal key. + InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); + MemTable::MemTableStats memStats = + sv->mem->ApproximateStats(k1.Encode(), k2.Encode()); + MemTable::MemTableStats immStats = + sv->imm->ApproximateStats(k1.Encode(), k2.Encode()); + *count = memStats.count + immStats.count; + *size = memStats.size + immStats.size; + + ReturnAndCleanupSuperVersion(cfd, sv); +} + +Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + if (!options.include_memtables && !options.include_files) { + return Status::InvalidArgument("Invalid options"); + } + + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + Version* v; + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + v = sv->current; + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + for (int i = 0; i < n; i++) { + Slice start = range[i].start; + Slice limit = range[i].limit; + + // Add timestamp if needed + std::string start_with_ts, limit_with_ts; + if (ts_sz > 0) { + // Maximum timestamp means including all key with any timestamp + AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz); + // Append a maximum timestamp as the range limit is exclusive: + // [start, limit) + AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz); + start = start_with_ts; + limit = limit_with_ts; + } + // Convert user_key into a corresponding internal key. + InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek); + sizes[i] = 0; + if (options.include_files) { + sizes[i] += versions_->ApproximateSize( + options, read_options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); + } + if (options.include_memtables) { + sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; + sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size; + } + } + + ReturnAndCleanupSuperVersion(cfd, sv); + return Status::OK(); +} + +std::list::iterator +DBImpl::CaptureCurrentFileNumberInPendingOutputs() { + // We need to remember the iterator of our insert, because after the + // background job is done, we need to remove that element from + // pending_outputs_. + pending_outputs_.push_back(versions_->current_next_file_number()); + auto pending_outputs_inserted_elem = pending_outputs_.end(); + --pending_outputs_inserted_elem; + return pending_outputs_inserted_elem; +} + +void DBImpl::ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v) { + if (v.get() != nullptr) { + pending_outputs_.erase(*v.get()); + v.reset(); + } +} + +Status DBImpl::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) { + RecordTick(stats_, GET_UPDATES_SINCE_CALLS); + if (seq_per_batch_) { + return Status::NotSupported( + "This API is not yet compatible with write-prepared/write-unprepared " + "transactions"); + } + if (seq > versions_->LastSequence()) { + return Status::NotFound("Requested sequence not yet written in the db"); + } + return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); +} + +Status DBImpl::DeleteFile(std::string name) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kWalFile)) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n", + name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + if (type == kWalFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed - not archived log.\n", + name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + Status status = wal_manager_.DeleteFile(name, number); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed -- %s.\n", name.c_str(), + status.ToString().c_str()); + } + return status; + } + + Status status; + int level; + FileMetaData* metadata; + ColumnFamilyData* cfd; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); + if (!status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed. File not found\n", name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not found"); + } + assert(level < cfd->NumberLevels()); + + // If the file is being compacted no need to delete. + if (metadata->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", + name.c_str()); + job_context.Clean(); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + auto* vstoreage = cfd->current()->storage_info(); + for (int i = level + 1; i < cfd->NumberLevels(); i++) { + if (vstoreage->NumLevelFiles(i) != 0) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not in last level"); + } + } + // if level == 0, it has to be the oldest file + if (level == 0 && + vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed ---" + " target file in level 0 must be the oldest.", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File in level 0, but not oldest"); + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(level, number); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status status = Status::OK(); + auto cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + VersionEdit edit; + std::set deleted_files; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + Version* input_version = cfd->current(); + + auto* vstorage = input_version->storage_info(); + for (size_t r = 0; r < n; r++) { + auto begin = ranges[r].start, end = ranges[r].limit; + for (int i = 1; i < cfd->NumberLevels(); i++) { + if (vstorage->LevelFiles(i).empty() || + !vstorage->OverlapInLevel(i, begin, end)) { + continue; + } + std::vector level_files; + InternalKey begin_storage, end_storage, *begin_key, *end_key; + if (begin == nullptr) { + begin_key = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + begin_key = &begin_storage; + } + if (end == nullptr) { + end_key = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + end_key = &end_storage; + } + + vstorage->GetCleanInputsWithinInterval( + i, begin_key, end_key, &level_files, -1 /* hint_index */, + nullptr /* file_index */); + FileMetaData* level_file; + for (uint32_t j = 0; j < level_files.size(); j++) { + level_file = level_files[j]; + if (level_file->being_compacted) { + continue; + } + if (deleted_files.find(level_file) != deleted_files.end()) { + continue; + } + if (!include_end && end != nullptr && + cfd->user_comparator()->Compare(level_file->largest.user_key(), + *end) == 0) { + continue; + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.insert(level_file); + level_file->being_compacted = true; + } + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + } + } + if (edit.GetDeletedFiles().empty()) { + job_context.Clean(); + return status; + } + input_version->Ref(); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + for (auto* deleted_file : deleted_files) { + deleted_file->being_compacted = false; + } + input_version->Unref(); + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + versions_->GetLiveFilesMetaData(metadata); +} + +Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { + InstrumentedMutexLock l(&mutex_); + return versions_->GetLiveFilesChecksumInfo(checksum_list); +} + +void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) { + assert(column_family); + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* sv = GetAndRefSuperVersion(cfd); + { + // Without mutex, Version::GetColumnFamilyMetaData will have data race with + // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but + // this may cause regression. An alternative is to make + // FileMetaData::being_compacted atomic, but it will make FileMetaData + // non-copy-able. Another option is to separate these variables from + // original FileMetaData struct, and this requires re-organization of data + // structures. For now, we take the easy approach. If + // DB::GetColumnFamilyMetaData is not called frequently, the regression + // should not be big. We still need to keep an eye on it. + InstrumentedMutexLock l(&mutex_); + sv->current->GetColumnFamilyMetaData(cf_meta); + } + ReturnAndCleanupSuperVersion(cfd, sv); +} + +void DBImpl::GetAllColumnFamilyMetaData( + std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *(versions_->GetColumnFamilySet())) { + { + metadata->emplace_back(); + cfd->current()->GetColumnFamilyMetaData(&metadata->back()); + } + } +} + +Status DBImpl::CheckConsistency() { + mutex_.AssertHeld(); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); + + std::string corruption_messages; + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + // Instead of calling GetFileSize() for each expected file, call + // GetChildren() for the DB directory and check that all expected files + // are listed, without checking their sizes. + // Since sst files might be in different directories, do it for each + // directory separately. + std::map> files_by_directory; + for (const auto& md : metadata) { + // md.name has a leading "/". Remove it. + std::string fname = md.name; + if (!fname.empty() && fname[0] == '/') { + fname = fname.substr(1); + } + files_by_directory[md.db_path].push_back(fname); + } + + IOOptions io_opts; + io_opts.do_not_recurse = true; + for (const auto& dir_files : files_by_directory) { + std::string directory = dir_files.first; + std::vector existing_files; + Status s = fs_->GetChildren(directory, io_opts, &existing_files, + /*IODebugContext*=*/nullptr); + if (!s.ok()) { + corruption_messages += + "Can't list files in " + directory + ": " + s.ToString() + "\n"; + continue; + } + std::sort(existing_files.begin(), existing_files.end()); + + for (const std::string& fname : dir_files.second) { + if (!std::binary_search(existing_files.begin(), existing_files.end(), + fname) && + !std::binary_search(existing_files.begin(), existing_files.end(), + Rocks2LevelTableFileName(fname))) { + corruption_messages += + "Missing sst file " + fname + " in " + directory + "\n"; + } + } + } + } else { + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + file_path + + ". Size recorded in manifest " + + std::to_string(md.size) + ", actual size " + + std::to_string(fsize) + "\n"; + } + } + } + + if (corruption_messages.size() == 0) { + return Status::OK(); + } else { + return Status::Corruption(corruption_messages); + } +} + +Status DBImpl::GetDbIdentity(std::string& identity) const { + identity.assign(db_id_); + return Status::OK(); +} + +Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const { + std::string idfilename = IdentityFileName(dbname_); + const FileOptions soptions; + + Status s = ReadFileToString(fs_.get(), idfilename, identity); + if (!s.ok()) { + return s; + } + + // If last character is '\n' remove it from identity. (Old implementations + // of Env::GenerateUniqueId() would include a trailing '\n'.) + if (identity->size() > 0 && identity->back() == '\n') { + identity->pop_back(); + } + return s; +} + +Status DBImpl::GetDbSessionId(std::string& session_id) const { + session_id.assign(db_session_id_); + return Status::OK(); +} + +namespace { +SemiStructuredUniqueIdGen* DbSessionIdGen() { + static SemiStructuredUniqueIdGen gen; + return &gen; +} +} // namespace + +void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); } + +std::string DBImpl::GenerateDbSessionId(Env*) { + // See SemiStructuredUniqueIdGen for its desirable properties. + auto gen = DbSessionIdGen(); + + uint64_t lo, hi; + gen->GenerateNext(&hi, &lo); + if (lo == 0) { + // Avoid emitting session ID with lo==0, so that SST unique + // IDs can be more easily ensured non-zero + gen->GenerateNext(&hi, &lo); + assert(lo != 0); + } + return EncodeSessionId(hi, lo); +} + +void DBImpl::SetDbSessionId() { + db_session_id_ = GenerateDbSessionId(env_); + TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_); +} + +// Default implementation -- returns not supported status +Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, + const std::string& /*column_family_name*/, + ColumnFamilyHandle** /*handle*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const ColumnFamilyOptions& /*cf_options*/, + const std::vector& /*column_family_names*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const std::vector& /*column_families*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamilies( + const std::vector& /*column_families*/) { + return Status::NotSupported(""); +} + +Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { + if (DefaultColumnFamily() == column_family) { + return Status::InvalidArgument( + "Cannot destroy the handle returned by DefaultColumnFamily()"); + } + delete column_family; + return Status::OK(); +} + +DB::~DB() {} + +Status DBImpl::Close() { + InstrumentedMutexLock closing_lock_guard(&closing_mutex_); + if (closed_) { + return closing_status_; + } + + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + if (!s.ok()) { + return s; + } + } + + closing_status_ = CloseImpl(); + closed_ = true; + return closing_status_; +} + +Status DB::ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families) { + const std::shared_ptr& fs = db_options.env->GetFileSystem(); + return VersionSet::ListColumnFamilies(column_families, name, fs.get()); +} + +Snapshot::~Snapshot() {} + +Status DestroyDB(const std::string& dbname, const Options& options, + const std::vector& column_families) { + ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + std::vector filenames; + bool wal_in_db_path = soptions.IsWalDirSameAsDBPath(); + + // Reset the logger because it holds a handle to the + // log file and prevents cleanup and directory removal + soptions.info_log.reset(); + IOOptions io_opts; + // Ignore error in case directory does not exist + soptions.fs + ->GetChildren(dbname, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .PermitUncheckedError(); + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname); + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + std::string path_to_delete = dbname + "/" + fname; + if (type == kMetaDatabase) { + del = DestroyDB(path_to_delete, options); + } else if (type == kTableFile || type == kWalFile || + type == kBlobFile) { + del = DeleteDBFile( + &soptions, path_to_delete, dbname, + /*force_bg=*/false, + /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false); + } else { + del = env->DeleteFile(path_to_delete); + } + if (!del.ok() && result.ok()) { + result = del; + } + } + } + + std::set paths; + for (const DbPath& db_path : options.db_paths) { + paths.insert(db_path.path); + } + for (const ColumnFamilyDescriptor& cf : column_families) { + for (const DbPath& cf_path : cf.options.cf_paths) { + paths.insert(cf_path.path); + } + } + + for (const auto& path : paths) { + if (soptions.fs + ->GetChildren(path, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .ok()) { + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, &type) && + (type == kTableFile || + type == kBlobFile)) { // Lock file will be deleted at end + std::string file_path = path + "/" + fname; + Status del = DeleteDBFile(&soptions, file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + if (!del.ok() && result.ok()) { + result = del; + } + } + } + // TODO: Should we return an error if we cannot delete the directory? + env->DeleteDir(path).PermitUncheckedError(); + } + } + + std::vector walDirFiles; + std::string archivedir = ArchivalDirectory(dbname); + bool wal_dir_exists = false; + if (!soptions.IsWalDirSameAsDBPath(dbname)) { + wal_dir_exists = + soptions.fs + ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles, + /*IODebugContext*=*/nullptr) + .ok(); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + // Archive dir may be inside wal dir or dbname and should be + // processed and removed before those otherwise we have issues + // removing them + std::vector archiveFiles; + if (soptions.fs + ->GetChildren(archivedir, io_opts, &archiveFiles, + /*IODebugContext*=*/nullptr) + .ok()) { + // Delete archival files. + for (const auto& file : archiveFiles) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { + Status del = + DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + if (!del.ok() && result.ok()) { + result = del; + } + } + } + // Ignore error in case dir contains other files + env->DeleteDir(archivedir).PermitUncheckedError(); + } + + // Delete log files in the WAL dir + if (wal_dir_exists) { + for (const auto& file : walDirFiles) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { + Status del = + DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path); + if (!del.ok() && result.ok()) { + result = del; + } + } + } + // Ignore error in case dir contains other files + env->DeleteDir(soptions.wal_dir).PermitUncheckedError(); + } + + // Ignore error since state is already gone + env->UnlockFile(lock).PermitUncheckedError(); + env->DeleteFile(lockname).PermitUncheckedError(); + + // sst_file_manager holds a ref to the logger. Make sure the logger is + // gone before trying to remove the directory. + soptions.sst_file_manager.reset(); + + // Ignore error in case dir contains other files + env->DeleteDir(dbname).PermitUncheckedError(); + ; + } + return result; +} + +Status DBImpl::WriteOptionsFile(bool need_mutex_lock, + bool need_enter_write_thread) { + WriteThread::Writer w; + if (need_mutex_lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + if (need_enter_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + } + + std::vector cf_names; + std::vector cf_opts; + + // This part requires mutex to protect the column family options + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cf_names.push_back(cfd->GetName()); + cf_opts.push_back(cfd->GetLatestCFOptions()); + } + + // Unlock during expensive operations. New writes cannot get here + // because the single write thread ensures all new writes get queued. + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + mutex_.Unlock(); + + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1"); + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2"); + TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions", + &db_options); + + std::string file_name = + TempOptionsFileName(GetName(), versions_->NewFileNumber()); + Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, + fs_.get()); + + if (s.ok()) { + s = RenameTempFileToOptionsFile(file_name); + } + + if (!s.ok() && GetEnv()->FileExists(file_name).ok()) { + if (!GetEnv()->DeleteFile(file_name).ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to delete temp options file %s", + file_name.c_str()); + } + } + + // restore lock + if (!need_mutex_lock) { + mutex_.Lock(); + } + if (need_enter_write_thread) { + write_thread_.ExitUnbatched(&w); + } + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unnable to persist options -- %s", s.ToString().c_str()); + if (immutable_db_options_.fail_if_options_file_error) { + return Status::IOError("Unable to persist options.", + s.ToString().c_str()); + } + } + return Status::OK(); +} + +namespace { +void DeleteOptionsFilesHelper(const std::map& filenames, + const size_t num_files_to_keep, + const std::shared_ptr& info_log, + Env* env) { + if (filenames.size() <= num_files_to_keep) { + return; + } + for (auto iter = std::next(filenames.begin(), num_files_to_keep); + iter != filenames.end(); ++iter) { + if (!env->DeleteFile(iter->second).ok()) { + ROCKS_LOG_WARN(info_log, "Unable to delete options file %s", + iter->second.c_str()); + } + } +} +} // namespace + +Status DBImpl::DeleteObsoleteOptionsFiles() { + std::vector filenames; + // use ordered map to store keep the filenames sorted from the newest + // to the oldest. + std::map options_filenames; + Status s; + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs_->GetChildren(GetName(), io_opts, &filenames, + /*IODebugContext*=*/nullptr); + if (!s.ok()) { + return s; + } + for (auto& filename : filenames) { + uint64_t file_number; + FileType type; + if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) { + options_filenames.insert( + {std::numeric_limits::max() - file_number, + GetName() + "/" + filename}); + } + } + + // Keeps the latest 2 Options file + const size_t kNumOptionsFilesKept = 2; + DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, + immutable_db_options_.info_log, GetEnv()); + return Status::OK(); +} + +Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { + Status s; + + uint64_t options_file_number = versions_->NewFileNumber(); + std::string options_file_name = + OptionsFileName(GetName(), options_file_number); + uint64_t options_file_size = 0; + s = GetEnv()->GetFileSize(file_name, &options_file_size); + if (s.ok()) { + // Retry if the file name happen to conflict with an existing one. + s = GetEnv()->RenameFile(file_name, options_file_name); + std::unique_ptr dir_obj; + if (s.ok()) { + s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(options_file_name)); + } + if (s.ok()) { + Status temp_s = dir_obj->Close(IOOptions(), nullptr); + // The default Close() could return "NotSupproted" and we bypass it + // if it is not impelmented. Detailed explanations can be found in + // db/db_impl/db_impl.h + if (!temp_s.ok()) { + if (temp_s.IsNotSupported()) { + temp_s.PermitUncheckedError(); + } else { + s = temp_s; + } + } + } + } + if (s.ok()) { + InstrumentedMutexLock l(&mutex_); + versions_->options_file_number_ = options_file_number; + versions_->options_file_size_ = options_file_size; + } + + if (0 == disable_delete_obsolete_files_) { + // TODO: Should we check for errors here? + DeleteObsoleteOptionsFiles().PermitUncheckedError(); + } + return s; +} + +#ifdef ROCKSDB_USING_THREAD_STATUS + +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(), + cfd->ioptions()->env); + } +} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseColumnFamilyInfo(cfd); + } +} + +void DBImpl::EraseThreadStatusDbInfo() const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseDatabaseInfo(this); + } +} + +#else +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusDbInfo() const {} +#endif // ROCKSDB_USING_THREAD_STATUS + +// +// A global method that can dump out the build version +void DumpRocksDBBuildVersion(Logger* log) { + ROCKS_LOG_HEADER(log, "RocksDB version: %s\n", + GetRocksVersionAsString().c_str()); + const auto& props = GetRocksBuildProperties(); + const auto& sha = props.find("rocksdb_build_git_sha"); + if (sha != props.end()) { + ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str()); + } + const auto date = props.find("rocksdb_build_date"); + if (date != props.end()) { + ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str()); + } +} + +SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history) { + // Find the earliest sequence number that we know we can rely on reading + // from the memtable without needing to check sst files. + SequenceNumber earliest_seq = + sv->imm->GetEarliestSequenceNumber(include_history); + if (earliest_seq == kMaxSequenceNumber) { + earliest_seq = sv->mem->GetEarliestSequenceNumber(); + } + assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq); + + return earliest_seq; +} + +Status DBImpl::GetLatestSequenceForKey( + SuperVersion* sv, const Slice& key, bool cache_only, + SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp, + bool* found_record_for_key, bool* is_blob_index) { + Status s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + // TODO: plumb Env::IOActivity + ReadOptions read_options; + SequenceNumber current_seq = versions_->LastSequence(); + + ColumnFamilyData* cfd = sv->cfd; + assert(cfd); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + std::string ts_buf; + if (ts_sz > 0) { + assert(timestamp); + ts_buf.assign(ts_sz, '\xff'); + } else { + assert(!timestamp); + } + Slice ts(ts_buf); + + LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts); + + *seq = kMaxSequenceNumber; + *found_record_for_key = false; + + // Check if there is a record for this key in the latest memtable + sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, seq, read_options, + false /* immutable_memtable */, nullptr /*read_callback*/, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTable::Get: %s\n", + s.ToString().c_str()); + + return s; + } + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + + TEST_SYNC_POINT_CALLBACK("DBImpl::GetLatestSequenceForKey:mem", timestamp); + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check immutable memtables + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber(); + if (lower_bound_in_mem != kMaxSequenceNumber && + lower_bound_in_mem < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, seq, read_options, + nullptr /*read_callback*/, is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::Get: %s\n", + s.ToString().c_str()); + + return s; + } + + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check memtable history + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber(); + if (lower_bound_in_imm != kMaxSequenceNumber && + lower_bound_in_imm < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->GetFromHistory(lkey, /*value=*/nullptr, /*columns=*/nullptr, + timestamp, &s, &merge_context, + &max_covering_tombstone_seq, seq, read_options, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR( + immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::GetFromHistory: %s\n", + s.ToString().c_str()); + + return s; + } + + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check SST files + assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff')); + *found_record_for_key = true; + return Status::OK(); + } + + // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true) + // check here to skip the history if possible. But currently the caller + // already does that. Maybe we should move the logic here later. + + // TODO(agiardullo): possible optimization: consider checking cached + // SST files if cache_only=true? + if (!cache_only) { + // Check tables + PinnedIteratorsManager pinned_iters_mgr; + sv->current->Get(read_options, lkey, /*value=*/nullptr, /*columns=*/nullptr, + timestamp, &s, &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, nullptr /* value_found */, + found_record_for_key, seq, nullptr /*read_callback*/, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading SST files + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from Version::Get: %s\n", + s.ToString().c_str()); + } + } + + return s; +} + +Status DBImpl::IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) { + IngestExternalFileArg arg; + arg.column_family = column_family; + arg.external_files = external_files; + arg.options = ingestion_options; + return IngestExternalFiles({arg}); +} + +Status DBImpl::IngestExternalFiles( + const std::vector& args) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + if (args.empty()) { + return Status::InvalidArgument("ingestion arg list is empty"); + } + { + std::unordered_set unique_cfhs; + for (const auto& arg : args) { + if (arg.column_family == nullptr) { + return Status::InvalidArgument("column family handle is null"); + } else if (unique_cfhs.count(arg.column_family) > 0) { + return Status::InvalidArgument( + "ingestion args have duplicate column families"); + } + unique_cfhs.insert(arg.column_family); + } + } + // Ingest multiple external SST files atomically. + const size_t num_cfs = args.size(); + for (size_t i = 0; i != num_cfs; ++i) { + if (args[i].external_files.empty()) { + char err_msg[128] = {0}; + snprintf(err_msg, 128, "external_files[%zu] is empty", i); + return Status::InvalidArgument(err_msg); + } + } + for (const auto& arg : args) { + const IngestExternalFileOptions& ingest_opts = arg.options; + if (ingest_opts.ingest_behind && + !immutable_db_options_.allow_ingest_behind) { + return Status::InvalidArgument( + "can't ingest_behind file in DB with allow_ingest_behind=false"); + } + } + + // TODO (yanqin) maybe handle the case in which column_families have + // duplicates + std::unique_ptr::iterator> pending_output_elem; + size_t total = 0; + for (const auto& arg : args) { + total += arg.external_files.size(); + } + uint64_t next_file_number = 0; + Status status = ReserveFileNumbersBeforeIngestion( + static_cast(args[0].column_family)->cfd(), total, + pending_output_elem, &next_file_number); + if (!status.ok()) { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector ingestion_jobs; + for (const auto& arg : args) { + auto* cfd = static_cast(arg.column_family)->cfd(); + ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_, + mutable_db_options_, file_options_, &snapshots_, + arg.options, &directories_, &event_logger_, + io_tracer_); + } + + // TODO(yanqin) maybe make jobs run in parallel + uint64_t start_file_number = next_file_number; + for (size_t i = 1; i != num_cfs; ++i) { + start_file_number += args[i - 1].external_files.size(); + auto* cfd = + static_cast(args[i].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Status es = ingestion_jobs[i].Prepare( + args[i].external_files, args[i].files_checksums, + args[i].files_checksum_func_names, args[i].file_temperature, + start_file_number, super_version); + // capture first error only + if (!es.ok() && status.ok()) { + status = es; + } + CleanupSuperVersion(super_version); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"); + { + auto* cfd = + static_cast(args[0].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Status es = ingestion_jobs[0].Prepare( + args[0].external_files, args[0].files_checksums, + args[0].files_checksum_func_names, args[0].file_temperature, + next_file_number, super_version); + if (!es.ok()) { + status = es; + } + CleanupSuperVersion(super_version); + } + if (!status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + ingestion_jobs[i].Cleanup(status); + } + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector sv_ctxs; + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs.emplace_back(true /* create_superversion */); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1"); + TEST_SYNC_POINT("DBImpl::AddFile:Start"); + { + InstrumentedMutexLock l(&mutex_); + TEST_SYNC_POINT("DBImpl::AddFile:MutexLock"); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + // When unordered_write is enabled, the keys are writing to memtable in an + // unordered way. If the ingestion job checks memtable key range before the + // key landing in memtable, the ingestion job may skip the necessary + // memtable flush. + // So wait here to ensure there is no pending write to memtable. + WaitForPendingWrites(); + + num_running_ingest_file_ += static_cast(num_cfs); + TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter"); + + bool at_least_one_cf_need_flush = false; + std::vector need_flush(num_cfs, false); + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + // TODO (yanqin) investigate whether we should abort ingestion or + // proceed with other non-dropped column families. + status = Status::InvalidArgument( + "cannot ingest an external file into a dropped CF"); + break; + } + bool tmp = false; + status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion()); + need_flush[i] = tmp; + at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp); + if (!status.ok()) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush", + &at_least_one_cf_need_flush); + + if (status.ok() && at_least_one_cf_need_flush) { + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + mutex_.Unlock(); + status = AtomicFlushMemTables( + flush_opts, FlushReason::kExternalFileIngestion, + {} /* provided_candidate_cfds */, true /* entered_write_thread */); + mutex_.Lock(); + } else { + for (size_t i = 0; i != num_cfs; ++i) { + if (need_flush[i]) { + mutex_.Unlock(); + auto* cfd = + static_cast(args[i].column_family) + ->cfd(); + status = FlushMemTable(cfd, flush_opts, + FlushReason::kExternalFileIngestion, + true /* entered_write_thread */); + mutex_.Lock(); + if (!status.ok()) { + break; + } + } + } + } + } + // Run ingestion jobs. + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + mutex_.AssertHeld(); + status = ingestion_jobs[i].Run(); + if (!status.ok()) { + break; + } + ingestion_jobs[i].RegisterRange(); + } + } + if (status.ok()) { + autovector cfds_to_commit; + autovector mutable_cf_options_list; + autovector> edit_lists; + uint32_t num_entries = 0; + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + continue; + } + cfds_to_commit.push_back(cfd); + mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); + autovector edit_list; + edit_list.push_back(ingestion_jobs[i].edit()); + edit_lists.push_back(edit_list); + ++num_entries; + } + // Mark the version edits as an atomic group if the number of version + // edits exceeds 1. + if (cfds_to_commit.size() > 1) { + for (auto& edits : edit_lists) { + assert(edits.size() == 1); + edits[0]->MarkAtomicGroup(--num_entries); + } + assert(0 == num_entries); + } + status = versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + read_options, edit_lists, &mutex_, + directories_.GetDbDir()); + // It is safe to update VersionSet last seqno here after LogAndApply since + // LogAndApply persists last sequence number from VersionEdits, + // which are from file's largest seqno and not from VersionSet. + // + // It is necessary to update last seqno here since LogAndApply releases + // mutex when persisting MANIFEST file, and the snapshots taken during + // that period will not be stable if VersionSet last seqno is updated + // before LogAndApply. + int consumed_seqno_count = + ingestion_jobs[0].ConsumedSequenceNumbersCount(); + for (size_t i = 1; i != num_cfs; ++i) { + consumed_seqno_count = + std::max(consumed_seqno_count, + ingestion_jobs[i].ConsumedSequenceNumbersCount()); + } + if (consumed_seqno_count > 0) { + const SequenceNumber last_seqno = versions_->LastSequence(); + versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastSequence(last_seqno + consumed_seqno_count); + } + } + + for (auto& job : ingestion_jobs) { + job.UnregisterRange(); + } + + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i], + *cfd->GetLatestMutableCFOptions()); +#ifndef NDEBUG + if (0 == i && num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0"); + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"); + } +#endif // !NDEBUG + } + } + } else if (versions_->io_status().IsIOError()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + const IOStatus& io_s = versions_->io_status(); + // Should handle return error? + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + if (status.ok()) { + for (auto& job : ingestion_jobs) { + job.UpdateStats(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + num_running_ingest_file_ -= static_cast(num_cfs); + if (0 == num_running_ingest_file_) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock"); + } + // mutex_ is unlocked here + + // Cleanup + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs[i].Clean(); + // This may rollback jobs that have completed successfully. This is + // intended for atomicity. + ingestion_jobs[i].Cleanup(status); + } + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]); + } + } + } + return status; +} + +Status DBImpl::CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + assert(handle != nullptr); + assert(*handle == nullptr); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::string cf_comparator_name = options.comparator->Name(); + if (cf_comparator_name != metadata.db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + + // Create column family. + auto status = CreateColumnFamily(options, column_family_name, handle); + if (!status.ok()) { + return status; + } + + // Import sst files from metadata. + auto cfh = static_cast_with_check(*handle); + auto cfd = cfh->cfd(); + ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_, + file_options_, import_options, + metadata.files, io_tracer_); + + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; + std::unique_ptr::iterator> pending_output_elem; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Don't import files when there is a bg_error + status = error_handler_.GetBGError(); + } + + // Make sure that bg cleanup wont delete the files that we are importing + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + if (status.ok()) { + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = + versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + } + dummy_sv_ctx.Clean(); + + if (status.ok()) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + status = import_job.Prepare(next_file_number, sv); + CleanupSuperVersion(sv); + } + + if (status.ok()) { + SuperVersionContext sv_context(true /*create_superversion*/); + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + num_running_ingest_file_++; + assert(!cfd->IsDropped()); + mutex_.AssertHeld(); + status = import_job.Run(); + + // Install job edit [Mutex will be unlocked here] + if (status.ok()) { + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, read_options, + import_job.edit(), &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + num_running_ingest_file_--; + if (num_running_ingest_file_ == 0) { + bg_cv_.SignalAll(); + } + } + // mutex_ is unlocked here + + sv_context.Clean(); + } + + { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + } + + import_job.Cleanup(status); + if (!status.ok()) { + Status temp_s = DropColumnFamily(*handle); + if (!temp_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DropColumnFamily failed with error %s", + temp_s.ToString().c_str()); + } + // Always returns Status::OK() + temp_s = DestroyColumnFamilyHandle(*handle); + assert(temp_s.ok()); + *handle = nullptr; + } + return status; +} + +Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + assert(column_family); + Status status; + // Flush memtable + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (immutable_db_options_.atomic_flush) { + status = AtomicFlushMemTables(flush_opts, FlushReason::kDeleteFiles, + {} /* provided_candidate_cfds */, + false /* entered_write_thread */); + } else { + status = FlushMemTable(cfd, flush_opts, FlushReason::kDeleteFiles, + false /* entered_write_thread */); + } + + if (status.ok()) { + // DeleteFilesInRanges non-overlap files except L0 + std::vector ranges; + ranges.push_back(RangePtr(nullptr, &begin_key)); + ranges.push_back(RangePtr(&end_key, nullptr)); + status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); + } + + // DeleteRange the remaining overlapping keys + bool empty_after_delete = false; + if (status.ok()) { + Slice smallest_user_key, largest_user_key; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetSstFilesBoundaryKeys(&smallest_user_key, + &largest_user_key); + } + // all the files has been deleted after DeleteFilesInRanges; + if (smallest_user_key.empty() && largest_user_key.empty()) { + empty_after_delete = true; + } else { + const Comparator* const ucmp = column_family->GetComparator(); + WriteOptions wo; + // Delete [smallest_user_key, clip_begin_key) + if (ucmp->Compare(smallest_user_key, begin_key) < 0) { + status = DeleteRange(wo, column_family, smallest_user_key, begin_key); + } + + if (status.ok()) { + // Delete [clip_end_key, largest_use_key] + if (ucmp->Compare(end_key, largest_user_key) < 0) { + status = DeleteRange(wo, column_family, end_key, largest_user_key); + if (status.ok()) { + status = Delete(wo, column_family, largest_user_key); + } + } + } + } + } + + if (status.ok() && !empty_after_delete) { + // CompactRange delete all the tombstones + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + // We could just compact the ranges [null, clip_begin_key] and + // [clip_end_key, null]. But due to how manual compaction calculates the + // last level to compact to and that range tombstones are not dropped + // during non-bottommost compactions, calling CompactRange() on these two + // ranges may not clear all range tombstones. + status = CompactRange(compact_options, nullptr, nullptr); + } + return status; +} + +Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { + return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); +} + +Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { + return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false); +} + +Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, + bool use_file_checksum) { + // `bytes_read` stat is enabled based on compile-time support and cannot + // be dynamically toggled. So we do not need to worry about `PerfLevel` + // here, unlike many other `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + + Status s; + + if (read_options.io_activity != Env::IOActivity::kUnknown) { + s = Status::InvalidArgument( + "Cannot verify file checksum with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return s; + } + if (use_file_checksum) { + FileChecksumGenFactory* const file_checksum_gen_factory = + immutable_db_options_.file_checksum_gen_factory.get(); + if (!file_checksum_gen_factory) { + s = Status::InvalidArgument( + "Cannot verify file checksum if options.file_checksum_gen_factory is " + "null"); + return s; + } + } + + // TODO: simplify using GetRefedColumnFamilySet? + std::vector cfd_list; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + cfd_list.push_back(cfd); + } + } + } + std::vector sv_list; + for (auto cfd : cfd_list) { + sv_list.push_back(cfd->GetReferencedSuperVersion(this)); + } + + for (auto& sv : sv_list) { + VersionStorageInfo* vstorage = sv->current->storage_info(); + ColumnFamilyData* cfd = sv->current->cfd(); + Options opts; + if (!use_file_checksum) { + InstrumentedMutexLock l(&mutex_); + opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfd->GetLatestCFOptions()); + } + for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { + for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); + j++) { + const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j]; + const auto& fd = fd_with_krange.fd; + const FileMetaData* fmeta = fd_with_krange.file_metadata; + assert(fmeta); + std::string fname = TableFileName(cfd->ioptions()->cf_paths, + fd.GetNumber(), fd.GetPathId()); + if (use_file_checksum) { + s = VerifyFullFileChecksum(fmeta->file_checksum, + fmeta->file_checksum_func_name, fname, + read_options); + } else { + s = ROCKSDB_NAMESPACE::VerifySstFileChecksum( + opts, file_options_, read_options, fname, fd.largest_seqno); + } + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + prev_bytes_read = IOSTATS(bytes_read); + } + } + + if (s.ok() && use_file_checksum) { + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + const uint64_t blob_file_number = meta->GetBlobFileNumber(); + + const std::string blob_file_name = BlobFileName( + cfd->ioptions()->cf_paths.front().path, blob_file_number); + s = VerifyFullFileChecksum(meta->GetChecksumValue(), + meta->GetChecksumMethod(), blob_file_name, + read_options); + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + prev_bytes_read = IOSTATS(bytes_read); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + break; + } + } + + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + for (auto sv : sv_list) { + if (sv && sv->Unref()) { + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + } else { + delete sv; + } + } + } + if (defer_purge) { + SchedulePurge(); + } + for (auto cfd : cfd_list) { + cfd->UnrefAndTryDelete(); + } + } + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + return s; +} + +Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fname, + const ReadOptions& read_options) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call VerifyChecksum with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + + Status s; + if (file_checksum_expected == kUnknownFileChecksum) { + return s; + } + std::string file_checksum; + std::string func_name; + s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum( + fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), + func_name_expected, &file_checksum, &func_name, + read_options.readahead_size, immutable_db_options_.allow_mmap_reads, + io_tracer_, immutable_db_options_.rate_limiter.get(), + read_options.rate_limiter_priority); + if (s.ok()) { + assert(func_name_expected == func_name); + if (file_checksum != file_checksum_expected) { + std::ostringstream oss; + oss << fname << " file checksum mismatch, "; + oss << "expecting " + << Slice(file_checksum_expected).ToString(/*hex=*/true); + oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true); + s = Status::Corruption(oss.str()); + TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s); + } + } + return s; +} + +void DBImpl::NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { + if (immutable_db_options_.listeners.empty()) { + return; + } + + for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) { + ExternalFileIngestionInfo info; + info.cf_name = cfd->GetName(); + info.external_file_path = f.external_file_path; + info.internal_file_path = f.internal_file_path; + info.global_seqno = f.assigned_seqno; + info.table_properties = f.table_properties; + for (auto listener : immutable_db_options_.listeners) { + listener->OnExternalFileIngested(this, info); + } + } +} + +Status DBImpl::StartTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock(&trace_mutex_); + tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options, + std::move(trace_writer))); + return Status::OK(); +} + +Status DBImpl::EndTrace() { + InstrumentedMutexLock lock(&trace_mutex_); + Status s; + if (tracer_ != nullptr) { + s = tracer_->Close(); + tracer_.reset(); + } else { + s = Status::IOError("No trace file to close"); + } + return s; +} + +Status DBImpl::NewDefaultReplayer( + const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) { + replayer->reset(new ReplayerImpl(this, handles, std::move(reader))); + return Status::OK(); +} + +Status DBImpl::StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + BlockCacheTraceOptions block_trace_opts; + block_trace_opts.sampling_frequency = trace_options.sampling_frequency; + + BlockCacheTraceWriterOptions trace_writer_opt; + trace_writer_opt.max_trace_file_size = trace_options.max_trace_file_size; + + std::unique_ptr block_cache_trace_writer = + NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt, + std::move(trace_writer)); + + return block_cache_tracer_.StartTrace(block_trace_opts, + std::move(block_cache_trace_writer)); +} + +Status DBImpl::StartBlockCacheTrace( + const BlockCacheTraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + return block_cache_tracer_.StartTrace(trace_options, std::move(trace_writer)); +} + +Status DBImpl::EndBlockCacheTrace() { + block_cache_tracer_.EndTrace(); + return Status::OK(); +} + +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound); + } + } + return s; +} + +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound); + } + } + return s; +} + +Status DBImpl::ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s; + SuperVersionContext dummy_sv_ctx(true /* create_superversion */); + assert(nullptr != next_file_number); + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Do not ingest files when there is a bg_error + return error_handler_.GetBGError(); + } + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + *next_file_number = versions_->FetchAddFileNumber(static_cast(num)); + auto cf_options = cfd->GetLatestMutableCFOptions(); + VersionEdit dummy_edit; + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + s = versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + dummy_sv_ctx.Clean(); + return s; +} + +Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + if (mutable_db_options_.max_open_files == -1) { + uint64_t oldest_time = std::numeric_limits::max(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped()) { + uint64_t ctime; + { + SuperVersion* sv = GetAndRefSuperVersion(cfd); + Version* version = sv->current; + version->GetCreationTimeOfOldestFile(&ctime); + ReturnAndCleanupSuperVersion(cfd, sv); + } + + if (ctime < oldest_time) { + oldest_time = ctime; + } + if (oldest_time == 0) { + break; + } + } + } + *creation_time = oldest_time; + return Status::OK(); + } else { + return Status::NotSupported("This API only works if max_open_files = -1"); + } +} + +void DBImpl::RecordSeqnoToTimeMapping() { + // Get time first then sequence number, so the actual time of seqno is <= + // unix_time recorded + int64_t unix_time = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error + SequenceNumber seqno = GetLatestSequenceNumber(); + bool appended = false; + { + InstrumentedMutexLock l(&mutex_); + appended = seqno_time_mapping_.Append(seqno, unix_time); + } + if (!appended) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to insert sequence number to time entry: %" PRIu64 + " -> %" PRIu64, + seqno, unix_time); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.h new file mode 100644 index 0000000000..f2da7467db --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl.h @@ -0,0 +1,2841 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_job.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/flush_scheduler.h" +#include "db/import_column_family_job.h" +#include "db/internal_stats.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/periodic_task_scheduler.h" +#include "db/post_memtable_callback.h" +#include "db/pre_release_callback.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/seqno_to_time_mapping.h" +#include "db/snapshot_checker.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/version_edit.h" +#include "db/wal_manager.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/utilities/replayer.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/hash.h" +#include "util/repeatable_thread.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ArenaWrappedDBIter; +class InMemoryStatsHistoryIterator; +class MemTable; +class PersistentStatsHistoryIterator; +class TableCache; +class TaskLimiterToken; +class Version; +class VersionEdit; +class VersionSet; +class WriteCallback; +struct JobContext; +struct ExternalSstFileInfo; +struct MemTableInfo; + +// Class to maintain directories for all database paths other than main one. +class Directories { + public: + IOStatus SetDirectories(FileSystem* fs, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + FSDirectory* GetDataDir(size_t path_id) const { + assert(path_id < data_dirs_.size()); + FSDirectory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); + } + return ret_dir; + } + + FSDirectory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + FSDirectory* GetDbDir() { return db_dir_.get(); } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) { + // close all directories for all database paths + IOStatus s = IOStatus::OK(); + + // The default implementation for Close() in Directory/FSDirectory class + // "NotSupported" status, the upper level interface should be able to + // handle this error so that Close() does not fail after upgrading when + // run on FileSystems that have not implemented `Directory::Close()` or + // `FSDirectory::Close()` yet + + if (db_dir_) { + IOStatus temp_s = db_dir_->Close(options, dbg); + if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) { + s = std::move(temp_s); + } + } + + // Attempt to close everything even if one fails + s.PermitUncheckedError(); + + if (wal_dir_) { + IOStatus temp_s = wal_dir_->Close(options, dbg); + if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) { + s = std::move(temp_s); + } + } + + s.PermitUncheckedError(); + + for (auto& data_dir_ptr : data_dirs_) { + if (data_dir_ptr) { + IOStatus temp_s = data_dir_ptr->Close(options, dbg); + if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) { + s = std::move(temp_s); + } + } + } + + // Ready for caller + s.MustCheck(); + return s; + } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; +}; + +// While DB is the public interface of RocksDB, and DBImpl is the actual +// class implementing it. It's the entrance of the core RocksdB engine. +// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a +// DBImpl internally. +// Other than functions implementing the DB interface, some public +// functions are there for other internal components to call. For +// example, TransactionDB directly calls DBImpl::WriteImpl() and +// BlobDB directly calls DBImpl::GetImpl(). Some other functions +// are for sub-components to call. For example, ColumnFamilyHandleImpl +// calls DBImpl::FindObsoleteFiles(). +// +// Since it's a very large class, the definition of the functions is +// divided in several db_impl_*.cc files, besides db_impl.cc. +class DBImpl : public DB { + public: + DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch = false, const bool batch_per_txn = true, + bool read_only = false); + // No copying allowed + DBImpl(const DBImpl&) = delete; + void operator=(const DBImpl&) = delete; + + virtual ~DBImpl(); + + // ---- Implementations of the DB interface ---- + + using DB::Resume; + Status Resume() override; + + using DB::Put; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override; + + using DB::PutEntity; + Status PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) override; + + using DB::Merge; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override; + + using DB::Delete; + Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key) override; + Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) override; + + using DB::SingleDelete; + Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + using DB::DeleteRange; + Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key) override; + Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key, const Slice& ts) override; + + using DB::Write; + virtual Status Write(const WriteOptions& options, + WriteBatch* updates) override; + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) override; + + using DB::GetEntity; + Status GetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) override; + + using DB::GetMergeOperands; + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.merge_operands = merge_operands; + get_impl_options.get_merge_operands_options = get_merge_operands_options; + get_impl_options.number_of_operands = number_of_operands; + get_impl_options.get_value = false; + return GetImpl(options, key, get_impl_options); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values, + std::vector* timestamps) override; + + // This MultiGet is a batched version, which may be faster than calling Get + // multiple times, especially if the keys have some spatial locality that + // enables them to be queried in the same SST files/set of files. The larger + // the batch size, the more scope for batching and performance improvement + // The values and statuses parameters are arrays with number of elements + // equal to keys.size(). This allows the storage for those to be alloacted + // by the caller on the stack for small batches + void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input = false) override; + void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool sorted_input = false) override; + + void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input = false) override; + + void MultiGetWithCallback( + const ReadOptions& options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys); + + using DB::MultiGetEntity; + + void MultiGetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, size_t num_keys, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) override; + + void MultiGetEntity(const ReadOptions& options, size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableWideColumns* results, Status* statuses, + bool sorted_input) override; + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) override; + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) override; + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + virtual Status DropColumnFamilies( + const std::vector& column_families) override; + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + virtual const Snapshot* GetSnapshot() override; + virtual void ReleaseSnapshot(const Snapshot* snapshot) override; + // Create a timestamped snapshot. This snapshot can be shared by multiple + // readers. If any of them uses it for write conflict checking, then + // is_write_conflict_boundary is true. For simplicity, set it to true by + // default. + std::pair> CreateTimestampedSnapshot( + SequenceNumber snapshot_seq, uint64_t ts); + std::shared_ptr GetTimestampedSnapshot(uint64_t ts) const; + void ReleaseTimestampedSnapshotsOlderThan( + uint64_t ts, size_t* remaining_total_ss = nullptr); + Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub, + std::vector>& + timestamped_snapshots) const; + + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override; + using DB::GetMapProperty; + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override; + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override; + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) override; + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override; + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override; + + virtual Status PauseBackgroundWork() override; + virtual Status ContinueBackgroundWork() override; + + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override; + + virtual void EnableManualCompaction() override; + virtual void DisableManualCompaction() override; + + using DB::SetOptions; + Status SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) override; + + virtual Status SetDBOptions( + const std::unordered_map& options_map) override; + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override; + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override; + virtual const std::string& GetName() const override; + virtual Env* GetEnv() const override; + virtual FileSystem* GetFileSystem() const override; + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override; + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override; + using DB::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status Flush( + const FlushOptions& options, + const std::vector& column_families) override; + virtual Status FlushWAL(bool sync) override; + bool WALBufferIsEmpty(); + virtual Status SyncWAL() override; + virtual Status LockWAL() override; + virtual Status UnlockWAL() override; + + virtual SequenceNumber GetLatestSequenceNumber() const override; + + // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire + // and release db_mutex + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) override; + + // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and + // release db_mutex + Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) override; + + virtual Status GetDbIdentity(std::string& identity) const override; + + virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; + + virtual Status GetDbSessionId(std::string& session_id) const override; + + ColumnFamilyHandle* DefaultColumnFamily() const override; + + ColumnFamilyHandle* PersistentStatsColumnFamily() const; + + virtual Status Close() override; + + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual bool IsFileDeletionsEnabled() const; + + Status GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) override; + + using DB::ResetStats; + virtual Status ResetStats() override; + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override; + virtual Status GetSortedWalFiles(VectorLogPtr& files) override; + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override; + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override; + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) override; + virtual Status DeleteFile(std::string name) override; + Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + + virtual void GetLiveFilesMetaData( + std::vector* metadata) override; + + virtual Status GetLiveFilesChecksumInfo( + FileChecksumList* checksum_list) override; + + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override; + + // Obtains the meta data of the specified column family of the DB. + // TODO(yhchiang): output parameter is placed in the end in this codebase. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override; + + void GetAllColumnFamilyMetaData( + std::vector* metadata) override; + + Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override; + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) override; + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override; + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override; + + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; + + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_options) override; + + using DB::VerifyChecksum; + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + // Verify the checksums of files in db. Currently only tables are checked. + // + // read_options: controls file I/O behavior, e.g. read ahead size while + // reading all the live table files. + // + // use_file_checksum: if false, verify the block checksums of all live table + // in db. Otherwise, obtain the file checksums and compare + // with the MANIFEST. Currently, file checksums are + // recomputed by reading all table files. + // + // Returns: OK if there is no file whose file or block checksum mismatches. + Status VerifyChecksumInternal(const ReadOptions& read_options, + bool use_file_checksum); + + Status VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fpath, + const ReadOptions& read_options); + + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + + using DB::NewDefaultReplayer; + virtual Status NewDefaultReplayer( + const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override; + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) override; + + Status StartBlockCacheTrace( + const BlockCacheTraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override; + + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndIOTrace; + Status EndIOTrace() override; + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + + + // ---- End of implementations of the DB interface ---- + SystemClock* GetSystemClock() const; + + struct GetImplOptions { + ColumnFamilyHandle* column_family = nullptr; + PinnableSlice* value = nullptr; + PinnableWideColumns* columns = nullptr; + std::string* timestamp = nullptr; + bool* value_found = nullptr; + ReadCallback* callback = nullptr; + bool* is_blob_index = nullptr; + // If true return value associated with key via value pointer else return + // all merge operands for key via merge_operands pointer + bool get_value = true; + // Pointer to an array of size + // get_merge_operands_options.expected_max_number_of_operands allocated by + // user + PinnableSlice* merge_operands = nullptr; + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + int* number_of_operands = nullptr; + }; + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + // This function is also called by GetMergeOperands + // If get_impl_options.get_value = true get value associated with + // get_impl_options.key via get_impl_options.value + // If get_impl_options.get_value = false get merge operands associated with + // get_impl_options.key via get_impl_options.merge_operands + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions& get_impl_options); + + // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file. + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool expose_blob_index = false, + bool allow_refresh = true); + + virtual SequenceNumber GetLastPublishedSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastPublishedSequence(); + } + } + + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. + virtual void SetLastPublishedSequence(SequenceNumber seq); + // Returns LastSequence in last_seq_same_as_publish_seq_ + // mode and LastAllocatedSequence otherwise. This is useful when visiblility + // depends also on data written to the WAL but not to the memtable. + SequenceNumber TEST_GetLastVisibleSequence() const; + + // Similar to Write() but will call the callback once on the single write + // thread to determine whether it is safe to perform the write. + virtual Status WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback); + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into the current + // memtables. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + // + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history); + + // For a given key, check to see if there are any records for this key + // in the memtables, including memtable history. If cache_only is false, + // SST files will also be checked. + // + // `key` should NOT have user-defined timestamp appended to user key even if + // timestamp is enabled. + // + // If a key is found, *found_record_for_key will be set to true and + // *seq will be set to the stored sequence number for the latest + // operation on this key or kMaxSequenceNumber if unknown. If user-defined + // timestamp is enabled for this column family and timestamp is not nullptr, + // then *timestamp will be set to the stored timestamp for the latest + // operation on this key. + // If no key is found, *found_record_for_key will be set to false. + // + // Note: If cache_only=false, it is possible for *seq to be set to 0 if + // the sequence number has been cleared from the record. If the caller is + // holding an active db snapshot, we know the missing sequence must be less + // than the snapshot's sequence number (sequence numbers are only cleared + // when there are no earlier active snapshots). + // + // If NotFound is returned and found_record_for_key is set to false, then no + // record for this key was found. If the caller is holding an active db + // snapshot, we know that no key could have existing after this snapshot + // (since we do not compact keys that have an earlier snapshot). + // + // Only records newer than or at `lower_bound_seq` are guaranteed to be + // returned. Memtables and files may not be checked if it only contains data + // older than `lower_bound_seq`. + // + // Returns OK or NotFound on success, + // other status on unexpected error. + // TODO(andrewkr): this API need to be aware of range deletion operations + Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, std::string* timestamp, + bool* found_record_for_key, + bool* is_blob_index); + + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound); + + // Similar to GetSnapshot(), but also lets the db know that this snapshot + // will be used for transaction write-conflict checking. The DB can then + // make sure not to compact any keys that would prevent a write-conflict from + // being detected. + const Snapshot* GetSnapshotForWriteConflictBoundary(); + + // checks if all live files exist on file system and that their file sizes + // match to our in-memory records + virtual Status CheckConsistency(); + + // max_file_num_to_ignore allows bottom level compaction to filter out newly + // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will + // disable the filtering + // If `final_output_level` is not nullptr, it is set to manual compaction's + // output level if returned status is OK, and it may or may not be set to + // manual compaction's output level if returned status is not OK. + Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, + const CompactRangeOptions& compact_range_options, + const Slice* begin, const Slice* end, + bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore, + const std::string& trim_ts, + int* final_output_level = nullptr); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + // If allow_unprepared_value is true, the returned iterator may defer reading + // the value and so will require PrepareValue() to be called before value(); + // allow_unprepared_value = false is convenient when this optimization is not + // useful, e.g. when reading the whole column family. + // + // read_options.ignore_range_deletions determines whether range tombstones are + // processed in the returned interator internally, i.e., whether range + // tombstone covered keys are in this iterator's output. + // @param read_options Must outlive the returned iterator. + InternalIterator* NewInternalIterator( + const ReadOptions& read_options, Arena* arena, SequenceNumber sequence, + ColumnFamilyHandle* column_family = nullptr, + bool allow_unprepared_value = false); + + // Note: to support DB iterator refresh, memtable range tombstones in the + // underlying merging iterator needs to be refreshed. If db_iter is not + // nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the + // memtable range tombstone iterator used by the underlying merging iterator. + // This range tombstone iterator can be refreshed later by db_iter. + // @param read_options Must outlive the returned iterator. + InternalIterator* NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena, SequenceNumber sequence, + bool allow_unprepared_value, + ArenaWrappedDBIter* db_iter = nullptr); + + LogsWithPrepTracker* logs_with_prep_tracker() { + return &logs_with_prep_tracker_; + } + + struct BGJobLimits { + int max_flushes; + int max_compactions; + }; + // Returns maximum background flushes and compactions allowed to be scheduled + BGJobLimits GetBGJobLimits() const; + // Need a static version that can be called during SanitizeOptions(). + static BGJobLimits GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions); + + // move logs pending closing from job_context to the DB queue and + // schedule a purge + void ScheduleBgLogWriterClose(JobContext* job_context); + + uint64_t MinLogNumberToKeep(); + + // Returns the lower bound file number for SSTs that won't be deleted, even if + // they're obsolete. This lower bound is used internally to prevent newly + // created flush/compaction output files from being deleted before they're + // installed. This technique avoids the need for tracking the exact numbers of + // files pending creation, although it prevents more files than necessary from + // being deleted. + uint64_t MinObsoleteSstNumberToKeep(); + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'candidate_files'. + // If force == false and the last call was less than + // db_options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the job_context + void FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are possibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + // If FindObsoleteFiles() was run, we need to also run + // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + void PurgeObsoleteFiles(JobContext& background_contet, + bool schedule_only = false); + + // Schedule a background job to actually delete obsolete files. + void SchedulePurge(); + + const SnapshotList& snapshots() const { return snapshots_; } + + // load list of snapshots to `snap_vector` that is no newer than `max_seq` + // in ascending order. + // `oldest_write_conflict_snapshot` is filled with the oldest snapshot + // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true. + void LoadSnapshots(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot, + const SequenceNumber& max_seq) const { + InstrumentedMutexLock l(mutex()); + snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq); + } + + const ImmutableDBOptions& immutable_db_options() const { + return immutable_db_options_; + } + + // Cancel all background jobs, including flush, compaction, background + // purging, stats dumping threads, etc. If `wait` = true, wait for the + // running jobs to abort or finish before returning. Otherwise, only + // sends the signals. + void CancelAllBackgroundWork(bool wait); + + // Find Super version and reference it. Based on options, it might return + // the thread local cached one. + // Call ReturnAndCleanupSuperVersion() when it is no longer needed. + SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. + SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); + + // Un-reference the super version and clean it up if it is the last reference. + void CleanupSuperVersion(SuperVersion* sv); + + // Un-reference the super version and return it to thread local cache if + // needed. If it is the last reference of the super version. Clean it up + // after un-referencing it. + void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread. + void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); + + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. Return value only valid until next call to this function or + // mutex is released. + ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + + // Same as above, should called without mutex held and not on write thread. + std::unique_ptr GetColumnFamilyHandleUnlocked( + uint32_t column_family_id); + + // Returns the number of currently running flushes. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_flushes() { + mutex_.AssertHeld(); + return num_running_flushes_; + } + + // Returns the number of currently running compactions. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_compactions() { + mutex_.AssertHeld(); + return num_running_compactions_; + } + + const WriteController& write_controller() { return write_controller_; } + + // hollow transactions shell used for recovery. + // these will then be passed to TransactionDB so that + // locks can be reacquired before writing can resume. + struct RecoveredTransaction { + std::string name_; + bool unprepared_; + + struct BatchInfo { + uint64_t log_number_; + // TODO(lth): For unprepared, the memory usage here can be big for + // unprepared transactions. This is only useful for rollbacks, and we + // can in theory just keep keyset for that. + WriteBatch* batch_; + // Number of sub-batches. A new sub-batch is created if txn attempts to + // insert a duplicate key,seq to memtable. This is currently used in + // WritePreparedTxn/WriteUnpreparedTxn. + size_t batch_cnt_; + }; + + // This maps the seq of the first key in the batch to BatchInfo, which + // contains WriteBatch and other information relevant to the batch. + // + // For WriteUnprepared, batches_ can have size greater than 1, but for + // other write policies, it must be of size 1. + std::map batches_; + + explicit RecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared) + : name_(name), unprepared_(unprepared) { + batches_[seq] = {log, batch, batch_cnt}; + } + + ~RecoveredTransaction() { + for (auto& it : batches_) { + delete it.second.batch_; + } + } + + void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch, + size_t batch_cnt, bool unprepared) { + assert(batches_.count(seq) == 0); + batches_[seq] = {log_number, batch, batch_cnt}; + // Prior state must be unprepared, since the prepare batch must be the + // last batch. + assert(unprepared_); + unprepared_ = unprepared; + } + }; + + bool allow_2pc() const { return immutable_db_options_.allow_2pc; } + + std::unordered_map + recovered_transactions() { + return recovered_transactions_; + } + + RecoveredTransaction* GetRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + if (it == recovered_transactions_.end()) { + return nullptr; + } else { + return it->second; + } + } + + void InsertRecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared_batch) { + // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple + // times for every unprepared batch encountered during recovery. + // + // If the transaction is prepared, then the last call to + // InsertRecoveredTransaction will have unprepared_batch = false. + auto rtxn = recovered_transactions_.find(name); + if (rtxn == recovered_transactions_.end()) { + recovered_transactions_[name] = new RecoveredTransaction( + log, name, batch, seq, batch_cnt, unprepared_batch); + } else { + rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch); + } + logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log); + } + + void DeleteRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + assert(it != recovered_transactions_.end()); + auto* trx = it->second; + recovered_transactions_.erase(it); + for (const auto& info : trx->batches_) { + logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed( + info.second.log_number_); + } + delete trx; + } + + void DeleteAllRecoveredTransactions() { + for (auto it = recovered_transactions_.begin(); + it != recovered_transactions_.end(); ++it) { + delete it->second; + } + recovered_transactions_.clear(); + } + + void AddToLogsToFreeQueue(log::Writer* log_writer) { + mutex_.AssertHeld(); + logs_to_free_queue_.push_back(log_writer); + } + + void AddSuperVersionsToFreeQueue(SuperVersion* sv) { + superversions_to_free_queue_.push_back(sv); + } + + void SetSnapshotChecker(SnapshotChecker* snapshot_checker); + + // Fill JobContext with snapshot information needed by flush and compaction. + void GetSnapshotContext(JobContext* job_context, + std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker); + + // Not thread-safe. + void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); + + InstrumentedMutex* mutex() const { return &mutex_; } + + // Initialize a brand new DB. The DB directory is expected to be empty before + // calling it. Push new manifest file name into `new_filenames`. + Status NewDB(std::vector* new_filenames); + + // This is to be used only by internal rocksdb classes. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn); + + static IOStatus CreateAndNewDirectory( + FileSystem* fs, const std::string& dirname, + std::unique_ptr* directory); + + // find stats map from stats_history_ with smallest timestamp in + // the range of [start_time, end_time) + bool FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map); + + // Print information of all tombstones of all iterators to the std::string + // This is only used by ldb. The output might be capped. Tombstones + // printed out are not guaranteed to be in any order. + Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str); + + VersionSet* GetVersionSet() const { return versions_.get(); } + + // Wait for all flush and compactions jobs to finish. Jobs to wait include the + // unscheduled (queued, but not scheduled yet). If the db is shutting down, + // Status::ShutdownInProgress will be returned. If PauseBackgroundWork() was + // called prior to this, this may potentially wait for unscheduled jobs + // indefinitely. abort_on_pause can be set to true to abort, and + // Status::Aborted will be returned immediately. This may also never return if + // there's sufficient ongoing writes that keeps flush and compaction going + // without stopping. The user would have to cease all the writes to DB to make + // this eventually return in a stable state. + Status WaitForCompact(bool abort_on_pause = false); + +#ifndef NDEBUG + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr, + bool disallow_trivial_move = false); + + Status TEST_SwitchWAL(); + + bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } + + bool TEST_IsLogGettingFlushed() { + return alive_log_files_.begin()->getting_flushed; + } + + Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, + ColumnFamilyHandle* cfh = nullptr); + + Status TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts); + + // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This + // is because in certain cases, we can flush column families, wait for the + // flush to complete, but delete the column family handle before the wait + // finishes. For example in CompactRange. + Status TEST_AtomicFlushMemTables( + const autovector& provided_candidate_cfds, + const FlushOptions& flush_opts); + + // Wait for background threads to complete scheduled work. + Status TEST_WaitForBackgroundWork(); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + Status TEST_WaitForCompact(bool abort_on_pause = false); + + // Wait for any background purge + Status TEST_WaitForPurge(); + + // Get the background error status + Status TEST_GetBGError(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + uint64_t TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family = nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Returns the number that'll be assigned to the next file that's created. + uint64_t TEST_Current_Next_FileNo(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata, + std::vector>* blob_metadata = nullptr); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + void TEST_SignalAllBgCv(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); + + uint64_t TEST_MaxTotalInMemoryState() const { + return max_total_in_memory_state_; + } + + size_t TEST_LogsToFreeSize(); + + uint64_t TEST_LogfileNumber(); + + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_options); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); + + int TEST_BGCompactionsAllowed() const; + int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + void TEST_WaitForPeriodicTaskRun(std::function callback) const; + SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const; + size_t TEST_EstimateInMemoryStatsHistorySize() const; + + uint64_t TEST_GetCurrentLogNumber() const { + InstrumentedMutexLock l(mutex()); + assert(!logs_.empty()); + return logs_.back().number; + } + + const std::unordered_set& TEST_GetFilesGrabbedForPurge() const { + return files_grabbed_for_purge_; + } + + const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const; + +#endif // NDEBUG + + // persist stats to column family "_persistent_stats" + void PersistStats(); + + // dump rocksdb.stats to LOG + void DumpStats(); + + // flush LOG out of application buffer + void FlushInfoLog(); + + // record current sequence number to time mapping + void RecordSeqnoToTimeMapping(); + + // Interface to block and signal the DB in case of stalling writes by + // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. + // When DB needs to be blocked or signalled by WriteBufferManager, + // state_ is changed accordingly. + class WBMStallInterface : public StallInterface { + public: + enum State { + BLOCKED = 0, + RUNNING, + }; + + WBMStallInterface() : state_cv_(&state_mutex_) { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + + void SetState(State state) { + MutexLock lock(&state_mutex_); + state_ = state; + } + + // Change the state_ to State::BLOCKED and wait until its state is + // changed by WriteBufferManager. When stall is cleared, Signal() is + // called to change the state and unblock the DB. + void Block() override { + MutexLock lock(&state_mutex_); + while (state_ == State::BLOCKED) { + TEST_SYNC_POINT("WBMStallInterface::BlockDB"); + state_cv_.Wait(); + } + } + + // Called from WriteBufferManager. This function changes the state_ + // to State::RUNNING indicating the stall is cleared and DB can proceed. + void Signal() override { + { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + state_cv_.Signal(); + } + + private: + // Conditional variable and mutex to block and + // signal the DB during stalling process. + port::Mutex state_mutex_; + port::CondVar state_cv_; + // state represting whether DB is running or blocked because of stall by + // WriteBufferManager. + State state_; + }; + + static void TEST_ResetDbSessionIdGen(); + static std::string GenerateDbSessionId(Env* env); + + bool seq_per_batch() const { return seq_per_batch_; } + + protected: + const std::string dbname_; + // TODO(peterd): unify with VersionSet::db_id_ + std::string db_id_; + // db_session_id_ is an identifier that gets reset + // every time the DB is opened + std::string db_session_id_; + std::unique_ptr versions_; + // Flag to check whether we allocated and own the info log file + bool own_info_log_; + Status init_logger_creation_s_; + const DBOptions initial_db_options_; + Env* const env_; + std::shared_ptr io_tracer_; + const ImmutableDBOptions immutable_db_options_; + FileSystemPtr fs_; + MutableDBOptions mutable_db_options_; + Statistics* stats_; + std::unordered_map + recovered_transactions_; + std::unique_ptr tracer_; + InstrumentedMutex trace_mutex_; + BlockCacheTracer block_cache_tracer_; + + // constant false canceled flag, used when the compaction is not manual + const std::atomic kManualCompactionCanceledFalse_{false}; + + // State below is protected by mutex_ + // With two_write_queues enabled, some of the variables that accessed during + // WriteToWAL need different synchronization: log_empty_, alive_log_files_, + // logs_, logfile_number_. Refer to the definition of each variable below for + // more description. + // + // `mutex_` can be a hot lock in some workloads, so it deserves dedicated + // cachelines. + mutable CacheAlignedInstrumentedMutex mutex_; + + ColumnFamilyHandleImpl* default_cf_handle_; + InternalStats* default_cf_internal_stats_; + + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + ErrorHandler error_handler_; + + // Unified interface for logging events + EventLogger event_logger_; + + // only used for dynamically adjusting max_total_wal_size. it is a sum of + // [write_buffer_size * max_write_buffer_number] over all column families + std::atomic max_total_in_memory_state_; + + // The options to access storage files + const FileOptions file_options_; + + // Additonal options for compaction and flush + FileOptions file_options_for_compaction_; + + std::unique_ptr column_family_memtables_; + + // Increase the sequence number after writing each batch, whether memtable is + // disabled for that or not. Otherwise the sequence number is increased after + // writing each key into memtable. This implies that when disable_memtable is + // set, the seq is not increased at all. + // + // Default: false + const bool seq_per_batch_; + // This determines during recovery whether we expect one writebatch per + // recovered transaction, or potentially multiple writebatches per + // transaction. For WriteUnprepared, this is set to false, since multiple + // batches can exist per transaction. + // + // Default: true + const bool batch_per_txn_; + + // Each flush or compaction gets its own job id. this counter makes sure + // they're unique + std::atomic next_job_id_; + + std::atomic shutting_down_; + + // RecoveryContext struct stores the context about version edits along + // with corresponding column_family_data and column_family_options. + class RecoveryContext { + public: + ~RecoveryContext() { + for (auto& edit_list : edit_lists_) { + for (auto* edit : edit_list) { + delete edit; + } + } + } + + void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) { + assert(cfd != nullptr); + if (map_.find(cfd->GetID()) == map_.end()) { + uint32_t size = static_cast(map_.size()); + map_.emplace(cfd->GetID(), size); + cfds_.emplace_back(cfd); + mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions()); + edit_lists_.emplace_back(autovector()); + } + uint32_t i = map_[cfd->GetID()]; + edit_lists_[i].emplace_back(new VersionEdit(edit)); + } + + std::unordered_map map_; // cf_id to index; + autovector cfds_; + autovector mutable_cf_opts_; + autovector> edit_lists_; + // files_to_delete_ contains sst files + std::unordered_set files_to_delete_; + }; + + // Except in DB::Open(), WriteOptionsFile can only be called when: + // Persist options to options file. + // If need_mutex_lock = false, the method will lock DB mutex. + // If need_enter_write_thread = false, the method will enter write thread. + Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + + Status CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + const std::string& trim_ts); + + // The following two functions can only be called when: + // 1. WriteThread::Writer::EnterUnbatched() is used. + // 2. db_mutex is NOT held + Status RenameTempFileToOptionsFile(const std::string& file_name); + Status DeleteObsoleteOptionsFiles(); + + void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id, FlushReason flush_reason); + + void NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info); + + void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, int job_id); + + void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id); + void NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info); + + void NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); + + virtual Status FlushForGetLiveFiles(); + + void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusDbInfo() const; + + // If disable_memtable is set the application logic must guarantee that the + // batch will still be skipped from memtable during the recovery. An excption + // to this is seq_per_batch_ mode, in which since each batch already takes one + // seq, it is ok for the batch to write to memtable during recovery as long as + // it only takes one sequence number: i.e., no duplicate keys. + // In WriteCommitted it is guarnateed since disable_memtable is used for + // prepare batch which will be written to memtable later during the commit, + // and in WritePrepared it is guaranteed since it will be used only for WAL + // markers which will never be written to memtable. If the commit marker is + // accompanied with CommitTimeWriteBatch that is not written to memtable as + // long as it has no duplicate keys, it does not violate the one-seq-per-batch + // policy. + // batch_cnt is expected to be non-zero in seq_per_batch mode and + // indicates the number of sub-patches. A sub-patch is a subset of the write + // batch that does not have duplicate keys. + Status WriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, uint64_t* seq_used = nullptr, + size_t batch_cnt = 0, + PreReleaseCallback* pre_release_callback = nullptr, + PostMemTableCallback* post_memtable_callback = nullptr); + + Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, + uint64_t* seq_used = nullptr); + + // Write only to memtables without joining any write queue + Status UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t log_ref, SequenceNumber seq, + const size_t sub_batch_cnt); + + // Whether the batch requires to be assigned with an order + enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; + // Whether it requires publishing last sequence or not + enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq }; + + // Join the write_thread to write the batch only to the WAL. It is the + // responsibility of the caller to also write the write batch to the memtable + // if it required. + // + // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder + // indicating the number of sub-batches in my_batch. A sub-patch is a subset + // of the write batch that does not have duplicate keys. When seq_per_batch is + // not set, each key is a separate sub_batch. Otherwise each duplicate key + // marks start of a new sub-batch. + Status WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& options, + WriteBatch* updates, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable); + + // write cached_recoverable_state_ to memtable if it is not empty + // The writer must be the leader in write_thread_ and holding mutex_ + Status WriteRecoverableState(); + + // Actual implementation of Close() + Status CloseImpl(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is + // skipped. + // recovery_ctx stores the context about version edits and all those + // edits are persisted to new Manifest after successfully syncing the new WAL. + virtual Status Recover( + const std::vector& column_families, + bool read_only = false, bool error_if_wal_file_exists = false, + bool error_if_data_exists_in_wals = false, + uint64_t* recovered_seq = nullptr, + RecoveryContext* recovery_ctx = nullptr); + + virtual bool OwnTablesAndLogs() const { return true; } + + // Setup DB identity file, and write DB ID to manifest if necessary. + Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx); + // Assign db_id_ and write DB ID to manifest if necessary. + void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx); + + // REQUIRES: db mutex held when calling this function, but the db mutex can + // be released and re-acquired. Db mutex will be held when the function + // returns. + // After recovery, there may be SST files in db/cf paths that are + // not referenced in the MANIFEST (e.g. + // 1. It's best effort recovery; + // 2. The VersionEdits referencing the SST files are appended to + // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are + // still not synced to MANIFEST during recovery.) + // It stores the SST files to be deleted in RecoveryContext. In the + // meantime, we find out the largest file number present in the paths, and + // bump up the version set's next_file_number_ to be 1 + largest_file_number. + // recovery_ctx stores the context about version edits and files to be + // deleted. All those edits are persisted to new Manifest after successfully + // syncing the new WAL. + Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx); + + // SetDbSessionId() should be called in the constuctor DBImpl() + // to ensure that db_session_id_ gets updated every time the DB is opened + void SetDbSessionId(); + + Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const; + Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts, + bool ts_for_read) const; + + // recovery_ctx stores the context about version edits and + // LogAndApplyForRecovery persist all those edits to new Manifest after + // successfully syncing new WAL. + // LogAndApplyForRecovery should be called only once during recovery and it + // should be called when RocksDB writes to a first new MANIFEST since this + // recovery. + Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx); + + void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap(); + + // Return true to proceed with current WAL record whose content is stored in + // `batch`. Return false to skip current WAL record. + bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, + const std::string& wal_fname, + log::Reader::Reporter& reporter, + Status& status, bool& stop_replay, + WriteBatch& batch); + + private: + friend class DB; + friend class ErrorHandler; + friend class InternalStats; + friend class PessimisticTransaction; + friend class TransactionBaseImpl; + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + friend class WritePreparedTxnDB; + friend class WriteBatchWithIndex; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + + friend class ForwardIterator; + friend struct SuperVersion; + friend class CompactedDBImpl; + friend class DBTest_ConcurrentFlushWAL_Test; + friend class DBTest_MixedSlowdownOptionsStop_Test; + friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; +#ifndef NDEBUG + friend class DBTest2_ReadCallbackTest_Test; + friend class WriteCallbackPTest_WriteWithCallbackTest_Test; + friend class XFTransactionWriteHandler; + friend class DBBlobIndexTest; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; +#endif + + struct CompactionState; + struct PrepickedCompaction; + struct PurgeFileInfo; + + struct WriteContext { + SuperVersionContext superversion_context; + autovector memtables_to_free_; + + explicit WriteContext(bool create_superversion = false) + : superversion_context(create_superversion) {} + + ~WriteContext() { + superversion_context.Clean(); + for (auto& m : memtables_to_free_) { + delete m; + } + } + }; + + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + LogFileNumberSize() {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size = 0; + bool getting_flushed = false; + }; + + struct LogWriterNumber { + // pass ownership of _writer + LogWriterNumber(uint64_t _number, log::Writer* _writer) + : number(_number), writer(_writer) {} + + log::Writer* ReleaseWriter() { + auto* w = writer; + writer = nullptr; + return w; + } + Status ClearWriter() { + Status s = writer->WriteBuffer(); + delete writer; + writer = nullptr; + return s; + } + + bool IsSyncing() { return getting_synced; } + + uint64_t GetPreSyncSize() { + assert(getting_synced); + return pre_sync_size; + } + + void PrepareForSync() { + assert(!getting_synced); + // Size is expected to be monotonically increasing. + assert(writer->file()->GetFlushedSize() >= pre_sync_size); + getting_synced = true; + pre_sync_size = writer->file()->GetFlushedSize(); + } + + void FinishSync() { + assert(getting_synced); + getting_synced = false; + } + + uint64_t number; + // Visual Studio doesn't support deque's member to be noncopyable because + // of a std::unique_ptr as a member. + log::Writer* writer; // own + + private: + // true for some prefix of logs_ + bool getting_synced = false; + // The size of the file before the sync happens. This amount is guaranteed + // to be persisted even if appends happen during sync so it can be used for + // tracking the synced size in MANIFEST. + uint64_t pre_sync_size = 0; + }; + + struct LogContext { + explicit LogContext(bool need_sync = false) + : need_log_sync(need_sync), need_log_dir_sync(need_sync) {} + bool need_log_sync = false; + bool need_log_dir_sync = false; + log::Writer* writer = nullptr; + LogFileNumberSize* log_file_number_size = nullptr; + }; + + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_files_ + struct PurgeFileInfo { + std::string fname; + std::string dir_to_sync; + FileType type; + uint64_t number; + int job_id; + PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, + int jid) + : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} + }; + + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), + max_memtable_id_(0), + superversion_context_(nullptr), + flush_reason_(FlushReason::kOthers) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, + SuperVersionContext* superversion_context, + FlushReason flush_reason) + : cfd_(cfd), + max_memtable_id_(max_memtable_id), + superversion_context_(superversion_context), + flush_reason_(flush_reason) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t max_memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + FlushReason flush_reason_; + }; + + // Argument passed to flush thread. + struct FlushThreadArg { + DBImpl* db_; + + Env::Priority thread_pri_; + }; + + // Information for a manual compaction + struct ManualCompactionState { + ManualCompactionState(ColumnFamilyData* _cfd, int _input_level, + int _output_level, uint32_t _output_path_id, + bool _exclusive, bool _disallow_trivial_move, + std::atomic* _canceled) + : cfd(_cfd), + input_level(_input_level), + output_level(_output_level), + output_path_id(_output_path_id), + exclusive(_exclusive), + disallow_trivial_move(_disallow_trivial_move), + canceled(_canceled ? *_canceled : canceled_internal_storage) {} + // When _canceled is not provided by ther user, we assign the reference of + // canceled_internal_storage to it to consolidate canceled and + // manual_compaction_paused since DisableManualCompaction() might be + // called + + ColumnFamilyData* cfd; + int input_level; + int output_level; + uint32_t output_path_id; + Status status; + bool done = false; + bool in_progress = false; // compaction request being processed? + bool incomplete = false; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin = nullptr; // nullptr means beginning of key range + const InternalKey* end = nullptr; // nullptr means end of key range + InternalKey* manual_end = nullptr; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + + // When the user provides a canceled pointer in CompactRangeOptions, the + // above varaibe is the reference of the user-provided + // `canceled`, otherwise, it is the reference of canceled_internal_storage + std::atomic canceled_internal_storage = false; + std::atomic& canceled; // Compaction canceled pointer reference + }; + struct PrepickedCompaction { + // background compaction takes ownership of `compaction`. + Compaction* compaction; + // caller retains ownership of `manual_compaction_state` as it is reused + // across background compactions. + ManualCompactionState* manual_compaction_state; // nullptr if non-manual + // task limiter token is requested during compaction picking. + std::unique_ptr task_token; + }; + + struct CompactionArg { + // caller retains ownership of `db`. + DBImpl* db; + // background compaction takes ownership of `prepicked_compaction`. + PrepickedCompaction* prepicked_compaction; + Env::Priority compaction_pri_; + }; + + // Initialize the built-in column family for persistent stats. Depending on + // whether on-disk persistent stats have been enabled before, it may either + // create a new column family and column family handle or just a column family + // handle. + // Required: DB mutex held + Status InitPersistStatsColumnFamily(); + + // Persistent Stats column family has two format version key which are used + // for compatibility check. Write format version if it's created for the + // first time, read format version and check compatibility if recovering + // from disk. This function requires DB mutex held at entrance but may + // release and re-acquire DB mutex in the process. + // Required: DB mutex held + Status PersistentStatsProcessFormatVersion(); + + Status ResumeImpl(DBRecoverContext context); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& cf_name, + ColumnFamilyHandle** handle); + + Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + // Delete obsolete files and log status and information of file deletion + void DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, FileType type, + uint64_t number); + + // Background process needs to call + // auto x = CaptureCurrentFileNumberInPendingOutputs() + // auto file_num = versions_->NewFileNumber(); + // + // ReleaseFileNumberFromPendingOutputs(x) + // This will protect any file with number `file_num` or greater from being + // deleted while is running. + // ----------- + // This function will capture current file number and append it to + // pending_outputs_. This will prevent any background process to delete any + // file created after this point. + std::list::iterator CaptureCurrentFileNumberInPendingOutputs(); + // This function should be called with the result of + // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file + // created between the calls CaptureCurrentFileNumberInPendingOutputs() and + // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live + // and blocked by any other pending_outputs_ calls) + void ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v); + + IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. Then + // installs a new super version for the column family. + Status FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, FlushReason flush_reason, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri); + + // Flush the memtables of (multiple) column families to multiple files on + // persistent storage. + Status FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + Status AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + // REQUIRES: log_numbers are sorted in ascending order + // corrupted_log_found is set to true if we recover from a corrupted log file. + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_log_found, + RecoveryContext* recovery_ctx); + + // The following two methods are used to flush a memtable to + // storage. The first one is used at database RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit); + + // Get the size of a log file and, if truncate is true, truncate the + // log file to its actual size, thereby freeing preallocated space. + // Return success even if truncate fails + Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log); + + // Restore alive_log_files_ and total_log_size_ after recovery. + // It needs to run only when there's no flush during recovery + // (e.g. avoid_flush_during_recovery=true). May also trigger flush + // in case total_log_size > max_total_wal_size. + Status RestoreAliveLogFiles(const std::vector& log_numbers); + + // num_bytes: for slowdown case, delay time is calculated based on + // `num_bytes` going through. + Status DelayWrite(uint64_t num_bytes, WriteThread& write_thread, + const WriteOptions& write_options); + + // Begin stalling of writes when memory usage increases beyond a certain + // threshold. + void WriteBufferManagerStallWrites(); + + Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch); + + // REQUIRES: mutex locked and in write thread. + Status ScheduleFlushes(WriteContext* context); + + void MaybeFlushStatsCF(autovector* cfds); + + Status TrimMemtableHistory(WriteContext* context); + + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); + + // Select and output column families qualified for atomic flush in + // `selected_cfds`. If `provided_candidate_cfds` is non-empty, it will be used + // as candidate CFs to select qualified ones from. Otherwise, all column + // families are used as candidate to select from. + // + // REQUIRES: mutex held + void SelectColumnFamiliesForAtomicFlush( + autovector* selected_cfds, + const autovector& provided_candidate_cfds = {}); + + // Force current memtable contents to be flushed. + Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, + FlushReason flush_reason, + bool entered_write_thread = false); + + // Atomic-flush memtables from quanlified CFs among `provided_candidate_cfds` + // (if non-empty) or amomg all column families and atomically record the + // result to the MANIFEST. + Status AtomicFlushMemTables( + const FlushOptions& options, FlushReason flush_reason, + const autovector& provided_candidate_cfds = {}, + bool entered_write_thread = false); + + // Wait until flushing this column family won't stall writes + Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed); + + // Wait for memtable flushed. + // If flush_memtable_id is non-null, wait until the memtable with the ID + // gets flush. Otherwise, wait until the column family don't have any + // memtable pending flush. + // resuming_from_bg_err indicates whether the caller is attempting to resume + // from background error. + Status WaitForFlushMemTable(ColumnFamilyData* cfd, + const uint64_t* flush_memtable_id = nullptr, + bool resuming_from_bg_err = false) { + return WaitForFlushMemTables({cfd}, {flush_memtable_id}, + resuming_from_bg_err); + } + // Wait for memtables to be flushed for multiple column families. + Status WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err); + + inline void WaitForPendingWrites() { + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock"); + // In case of pipelined write is enabled, wait for all pending memtable + // writers. + if (immutable_db_options_.enable_pipelined_write) { + // Memtable writers may call DB::Get in case max_successive_merges > 0, + // which may lock mutex. Unlocking mutex here to avoid deadlock. + mutex_.Unlock(); + write_thread_.WaitForMemTableWriters(); + mutex_.Lock(); + } + + if (!immutable_db_options_.unordered_write) { + // Then the writes are finished before the next write group starts + return; + } + + // Wait for the ones who already wrote to the WAL to finish their + // memtable write. + if (pending_memtable_writes_.load() != 0) { + std::unique_lock guard(switch_mutex_); + switch_cv_.wait(guard, + [&] { return pending_memtable_writes_.load() == 0; }); + } + } + + // TaskType is used to identify tasks in thread-pool, currently only + // differentiate manual compaction, which could be unscheduled from the + // thread-pool. + enum class TaskType : uint8_t { + kDefault = 0, + kManualCompaction = 1, + kCount = 2, + }; + + // Task tag is used to identity tasks in thread-pool, which is + // dbImpl obj address + type + inline void* GetTaskTag(TaskType type) { + return GetTaskTag(static_cast(type)); + } + + inline void* GetTaskTag(uint8_t type) { + return static_cast(static_cast(this)) + type; + } + + // REQUIRES: mutex locked and in write thread. + void AssignAtomicFlushSeq(const autovector& cfds); + + // REQUIRES: mutex locked and in write thread. + Status SwitchWAL(WriteContext* write_context); + + // REQUIRES: mutex locked and in write thread. + Status HandleWriteBufferManagerFlush(WriteContext* write_context); + + // REQUIRES: mutex locked + Status PreprocessWrite(const WriteOptions& write_options, + LogContext* log_context, WriteContext* write_context); + + // Merge write batches in the write group into merged_batch. + // Returns OK if merge is successful. + // Returns Corruption if corruption in write batch is detected. + Status MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, WriteBatch** merged_batch, + size_t* write_with_wal, WriteBatch** to_be_cached_state); + + // rate_limiter_priority is used to charge `DBOptions::rate_limiter` + // for automatic WAL flush (`Options::manual_wal_flush` == false) + // associated with this WriteToWAL + IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, + uint64_t* log_used, uint64_t* log_size, + Env::IOPriority rate_limiter_priority, + LogFileNumberSize& log_file_number_size); + + IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence, + LogFileNumberSize& log_file_number_size); + + IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, + SequenceNumber* last_sequence, size_t seq_inc); + + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + // Caller must hold mutex_. + void WriteStatusCheckOnLocked(const Status& status); + + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + void WriteStatusCheck(const Status& status); + + // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write + // WAL, sync WAL fails, if paranoid check is enabled. + void IOStatusCheck(const IOStatus& status); + + // Used by WriteImpl to update bg_error_ in case of memtable insert error. + void MemTableInsertStatusCheck(const Status& memtable_insert_status); + + Status CompactFilesImpl(const CompactionOptions& compact_options, + ColumnFamilyData* cfd, Version* version, + const std::vector& input_file_names, + std::vector* const output_file_names, + const int output_level, int output_path_id, + JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info); + + ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); + + void MaybeScheduleFlushOrCompaction(); + + struct FlushRequest { + FlushReason flush_reason; + // A map from column family to flush to largest memtable id to persist for + // each column family. Once all the memtables whose IDs are smaller than or + // equal to this per-column-family specified value, this flush request is + // considered to have completed its work of flushing this column family. + // After completing the work for all column families in this request, this + // flush is considered complete. + std::unordered_map + cfd_to_max_mem_id_to_persist; + }; + + void GenerateFlushRequest(const autovector& cfds, + FlushReason flush_reason, FlushRequest* req); + + void SchedulePendingFlush(const FlushRequest& req); + + void SchedulePendingCompaction(ColumnFamilyData* cfd); + void SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id); + static void BGWorkCompaction(void* arg); + // Runs a pre-chosen universal compaction involving bottom level in a + // separate, bottom-pri thread pool. + static void BGWorkBottomCompaction(void* arg); + static void BGWorkFlush(void* arg); + static void BGWorkPurge(void* arg); + static void UnscheduleCompactionCallback(void* arg); + static void UnscheduleFlushCallback(void* arg); + void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + void BackgroundCallFlush(Env::Priority thread_pri); + void BackgroundCallPurge(); + Status BackgroundCompaction(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + Status BackgroundFlush(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri); + + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector& inputs, + bool* sfm_bookkeeping, LogBuffer* log_buffer); + + // Request compaction tasks token from compaction thread limiter. + // It always succeeds if force = true or limiter is disable. + bool RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer); + + // Schedule background tasks + Status StartPeriodicTaskScheduler(); + + Status RegisterRecordSeqnoTimeWorker(); + + void PrintStatistics(); + + size_t EstimateInMemoryStatsHistorySize() const; + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, + int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); + + // helper functions for adding and removing from flush & compaction queues + void AddToCompactionQueue(ColumnFamilyData* cfd); + ColumnFamilyData* PopFirstFromCompactionQueue(); + FlushRequest PopFirstFromFlushQueue(); + + // Pick the first unthrottled compaction with task token from queue. + ColumnFamilyData* PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer); + + // helper function to call after some of the logs_ were synced + void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); + Status ApplyWALToManifest(const ReadOptions& read_options, VersionEdit* edit); + // WALs with log number up to up_to are not synced successfully. + void MarkLogsNotSynced(uint64_t up_to); + + SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock = true); + + // If snapshot_seq != kMaxSequenceNumber, then this function can only be + // called from the write thread that publishes sequence numbers to readers. + // For 1) write-committed, or 2) write-prepared + one-write-queue, this will + // be the write thread performing memtable writes. For write-prepared with + // two write queues, this will be the write thread writing commit marker to + // the WAL. + // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller + // ensuring no writes to the database. + std::pair> + CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts, + bool lock = true); + + uint64_t GetMaxTotalWalSize() const; + + FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; + + Status MaybeReleaseTimestampedSnapshotsAndCheck(); + + Status CloseHelper(); + + void WaitForBackgroundWork(); + + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function. Background threads carry + // sv_context which can have new_superversion already + // allocated. + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + bool GetPropertyHandleOptionsStatistics(std::string* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompactionState* m); + void RemoveManualCompaction(ManualCompactionState* m); + bool ShouldntRunManualCompaction(ManualCompactionState* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); + void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& compaction_job_stats, + const int job_id, const Version* current, + CompactionJobInfo* compaction_job_info) const; + // Reserve the next 'num' file numbers for to-be-ingested external SST files, + // and return the current file_number in 'next_file_number'. + // Write a version edit to the MANIFEST. + Status ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number); + + bool ShouldPurge(uint64_t file_number) const; + void MarkAsGrabbedForPurge(uint64_t file_number); + + size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } + + IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); + + // Validate self-consistency of DB options + static Status ValidateOptions(const DBOptions& db_options); + // Validate self-consistency of DB options and its consistency with cf options + static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families); + + // Utility function to do some debug validation and sort the given vector + // of MultiGet keys + void PrepareMultiGetKeys( + const size_t num_keys, bool sorted, + autovector* key_ptrs); + + void MultiGetCommon(const ReadOptions& options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + PinnableWideColumns* columns, std::string* timestamps, + Status* statuses, bool sorted_input); + + void MultiGetCommon(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, PinnableWideColumns* columns, + std::string* timestamps, Status* statuses, + bool sorted_input); + + // A structure to hold the information required to process MultiGet of keys + // belonging to one column family. For a multi column family MultiGet, there + // will be a container of these objects. + struct MultiGetColumnFamilyData { + ColumnFamilyHandle* cf; + ColumnFamilyData* cfd; + + // For the batched MultiGet which relies on sorted keys, start specifies + // the index of first key belonging to this column family in the sorted + // list. + size_t start; + + // For the batched MultiGet case, num_keys specifies the number of keys + // belonging to this column family in the sorted list + size_t num_keys; + + // SuperVersion for the column family obtained in a manner that ensures a + // consistent view across all column families in the DB + SuperVersion* super_version; + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, + SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(0), + num_keys(0), + super_version(sv) {} + + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first, + size_t count, SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(first), + num_keys(count), + super_version(sv) {} + + MultiGetColumnFamilyData() = default; + }; + + // A common function to obtain a consistent snapshot, which can be implicit + // if the user doesn't specify a snapshot in read_options, across + // multiple column families for MultiGet. It will attempt to get an implicit + // snapshot without acquiring the db_mutes, but will give up after a few + // tries and acquire the mutex if a memtable flush happens. The template + // allows both the batched and non-batched MultiGet to call this with + // either an std::unordered_map or autovector of column families. + // + // If callback is non-null, the callback is refreshed with the snapshot + // sequence number + // + // A return value of true indicates that the SuperVersions were obtained + // from the ColumnFamilyData, whereas false indicates they are thread + // local + template + bool MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot); + + // The actual implementation of the batching MultiGet. The caller is expected + // to have acquired the SuperVersion and pass in a snapshot sequence number + // in order to construct the LookupKeys. The start_key and num_keys specify + // the range of keys in the sorted_keys vector for a single column family. + Status MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback); + + Status DisableFileDeletionsWithLock(); + + Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, + std::string ts_low); + + bool ShouldReferenceSuperVersion(const MergeContext& merge_context); + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // In addition to mutex_, log_write_mutex_ protected writes to stats_history_ + InstrumentedMutex stats_history_mutex_; + // In addition to mutex_, log_write_mutex_ protected writes to logs_ and + // logfile_number_. With two_write_queues it also protects alive_log_files_, + // and log_empty_. Refer to the definition of each variable below for more + // details. + // Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and + // mutex_, the order should be first mutex_ and then log_write_mutex_. + InstrumentedMutex log_write_mutex_; + + // If zero, manual compactions are allowed to proceed. If non-zero, manual + // compactions may still be running, but will quickly fail with + // `Status::Incomplete`. The value indicates how many threads have paused + // manual compactions. It is accessed in read mode outside the DB mutex in + // compaction code paths. + std::atomic manual_compaction_paused_; + + // This condition variable is signaled on these conditions: + // * whenever bg_compaction_scheduled_ goes down to 0 + // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't + // made any progress + // * whenever a compaction made any progress + // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases + // (i.e. whenever a flush is done, even if it didn't make any progress) + // * whenever there is an error in background purge, flush or compaction + // * whenever num_running_ingest_file_ goes to 0. + // * whenever pending_purge_obsolete_files_ goes to 0. + // * whenever disable_delete_obsolete_files_ goes to 0. + // * whenever SetOptions successfully updates options. + // * whenever a column family is dropped. + InstrumentedCondVar bg_cv_; + // Writes are protected by locking both mutex_ and log_write_mutex_, and reads + // must be under either mutex_ or log_write_mutex_. Since after ::Open, + // logfile_number_ is currently updated only in write_thread_, it can be read + // from the same write_thread_ without any locks. + uint64_t logfile_number_; + // Log files that we can recycle. Must be protected by db mutex_. + std::deque log_recycle_files_; + // Protected by log_write_mutex_. + bool log_dir_synced_; + // Without two_write_queues, read and writes to log_empty_ are protected by + // mutex_. Since it is currently updated/read only in write_thread_, it can be + // accessed from the same write_thread_ without any locks. With + // two_write_queues writes, where it can be updated in different threads, + // read and writes are protected by log_write_mutex_ instead. This is to avoid + // expensive mutex_ lock during WAL write, which update log_empty_. + bool log_empty_; + + ColumnFamilyHandleImpl* persist_stats_cf_handle_; + + bool persistent_stats_cfd_exists_ = true; + + // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details + // as follows: + // 1. read by FindObsoleteFiles() which can be called in either application + // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are + // held. + // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_ + // are held. + // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles() + // (actually called by Open()), only mutex_ is held because at this point, + // the DB::Open() call has not returned success to application, and the + // only other thread(s) that can conflict are bg threads calling + // FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_ + // are held when accessing alive_log_files_. + // 4. read by DBImpl::Open() is protected by mutex_. + // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are + // held. This is done by the write group leader. Note that in the case of + // two-write-queues, another WAL-only write thread can be writing to the + // WAL concurrently. See 9. + // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is + // done by write group leader. + // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of + // two-write-queues. Only log_write_mutex_ is held to protect concurrent + // pop_front() by FindObsoleteFiles(). + // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_ + // is held to protect the data structure from concurrent pop_front() by + // FindObsoleteFiles(). + // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case + // of two-write-queues. Only log_write_mutex_ is held. This suffices to + // protect the data structure from concurrent push_back() by current + // write group leader as well as pop_front() by FindObsoleteFiles(). + std::deque alive_log_files_; + + // Log files that aren't fully synced, and the current log file. + // Synchronization: + // 1. read by FindObsoleteFiles() which can be called either in application + // thread or RocksDB bg threads. log_write_mutex_ is always held, while + // some reads are performed without mutex_. + // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held. + // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_. + // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex. + // Note that at this point, DB::Open() has not returned success to + // application, thus the only other thread(s) that can conflict are bg + // threads calling FindObsoleteFiles(). See 1. + // 5. iteration and clear() from CloseHelper() always hold log_write_mutex + // and mutex_. + // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only + // log_write_mutex_. These two can be called by application threads after + // DB::Open() returns success to applications. + // 7. read by SyncWAL(), another API, protected by only log_write_mutex_. + // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by + // log_write_mutex_. + // 9. erase() by MarkLogsSynced() protected by log_write_mutex_. + // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can + // happen in bg flush threads after DB::Open() returns success to + // applications. + // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite() + // holds only the log_write_mutex_. This is done by the write group + // leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced() + // can happen concurrently. This is fine because log_write_mutex_ is used + // by all parties. See 2, 5, 9. + // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and + // log_write_mutex_. This happens in the write group leader. + // 13. emplace_back() by SwitchMemtable() hold both mutex_ and + // log_write_mutex_. This happens in the write group leader. Can conflict + // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(), + // SyncClosedLogs(), etc. as well as application threads calling + // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties + // require at least log_write_mutex_. + // 14. iteration called in WriteToWAL(write_group) protected by + // log_write_mutex_. This is done by write group leader when + // two-write-queues is disabled and write needs to sync logs. + // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_. + // This can be done by the write group leader if two-write-queues is + // enabled. It can also be done by another WAL-only write thread. + // + // Other observations: + // - back() and items with getting_synced=true are not popped, + // - The same thread that sets getting_synced=true will reset it. + // - it follows that the object referred by back() can be safely read from + // the write_thread_ without using mutex. Note that calling back() without + // mutex may be unsafe because different implementations of deque::back() may + // access other member variables of deque, causing undefined behaviors. + // Generally, do not access stl containers without proper synchronization. + // - it follows that the items with getting_synced=true can be safely read + // from the same thread that has set getting_synced=true + std::deque logs_; + + // Signaled when getting_synced becomes false for some of the logs_. + InstrumentedCondVar log_sync_cv_; + // This is the app-level state that is written to the WAL but will be used + // only during recovery. Using this feature enables not writing the state to + // memtable on normal writes and hence improving the throughput. Each new + // write of the state will replace the previous state entirely even if the + // keys in the two consecutive states do not overlap. + // It is protected by log_write_mutex_ when two_write_queues_ is enabled. + // Otherwise only the heaad of write_thread_ can access it. + WriteBatch cached_recoverable_state_; + std::atomic cached_recoverable_state_empty_ = {true}; + std::atomic total_log_size_; + + // If this is non-empty, we need to delete these log files in background + // threads. Protected by log_write_mutex_. + autovector logs_to_free_; + + bool is_snapshot_supported_; + + std::map> stats_history_; + + std::map stats_slice_; + + bool stats_slice_initialized_ = false; + + Directories directories_; + + WriteBufferManager* write_buffer_manager_; + + WriteThread write_thread_; + WriteBatch tmp_batch_; + // The write thread when the writers have no memtable write. This will be used + // in 2PC to batch the prepares separately from the serial commit. + WriteThread nonmem_write_thread_; + + WriteController write_controller_; + + // Size of the last batch group. In slowdown mode, next write needs to + // sleep if it uses up the quota. + // Note: This is to protect memtable and compaction. If the batch only writes + // to the WAL its size need not to be included in this. + uint64_t last_batch_group_size_; + + FlushScheduler flush_scheduler_; + + TrimHistoryScheduler trim_history_scheduler_; + + SnapshotList snapshots_; + + TimestampedSnapshotList timestamped_snapshots_; + + // For each background job, pending_outputs_ keeps the current file number at + // the time that background job started. + // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has + // number bigger than any of the file number in pending_outputs_. Since file + // numbers grow monotonically, this also means that pending_outputs_ is always + // sorted. After a background job is done executing, its file number is + // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean + // it up. + // State is protected with db mutex. + std::list pending_outputs_; + + // flush_queue_ and compaction_queue_ hold column families that we need to + // flush and compact, respectively. + // A column family is inserted into flush_queue_ when it satisfies condition + // cfd->imm()->IsFlushPending() + // A column family is inserted into compaction_queue_ when it satisfied + // condition cfd->NeedsCompaction() + // Column families in this list are all Ref()-erenced + // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will + // do RAII on ColumnFamilyData + // Column families are in this queue when they need to be flushed or + // compacted. Consumers of these queues are flush and compaction threads. When + // column family is put on this queue, we increase unscheduled_flushes_ and + // unscheduled_compactions_. When these variables are bigger than zero, that + // means we need to schedule background threads for flush and compaction. + // Once the background threads are scheduled, we decrease unscheduled_flushes_ + // and unscheduled_compactions_. That way we keep track of number of + // compaction and flush threads we need to schedule. This scheduling is done + // in MaybeScheduleFlushOrCompaction() + // invariant(column family present in flush_queue_ <==> + // ColumnFamilyData::pending_flush_ == true) + std::deque flush_queue_; + // invariant(column family present in compaction_queue_ <==> + // ColumnFamilyData::pending_compaction_ == true) + std::deque compaction_queue_; + + // A map to store file numbers and filenames of the files to be purged + std::unordered_map purge_files_; + + // A vector to store the file numbers that have been assigned to certain + // JobContext. Current implementation tracks table and blob files only. + std::unordered_set files_grabbed_for_purge_; + + // A queue to store log writers to close. Protected by db mutex_. + std::deque logs_to_free_queue_; + + std::deque superversions_to_free_queue_; + + int unscheduled_flushes_; + + int unscheduled_compactions_; + + // count how many background compactions are running or have been scheduled in + // the BOTTOM pool + int bg_bottom_compaction_scheduled_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // stores the number of compactions are currently running + int num_running_compactions_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // stores the number of flushes are currently running + int num_running_flushes_; + + // number of background obsolete file purge jobs, submitted to the HIGH pool + int bg_purge_scheduled_; + + std::deque manual_compaction_dequeue_; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // Number of times FindObsoleteFiles has found deletable files and the + // corresponding call to PurgeObsoleteFiles has not yet finished. + int pending_purge_obsolete_files_; + + // last time when DeleteObsoleteFiles with full scan was executed. Originally + // initialized with startup time. + uint64_t delete_obsolete_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // The thread that wants to switch memtable, can wait on this cv until the + // pending writes to memtable finishes. + std::condition_variable switch_cv_; + // The mutex used by switch_cv_. mutex_ should be acquired beforehand. + std::mutex switch_mutex_; + // Number of threads intending to write to memtable + std::atomic pending_memtable_writes_ = {}; + + // A flag indicating whether the current rocksdb database has any + // data that is not yet persisted into either WAL or SST file. + // Used when disableWAL is true. + std::atomic has_unpersisted_data_; + + // if an attempt was made to flush all column families that + // the oldest log depends on but uncommitted data in the oldest + // log prevents the log from being released. + // We must attempt to free the dependent memtables again + // at a later time after the transaction in the oldest + // log is fully commited. + bool unable_to_release_oldest_log_; + + // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() + // calls. + // REQUIRES: mutex held + int num_running_ingest_file_; + + WalManager wal_manager_; + + // A value of > 0 temporarily disables scheduling of background work + int bg_work_paused_; + + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // Indicate DB was opened successfully + bool opened_successfully_; + + // The min threshold to triggere bottommost compaction for removing + // garbages, among all column families. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + LogsWithPrepTracker logs_with_prep_tracker_; + + // Callback for compaction to check if a key is visible to a snapshot. + // REQUIRES: mutex held + std::unique_ptr snapshot_checker_; + + // Callback for when the cached_recoverable_state_ is written to memtable + // Only to be set during initialization + std::unique_ptr recoverable_state_pre_release_callback_; + + // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog(). + // Currently, internally it has a global timer instance for running the tasks. + PeriodicTaskScheduler periodic_task_scheduler_; + + // It contains the implementations for each periodic task. + std::map periodic_task_functions_; + + // When set, we use a separate queue for writes that don't write to memtable. + // In 2PC these are the writes at Prepare phase. + const bool two_write_queues_; + const bool manual_wal_flush_; + + // LastSequence also indicates last published sequence visibile to the + // readers. Otherwise LastPublishedSequence should be used. + const bool last_seq_same_as_publish_seq_; + // It indicates that a customized gc algorithm must be used for + // flush/compaction and if it is not provided vis SnapshotChecker, we should + // disable gc to be safe. + const bool use_custom_gc_; + // Flag to indicate that the DB instance shutdown has been initiated. This + // different from shutting_down_ atomic in that it is set at the beginning + // of shutdown sequence, specifically in order to prevent any background + // error recovery from going on in parallel. The latter, shutting_down_, + // is set a little later during the shutdown after scheduling memtable + // flushes + std::atomic shutdown_initiated_; + // Flag to indicate whether sst_file_manager object was allocated in + // DB::Open() or passed to us + bool own_sfm_; + + // Flag to check whether Close() has been called on this DB + bool closed_; + // save the closing status, for re-calling the close() + Status closing_status_; + // mutex for DB::Close() + InstrumentedMutex closing_mutex_; + + // Conditional variable to coordinate installation of atomic flush results. + // With atomic flush, each bg thread installs the result of flushing multiple + // column families, and different threads can flush different column + // families. It's difficult to rely on one thread to perform batch + // installation for all threads. This is different from the non-atomic flush + // case. + // atomic_flush_install_cv_ makes sure that threads install atomic flush + // results sequentially. Flush results of memtables with lower IDs get + // installed to MANIFEST first. + InstrumentedCondVar atomic_flush_install_cv_; + + bool wal_in_db_path_; + std::atomic max_total_wal_size_; + + BlobFileCompletionCallback blob_callback_; + + // Pointer to WriteBufferManager stalling interface. + std::unique_ptr wbm_stall_; + + // seqno_time_mapping_ stores the sequence number to time mapping, it's not + // thread safe, both read and write need db mutex hold. + SeqnoToTimeMapping seqno_time_mapping_; + + // Stop write token that is acquired when first LockWAL() is called. + // Destroyed when last UnlockWAL() is called. Controlled by DB mutex. + // See lock_wal_count_ + std::unique_ptr lock_wal_write_token_; + + // The number of LockWAL called without matching UnlockWAL call. + // See also lock_wal_write_token_ + uint32_t lock_wal_count_; +}; + +class GetWithTimestampReadCallback : public ReadCallback { + public: + explicit GetWithTimestampReadCallback(SequenceNumber seq) + : ReadCallback(seq) {} + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= max_visible_seq_; + } +}; + +extern Options SanitizeOptions(const std::string& db, const Options& src, + bool read_only = false, + Status* logger_creation_s = nullptr); + +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, + bool read_only = false, + Status* logger_creation_s = nullptr); + +extern CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + +// Return the earliest log file to keep after the memtable flush is +// finalized. +// `cfd_to_flush` is the column family whose memtable (specified in +// `memtables_to_flush`) will be flushed and thus will not depend on any WAL +// file. +// The function is only applicable to 2pc mode. +extern uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); + +// In non-2PC mode, WALs with log number < the returned number can be +// deleted after the cfd_to_flush column family is flushed successfully. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists); + +// `cfd_to_flush` is the column family whose memtable will be flushed and thus +// will not depend on any WAL file. nullptr means no memtable is being flushed. +// The function is only applicable to 2pc mode. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const autovector& memtables_to_flush); +// For atomic flush. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, + const autovector*>& memtables_to_flush); + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} + +inline Status DBImpl::FailIfCfHasTs( + const ColumnFamilyHandle* column_family) const { + column_family = column_family ? column_family : DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + std::ostringstream oss; + oss << "cannot call this method on column family " + << column_family->GetName() << " that enables timestamp"; + return Status::InvalidArgument(oss.str()); + } + return Status::OK(); +} + +inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family, + const Slice& ts, + bool ts_for_read) const { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be null"); + } + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (0 == ucmp->timestamp_size()) { + std::stringstream oss; + oss << "cannot call this method on column family " + << column_family->GetName() << " that does not enable timestamp"; + return Status::InvalidArgument(oss.str()); + } + const size_t ts_sz = ts.size(); + if (ts_sz != ucmp->timestamp_size()) { + std::stringstream oss; + oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", " + << ts_sz << " given"; + return Status::InvalidArgument(oss.str()); + } + if (ts_for_read) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + std::string current_ts_low = cfd->GetFullHistoryTsLow(); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(ts, current_ts_low) < 0) { + std::stringstream oss; + oss << "Read timestamp: " << ts.ToString(true) + << " is smaller than full_history_ts_low: " + << Slice(current_ts_low).ToString(true) << std::endl; + return Status::InvalidArgument(oss.str()); + } + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_compaction_flush.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_compaction_flush.cc new file mode 100644 index 0000000000..9e084ab622 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_compaction_flush.cc @@ -0,0 +1,3980 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/concurrent_task_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { + +bool DBImpl::EnoughRoomForCompaction( + ColumnFamilyData* cfd, const std::vector& inputs, + bool* sfm_reserved_compact_space, LogBuffer* log_buffer) { + // Check if we have enough room to do the compaction + bool enough_room = true; + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Pass the current bg_error_ to SFM so it can decide what checks to + // perform. If this DB instance hasn't seen any error yet, the SFM can be + // optimistic and not do disk space checks + Status bg_error = error_handler_.GetBGError(); + enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error); + bg_error.PermitUncheckedError(); // bg_error is just a copy of the Status + // from the error_handler_ + if (enough_room) { + *sfm_reserved_compact_space = true; + } + } + if (!enough_room) { + // Just in case tests want to change the value of enough_room + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room); + ROCKS_LOG_BUFFER(log_buffer, + "Cancelled compaction because not enough room"); + RecordTick(stats_, COMPACTION_CANCELLED, 1); + } + return enough_room; +} + +bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer) { + assert(*token == nullptr); + auto limiter = static_cast( + cfd->ioptions()->compaction_thread_limiter.get()); + if (limiter == nullptr) { + return true; + } + *token = limiter->GetToken(force); + if (*token != nullptr) { + ROCKS_LOG_BUFFER(log_buffer, + "Thread limiter [%s] increase [%s] compaction task, " + "force: %s, tasks after: %d", + limiter->GetName().c_str(), cfd->GetName().c_str(), + force ? "true" : "false", limiter->GetOutstandingTask()); + return true; + } + return false; +} + +IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, + VersionEdit* synced_wals) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); + InstrumentedMutexLock l(&log_write_mutex_); + autovector logs_to_sync; + uint64_t current_log_number = logfile_number_; + while (logs_.front().number < current_log_number && + logs_.front().IsSyncing()) { + log_sync_cv_.Wait(); + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number < current_log_number; ++it) { + auto& log = *it; + log.PrepareForSync(); + logs_to_sync.push_back(log.writer); + } + + IOStatus io_s; + if (!logs_to_sync.empty()) { + log_write_mutex_.Unlock(); + + assert(job_context); + + for (log::Writer* log : logs_to_sync) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, + log->get_log_number()); + if (error_handler_.IsRecoveryInProgress()) { + log->file()->reset_seen_error(); + } + io_s = log->file()->Sync(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + break; + } + + if (immutable_db_options_.recycle_log_file_num > 0) { + if (error_handler_.IsRecoveryInProgress()) { + log->file()->reset_seen_error(); + } + io_s = log->Close(); + if (!io_s.ok()) { + break; + } + } + } + if (io_s.ok()) { + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + + TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock", + /*arg=*/nullptr); + log_write_mutex_.Lock(); + + // "number <= current_log_number - 1" is equivalent to + // "number < current_log_number". + if (io_s.ok()) { + MarkLogsSynced(current_log_number - 1, true, synced_wals); + } else { + MarkLogsNotSynced(current_log_number - 1); + } + if (!io_s.ok()) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); + return io_s; + } + } + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end"); + return io_s; +} + +Status DBImpl::FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* made_progress, JobContext* job_context, FlushReason flush_reason, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + assert(cfd); + assert(cfd->imm()); + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + assert(versions_); + assert(versions_->GetColumnFamilySet()); + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other (unflushed) column families are missing. + const bool needs_to_sync_closed_wals = + logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1; + + // If needs_to_sync_closed_wals is true, we need to record the current + // maximum memtable ID of this column family so that a later PickMemtables() + // call will not pick memtables whose IDs are higher. This is due to the fact + // that SyncClosedLogs() may release the db mutex, and memtable switch can + // happen for this column family in the meantime. The newly created memtables + // have their data backed by unsynced WALs, thus they cannot be included in + // this flush job. + // Another reason why we must record the current maximum memtable ID of this + // column family: SyncClosedLogs() may release db mutex, thus it's possible + // for application to continue to insert into memtables increasing db's + // sequence number. The application may take a snapshot, but this snapshot is + // not included in `snapshot_seqs` which will be passed to flush job because + // `snapshot_seqs` has already been computed before this function starts. + // Recording the max memtable ID ensures that the flush job does not flush + // a memtable without knowing such snapshot(s). + uint64_t max_memtable_id = needs_to_sync_closed_wals + ? cfd->imm()->GetLatestMemTableID() + : std::numeric_limits::max(); + + // If needs_to_sync_closed_wals is false, then the flush job will pick ALL + // existing memtables of the column family when PickMemTable() is called + // later. Although we won't call SyncClosedLogs() in this case, we may still + // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also + // releases and re-acquires the db mutex. In the meantime, the application + // can still insert into the memtables and increase the db's sequence number. + // The application can take a snapshot, hoping that the latest visible state + // to this snapshto is preserved. This is hard to guarantee since db mutex + // not held. This newly-created snapshot is not included in `snapshot_seqs` + // and the flush job is unaware of its presence. Consequently, the flush job + // may drop certain keys when generating the L0, causing incorrect data to be + // returned for snapshot read using this snapshot. + // To address this, we make sure NotifyOnFlushBegin() executes after memtable + // picking so that no new snapshot can be taken between the two functions. + + FlushJob flush_job( + dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, + file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + job_context, flush_reason, log_buffer, directories_.GetDbDir(), + GetDataDir(cfd, 0U), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats, + true /* sync_output_directory */, true /* write_manifest */, thread_pri, + io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, + cfd->GetFullHistoryTsLow(), &blob_callback_); + FileMetaData file_meta; + + Status s; + bool need_cancel = false; + IOStatus log_io_s = IOStatus::OK(); + if (needs_to_sync_closed_wals) { + // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple + // times. + VersionEdit synced_wals; + mutex_.Unlock(); + log_io_s = SyncClosedLogs(job_context, &synced_wals); + mutex_.Lock(); + if (log_io_s.ok() && synced_wals.IsWalAddition()) { + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); + TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", + nullptr); + } + + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } + } else { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); + } + s = log_io_s; + + // If the log sync failed, we do not need to pick memtable. Otherwise, + // num_flush_not_started_ needs to be rollback. + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); + if (s.ok()) { + flush_job.PickMemTable(); + need_cancel = true; + } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job); + + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id, + flush_reason); + + bool switched_to_mempurge = false; + // Within flush_job.Run, rocksdb may call event listener to notify + // file creation and deletion. + // + // Note that flush_job.Run will unlock and lock the db_mutex, + // and EventListener callback will be called when the db_mutex + // is unlocked by the current thread. + if (s.ok()) { + s = flush_job.Run(&logs_with_prep_tracker_, &file_meta, + &switched_to_mempurge); + need_cancel = false; + } + + if (!s.ok() && need_cancel) { + flush_job.Cancel(); + } + + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, superversion_context, + mutable_cf_options); + if (made_progress) { + *made_progress = true; + } + + const std::string& column_family_name = cfd->GetName(); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + column_family_name.c_str(), + storage_info->LevelSummary(&tmp)); + + const auto& blob_files = storage_info->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + } + + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + if (log_io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + if (!versions_->io_status().ok()) { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is + // needed. + error_handler_.SetBGError(s, + BackgroundErrorReason::kManifestWriteNoWAL); + } else { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL); + } + } else { + assert(s == log_io_s); + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } + // If flush ran smoothly and no mempurge happened + // install new SST file path. + if (s.ok() && (!switched_to_mempurge)) { + // may temporarily unlock and lock the mutex. + NotifyOnFlushCompleted(cfd, mutable_cf_options, + flush_job.GetCommittedFlushJobsInfo()); + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName( + cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber()); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); + if (sfm->IsMaxAllowedSpaceReached()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + &new_bg_error); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish"); + return s; +} + +Status DBImpl::FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + if (immutable_db_options_.atomic_flush) { + return AtomicFlushMemTablesToOutputFiles( + bg_flush_args, made_progress, job_context, log_buffer, thread_pri); + } + assert(bg_flush_args.size() == 1); + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + const auto& bg_flush_arg = bg_flush_args[0]; + ColumnFamilyData* cfd = bg_flush_arg.cfd_; + // intentional infrequent copy for each flush + MutableCFOptions mutable_cf_options_copy = *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = + bg_flush_arg.superversion_context_; + FlushReason flush_reason = bg_flush_arg.flush_reason_; + Status s = FlushMemTableToOutputFile( + cfd, mutable_cf_options_copy, made_progress, job_context, flush_reason, + superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, log_buffer, thread_pri); + return s; +} + +/* + * Atomically flushes multiple column families. + * + * For each column family, all memtables with ID smaller than or equal to the + * ID specified in bg_flush_args will be flushed. Only after all column + * families finish flush will this function commit to MANIFEST. If any of the + * column families are not flushed successfully, this function does not have + * any side-effect on the state of the database. + */ +Status DBImpl::AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + mutex_.AssertHeld(); + + autovector cfds; + for (const auto& arg : bg_flush_args) { + cfds.emplace_back(arg.cfd_); + } + +#ifndef NDEBUG + for (const auto cfd : cfds) { + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + } + for (const auto& bg_flush_arg : bg_flush_args) { + assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_); + } +#endif /* !NDEBUG */ + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + autovector distinct_output_dirs; + autovector distinct_output_dir_paths; + std::vector> jobs; + std::vector all_mutable_cf_options; + int num_cfs = static_cast(cfds.size()); + all_mutable_cf_options.reserve(num_cfs); + for (int i = 0; i < num_cfs; ++i) { + auto cfd = cfds[i]; + FSDirectory* data_dir = GetDataDir(cfd, 0U); + const std::string& curr_path = cfd->ioptions()->cf_paths[0].path; + + // Add to distinct output directories if eligible. Use linear search. Since + // the number of elements in the vector is not large, performance should be + // tolerable. + bool found = false; + for (const auto& path : distinct_output_dir_paths) { + if (path == curr_path) { + found = true; + break; + } + } + if (!found) { + distinct_output_dir_paths.emplace_back(curr_path); + distinct_output_dirs.emplace_back(data_dir); + } + + all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); + uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_; + FlushReason flush_reason = bg_flush_args[i].flush_reason_; + jobs.emplace_back(new FlushJob( + dbname_, cfd, immutable_db_options_, mutable_cf_options, + max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, + &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, flush_reason, log_buffer, + directories_.GetDbDir(), data_dir, + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats, + false /* sync_output_directory */, false /* write_manifest */, + thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, + cfd->GetFullHistoryTsLow(), &blob_callback_)); + } + + std::vector file_meta(num_cfs); + // Use of deque because vector + // is specific and doesn't allow &v[i]. + std::deque switched_to_mempurge(num_cfs, false); + Status s; + IOStatus log_io_s = IOStatus::OK(); + assert(num_cfs == static_cast(jobs.size())); + + for (int i = 0; i != num_cfs; ++i) { + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); + // may temporarily unlock and lock the mutex. + FlushReason flush_reason = bg_flush_args[i].flush_reason_; + NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, + job_context->job_id, flush_reason); + } + + if (logfile_number_ > 0) { + // TODO (yanqin) investigate whether we should sync the closed logs for + // single column family case. + VersionEdit synced_wals; + mutex_.Unlock(); + log_io_s = SyncClosedLogs(job_context, &synced_wals); + mutex_.Lock(); + if (log_io_s.ok() && synced_wals.IsWalAddition()) { + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); + } + + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + if (total_log_size_ > 0) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } else { + // If the WAL is empty, we use different error reason + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL); + } + } + } + s = log_io_s; + + // exec_status stores the execution status of flush_jobs as + // + autovector> exec_status; + std::vector pick_status; + for (int i = 0; i != num_cfs; ++i) { + // Initially all jobs are not executed, with status OK. + exec_status.emplace_back(false, Status::OK()); + pick_status.push_back(false); + } + + if (s.ok()) { + for (int i = 0; i != num_cfs; ++i) { + jobs[i]->PickMemTable(); + pick_status[i] = true; + } + } + + if (s.ok()) { + assert(switched_to_mempurge.size() == + static_cast(num_cfs)); + // TODO (yanqin): parallelize jobs with threads. + for (int i = 1; i != num_cfs; ++i) { + exec_status[i].second = + jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i], + &(switched_to_mempurge.at(i))); + exec_status[i].first = true; + } + if (num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1"); + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"); + } + assert(exec_status.size() > 0); + assert(!file_meta.empty()); + exec_status[0].second = jobs[0]->Run( + &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */, + switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0))); + exec_status[0].first = true; + + Status error_status; + for (const auto& e : exec_status) { + if (!e.second.ok()) { + s = e.second; + if (!e.second.IsShutdownInProgress() && + !e.second.IsColumnFamilyDropped()) { + // If a flush job did not return OK, and the CF is not dropped, and + // the DB is not shutting down, then we have to return this result to + // caller later. + error_status = e.second; + } + } + } + + s = error_status.ok() ? s : error_status; + } + + if (s.IsColumnFamilyDropped()) { + s = Status::OK(); + } + + if (s.ok() || s.IsShutdownInProgress()) { + // Sync on all distinct output directories. + for (auto dir : distinct_output_dirs) { + if (dir != nullptr) { + Status error_status = dir->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + if (!error_status.ok()) { + s = error_status; + break; + } + } + } + } else { + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + // Have to cancel the flush jobs that have NOT executed because we need to + // unref the versions. + for (int i = 0; i != num_cfs; ++i) { + if (pick_status[i] && !exec_status[i].first) { + jobs[i]->Cancel(); + } + } + for (int i = 0; i != num_cfs; ++i) { + if (exec_status[i].second.ok() && exec_status[i].first) { + auto& mems = jobs[i]->GetMemTables(); + cfds[i]->imm()->RollbackMemtableFlush(mems, + file_meta[i].fd.GetNumber()); + } + } + } + + if (s.ok()) { + const auto wait_to_install_func = + [&]() -> std::pair { + if (!versions_->io_status().ok()) { + // Something went wrong elsewhere, we cannot count on waiting for our + // turn to write/sync to MANIFEST or CURRENT. Just return. + return std::make_pair(versions_->io_status(), false); + } else if (shutting_down_.load(std::memory_order_acquire)) { + return std::make_pair(Status::ShutdownInProgress(), false); + } + bool ready = true; + for (size_t i = 0; i != cfds.size(); ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (cfds[i]->IsDropped()) { + // If the column family is dropped, then do not wait. + continue; + } else if (!mems.empty() && + cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) { + // If a flush job needs to install the flush result for mems and + // mems[0] is not the earliest memtable, it means another thread must + // be installing flush results for the same column family, then the + // current thread needs to wait. + ready = false; + break; + } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <= + bg_flush_args[i].max_memtable_id_) { + // If a flush job does not need to install flush results, then it has + // to wait until all memtables up to max_memtable_id_ (inclusive) are + // installed. + ready = false; + break; + } + } + return std::make_pair(Status::OK(), !ready); + }; + + bool resuming_from_bg_err = + error_handler_.IsDBStopped() || + (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || + bg_flush_args[0].flush_reason_ == + FlushReason::kErrorRecoveryRetryFlush); + while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) { + std::pair res = wait_to_install_func(); + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res); + + if (!res.first.ok()) { + s = res.first; + break; + } else if (!res.second) { + break; + } + atomic_flush_install_cv_.Wait(); + + resuming_from_bg_err = + error_handler_.IsDBStopped() || + (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || + bg_flush_args[0].flush_reason_ == + FlushReason::kErrorRecoveryRetryFlush); + } + + if (!resuming_from_bg_err) { + // If not resuming from bg err, then we determine future action based on + // whether we hit background error. + if (s.ok()) { + s = error_handler_.GetBGError(); + } + } else if (s.ok()) { + // If resuming from bg err, we still rely on wait_to_install_func()'s + // result to determine future action. If wait_to_install_func() returns + // non-ok already, then we should not proceed to flush result + // installation. + s = error_handler_.GetRecoveryError(); + } + } + + if (s.ok()) { + autovector tmp_cfds; + autovector*> mems_list; + autovector mutable_cf_options_list; + autovector tmp_file_meta; + autovector>*> + committed_flush_jobs_info; + for (int i = 0; i != num_cfs; ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (!cfds[i]->IsDropped() && !mems.empty()) { + tmp_cfds.emplace_back(cfds[i]); + mems_list.emplace_back(&mems); + mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); + tmp_file_meta.emplace_back(&file_meta[i]); + committed_flush_jobs_info.emplace_back( + jobs[i]->GetCommittedFlushJobsInfo()); + } + } + + s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, + versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta, + committed_flush_jobs_info, &job_context->memtables_to_free, + directories_.GetDbDir(), log_buffer); + } + + if (s.ok()) { + assert(num_cfs == + static_cast(job_context->superversion_contexts.size())); + for (int i = 0; i != num_cfs; ++i) { + assert(cfds[i]); + + if (cfds[i]->IsDropped()) { + continue; + } + InstallSuperVersionAndScheduleWork(cfds[i], + &job_context->superversion_contexts[i], + all_mutable_cf_options[i]); + + const std::string& column_family_name = cfds[i]->GetName(); + + Version* const current = cfds[i]->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + column_family_name.c_str(), + storage_info->LevelSummary(&tmp)); + + const auto& blob_files = storage_info->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + } + if (made_progress) { + *made_progress = true; + } + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + assert(all_mutable_cf_options.size() == static_cast(num_cfs)); + for (int i = 0; s.ok() && i != num_cfs; ++i) { + // If mempurge happened instead of Flush, + // no NotifyOnFlushCompleted call (no SST file created). + if (switched_to_mempurge[i]) { + continue; + } + if (cfds[i]->IsDropped()) { + continue; + } + NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i], + jobs[i]->GetCommittedFlushJobsInfo()); + if (sfm) { + std::string file_path = MakeTableFileName( + cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); + if (sfm->IsMaxAllowedSpaceReached() && + error_handler_.GetBGError().ok()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + error_handler_.SetBGError(new_bg_error, + BackgroundErrorReason::kFlush); + } + } + } + } + + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + if (!s.ok() && !s.IsColumnFamilyDropped()) { + if (log_io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + if (!versions_->io_status().ok()) { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor + // is needed. + error_handler_.SetBGError(s, + BackgroundErrorReason::kManifestWriteNoWAL); + } else { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL); + } + } else { + assert(s == log_io_s); + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } + + return s; +} + +void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id, FlushReason flush_reason) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + FlushJobInfo info{}; + info.cf_id = cfd->GetID(); + info.cf_name = cfd->GetName(); + // TODO(yhchiang): make db_paths dynamic in case flush does not + // go to L0 in the future. + const uint64_t file_number = file_meta->fd.GetNumber(); + info.file_path = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number); + info.file_number = file_number; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.triggered_writes_slowdown = triggered_writes_slowdown; + info.triggered_writes_stop = triggered_writes_stop; + info.smallest_seqno = file_meta->fd.smallest_seqno; + info.largest_seqno = file_meta->fd.largest_seqno; + info.flush_reason = flush_reason; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushBegin(this, info); + } + } + mutex_.Lock(); +// no need to signal bg_cv_ as it will be signaled at the end of the +// flush process. +} + +void DBImpl::NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info) { + assert(flush_jobs_info != nullptr); + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + for (auto& info : *flush_jobs_info) { + info->triggered_writes_slowdown = triggered_writes_slowdown; + info->triggered_writes_stop = triggered_writes_stop; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushCompleted(this, *info); + } + TEST_SYNC_POINT( + "DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted"); + } + flush_jobs_info->clear(); + } + mutex_.Lock(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +} + +Status DBImpl::CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin_without_ts, + const Slice* end_without_ts) { + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return CompactRangeInternal(options, column_family, begin_without_ts, + end_without_ts, "" /*trim_ts*/); + } + + std::string begin_str; + std::string end_str; + + // CompactRange compact all keys: [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to include + // all `end` keys. + if (begin_without_ts != nullptr) { + AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz); + } + if (end_without_ts != nullptr) { + AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz); + } + Slice begin(begin_str); + Slice end(end_str); + + Slice* begin_with_ts = begin_without_ts ? &begin : nullptr; + Slice* end_with_ts = end_without_ts ? &end : nullptr; + + return CompactRangeInternal(options, column_family, begin_with_ts, + end_with_ts, "" /*trim_ts*/); +} + +Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) { + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + if (cfd->user_comparator()->timestamp_size() != ts_low.size()) { + return Status::InvalidArgument("ts_low size mismatch"); + } + return IncreaseFullHistoryTsLowImpl(cfd, ts_low); +} + +Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, + std::string ts_low) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + edit.SetFullHistoryTsLow(ts_low); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", + &edit); + + InstrumentedMutexLock l(&mutex_); + std::string current_ts_low = cfd->GetFullHistoryTsLow(); + const Comparator* ucmp = cfd->user_comparator(); + assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty()); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) { + return Status::InvalidArgument("Cannot decrease full_history_ts_low"); + } + + Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); + if (!s.ok()) { + return s; + } + current_ts_low = cfd->GetFullHistoryTsLow(); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(current_ts_low, ts_low) > 0) { + std::stringstream oss; + oss << "full_history_ts_low: " << Slice(current_ts_low).ToString(true) + << " is set to be higher than the requested " + "timestamp: " + << Slice(ts_low).ToString(true) << std::endl; + return Status::TryAgain(oss.str()); + } + return Status::OK(); +} + +Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + const std::string& trim_ts) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) { + return Status::InvalidArgument("Invalid target path ID"); + } + + bool flush_needed = true; + + // Update full_history_ts_low if it's set + if (options.full_history_ts_low != nullptr && + !options.full_history_ts_low->empty()) { + std::string ts_low = options.full_history_ts_low->ToString(); + if (begin != nullptr || end != nullptr) { + return Status::InvalidArgument( + "Cannot specify compaction range with full_history_ts_low"); + } + Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low); + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + Status s; + if (begin != nullptr && end != nullptr) { + // TODO(ajkr): We could also optimize away the flush in certain cases where + // one/both sides of the interval are unbounded. But it requires more + // changes to RangesOverlapWithMemtables. + Range range(*begin, *end); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + s = cfd->RangesOverlapWithMemtables( + {range}, super_version, immutable_db_options_.allow_data_in_errors, + &flush_needed); + CleanupSuperVersion(super_version); + } + + if (s.ok() && flush_needed) { + FlushOptions fo; + fo.allow_write_stall = options.allow_write_stall; + if (immutable_db_options_.atomic_flush) { + s = AtomicFlushMemTables(fo, FlushReason::kManualCompaction); + } else { + s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction); + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + constexpr int kInvalidLevel = -1; + int final_output_level = kInvalidLevel; + bool exclusive = options.exclusive_manual_compaction; + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && + cfd->NumberLevels() > 1) { + // Always compact all files together. + final_output_level = cfd->NumberLevels() - 1; + // if bottom most level is reserved + if (immutable_db_options_.allow_ingest_behind) { + final_output_level--; + } + s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, + final_output_level, options, begin, end, exclusive, + false /* disable_trivial_move */, + std::numeric_limits::max(), trim_ts); + } else { + int first_overlapped_level = kInvalidLevel; + int max_overlapped_level = kInvalidLevel; + { + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* current_version = super_version->current; + + // Might need to query the partitioner + SstPartitionerFactory* partitioner_factory = + current_version->cfd()->ioptions()->sst_partitioner_factory.get(); + std::unique_ptr partitioner; + if (partitioner_factory && begin != nullptr && end != nullptr) { + SstPartitioner::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = true; + context.output_level = /*unknown*/ -1; + // Small lies about compaction range + context.smallest_user_key = *begin; + context.largest_user_key = *end; + partitioner = partitioner_factory->CreatePartitioner(context); + } + + ReadOptions ro; + ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kCompaction; + bool overlap; + for (int level = 0; + level < current_version->storage_info()->num_non_empty_levels(); + level++) { + overlap = true; + + // Whether to look at specific keys within files for overlap with + // compaction range, other than largest and smallest keys of the file + // known in Version metadata. + bool check_overlap_within_file = false; + if (begin != nullptr && end != nullptr) { + // Typically checking overlap within files in this case + check_overlap_within_file = true; + // WART: Not known why we don't check within file in one-sided bound + // cases + if (partitioner) { + // Especially if the partitioner is new, the manual compaction + // might be used to enforce the partitioning. Checking overlap + // within files might miss cases where compaction is needed to + // partition the files, as in this example: + // * File has two keys "001" and "111" + // * Compaction range is ["011", "101") + // * Partition boundary at "100" + // In cases like this, file-level overlap with the compaction + // range is sufficient to force any partitioning that is needed + // within the compaction range. + // + // But if there's no partitioning boundary within the compaction + // range, we can be sure there's no need to fix partitioning + // within that range, thus safe to check overlap within file. + // + // Use a hypothetical trivial move query to check for partition + // boundary in range. (NOTE: in defiance of all conventions, + // `begin` and `end` here are both INCLUSIVE bounds, which makes + // this analogy to CanDoTrivialMove() accurate even when `end` is + // the first key in a partition.) + if (!partitioner->CanDoTrivialMove(*begin, *end)) { + check_overlap_within_file = false; + } + } + } + if (check_overlap_within_file) { + Status status = current_version->OverlapWithLevelIterator( + ro, file_options_, *begin, *end, level, &overlap); + if (!status.ok()) { + check_overlap_within_file = false; + } + } + if (!check_overlap_within_file) { + overlap = current_version->storage_info()->OverlapInLevel(level, + begin, end); + } + if (overlap) { + if (first_overlapped_level == kInvalidLevel) { + first_overlapped_level = level; + } + max_overlapped_level = level; + } + } + CleanupSuperVersion(super_version); + } + if (s.ok() && first_overlapped_level != kInvalidLevel) { + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + assert(first_overlapped_level == 0); + s = RunManualCompaction( + cfd, first_overlapped_level, first_overlapped_level, options, begin, + end, exclusive, true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + trim_ts); + final_output_level = max_overlapped_level; + } else { + assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); + uint64_t next_file_number = versions_->current_next_file_number(); + // Start compaction from `first_overlapped_level`, one level down at a + // time, until output level >= max_overlapped_level. + // When max_overlapped_level == 0, we will still compact from L0 -> L1 + // (or LBase), and followed by a bottommost level intra-level compaction + // at L1 (or LBase), if applicable. + int level = first_overlapped_level; + final_output_level = level; + int output_level = 0, base_level = 0; + while (level < max_overlapped_level || level == 0) { + output_level = level + 1; + if (cfd->ioptions()->level_compaction_dynamic_level_bytes && + level == 0) { + output_level = ColumnFamilyData::kCompactToBaseLevel; + } + // Use max value for `max_file_num_to_ignore` to always compact + // files down. + s = RunManualCompaction( + cfd, level, output_level, options, begin, end, exclusive, + !trim_ts.empty() /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + trim_ts, + output_level == ColumnFamilyData::kCompactToBaseLevel + ? &base_level + : nullptr); + if (!s.ok()) { + break; + } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(base_level > 0); + level = base_level; + } else { + ++level; + } + final_output_level = level; + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); + } + if (s.ok()) { + assert(final_output_level > 0); + // bottommost level intra-level compaction + // TODO(cbi): this preserves earlier behavior where if + // max_overlapped_level = 0 and bottommost_level_compaction is + // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction + // and do not do intra-LBase compaction even when user configures + // compaction filter. We may want to still do a LBase -> LBase + // compaction in case there is some file in LBase that did not go + // through L0 -> LBase compaction, and hence did not go through + // compaction filter. + if ((options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + max_overlapped_level != 0 && + (cfd->ioptions()->compaction_filter != nullptr || + cfd->ioptions()->compaction_filter_factory != nullptr)) || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForce) { + // Use `next_file_number` as `max_file_num_to_ignore` to avoid + // rewriting newly compacted files when it is kForceOptimized. + s = RunManualCompaction( + cfd, final_output_level, final_output_level, options, begin, + end, exclusive, !trim_ts.empty() /* disallow_trivial_move */, + next_file_number /* max_file_num_to_ignore */, trim_ts); + } + } + } + } + } + if (!s.ok() || final_output_level == kInvalidLevel) { + LogFlush(immutable_db_options_.info_log); + return s; + } + + if (options.change_level) { + TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1"); + TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2"); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[RefitLevel] waiting for background threads to stop"); + // TODO(hx235): remove `Enable/DisableManualCompaction` and + // `Continue/PauseBackgroundWork` once we ensure registering RefitLevel()'s + // range is sufficient (if not, what else is needed) for avoiding range + // conflicts with other activities (e.g, compaction, flush) that are + // currently avoided by `Enable/DisableManualCompaction` and + // `Continue/PauseBackgroundWork`. + DisableManualCompaction(); + s = PauseBackgroundWork(); + if (s.ok()) { + TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel"); + s = ReFitLevel(cfd, final_output_level, options.target_level); + TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel"); + // ContinueBackgroundWork always return Status::OK(). + Status temp_s = ContinueBackgroundWork(); + assert(temp_s.ok()); + } + EnableManualCompaction(); + TEST_SYNC_POINT( + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled"); + } + LogFlush(immutable_db_options_.info_log); + + { + InstrumentedMutexLock l(&mutex_); + // an automatic compaction that has been scheduled might have been + // preempted by the manual compactions. Need to schedule it back. + MaybeScheduleFlushOrCompaction(); + } + + return s; +} + +Status DBImpl::CompactFiles(const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id, + std::vector* const output_file_names, + CompactionJobInfo* compaction_job_info) { + if (column_family == nullptr) { + return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); + } + + auto cfd = + static_cast_with_check(column_family)->cfd(); + assert(cfd); + + Status s; + JobContext job_context(next_job_id_.fetch_add(1), true); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + // Perform CompactFiles + TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2"); + TEST_SYNC_POINT_CALLBACK( + "TestCompactFiles:PausingManualCompaction:3", + reinterpret_cast( + const_cast*>(&manual_compaction_paused_))); + { + InstrumentedMutexLock l(&mutex_); + auto* current = cfd->current(); + current->Ref(); + + s = CompactFilesImpl(compact_options, cfd, current, input_file_names, + output_file_names, output_level, output_path_id, + &job_context, &log_buffer, compaction_job_info); + + current->Unref(); + } + + // Find and delete obsolete files + { + InstrumentedMutexLock l(&mutex_); + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + FindObsoleteFiles(&job_context, !s.ok()); + } // release the mutex + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + // no mutex is locked here. No need to Unlock() and Lock() here. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + } + + return s; +} + +Status DBImpl::CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + std::vector* const output_file_names, const int output_level, + int output_path_id, JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info) { + mutex_.AssertHeld(); + + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + std::unordered_set input_set; + for (const auto& file_name : input_file_names) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + ColumnFamilyMetaData cf_meta; + // TODO(yhchiang): can directly use version here if none of the + // following functions call is pluggable to external developers. + version->GetColumnFamilyMetaData(&cf_meta); + + if (output_path_id < 0) { + if (cfd->ioptions()->cf_paths.size() == 1U) { + output_path_id = 0; + } else { + return Status::NotSupported( + "Automatic output path selection is not " + "yet supported in CompactFiles()"); + } + } + + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( + &input_set, cf_meta, output_level); + TEST_SYNC_POINT("DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles"); + if (!s.ok()) { + return s; + } + + std::vector input_files; + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, version->storage_info(), compact_options); + if (!s.ok()) { + return s; + } + + for (const auto& inputs : input_files) { + if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) { + return Status::Aborted( + "Some of the necessary compaction input " + "files are already being compacted"); + } + } + bool sfm_reserved_compact_space = false; + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + cfd, input_files, &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + return Status::CompactionTooLarge(); + } + + // At this point, CompactFiles will be run. + bg_compaction_scheduled_++; + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + compact_options, input_files, output_level, version->storage_info(), + *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id)); + // we already sanitized the set of input files and checked for conflicts + // without releasing the lock, so we're guaranteed a compaction can be formed. + assert(c != nullptr); + + c->SetInputVersion(version); + // deletion compaction currently not allowed in CompactFiles. + assert(!c->deletion_compaction()); + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJobStats compaction_job_stats; + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + job_context, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, Env::Priority::USER, io_tracer_, + kManualCompactionCanceledFalse_, db_id_, db_session_id_, + c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(), + &blob_callback_, &bg_compaction_scheduled_, + &bg_bottom_compaction_scheduled_); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + version->storage_info()->ComputeCompactionScore(*cfd->ioptions(), + *c->mutable_cf_options()); + + compaction_job.Prepare(); + + mutex_.Unlock(); + TEST_SYNC_POINT("CompactFilesImpl:0"); + TEST_SYNC_POINT("CompactFilesImpl:1"); + // Ignore the status here, as it will be checked in the Install down below... + compaction_job.Run().PermitUncheckedError(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); + mutex_.Lock(); + + Status status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + assert(compaction_job.io_status().ok()); + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + // status above captures any error during compaction_job.Install, so its ok + // not check compaction_job.io_status() explicitly if we're not calling + // SetBGError + compaction_job.io_status().PermitUncheckedError(); + c->ReleaseCompactionFiles(s); + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + if (compaction_job_info != nullptr) { + BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats, + job_context->job_id, version, compaction_job_info); + } + + if (status.ok()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else if (status.IsManualCompactionPaused()) { + // Don't report stopping manual compaction as error + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] [JOB %d] Stopping manual compaction", + c->column_family_data()->GetName().c_str(), + job_context->job_id); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Compaction error: %s", + c->column_family_data()->GetName().c_str(), + job_context->job_id, status.ToString().c_str()); + IOStatus io_s = compaction_job.io_status(); + if (!io_s.ok()) { + error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); + } else { + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + } + + if (output_file_names != nullptr) { + for (const auto& newf : c->edit()->GetNewFiles()) { + output_file_names->push_back(TableFileName( + c->immutable_options()->cf_paths, newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + + for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) { + output_file_names->push_back( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber())); + } + } + + c.reset(); + + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + MaybeScheduleFlushOrCompaction(); + TEST_SYNC_POINT("CompactFilesImpl:End"); + + return status; +} + +Status DBImpl::PauseBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + bg_compaction_paused_++; + while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 || + bg_flush_scheduled_ > 0) { + bg_cv_.Wait(); + } + bg_work_paused_++; + return Status::OK(); +} + +Status DBImpl::ContinueBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + if (bg_work_paused_ == 0) { + return Status::InvalidArgument(); + } + assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; + bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ + if (bg_work_paused_ == 0) { + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id) { + if (immutable_db_options_.listeners.empty()) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return; + } + + c->SetNotifyOnCompactionCompleted(); + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); + { + CompactionJobInfo info{}; + BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info); + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionBegin(this, info); + } + info.status.PermitUncheckedError(); + } + mutex_.Lock(); + current->Unref(); +} + +void DBImpl::NotifyOnCompactionCompleted( + ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + if (c->ShouldNotifyOnCompactionCompleted() == false) { + return; + } + + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); + { + CompactionJobInfo info{}; + BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, + &info); + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionCompleted(this, info); + } + } + mutex_.Lock(); + current->Unref(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +} + +// REQUIREMENT: block all background work by calling PauseBackgroundWork() +// before calling this function +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); + if (target_level >= cfd->NumberLevels()) { + return Status::InvalidArgument("Target level exceeds number of levels"); + } + + const ReadOptions read_options(Env::IOActivity::kCompaction); + + SuperVersionContext sv_context(/* create_superversion */ true); + + InstrumentedMutexLock guard_lock(&mutex_); + + auto* vstorage = cfd->current()->storage_info(); + if (vstorage->LevelFiles(level).empty()) { + return Status::OK(); + } + // only allow one thread refitting + if (refitting_level_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[ReFitLevel] another thread is refitting"); + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); + } + + if (to_level != level) { + std::vector input(1); + input[0].level = level; + for (auto& f : vstorage->LevelFiles(level)) { + input[0].files.push_back(f); + } + InternalKey refit_level_smallest; + InternalKey refit_level_largest; + cfd->compaction_picker()->GetRange(input[0], &refit_level_smallest, + &refit_level_largest); + if (to_level > level) { + if (level == 0) { + refitting_level_ = false; + return Status::NotSupported( + "Cannot change from level 0 to other levels."); + } + // Check levels are empty for a trivial move + for (int l = level + 1; l <= to_level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + if (cfd->RangeOverlapWithCompaction(refit_level_smallest.user_key(), + refit_level_largest.user_key(), + l)) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target " + "will have some ongoing compaction's output."); + } + } + } else { + // to_level < level + // Check levels are empty for a trivial move + for (int l = to_level; l < level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + if (cfd->RangeOverlapWithCompaction(refit_level_smallest.user_key(), + refit_level_largest.user_key(), + l)) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target " + "will have some ongoing compaction's output."); + } + } + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + + std::unique_ptr c(new Compaction( + vstorage, *cfd->ioptions(), mutable_cf_options, mutable_db_options_, + {input}, to_level, + MaxFileSizeForLevel( + mutable_cf_options, to_level, + cfd->ioptions() + ->compaction_style) /* output file size limit, not applicable */ + , + LLONG_MAX /* max compaction bytes, not applicable */, + 0 /* output path ID, not applicable */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions, not applicable */, + {} /* grandparents, not applicable */, false /* is manual */, + "" /* trim_ts */, -1 /* score, not applicable */, + false /* is deletion compaction, not applicable */, + false /* l0_files_might_overlap, not applicable */, + CompactionReason::kRefitLevel)); + cfd->compaction_picker()->RegisterCompaction(c.get()); + TEST_SYNC_POINT("DBImpl::ReFitLevel:PostRegisterCompaction"); + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + for (const auto& f : vstorage->LevelFiles(level)) { + edit.DeleteFile(level, f->fd.GetNumber()); + edit.AddFile( + to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), + f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size, f->tail_size); + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), + edit.DebugString().data()); + + Status status = + versions_->LogAndApply(cfd, mutable_cf_options, read_options, &edit, + &mutex_, directories_.GetDbDir()); + + cfd->compaction_picker()->UnregisterCompaction(c.get()); + c.reset(); + + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); + + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", + cfd->GetName().c_str(), status.ToString().data()); + + if (status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] After refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + } + sv_context.Clean(); + refitting_level_ = false; + + return status; + } + + refitting_level_ = false; + return Status::OK(); +} + +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + return cfh->cfd()->NumberLevels(); +} + +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) { + return 0; +} + +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + InstrumentedMutexLock l(&mutex_); + return cfh->cfd() + ->GetSuperVersion() + ->mutable_cf_options.level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + ColumnFamilyHandle* column_family) { + auto cfh = static_cast_with_check(column_family); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", + cfh->GetName().c_str()); + Status s; + if (immutable_db_options_.atomic_flush) { + s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush, + {cfh->cfd()}); + } else { + s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual flush finished, status: %s\n", + cfh->GetName().c_str(), s.ToString().c_str()); + return s; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + const std::vector& column_families) { + Status s; + if (!immutable_db_options_.atomic_flush) { + for (auto cfh : column_families) { + s = Flush(flush_options, cfh); + if (!s.ok()) { + break; + } + } + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush start.\n" + "=====Column families:====="); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + autovector cfds; + std::for_each(column_families.begin(), column_families.end(), + [&cfds](ColumnFamilyHandle* elem) { + auto cfh = static_cast(elem); + cfds.emplace_back(cfh->cfd()); + }); + s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush, cfds); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush finished, status: %s\n" + "=====Column families:=====", + s.ToString().c_str()); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + } + return s; +} + +Status DBImpl::RunManualCompaction( + ColumnFamilyData* cfd, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const Slice* begin, + const Slice* end, bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore, const std::string& trim_ts, + int* final_output_level) { + assert(input_level == ColumnFamilyData::kCompactAllLevels || + input_level >= 0); + + InternalKey begin_storage, end_storage; + CompactionArg* ca = nullptr; + + bool scheduled = false; + bool unscheduled = false; + Env::Priority thread_pool_priority = Env::Priority::TOTAL; + bool manual_conflict = false; + + ManualCompactionState manual( + cfd, input_level, output_level, compact_range_options.target_path_id, + exclusive, disallow_trivial_move, compact_range_options.canceled); + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.begin = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + manual.begin = &begin_storage; + } + if (end == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.end = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + manual.end = &end_storage; + } + + TEST_SYNC_POINT("DBImpl::RunManualCompaction:0"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); + InstrumentedMutexLock l(&mutex_); + + if (manual_compaction_paused_ > 0) { + // Does not make sense to `AddManualCompaction()` in this scenario since + // `DisableManualCompaction()` just waited for the manual compaction queue + // to drain. So return immediately. + TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart"); + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + manual.done = true; + return manual.status; + } + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This used to be needed to ensure that this manual + // compaction can compact any range of keys/files. Now it is optional + // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for + // `exclusive_manual_compaction=true` is unclear beyond not trusting the code. + // + // HasPendingManualCompaction() is true when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + AddManualCompaction(&manual); + TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); + if (exclusive) { + // Limitation: there's no way to wake up the below loop when user sets + // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction` + // and `CompactRangeOptions::canceled` might not work well together. + while (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0) { + if (manual_compaction_paused_ > 0 || manual.canceled == true) { + // Pretend the error came from compaction so the below cleanup/error + // handling code can process it. + manual.done = true; + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + break; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled"); + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + } + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + ROCKS_LOG_BUFFER(&log_buffer, "[%s] Manual compaction starting", + cfd->GetName().c_str()); + + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { + assert(HasPendingManualCompaction()); + manual_conflict = false; + Compaction* compaction = nullptr; + if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || + scheduled || + (((manual.manual_end = &manual.tmp_storage1) != nullptr) && + ((compaction = manual.cfd->CompactRange( + *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_, + manual.input_level, manual.output_level, compact_range_options, + manual.begin, manual.end, &manual.manual_end, &manual_conflict, + max_file_num_to_ignore, trim_ts)) == nullptr && + manual_conflict))) { + if (!scheduled) { + // There is a conflicting compaction + if (manual_compaction_paused_ > 0 || manual.canceled == true) { + // Stop waiting since it was canceled. Pretend the error came from + // compaction so the below cleanup/error handling code can process it. + manual.done = true; + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + } + if (!manual.done) { + bg_cv_.Wait(); + } + if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) { + assert(thread_pool_priority != Env::Priority::TOTAL); + // unschedule all manual compactions + auto unscheduled_task_num = env_->UnSchedule( + GetTaskTag(TaskType::kManualCompaction), thread_pool_priority); + if (unscheduled_task_num > 0) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Unscheduled %d number of manual compactions from the " + "thread-pool", + cfd->GetName().c_str(), unscheduled_task_num); + // it may unschedule other manual compactions, notify others. + bg_cv_.SignalAll(); + } + unscheduled = true; + TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled"); + } + if (scheduled && manual.incomplete == true) { + assert(!manual.in_progress); + scheduled = false; + manual.incomplete = false; + } + } else if (!scheduled) { + if (compaction == nullptr) { + manual.done = true; + if (final_output_level) { + // No compaction needed or there is a conflicting compaction. + // Still set `final_output_level` to the level where we would + // have compacted to. + *final_output_level = output_level; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + *final_output_level = cfd->current()->storage_info()->base_level(); + } + } + bg_cv_.SignalAll(); + continue; + } + ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->manual_compaction_state = &manual; + ca->prepicked_compaction->compaction = compaction; + if (!RequestCompactionToken( + cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) { + // Don't throttle manual compaction, only count outstanding tasks. + assert(false); + } + manual.incomplete = false; + if (compaction->bottommost_level() && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + bg_bottom_compaction_scheduled_++; + ca->compaction_pri_ = Env::Priority::BOTTOM; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, + Env::Priority::BOTTOM, + GetTaskTag(TaskType::kManualCompaction), + &DBImpl::UnscheduleCompactionCallback); + thread_pool_priority = Env::Priority::BOTTOM; + } else { + bg_compaction_scheduled_++; + ca->compaction_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, + GetTaskTag(TaskType::kManualCompaction), + &DBImpl::UnscheduleCompactionCallback); + thread_pool_priority = Env::Priority::LOW; + } + scheduled = true; + TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled"); + if (final_output_level) { + *final_output_level = compaction->output_level(); + } + } + } + + log_buffer.FlushBufferToLog(); + assert(!manual.in_progress); + assert(HasPendingManualCompaction()); + RemoveManualCompaction(&manual); + // if the manual job is unscheduled, try schedule other jobs in case there's + // any unscheduled compaction job which was blocked by exclusive manual + // compaction. + if (manual.status.IsIncomplete() && + manual.status.subcode() == Status::SubCode::kManualCompactionPaused) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); + return manual.status; +} + +void DBImpl::GenerateFlushRequest(const autovector& cfds, + FlushReason flush_reason, FlushRequest* req) { + assert(req != nullptr); + req->flush_reason = flush_reason; + req->cfd_to_max_mem_id_to_persist.reserve(cfds.size()); + for (const auto cfd : cfds) { + if (nullptr == cfd) { + // cfd may be null, see DBImpl::ScheduleFlushes + continue; + } + uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(); + req->cfd_to_max_mem_id_to_persist.emplace(cfd, max_memtable_id); + } +} + +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_options, + FlushReason flush_reason, + bool entered_write_thread) { + // This method should not be called if atomic_flush is true. + assert(!immutable_db_options_.atomic_flush); + if (!flush_options.wait && write_controller_.IsStopped()) { + std::ostringstream oss; + oss << "Writes have been stopped, thus unable to perform manual flush. " + "Please try again later after writes are resumed"; + return Status::TryAgain(oss.str()); + } + Status s; + if (!flush_options.allow_write_stall) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone"); + if (!s.ok() || !flush_needed) { + return s; + } + } + + const bool needs_to_join_write_thread = !entered_write_thread; + autovector flush_reqs; + autovector memtable_ids_to_wait; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (needs_to_join_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + if (flush_reason != FlushReason::kErrorRecoveryRetryFlush && + (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) { + // Note that, when flush reason is kErrorRecoveryRetryFlush, during the + // auto retry resume, we want to avoid creating new small memtables. + // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl + // will iterate through all the CFs and call FlushMemtable during auto + // retry resume, it is possible that in some CFs, + // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will + // be created and scheduled, status::OK() will be returned. + s = SwitchMemtable(cfd, &context); + } + const uint64_t flush_memtable_id = std::numeric_limits::max(); + if (s.ok()) { + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID()); + } + if (immutable_db_options_.persist_stats_to_disk && + flush_reason != FlushReason::kErrorRecoveryRetryFlush) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && cfd_stats != cfd && + !cfd_stats->mem()->IsEmpty()) { + // only force flush stats CF when it will be the only CF lagging + // behind after the current flush + bool stats_cf_flush_needed = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats || loop_cfd == cfd) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + stats_cf_flush_needed = false; + } + } + if (stats_cf_flush_needed) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with manual flush of %s " + "to avoid holding old logs", + cfd->GetName().c_str()); + s = SwitchMemtable(cfd_stats, &context); + FlushRequest req{flush_reason, {{cfd_stats, flush_memtable_id}}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back( + cfd_stats->imm()->GetLatestMemTableID()); + } + } + } + } + + if (s.ok() && !flush_reqs.empty()) { + for (const auto& req : flush_reqs) { + assert(req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* loop_cfd = + req.cfd_to_max_mem_id_to_persist.begin()->first; + loop_cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (const auto& req : flush_reqs) { + assert(req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* loop_cfd = + req.cfd_to_max_mem_id_to_persist.begin()->first; + loop_cfd->Ref(); + } + } + for (const auto& req : flush_reqs) { + SchedulePendingFlush(req); + } + MaybeScheduleFlushOrCompaction(); + } + + if (needs_to_join_write_thread) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector cfds; + autovector flush_memtable_ids; + assert(flush_reqs.size() == memtable_ids_to_wait.size()); + for (size_t i = 0; i < flush_reqs.size(); ++i) { + assert(flush_reqs[i].cfd_to_max_mem_id_to_persist.size() == 1); + cfds.push_back(flush_reqs[i].cfd_to_max_mem_id_to_persist.begin()->first); + flush_memtable_ids.push_back(&(memtable_ids_to_wait[i])); + } + s = WaitForFlushMemTables( + cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery || + flush_reason == FlushReason::kErrorRecoveryRetryFlush)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* tmp_cfd : cfds) { + tmp_cfd->UnrefAndTryDelete(); + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished"); + return s; +} + +Status DBImpl::AtomicFlushMemTables( + const FlushOptions& flush_options, FlushReason flush_reason, + const autovector& provided_candidate_cfds, + bool entered_write_thread) { + assert(immutable_db_options_.atomic_flush); + if (!flush_options.wait && write_controller_.IsStopped()) { + std::ostringstream oss; + oss << "Writes have been stopped, thus unable to perform manual flush. " + "Please try again later after writes are resumed"; + return Status::TryAgain(oss.str()); + } + Status s; + autovector candidate_cfds; + if (provided_candidate_cfds.empty()) { + // Generate candidate cfds if not provided + { + InstrumentedMutexLock l(&mutex_); + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + candidate_cfds.push_back(cfd); + } + } + } + } else { + candidate_cfds = provided_candidate_cfds; + } + + if (!flush_options.allow_write_stall) { + int num_cfs_to_flush = 0; + for (auto cfd : candidate_cfds) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + if (!s.ok()) { + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } + return s; + } else if (flush_needed) { + ++num_cfs_to_flush; + } + } + if (0 == num_cfs_to_flush) { + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } + return s; + } + } + const bool needs_to_join_write_thread = !entered_write_thread; + FlushRequest flush_req; + autovector cfds; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (needs_to_join_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds); + + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } + + for (auto cfd : cfds) { + if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) || + flush_reason == FlushReason::kErrorRecoveryRetryFlush) { + continue; + } + cfd->Ref(); + s = SwitchMemtable(cfd, &context); + cfd->UnrefAndTryDelete(); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + AssignAtomicFlushSeq(cfds); + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto cfd : cfds) { + cfd->Ref(); + } + } + GenerateFlushRequest(cfds, flush_reason, &flush_req); + SchedulePendingFlush(flush_req); + MaybeScheduleFlushOrCompaction(); + } + + if (needs_to_join_write_thread) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector flush_memtable_ids; + for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables( + cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery || + flush_reason == FlushReason::kErrorRecoveryRetryFlush)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* cfd : cfds) { + cfd->UnrefAndTryDelete(); + } + } + return s; +} + +// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can +// cause write stall, for example if one memtable is being flushed already. +// This method tries to avoid write stall (similar to CompactRange() behavior) +// it emulates how the SuperVersion / LSM would change if flush happens, checks +// it against various constrains and delays flush if it'd cause write stall. +// Caller should check status and flush_needed to see if flush already happened. +Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed) { + { + *flush_needed = true; + InstrumentedMutexLock l(&mutex_); + uint64_t orig_active_memtable_id = cfd->mem()->GetID(); + WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; + do { + if (write_stall_condition != WriteStallCondition::kNormal) { + // Same error handling as user writes: Don't wait if there's a + // background error, even if it's a soft error. We might wait here + // indefinitely as the pending flushes/compactions may never finish + // successfully, resulting in the stall condition lasting indefinitely + if (error_handler_.IsBGWorkStopped()) { + return error_handler_.GetBGError(); + } + + TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] WaitUntilFlushWouldNotStallWrites" + " waiting on stall conditions to clear", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + if (cfd->IsDropped()) { + return Status::ColumnFamilyDropped(); + } + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + uint64_t earliest_memtable_id = + std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID()); + if (earliest_memtable_id > orig_active_memtable_id) { + // We waited so long that the memtable we were originally waiting on was + // flushed. + *flush_needed = false; + return Status::OK(); + } + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + const auto* vstorage = cfd->current()->storage_info(); + + // Skip stalling check if we're below auto-flush and auto-compaction + // triggers. If it stalled in these conditions, that'd mean the stall + // triggers are so low that stalling is needed for any background work. In + // that case we shouldn't wait since background work won't be scheduled. + if (cfd->imm()->NumNotFlushed() < + cfd->ioptions()->min_write_buffer_number_to_merge && + vstorage->l0_delay_trigger_count() < + mutable_cf_options.level0_file_num_compaction_trigger) { + break; + } + + // check whether one extra immutable memtable or an extra L0 file would + // cause write stalling mode to be entered. It could still enter stall + // mode due to pending compaction bytes, but that's less common + write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), + mutable_cf_options, *cfd->ioptions()) + .first; + } while (write_stall_condition != WriteStallCondition::kNormal); + } + return Status::OK(); +} + +// Wait for memtables to be flushed for multiple column families. +// let N = cfds.size() +// for i in [0, N), +// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs +// have to be flushed for THIS column family; +// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column +// family have to be flushed. +// Finish waiting when ALL column families finish flushing memtables. +// resuming_from_bg_err indicates whether the caller is trying to resume from +// background error or in normal processing. +Status DBImpl::WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err) { + int num = static_cast(cfds.size()); + // Wait until the compaction completes + InstrumentedMutexLock l(&mutex_); + Status s; + // If the caller is trying to resume from bg error, then + // error_handler_.IsDBStopped() is true. + while (resuming_from_bg_err || !error_handler_.IsDBStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress(); + return s; + } + // If an error has occurred during resumption, then no need to wait. + // But flush operation may fail because of this error, so need to + // return the status. + if (!error_handler_.GetRecoveryError().ok()) { + s = error_handler_.GetRecoveryError(); + break; + } + // If BGWorkStopped, which indicate that there is a BG error and + // 1) soft error but requires no BG work, 2) no in auto_recovery_ + if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() && + error_handler_.GetBGError().severity() < Status::Severity::kHardError) { + s = error_handler_.GetBGError(); + return s; + } + + // Number of column families that have been dropped. + int num_dropped = 0; + // Number of column families that have finished flush. + int num_finished = 0; + for (int i = 0; i < num; ++i) { + if (cfds[i]->IsDropped()) { + ++num_dropped; + } else if (cfds[i]->imm()->NumNotFlushed() == 0 || + (flush_memtable_ids[i] != nullptr && + cfds[i]->imm()->GetEarliestMemTableID() > + *flush_memtable_ids[i])) { + ++num_finished; + } + } + if (1 == num_dropped && 1 == num) { + s = Status::ColumnFamilyDropped(); + return s; + } + // Column families involved in this flush request have either been dropped + // or finished flush. Then it's time to finish waiting. + if (num_dropped + num_finished == num) { + break; + } + bg_cv_.Wait(); + } + // If not resuming from bg error, and an error has caused the DB to stop, + // then report the bg error to caller. + if (!resuming_from_bg_err && error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +Status DBImpl::EnableAutoCompaction( + const std::vector& column_family_handles) { + Status s; + for (auto cf_ptr : column_family_handles) { + Status status = + this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); + if (!status.ok()) { + s = status; + } + } + + return s; +} + +// NOTE: Calling DisableManualCompaction() may overwrite the +// user-provided canceled variable in CompactRangeOptions +void DBImpl::DisableManualCompaction() { + InstrumentedMutexLock l(&mutex_); + manual_compaction_paused_.fetch_add(1, std::memory_order_release); + + // Mark the canceled as true when the cancellation is triggered by + // manual_compaction_paused (may overwrite user-provided `canceled`) + for (const auto& manual_compaction : manual_compaction_dequeue_) { + manual_compaction->canceled = true; + } + + // Wake up manual compactions waiting to start. + bg_cv_.SignalAll(); + + // Wait for any pending manual compactions to finish (typically through + // failing with `Status::Incomplete`) prior to returning. This way we are + // guaranteed no pending manual compaction will commit while manual + // compactions are "disabled". + while (HasPendingManualCompaction()) { + bg_cv_.Wait(); + } +} + +// NOTE: In contrast to DisableManualCompaction(), calling +// EnableManualCompaction() does NOT overwrite the user-provided *canceled +// variable to be false since there is NO CHANCE a canceled compaction +// is uncanceled. In other words, a canceled compaction must have been +// dropped out of the manual compaction queue, when we disable it. +void DBImpl::EnableManualCompaction() { + InstrumentedMutexLock l(&mutex_); + assert(manual_compaction_paused_ > 0); + manual_compaction_paused_.fetch_sub(1, std::memory_order_release); +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + if (!opened_successfully_) { + // Compaction may introduce data race to DB open + return; + } + if (bg_work_paused_ > 0) { + // we paused the background work + return; + } else if (error_handler_.IsBGWorkStopped() && + !error_handler_.IsRecoveryInProgress()) { + // There has been a hard error and this call is not part of the recovery + // sequence. Bail out here so we don't get into an endless loop of + // scheduling BG work which will again call this function + return; + } else if (shutting_down_.load(std::memory_order_acquire)) { + // DB is being deleted; no more background compactions + return; + } + auto bg_job_limits = GetBGJobLimits(); + bool is_flush_pool_empty = + env_->GetBackgroundThreads(Env::Priority::HIGH) == 0; + while (!is_flush_pool_empty && unscheduled_flushes_ > 0 && + bg_flush_scheduled_ < bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::HIGH; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", + &unscheduled_flushes_); + } + + // special case -- if high-pri (flush) thread pool is empty, then schedule + // flushes in low-pri (compaction) thread pool. + if (is_flush_pool_empty) { + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ + bg_compaction_scheduled_ < + bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + } + } + + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } else if (error_handler_.IsBGWorkStopped()) { + // Compaction is not part of the recovery sequence from a hard error. We + // might get here because recovery might do a flush and install a new + // super version, which will try to schedule pending compactions. Bail + // out here and let the higher level recovery handle compactions + return; + } + + if (HasExclusiveManualCompaction()) { + // only manual compactions are allowed to run. don't schedule automatic + // compactions + TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict"); + return; + } + + while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ < + bg_job_limits.max_compactions && + unscheduled_compactions_ > 0) { + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->compaction_pri_ = Env::Priority::LOW; + ca->prepicked_compaction = nullptr; + bg_compaction_scheduled_++; + unscheduled_compactions_--; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCompactionCallback); + } +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const { + mutex_.AssertHeld(); + return GetBGJobLimits(mutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + write_controller_.NeedSpeedupCompaction()); +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions) { + BGJobLimits res; + if (max_background_flushes == -1 && max_background_compactions == -1) { + // for our first stab implementing max_background_jobs, simply allocate a + // quarter of the threads to flushes. + res.max_flushes = std::max(1, max_background_jobs / 4); + res.max_compactions = std::max(1, max_background_jobs - res.max_flushes); + } else { + // compatibility code in case users haven't migrated to max_background_jobs, + // which automatically computes flush/compaction limits + res.max_flushes = std::max(1, max_background_flushes); + res.max_compactions = std::max(1, max_background_compactions); + } + if (!parallelize_compactions) { + // throttle background compactions until we deem necessary + res.max_compactions = 1; + } + return res; +} + +void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { + assert(!cfd->queued_for_compaction()); + cfd->Ref(); + compaction_queue_.push_back(cfd); + cfd->set_queued_for_compaction(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { + assert(!compaction_queue_.empty()); + auto cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(cfd->queued_for_compaction()); + cfd->set_queued_for_compaction(false); + return cfd; +} + +DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { + assert(!flush_queue_.empty()); + FlushRequest flush_req = flush_queue_.front(); + flush_queue_.pop_front(); + if (!immutable_db_options_.atomic_flush) { + assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); + } + for (const auto& elem : flush_req.cfd_to_max_mem_id_to_persist) { + if (!immutable_db_options_.atomic_flush) { + ColumnFamilyData* cfd = elem.first; + assert(cfd); + assert(cfd->queued_for_flush()); + cfd->set_queued_for_flush(false); + } + } + return flush_req; +} + +ColumnFamilyData* DBImpl::PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer) { + assert(!compaction_queue_.empty()); + assert(*token == nullptr); + autovector throttled_candidates; + ColumnFamilyData* cfd = nullptr; + while (!compaction_queue_.empty()) { + auto first_cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(first_cfd->queued_for_compaction()); + if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) { + throttled_candidates.push_back(first_cfd); + continue; + } + cfd = first_cfd; + cfd->set_queued_for_compaction(false); + break; + } + // Add throttled compaction candidates back to queue in the original order. + for (auto iter = throttled_candidates.rbegin(); + iter != throttled_candidates.rend(); ++iter) { + compaction_queue_.push_front(*iter); + } + return cfd; +} + +void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) { + mutex_.AssertHeld(); + if (flush_req.cfd_to_max_mem_id_to_persist.empty()) { + return; + } + if (!immutable_db_options_.atomic_flush) { + // For the non-atomic flush case, we never schedule multiple column + // families in the same flush request. + assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* cfd = + flush_req.cfd_to_max_mem_id_to_persist.begin()->first; + assert(cfd); + + if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) { + cfd->Ref(); + cfd->set_queued_for_flush(true); + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); + } + } else { + for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + } + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); + } +} + +void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + mutex_.AssertHeld(); + if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } +} + +void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id) { + mutex_.AssertHeld(); + PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); + purge_files_.insert({{number, std::move(file_info)}}); +} + +void DBImpl::BGWorkFlush(void* arg) { + FlushThreadArg fta = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + + IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush"); + static_cast_with_check(fta.db_)->BackgroundCallFlush(fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); +} + +void DBImpl::BGWorkCompaction(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); + TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); + auto prepicked_compaction = + static_cast(ca.prepicked_compaction); + static_cast_with_check(ca.db)->BackgroundCallCompaction( + prepicked_compaction, Env::Priority::LOW); + delete prepicked_compaction; +} + +void DBImpl::BGWorkBottomCompaction(void* arg) { + CompactionArg ca = *(static_cast(arg)); + delete static_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM); + TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction"); + auto* prepicked_compaction = ca.prepicked_compaction; + assert(prepicked_compaction && prepicked_compaction->compaction); + ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM); + delete prepicked_compaction; +} + +void DBImpl::BGWorkPurge(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); + reinterpret_cast(db)->BackgroundCallPurge(); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); +} + +void DBImpl::UnscheduleCompactionCallback(void* arg) { + CompactionArg* ca_ptr = reinterpret_cast(arg); + Env::Priority compaction_pri = ca_ptr->compaction_pri_; + if (Env::Priority::BOTTOM == compaction_pri) { + // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM + ca_ptr->db->bg_bottom_compaction_scheduled_--; + } else if (Env::Priority::LOW == compaction_pri) { + // Decrement bg_compaction_scheduled_ if priority is LOW + ca_ptr->db->bg_compaction_scheduled_--; + } + CompactionArg ca = *(ca_ptr); + delete reinterpret_cast(arg); + if (ca.prepicked_compaction != nullptr) { + // if it's a manual compaction, set status to ManualCompactionPaused + if (ca.prepicked_compaction->manual_compaction_state) { + ca.prepicked_compaction->manual_compaction_state->done = true; + ca.prepicked_compaction->manual_compaction_state->status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (ca.prepicked_compaction->compaction != nullptr) { + ca.prepicked_compaction->compaction->ReleaseCompactionFiles( + Status::Incomplete(Status::SubCode::kManualCompactionPaused)); + delete ca.prepicked_compaction->compaction; + } + delete ca.prepicked_compaction; + } + TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback"); +} + +void DBImpl::UnscheduleFlushCallback(void* arg) { + // Decrement bg_flush_scheduled_ in flush callback + reinterpret_cast(arg)->db_->bg_flush_scheduled_--; + Env::Priority flush_pri = reinterpret_cast(arg)->thread_pri_; + if (Env::Priority::LOW == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback"); + } else if (Env::Priority::HIGH == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback"); + } + delete reinterpret_cast(arg); + TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); +} + +Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + + Status status; + *reason = FlushReason::kOthers; + // If BG work is stopped due to an error, but a recovery is in progress, + // that means this flush is part of the recovery. So allow it to go through + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } + } else if (!error_handler_.IsRecoveryInProgress()) { + status = error_handler_.GetBGError(); + } + + if (!status.ok()) { + return status; + } + + autovector bg_flush_args; + std::vector& superversion_contexts = + job_context->superversion_contexts; + autovector column_families_not_to_flush; + while (!flush_queue_.empty()) { + // This cfd is already referenced + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + FlushReason flush_reason = flush_req.flush_reason; + superversion_contexts.clear(); + superversion_contexts.reserve( + flush_req.cfd_to_max_mem_id_to_persist.size()); + + for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { + ColumnFamilyData* cfd = iter.first; + if (cfd->GetMempurgeUsed()) { + // If imm() contains silent memtables (e.g.: because + // MemPurge was activated), requesting a flush will + // mark the imm_needed as true. + cfd->imm()->FlushRequested(); + } + + if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + column_families_not_to_flush.push_back(cfd); + continue; + } + superversion_contexts.emplace_back(SuperVersionContext(true)); + bg_flush_args.emplace_back(cfd, iter.second, + &(superversion_contexts.back()), flush_reason); + } + if (!bg_flush_args.empty()) { + break; + } + } + + if (!bg_flush_args.empty()) { + auto bg_job_limits = GetBGJobLimits(); + for (const auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + ROCKS_LOG_BUFFER( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots available " + "%d, " + "flush slots scheduled %d, compaction slots scheduled %d", + cfd->GetName().c_str(), bg_job_limits.max_flushes, + bg_job_limits.max_compactions, bg_flush_scheduled_, + bg_compaction_scheduled_); + } + status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, + job_context, log_buffer, thread_pri); + TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); +// All the CFD/bg_flush_arg in the FlushReq must have the same flush reason, so +// just grab the first one +#ifndef NDEBUG + for (const auto& bg_flush_arg : bg_flush_args) { + assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_); + } +#endif /* !NDEBUG */ + *reason = bg_flush_args[0].flush_reason_; + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + if (cfd->UnrefAndTryDelete()) { + arg.cfd_ = nullptr; + } + } + } + for (auto cfd : column_families_not_to_flush) { + cfd->UnrefAndTryDelete(); + } + return status; +} + +void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1"); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2"); + { + InstrumentedMutexLock l(&mutex_); + assert(bg_flush_scheduled_); + num_running_flushes_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + FlushReason reason; + + Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, + &reason, thread_pri); + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && + reason != FlushReason::kErrorRecovery) { + // Wait a little bit before retrying background flush in + // case this is an environmental problem and we do not want to + // chew up resources for failed flushes for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(immutable_db_options_.info_log); + immutable_db_options_.clock->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0"); + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If flush failed, we want to delete all temporary files that we might have + // created. Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); + + assert(num_running_flushes_ > 0); + num_running_flushes_--; + bg_flush_scheduled_--; + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + atomic_flush_install_cv_.SignalAll(); + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority bg_thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + TEST_SYNC_POINT("BackgroundCallCompaction:0"); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + + num_running_compactions_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert((bg_thread_pri == Env::Priority::BOTTOM && + bg_bottom_compaction_scheduled_) || + (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_)); + Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer, + prepicked_compaction, bg_thread_pri); + TEST_SYNC_POINT("BackgroundCallCompaction:1"); + if (s.IsBusy()) { + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + immutable_db_options_.clock->SleepForMicroseconds( + 10000); // prevent hot loop + mutex_.Lock(); + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(immutable_db_options_.info_log); + immutable_db_options_.clock->SleepForMicroseconds(1000000); + mutex_.Lock(); + } else if (s.IsManualCompactionPaused()) { + assert(prepicked_compaction); + ManualCompactionState* m = prepicked_compaction->manual_compaction_state; + assert(m); + ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused", + m->cfd->GetName().c_str(), job_context.job_id); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If compaction failed, we want to delete all temporary files that we + // might have created (they might not be all recorded in job_context in + // case of a failure). Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && + !s.IsColumnFamilyDropped() && + !s.IsBusy()); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles"); + } + job_context.Clean(); + mutex_.Lock(); + } + + assert(num_running_compactions_ > 0); + num_running_compactions_--; + + if (bg_thread_pri == Env::Priority::LOW) { + bg_compaction_scheduled_--; + } else { + assert(bg_thread_pri == Env::Priority::BOTTOM); + bg_bottom_compaction_scheduled_--; + } + + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + + if (prepicked_compaction != nullptr && + prepicked_compaction->task_token != nullptr) { + // Releasing task tokens affects (and asserts on) the DB state, so + // must be done before we potentially signal the DB close process to + // proceed below. + prepicked_compaction->task_token.reset(); + } + + if (made_progress || + (bg_compaction_scheduled_ == 0 && + bg_bottom_compaction_scheduled_ == 0) || + HasPendingManualCompaction() || unscheduled_compactions_ == 0) { + // signal if + // * made_progress -- need to wakeup DelayWrite + // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl + // * HasPendingManualCompaction -- need to wakeup RunManualCompaction + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* made_progress, + JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri) { + ManualCompactionState* manual_compaction = + prepicked_compaction == nullptr + ? nullptr + : prepicked_compaction->manual_compaction_state; + *made_progress = false; + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + + const ReadOptions read_options(Env::IOActivity::kCompaction); + + bool is_manual = (manual_compaction != nullptr); + std::unique_ptr c; + if (prepicked_compaction != nullptr && + prepicked_compaction->compaction != nullptr) { + c.reset(prepicked_compaction->compaction); + } + bool is_prepicked = is_manual || c; + + // (manual_compaction->in_progress == false); + bool trivial_move_disallowed = + is_manual && manual_compaction->disallow_trivial_move; + + CompactionJobStats compaction_job_stats; + Status status; + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } else if (is_manual && + manual_compaction->canceled.load(std::memory_order_acquire)) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + } else { + status = error_handler_.GetBGError(); + // If we get here, it means a hard error happened after this compaction + // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got + // a chance to execute. Since we didn't pop a cfd from the compaction + // queue, increment unscheduled_compactions_ + unscheduled_compactions_++; + } + + if (!status.ok()) { + if (is_manual) { + manual_compaction->status = status; + manual_compaction->done = true; + manual_compaction->in_progress = false; + manual_compaction = nullptr; + } + if (c) { + c->ReleaseCompactionFiles(status); + c.reset(); + } + return status; + } + + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction->in_progress = true; + } + + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress"); + + std::unique_ptr task_token; + + // InternalKey manual_end_storage; + // InternalKey* manual_end = &manual_end_storage; + bool sfm_reserved_compact_space = false; + if (is_manual) { + ManualCompactionState* m = manual_compaction; + assert(m->in_progress); + if (!c) { + m->done = true; + m->manual_end = nullptr; + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d from %s .. " + "%s; nothing to do\n", + m->cfd->GetName().c_str(), m->input_level, + (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"), + (m->end ? m->end->DebugString(true).c_str() : "(end)")); + } else { + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c.reset(); + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + status = Status::CompactionTooLarge(); + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, c->output_level(), + (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"), + (m->end ? m->end->DebugString(true).c_str() : "(end)"), + ((m->done || m->manual_end == nullptr) + ? "(end)" + : m->manual_end->DebugString(true).c_str())); + } + } + } else if (!is_prepicked && !compaction_queue_.empty()) { + if (HasExclusiveManualCompaction()) { + // Can't compact right now, but try again later + TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); + + // Stay in the compaction queue. + unscheduled_compactions_++; + + return Status::OK(); + } + + auto cfd = PickCompactionFromQueue(&task_token, log_buffer); + if (cfd == nullptr) { + // Can't find any executable task from the compaction queue. + // All tasks have been throttled by compaction thread limiter. + ++unscheduled_compactions_; + return Status::Busy(); + } + + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->UnrefAndTryDelete()) { + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + + // Pick up latest mutable CF Options and use it throughout the + // compaction job + // Compaction makes a copy of the latest MutableCFOptions. It should be used + // throughout the compaction procedure to make sure consistency. It will + // eventually be installed into SuperVersion + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { + // NOTE: try to avoid unnecessary copy of MutableCFOptions if + // compaction is not necessary. Need to make sure mutex is held + // until we make a copy in the following code + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); + c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_, + log_buffer)); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + + if (c != nullptr) { + bool enough_room = EnoughRoomForCompaction( + cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_options()), + *(c->mutable_cf_options())); + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + + c.reset(); + // Don't need to sleep here, because BackgroundCallCompaction + // will sleep if !s.ok() + status = Status::CompactionTooLarge(); + } else { + // update statistics + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files); + + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersionAndScheduleWork + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersionAndScheduleWork, because this is when the new + // options take effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add + // it to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); + } + } + } + } + } + + IOStatus io_s; + if (!c) { + // Nothing to do + ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); + } else if (c->deletion_compaction()) { + // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old + // file if there is alive snapshot pointing to it + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + assert(c->num_input_files(1) == 0); + assert(c->column_family_data()->ioptions()->compaction_style == + kCompactionStyleFIFO); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + for (const auto& f : *c->inputs(0)) { + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); + } + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir()); + io_s = versions_->io_status(); + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", + c->column_family_data()->GetName().c_str(), + c->num_input_files(0)); + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!trivial_move_disallowed && c->IsTrivialMove()) { + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + // Instrument for event update + // TODO(yhchiang): add op details for showing trivial-move. + ThreadStatusUtil::SetColumnFamily(c->column_family_data()); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + // Move files to next level + int32_t moved_files = 0; + int64_t moved_bytes = 0; + for (unsigned int l = 0; l < c->num_input_levels(); l++) { + if (c->level(l) == c->output_level()) { + continue; + } + for (size_t i = 0; i < c->num_input_files(l); i++) { + FileMetaData* f = c->input(l, i); + c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); + c->edit()->AddFile( + c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size, f->tail_size); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", + c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c->output_level(), f->fd.GetFileSize()); + ++moved_files; + moved_bytes += f->fd.GetFileSize(); + } + } + if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize && + c->immutable_options()->compaction_pri == kRoundRobin) { + int start_level = c->start_level(); + if (start_level > 0) { + auto vstorage = c->input_version()->storage_info(); + c->edit()->AddCompactCursor( + start_level, + vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); + } + } + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir()); + io_s = versions_->io_status(); + // Use latest MutableCFOptions + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + + VersionStorageInfo::LevelSummaryStorage tmp; + c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), + moved_bytes); + { + event_logger_.LogToBuffer(log_buffer) + << "job" << job_context->job_id << "event" + << "trivial_move" + << "destination_level" << c->output_level() << "files" << moved_files + << "total_files_size" << moved_bytes; + } + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), moved_files, + c->output_level(), moved_bytes, status.ToString().c_str(), + c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); + *made_progress = true; + + // Clear Instrument + ThreadStatusUtil::ResetThreadStatus(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!is_prepicked && c->output_level() > 0 && + c->output_level() == + c->column_family_data() + ->current() + ->storage_info() + ->MaxOutputLevel( + immutable_db_options_.allow_ingest_behind) && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + // Forward compactions involving last level to the bottom pool if it exists, + // such that compactions unlikely to contribute to write stalls can be + // delayed or deprioritized. + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool"); + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->compaction_pri_ = Env::Priority::BOTTOM; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->compaction = c.release(); + ca->prepicked_compaction->manual_compaction_state = nullptr; + // Transfer requested token, so it doesn't need to do it again. + ca->prepicked_compaction->task_token = std::move(task_token); + ++bg_bottom_compaction_scheduled_; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM, + this, &DBImpl::UnscheduleCompactionCallback); + } else { + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + int output_level __attribute__((__unused__)); + output_level = c->output_level(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", + &output_level); + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + assert(is_snapshot_supported_ || snapshots_.empty()); + + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, + mutable_db_options_, file_options_for_compaction_, versions_.get(), + &shutting_down_, log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, + &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, thread_pri, io_tracer_, + is_manual ? manual_compaction->canceled + : kManualCompactionCanceledFalse_, + db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(), + c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_, + &bg_bottom_compaction_scheduled_); + compaction_job.Prepare(); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + // Should handle error? + compaction_job.Run().PermitUncheckedError(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); + mutex_.Lock(); + + status = compaction_job.Install(*c->mutable_cf_options()); + io_s = compaction_job.io_status(); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } + + if (status.ok() && !io_s.ok()) { + status = io_s; + } else { + io_s.PermitUncheckedError(); + } + + if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; + + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } + + NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + } + + if (status.ok() || status.IsCompactionTooLarge() || + status.IsManualCompactionPaused()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + if (!io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kCompaction + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); + } else { + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { + // Put this cfd back in the compaction queue so we can retry after some + // time + auto cfd = c->column_family_data(); + assert(cfd != nullptr); + // Since this compaction failed, we need to recompute the score so it + // takes the original input files into account + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_options()), + *(c->mutable_cf_options())); + if (!cfd->queued_for_compaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } + } + } + // this will unref its input_version and column_family_data + c.reset(); + + if (is_manual) { + ManualCompactionState* m = manual_compaction; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (m->manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal and FIFO compactions should always compact the whole range + assert(m->cfd->ioptions()->compaction_style != + kCompactionStyleUniversal || + m->cfd->ioptions()->num_levels > 1); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); + m->tmp_storage = *m->manual_end; + m->begin = &m->tmp_storage; + m->incomplete = true; + } + m->in_progress = false; // not being processed anymore + } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); + return status; +} + +bool DBImpl::HasPendingManualCompaction() { + return (!manual_compaction_dequeue_.empty()); +} + +void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) { + assert(manual_compaction_paused_ == 0); + manual_compaction_dequeue_.push_back(m); +} + +void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { + // Remove from queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it = manual_compaction_dequeue_.erase(it); + return; + } + ++it; + } + assert(false); + return; +} + +bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { + if (m->exclusive) { + return (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0); + } + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + bool seen = false; + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + ++it; + seen = true; + continue; + } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { + // Consider the other manual compaction *it, conflicts if: + // overlaps with m + // and (*it) is ahead in the queue and is not yet in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { + // Allow automatic compaction if manual compaction is + // in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HasExclusiveManualCompaction() { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + ++it; + } + return false; +} + +bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { + if ((m->exclusive) || (m1->exclusive)) { + return true; + } + if (m->cfd != m1->cfd) { + return false; + } + return false; +} + +void DBImpl::BuildCompactionJobInfo( + const ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id, + const Version* current, CompactionJobInfo* compaction_job_info) const { + assert(compaction_job_info != nullptr); + compaction_job_info->cf_id = cfd->GetID(); + compaction_job_info->cf_name = cfd->GetName(); + compaction_job_info->status = st; + compaction_job_info->thread_id = env_->GetThreadID(); + compaction_job_info->job_id = job_id; + compaction_job_info->base_input_level = c->start_level(); + compaction_job_info->output_level = c->output_level(); + compaction_job_info->stats = compaction_job_stats; + compaction_job_info->table_properties = c->GetOutputTableProperties(); + compaction_job_info->compaction_reason = c->compaction_reason(); + compaction_job_info->compression = c->output_compression(); + + const ReadOptions read_options(Env::IOActivity::kCompaction); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_options()->cf_paths, file_number, + desc.GetPathId()); + compaction_job_info->input_files.push_back(fn); + compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); + if (compaction_job_info->table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = current->GetTableProperties(read_options, &tp, fmd, &fn); + if (s.ok()) { + compaction_job_info->table_properties[fn] = tp; + } + } + } + } + for (const auto& newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + compaction_job_info->output_files.push_back(TableFileName( + c->immutable_options()->cf_paths, file_number, desc.GetPathId())); + compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); + } + compaction_job_info->blob_compression_type = + c->mutable_cf_options()->blob_compression_type; + + // Update BlobFilesInfo. + for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) { + BlobFileAdditionInfo blob_file_addition_info( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(), + blob_file.GetTotalBlobBytes()); + compaction_job_info->blob_file_addition_infos.emplace_back( + std::move(blob_file_addition_info)); + } + + // Update BlobFilesGarbageInfo. + for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) { + BlobFileGarbageInfo blob_file_garbage_info( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(), + blob_file.GetGarbageBlobBytes()); + compaction_job_info->blob_file_garbage_infos.emplace_back( + std::move(blob_file_garbage_info)); + } +} + +// SuperVersionContext gets created and destructed outside of the lock -- +// we use this conveniently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersionAndScheduleWork() gets called twice with the +// same sv_context, we can't reuse the SuperVersion() that got +// malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free + +void DBImpl::InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + + // Update max_total_in_memory_state_ + size_t old_memtable_size = 0; + auto* old_sv = cfd->GetSuperVersion(); + if (old_sv) { + old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * + old_sv->mutable_cf_options.max_write_buffer_number; + } + + // this branch is unlikely to step in + if (UNLIKELY(sv_context->new_superversion == nullptr)) { + sv_context->NewSuperVersion(); + } + cfd->InstallSuperVersion(sv_context, mutable_cf_options); + + // There may be a small data race here. The snapshot tricking bottommost + // compaction may already be released here. But assuming there will always be + // newer snapshot created and released frequently, the compaction will be + // triggered soon anyway. + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto* my_cfd : *versions_->GetColumnFamilySet()) { + if (!my_cfd->ioptions()->allow_ingest_behind) { + bottommost_files_mark_threshold_ = std::min( + bottommost_files_mark_threshold_, + my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + } + + // Whenever we install new SuperVersion, we might need to issue new flushes or + // compactions. + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + + // Update max_total_in_memory_state_ + max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size + + mutable_cf_options.write_buffer_size * + mutable_cf_options.max_write_buffer_number; +} + +// ShouldPurge is called by FindObsoleteFiles when doing a full scan, +// and db mutex (mutex_) should already be held. +// Actually, the current implementation of FindObsoleteFiles with +// full_scan=true can issue I/O requests to obtain list of files in +// directories, e.g. env_->getChildren while holding db mutex. +bool DBImpl::ShouldPurge(uint64_t file_number) const { + return files_grabbed_for_purge_.find(file_number) == + files_grabbed_for_purge_.end() && + purge_files_.find(file_number) == purge_files_.end(); +} + +// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex +// (mutex_) should already be held. +void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) { + files_grabbed_for_purge_.insert(file_number); +} + +void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { + InstrumentedMutexLock l(&mutex_); + // snapshot_checker_ should only set once. If we need to set it multiple + // times, we need to make sure the old one is not deleted while it is still + // using by a compaction job. + assert(!snapshot_checker_); + snapshot_checker_.reset(snapshot_checker); +} + +void DBImpl::GetSnapshotContext( + JobContext* job_context, std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker_ptr) { + mutex_.AssertHeld(); + assert(job_context != nullptr); + assert(snapshot_seqs != nullptr); + assert(earliest_write_conflict_snapshot != nullptr); + assert(snapshot_checker_ptr != nullptr); + + *snapshot_checker_ptr = snapshot_checker_.get(); + if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) { + *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance(); + } + if (*snapshot_checker_ptr != nullptr) { + // If snapshot_checker is used, that means the flush/compaction may + // contain values not visible to snapshot taken after + // flush/compaction job starts. Take a snapshot and it will appear + // in snapshot_seqs and force compaction iterator to consider such + // snapshots. + const Snapshot* job_snapshot = + GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/); + job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot)); + } + *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot); +} + +Status DBImpl::WaitForCompact(bool abort_on_pause) { + InstrumentedMutexLock l(&mutex_); + for (;;) { + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (bg_work_paused_ && abort_on_pause) { + return Status::Aborted(); + } + if ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || unscheduled_compactions_ || + unscheduled_flushes_) && + (error_handler_.GetBGError().ok())) { + bg_cv_.Wait(); + } else { + return error_handler_.GetBGError(); + } + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_debug.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_debug.cc new file mode 100644 index 0000000000..9856caced3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_debug.cc @@ -0,0 +1,314 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef NDEBUG + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/periodic_task_scheduler.h" +#include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + InstrumentedMutexLock l(&mutex_); + return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); +} + +Status DBImpl::TEST_SwitchWAL() { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + void* writer = TEST_BeginWrite(); + auto s = SwitchWAL(&write_context); + TEST_EndWrite(writer); + return s; +} + +uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + InstrumentedMutexLock l(&mutex_); + return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes(); +} + +void DBImpl::TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata, + std::vector>* blob_metadata) { + assert(metadata); + + auto cfh = static_cast_with_check(column_family); + assert(cfh); + + auto cfd = cfh->cfd(); + assert(cfd); + + InstrumentedMutexLock l(&mutex_); + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + metadata->resize(NumberLevels()); + + for (int level = 0; level < NumberLevels(); ++level) { + const std::vector& files = vstorage->LevelFiles(level); + + (*metadata)[level].clear(); + (*metadata)[level].reserve(files.size()); + + for (const auto& f : files) { + (*metadata)[level].push_back(*f); + } + } + + if (blob_metadata) { + *blob_metadata = vstorage->GetBlobFiles(); + } +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->manifest_file_number(); +} + +uint64_t DBImpl::TEST_Current_Next_FileNo() { + return versions_->current_next_file_number(); +} + +Status DBImpl::TEST_CompactRange(int level, const Slice* begin, + const Slice* end, + ColumnFamilyHandle* column_family, + bool disallow_trivial_move) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + int output_level = + (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) + ? level + : level + 1; + return RunManualCompaction( + cfd, level, output_level, CompactRangeOptions(), begin, end, true, + disallow_trivial_move, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/); +} + +Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + if (cfd == nullptr) { + cfd = default_cf_handle_->cfd(); + } + + Status s; + void* writer = TEST_BeginWrite(); + if (two_write_queues_) { + WriteThread::Writer nonmem_w; + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + s = SwitchMemtable(cfd, &write_context); + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } else { + s = SwitchMemtable(cfd, &write_context); + } + TEST_EndWrite(writer); + return s; +} + +Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, + ColumnFamilyHandle* cfh) { + FlushOptions fo; + fo.wait = wait; + fo.allow_write_stall = allow_write_stall; + ColumnFamilyData* cfd; + if (cfh == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfhi = static_cast_with_check(cfh); + cfd = cfhi->cfd(); + } + return FlushMemTable(cfd, fo, FlushReason::kTest); +} + +Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts) { + return FlushMemTable(cfd, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_AtomicFlushMemTables( + const autovector& provided_candidate_cfds, + const FlushOptions& flush_opts) { + return AtomicFlushMemTables(flush_opts, FlushReason::kTest, + provided_candidate_cfds); +} + +Status DBImpl::TEST_WaitForBackgroundWork() { + InstrumentedMutexLock l(&mutex_); + WaitForBackgroundWork(); + return error_handler_.GetBGError(); +} + +Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + cfd = cfh->cfd(); + } + return WaitForFlushMemTable(cfd, nullptr, false); +} + +Status DBImpl::TEST_WaitForCompact(bool abort_on_pause) { + // Wait until the compaction completes + return WaitForCompact(abort_on_pause); +} + +Status DBImpl::TEST_WaitForPurge() { + InstrumentedMutexLock l(&mutex_); + while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + +Status DBImpl::TEST_GetBGError() { + InstrumentedMutexLock l(&mutex_); + return error_handler_.GetBGError(); +} + +void DBImpl::TEST_LockMutex() { mutex_.Lock(); } + +void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } + +void DBImpl::TEST_SignalAllBgCv() { bg_cv_.SignalAll(); } + +void* DBImpl::TEST_BeginWrite() { + auto w = new WriteThread::Writer(); + write_thread_.EnterUnbatched(w, &mutex_); + return reinterpret_cast(w); +} + +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + write_thread_.ExitUnbatched(writer); + delete writer; +} + +size_t DBImpl::TEST_LogsToFreeSize() { + InstrumentedMutexLock l(&log_write_mutex_); + return logs_to_free_.size(); +} + +uint64_t DBImpl::TEST_LogfileNumber() { + InstrumentedMutexLock l(&mutex_); + return logfile_number_; +} + +Status DBImpl::TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map) { + std::vector cf_names; + std::vector iopts; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_names.push_back(cfd->GetName()); + iopts.push_back(cfd->ioptions()); + } + } + iopts_map->clear(); + for (size_t i = 0; i < cf_names.size(); ++i) { + iopts_map->insert({cf_names[i], iopts[i]}); + } + + return Status::OK(); +} + +uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { + return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); +} + +size_t DBImpl::TEST_PreparedSectionCompletedSize() { + return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize(); +} + +size_t DBImpl::TEST_LogsWithPrepSize() { + return logs_with_prep_tracker_.TEST_LogsWithPrepSize(); +} + +uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { + autovector empty_list; + return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list); +} + +Status DBImpl::TEST_GetLatestMutableCFOptions( + ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) { + InstrumentedMutexLock l(&mutex_); + + auto cfh = static_cast_with_check(column_family); + *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions(); + return Status::OK(); +} + +int DBImpl::TEST_BGCompactionsAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_compactions; +} + +int DBImpl::TEST_BGFlushesAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_flushes; +} + +SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastAllocatedSequence(); + } +} + +size_t DBImpl::TEST_GetWalPreallocateBlockSize( + uint64_t write_buffer_size) const { + InstrumentedMutexLock l(&mutex_); + return GetWalPreallocateBlockSize(write_buffer_size); +} + +void DBImpl::TEST_WaitForPeriodicTaskRun(std::function callback) const { + periodic_task_scheduler_.TEST_WaitForRun(callback); +} + +const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const { + return periodic_task_scheduler_; +} + +SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const { + InstrumentedMutexLock l(&mutex_); + return seqno_time_mapping_; +} + + +size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + return EstimateInMemoryStatsHistorySize(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // NDEBUG diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_experimental.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_experimental.cc new file mode 100644 index 0000000000..8d958ffc17 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_experimental.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "logging/logging.h" +#include "rocksdb/status.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + InternalKey start_key, end_key; + if (begin != nullptr) { + start_key.SetMinPossibleForUserKey(*begin); + } + if (end != nullptr) { + end_key.SetMaxPossibleForUserKey(*end); + } + { + InstrumentedMutexLock l(&mutex_); + auto vstorage = cfd->current()->storage_info(); + for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) { + std::vector inputs; + vstorage->GetOverlappingInputs( + level, begin == nullptr ? nullptr : &start_key, + end == nullptr ? nullptr : &end_key, &inputs); + for (auto f : inputs) { + f->marked_for_compaction = true; + } + } + // Since we have some more files to compact, we should also recompute + // compaction score + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { + assert(column_family); + + if (target_level < 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Invalid target level %d\n", target_level); + return Status::InvalidArgument("Invalid target level"); + } + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status status; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + auto* cfd = static_cast(column_family)->cfd(); + const auto* vstorage = cfd->current()->storage_info(); + + if (target_level >= vstorage->num_levels()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Target level %d does not exist\n", + target_level); + job_context.Clean(); + status = Status::InvalidArgument("Target level does not exist"); + return status; + } + + // Sort L0 files by range. + const InternalKeyComparator* icmp = &cfd->internal_comparator(); + auto l0_files = vstorage->LevelFiles(0); + std::sort(l0_files.begin(), l0_files.end(), + [icmp](FileMetaData* f1, FileMetaData* f2) { + return icmp->Compare(f1->largest, f2->largest) < 0; + }); + + // Check that no L0 file is being compacted and that they have + // non-overlapping ranges. + for (size_t i = 0; i < l0_files.size(); ++i) { + auto f = l0_files[i]; + if (f->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. File %" PRIu64 " being compacted\n", + f->fd.GetNumber()); + job_context.Clean(); + status = + Status::InvalidArgument("PromoteL0 called during L0 compaction"); + return status; + } + + if (i == 0) continue; + auto prev_f = l0_files[i - 1]; + if (icmp->Compare(prev_f->largest, f->smallest) >= 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64 + " have overlapping ranges\n", + prev_f->fd.GetNumber(), f->fd.GetNumber()); + job_context.Clean(); + status = Status::InvalidArgument("L0 has overlapping files"); + return status; + } + } + + // Check that all levels up to target_level are empty. + for (int level = 1; level <= target_level; ++level) { + if (vstorage->NumLevelFiles(level) > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Level %d not empty\n", level); + job_context.Clean(); + status = Status::InvalidArgument( + "All levels up to target_level " + "must be empty"); + return status; + } + } + + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : l0_files) { + edit.DeleteFile(0, f->fd.GetNumber()); + edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size, f->tail_size); + } + + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + } // lock released here + LogFlush(immutable_db_options_.info_log); + job_context.Clean(); + + return status; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_files.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_files.cc new file mode 100644 index 0000000000..0998c94551 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_files.cc @@ -0,0 +1,1014 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/event_helpers.h" +#include "db/memtable_list.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "port/port.h" +#include "util/autovector.h" +#include "util/defer.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t DBImpl::MinLogNumberToKeep() { + return versions_->min_log_number_to_keep(); +} + +uint64_t DBImpl::MinObsoleteSstNumberToKeep() { + mutex_.AssertHeld(); + if (!pending_outputs_.empty()) { + return *pending_outputs_.begin(); + } + return std::numeric_limits::max(); +} + +Status DBImpl::DisableFileDeletions() { + Status s; + int my_disable_delete_obsolete_files; + { + InstrumentedMutexLock l(&mutex_); + s = DisableFileDeletionsWithLock(); + my_disable_delete_obsolete_files = disable_delete_obsolete_files_; + } + if (my_disable_delete_obsolete_files == 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + my_disable_delete_obsolete_files); + } + return s; +} + +// FIXME: can be inconsistent with DisableFileDeletions in cases like +// DBImplReadOnly +Status DBImpl::DisableFileDeletionsWithLock() { + mutex_.AssertHeld(); + ++disable_delete_obsolete_files_; + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + int saved_counter; // initialize on all paths + { + InstrumentedMutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + saved_counter = disable_delete_obsolete_files_; + if (saved_counter == 0) { + FindObsoleteFiles(&job_context, true); + bg_cv_.SignalAll(); + } + } + if (saved_counter == 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + saved_counter); + } + job_context.Clean(); + LogFlush(immutable_db_options_.info_log); + return Status::OK(); +} + +bool DBImpl::IsFileDeletionsEnabled() const { + return 0 == disable_delete_obsolete_files_; +} + +// * Returns the list of live files in 'sst_live' and 'blob_live'. +// If it's doing full scan: +// * Returns the list of all files in the filesystem in +// 'full_scan_candidate_files'. +// Otherwise, gets obsolete files from VersionSet. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// mutable_db_options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figuring out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || + mutable_db_options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = immutable_db_options_.clock->NowMicros(); + if ((delete_obsolete_files_last_run_ + + mutable_db_options_.delete_obsolete_files_period_micros) < + now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // don't delete files that might be currently written to from compaction + // threads + // Since job_context->min_pending_output is set, until file scan finishes, + // mutex_ cannot be released. Otherwise, we might see no min_pending_output + // here but later find newer generated unfinalized files while scanning. + job_context->min_pending_output = MinObsoleteSstNumberToKeep(); + + // Get obsolete files. This function will also update the list of + // pending files in VersionSet(). + versions_->GetObsoleteFiles( + &job_context->sst_delete_files, &job_context->blob_delete_files, + &job_context->manifest_delete_files, job_context->min_pending_output); + + // Mark the elements in job_context->sst_delete_files and + // job_context->blob_delete_files as "grabbed for purge" so that other threads + // calling FindObsoleteFiles with full_scan=true will not add these files to + // candidate list for purge. + for (const auto& sst_to_del : job_context->sst_delete_files) { + MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber()); + } + + for (const auto& blob_file : job_context->blob_delete_files) { + MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber()); + } + + // store the current filenum, lognum, etc + job_context->manifest_file_number = versions_->manifest_file_number(); + job_context->pending_manifest_file_number = + versions_->pending_manifest_file_number(); + job_context->log_number = MinLogNumberToKeep(); + job_context->prev_log_number = versions_->prev_log_number(); + + if (doing_the_full_scan) { + versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live); + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + std::set paths; + for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size(); + path_id++) { + paths.insert(immutable_db_options_.db_paths[path_id].path); + } + + // Note that if cf_paths is not specified in the ColumnFamilyOptions + // of a particular column family, we use db_paths as the cf_paths + // setting. Hence, there can be multiple duplicates of files from db_paths + // in the following code. The duplicate are removed while identifying + // unique files in PurgeObsoleteFiles. + for (auto cfd : *versions_->GetColumnFamilySet()) { + for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size(); + path_id++) { + auto& path = cfd->ioptions()->cf_paths[path_id].path; + + if (paths.find(path) == paths.end()) { + paths.insert(path); + } + } + } + + IOOptions io_opts; + io_opts.do_not_recurse = true; + for (auto& path : paths) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + std::vector files; + Status s = immutable_db_options_.fs->GetChildren( + path, io_opts, &files, /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); // TODO: What should we do on error? + for (const std::string& file : files) { + uint64_t number; + FileType type; + // 1. If we cannot parse the file name, we skip; + // 2. If the file with file_number equals number has already been + // grabbed for purge by another compaction job, or it has already been + // schedule for purge, we also skip it if we + // are doing full scan in order to avoid double deletion of the same + // file under race conditions. See + // https://github.com/facebook/rocksdb/issues/3573 + if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) || + !ShouldPurge(number)) { + continue; + } + + // TODO(icanadi) clean up this mess to avoid having one-off "/" + // prefixes + job_context->full_scan_candidate_files.emplace_back("/" + file, path); + } + } + + // Add log files in wal_dir + if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) { + std::vector log_files; + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.wal_dir, io_opts, &log_files, + /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); // TODO: What should we do on error? + for (const std::string& log_file : log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.wal_dir); + } + } + + // Add info log files in db_log_dir + if (!immutable_db_options_.db_log_dir.empty() && + immutable_db_options_.db_log_dir != dbname_) { + std::vector info_log_files; + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.db_log_dir, io_opts, &info_log_files, + /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); // TODO: What should we do on error? + for (std::string& log_file : info_log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.db_log_dir); + } + } + } else { + // Instead of filling ob_context->sst_live and job_context->blob_live, + // directly remove files that show up in any Version. This is because + // candidate files tend to be a small percentage of all files, so it is + // usually cheaper to check them against every version, compared to + // building a map for all files. + versions_->RemoveLiveFiles(job_context->sst_delete_files, + job_context->blob_delete_files); + } + + // Before potentially releasing mutex and waiting on condvar, increment + // pending_purge_obsolete_files_ so that another thread executing + // `GetSortedWals` will wait until this thread finishes execution since the + // other thread will be waiting for `pending_purge_obsolete_files_`. + // pending_purge_obsolete_files_ MUST be decremented if there is nothing to + // delete. + ++pending_purge_obsolete_files_; + + Defer cleanup([job_context, this]() { + assert(job_context != nullptr); + if (!job_context->HaveSomethingToDelete()) { + mutex_.AssertHeld(); + --pending_purge_obsolete_files_; + } + }); + + // logs_ is empty when called during recovery, in which case there can't yet + // be any tracked obsolete logs + log_write_mutex_.Lock(); + + if (alive_log_files_.empty() || logs_.empty()) { + mutex_.AssertHeld(); + // We may reach here if the db is DBImplSecondary + log_write_mutex_.Unlock(); + return; + } + + bool mutex_unlocked = false; + if (!alive_log_files_.empty() && !logs_.empty()) { + uint64_t min_log_number = job_context->log_number; + size_t num_alive_log_files = alive_log_files_.size(); + // find newly obsoleted log files + while (alive_log_files_.begin()->number < min_log_number) { + auto& earliest = *alive_log_files_.begin(); + if (immutable_db_options_.recycle_log_file_num > + log_recycle_files_.size()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", + earliest.number); + log_recycle_files_.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } + if (job_context->size_log_to_delete == 0) { + job_context->prev_total_log_size = total_log_size_; + job_context->num_alive_log_files = num_alive_log_files; + } + job_context->size_log_to_delete += earliest.size; + total_log_size_ -= earliest.size; + alive_log_files_.pop_front(); + + // Current log should always stay alive since it can't have + // number < MinLogNumber(). + assert(alive_log_files_.size()); + } + log_write_mutex_.Unlock(); + mutex_.Unlock(); + mutex_unlocked = true; + TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr); + log_write_mutex_.Lock(); + while (!logs_.empty() && logs_.front().number < min_log_number) { + auto& log = logs_.front(); + if (log.IsSyncing()) { + log_sync_cv_.Wait(); + // logs_ could have changed while we were waiting. + continue; + } + logs_to_free_.push_back(log.ReleaseWriter()); + logs_.pop_front(); + } + // Current log cannot be obsolete. + assert(!logs_.empty()); + } + + // We're just cleaning up for DB::Write(). + assert(job_context->logs_to_free.empty()); + job_context->logs_to_free = logs_to_free_; + + logs_to_free_.clear(); + log_write_mutex_.Unlock(); + if (mutex_unlocked) { + mutex_.Lock(); + } + job_context->log_recycle_files.assign(log_recycle_files_.begin(), + log_recycle_files_.end()); +} + +// Delete obsolete files and log status and information of file deletion +void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, + FileType type, uint64_t number) { + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", + const_cast(&fname)); + + Status file_deletion_status; + if (type == kTableFile || type == kBlobFile || type == kWalFile) { + // Rate limit WAL deletion only if its in the DB dir + file_deletion_status = DeleteDBFile( + &immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, + /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false); + } else { + file_deletion_status = env_->DeleteFile(fname); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion", + &file_deletion_status); + if (file_deletion_status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else if (env_->FileExists(fname).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 + " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } + if (type == kTableFile) { + EventHelpers::LogAndNotifyTableFileDeletion( + &event_logger_, job_id, number, fname, file_deletion_status, GetName(), + immutable_db_options_.listeners); + } + if (type == kBlobFile) { + EventHelpers::LogAndNotifyBlobFileDeletion( + &event_logger_, immutable_db_options_.listeners, job_id, number, fname, + file_deletion_status, GetName()); + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are possibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin"); + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // FindObsoleteFiles() should've populated this so nonzero + assert(state.manifest_file_number != 0); + + // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow. + std::unordered_set sst_live_set(state.sst_live.begin(), + state.sst_live.end()); + std::unordered_set blob_live_set(state.blob_live.begin(), + state.blob_live.end()); + std::unordered_set log_recycle_files_set( + state.log_recycle_files.begin(), state.log_recycle_files.end()); + + auto candidate_files = state.full_scan_candidate_files; + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.blob_delete_files.size() + state.log_delete_files.size() + + state.manifest_delete_files.size()); + // We may ignore the dbname when generating the file names. + for (auto& file : state.sst_delete_files) { + if (!file.only_delete_metadata) { + candidate_files.emplace_back( + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); + } + if (file.metadata->table_reader_handle) { + table_cache_->Release(file.metadata->table_reader_handle); + } + file.DeleteMetadata(); + } + + for (const auto& blob_file : state.blob_delete_files) { + candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()), + blob_file.GetPath()); + } + + auto wal_dir = immutable_db_options_.GetWalDir(); + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.emplace_back(LogFileName(file_num), wal_dir); + } + } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, dbname_); + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + std::sort(candidate_files.begin(), candidate_files.end(), + [](const JobContext::CandidateFileInfo& lhs, + const JobContext::CandidateFileInfo& rhs) { + if (lhs.file_name > rhs.file_name) { + return true; + } else if (lhs.file_name < rhs.file_name) { + return false; + } else { + return (lhs.file_path > rhs.file_path); + } + }); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + if (state.prev_total_log_size > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Try to delete WAL files size %" PRIu64 + ", prev total WAL file size %" PRIu64 + ", number of live WAL files %" ROCKSDB_PRIszt ".\n", + state.job_id, state.size_log_to_delete, + state.prev_total_log_size, state.num_alive_log_files); + } + + std::vector old_info_log_files; + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + + // File numbers of most recent two OPTIONS file in candidate_files (found in + // previos FindObsoleteFiles(full_scan=true)) + // At this point, there must not be any duplicate file numbers in + // candidate_files. + uint64_t optsfile_num1 = std::numeric_limits::min(); + uint64_t optsfile_num2 = std::numeric_limits::min(); + for (const auto& candidate_file : candidate_files) { + const std::string& fname = candidate_file.file_name; + uint64_t number; + FileType type; + if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) || + type != kOptionsFile) { + continue; + } + if (number > optsfile_num1) { + optsfile_num2 = optsfile_num1; + optsfile_num1 = number; + } else if (number > optsfile_num2) { + optsfile_num2 = number; + } + } + + // Close WALs before trying to delete them. + for (const auto w : state.logs_to_free) { + // TODO: maybe check the return value of Close. + auto s = w->Close(); + s.PermitUncheckedError(); + } + + bool own_files = OwnTablesAndLogs(); + std::unordered_set files_to_del; + for (const auto& candidate_file : candidate_files) { + const std::string& to_delete = candidate_file.file_name; + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kWalFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number) || + (log_recycle_files_set.find(number) != + log_recycle_files_set.end())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + // If the second condition is not there, this makes + // DontDeletePendingOutputs fail + keep = (sst_live_set.find(number) != sst_live_set.end()) || + number >= state.min_pending_output; + if (!keep) { + files_to_del.insert(number); + } + break; + case kBlobFile: + keep = number >= state.min_pending_output || + (blob_live_set.find(number) != blob_live_set.end()); + if (!keep) { + files_to_del.insert(number); + } + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + // + // TODO(yhchiang): carefully modify the third condition to safely + // remove the temp options files. + keep = (sst_live_set.find(number) != sst_live_set.end()) || + (blob_live_set.find(number) != blob_live_set.end()) || + (number == state.pending_manifest_file_number) || + (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kOptionsFile: + keep = (number >= optsfile_num2); + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + keep = true; + break; + } + + if (keep) { + continue; + } + + std::string fname; + std::string dir_to_sync; + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + fname = MakeTableFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; + } else if (type == kBlobFile) { + fname = BlobFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; + } else { + dir_to_sync = (type == kWalFile) ? wal_dir : dbname_; + fname = dir_to_sync + + ((!dir_to_sync.empty() && dir_to_sync.back() == '/') || + (!to_delete.empty() && to_delete.front() == '/') + ? "" + : "/") + + to_delete; + } + + if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0)) { + wal_manager_.ArchiveWALFile(fname, number); + continue; + } + + // If I do not own these files, e.g. secondary instance with max_open_files + // = -1, then no need to delete or schedule delete these files since they + // will be removed by their owner, e.g. the primary instance. + if (!own_files) { + continue; + } + if (schedule_only) { + InstrumentedMutexLock guard_lock(&mutex_); + SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id); + } else { + DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number); + } + } + + { + // After purging obsolete files, remove them from files_grabbed_for_purge_. + InstrumentedMutexLock guard_lock(&mutex_); + autovector to_be_removed; + for (auto fn : files_grabbed_for_purge_) { + if (files_to_del.count(fn) != 0) { + to_be_removed.emplace_back(fn); + } + } + for (auto fn : to_be_removed) { + files_grabbed_for_purge_.erase(fn); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + if (old_info_log_file_count != 0 && + old_info_log_file_count >= immutable_db_options_.keep_log_file_num) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = + old_info_log_file_count - immutable_db_options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + std::string full_path_to_delete = + (immutable_db_options_.db_log_dir.empty() + ? dbname_ + : immutable_db_options_.db_log_dir) + + "/" + to_delete; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s\n", state.job_id, + full_path_to_delete.c_str()); + Status s = env_->DeleteFile(full_path_to_delete); + if (!s.ok()) { + if (env_->FileExists(full_path_to_delete).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete non-existing info log file %s FAILED " + "-- %s\n", + state.job_id, to_delete.c_str(), s.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s FAILED -- %s\n", + state.job_id, to_delete.c_str(), + s.ToString().c_str()); + } + } + } + } + wal_manager_.PurgeObsoleteWALFiles(); + LogFlush(immutable_db_options_.info_log); + InstrumentedMutexLock l(&mutex_); + --pending_purge_obsolete_files_; + assert(pending_purge_obsolete_files_ >= 0); + if (schedule_only) { + // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_ + // while holding mutex (for GetSortedWalFiles() etc.) + SchedulePurge(); + } + if (pending_purge_obsolete_files_ == 0) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End"); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + if (job_context.HaveSomethingToDelete()) { + bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io; + PurgeObsoleteFiles(job_context, defer_purge); + } + job_context.Clean(); + mutex_.Lock(); +} + +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const autovector& memtables_to_flush) { + uint64_t min_log = 0; + + // we must look through the memtables for two phase transactions + // that have been committed but not yet flushed + std::unordered_set memtables_to_flush_set( + memtables_to_flush.begin(), memtables_to_flush.end()); + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + &memtables_to_flush_set); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, + const autovector*>& memtables_to_flush) { + uint64_t min_log = 0; + + std::unordered_set memtables_to_flush_set; + for (const autovector* memtables : memtables_to_flush) { + memtables_to_flush_set.insert(memtables->begin(), memtables->end()); + } + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + &memtables_to_flush_set); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list) { + assert(vset != nullptr); + + // Precompute the min log number containing unflushed data for the column + // family being flushed (`cfd_to_flush`). + uint64_t cf_min_log_number_to_keep = 0; + for (auto& e : edit_list) { + if (e->HasLogNumber()) { + cf_min_log_number_to_keep = + std::max(cf_min_log_number_to_keep, e->GetLogNumber()); + } + } + if (cf_min_log_number_to_keep == 0) { + // No version edit contains information on log number. The log number + // for this column family should stay the same as it is. + cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber(); + } + + // Get min log number containing unflushed data for other column families. + uint64_t min_log_number_to_keep = + vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush); + if (cf_min_log_number_to_keep != 0) { + min_log_number_to_keep = + std::min(cf_min_log_number_to_keep, min_log_number_to_keep); + } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists) { + assert(vset != nullptr); + assert(!cfds_to_flush.empty()); + assert(cfds_to_flush.size() == edit_lists.size()); + + uint64_t min_log_number_to_keep = std::numeric_limits::max(); + for (const auto& edit_list : edit_lists) { + uint64_t log = 0; + for (const auto& e : edit_list) { + if (e->HasLogNumber()) { + log = std::max(log, e->GetLogNumber()); + } + } + if (log != 0) { + min_log_number_to_keep = std::min(min_log_number_to_keep, log); + } + } + if (min_log_number_to_keep == std::numeric_limits::max()) { + min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber(); + for (size_t i = 1; i < cfds_to_flush.size(); i++) { + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber()); + } + } + + std::unordered_set flushed_cfds( + cfds_to_flush.begin(), cfds_to_flush.end()); + min_log_number_to_keep = + std::min(min_log_number_to_keep, + vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds)); + + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + // Calculate updated min_log_number_to_keep + // Since the function should only be called in 2pc mode, log number in + // the version edit should be sufficient. + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list); + + // if are 2pc we must consider logs containing prepared + // sections of outstanding transactions. + // + // We must check min logs with outstanding prep before we check + // logs references by memtables because a log referenced by the + // first data structure could transition to the second under us. + // + // TODO: iterating over all column families under db mutex. + // should find more optimal solution + auto min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = + FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + assert(cfds_to_flush.size() == edit_lists.size()); + assert(cfds_to_flush.size() == memtables_to_flush.size()); + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists); + + uint64_t min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = + FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + + return min_log_number_to_keep; +} + +void DBImpl::SetDBId(std::string&& id, bool read_only, + RecoveryContext* recovery_ctx) { + assert(db_id_.empty()); + assert(!id.empty()); + db_id_ = std::move(id); + if (!read_only && immutable_db_options_.write_dbid_to_manifest) { + assert(recovery_ctx != nullptr); + assert(versions_->GetColumnFamilySet() != nullptr); + VersionEdit edit; + edit.SetDBId(db_id_); + versions_->db_id_ = db_id_; + recovery_ctx->UpdateVersionEdits( + versions_->GetColumnFamilySet()->GetDefault(), edit); + } +} + +Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) { + Status s; + // Check for the IDENTITY file and create it if not there or + // broken or not matching manifest + std::string db_id_in_file; + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + if (s.ok()) { + s = GetDbIdentityFromIdentityFile(&db_id_in_file); + if (s.ok() && !db_id_in_file.empty()) { + if (db_id_.empty()) { + // Loaded from file and wasn't already known from manifest + SetDBId(std::move(db_id_in_file), read_only, recovery_ctx); + return s; + } else if (db_id_ == db_id_in_file) { + // Loaded from file and matches manifest + return s; + } + } + } + if (s.IsNotFound()) { + s = Status::OK(); + } + if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + // Otherwise IDENTITY file is missing or no good. + // Generate new id if needed + if (db_id_.empty()) { + SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx); + } + // Persist it to IDENTITY file if allowed + if (!read_only) { + s = SetIdentityFile(env_, dbname_, db_id_); + } + return s; +} + +Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { + mutex_.AssertHeld(); + std::vector paths; + paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator))); + for (const auto& db_path : immutable_db_options_.db_paths) { + paths.push_back( + NormalizePath(db_path.path + std::string(1, kFilePathSeparator))); + } + for (const auto* cfd : *versions_->GetColumnFamilySet()) { + for (const auto& cf_path : cfd->ioptions()->cf_paths) { + paths.push_back( + NormalizePath(cf_path.path + std::string(1, kFilePathSeparator))); + } + } + // Dedup paths + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + + uint64_t next_file_number = versions_->current_next_file_number(); + uint64_t largest_file_number = next_file_number; + Status s; + for (const auto& path : paths) { + std::vector files; + s = env_->GetChildren(path, &files); + if (!s.ok()) { + break; + } + for (const auto& fname : files) { + uint64_t number = 0; + FileType type; + if (!ParseFileName(fname, &number, &type)) { + continue; + } + // path ends with '/' or '\\' + const std::string normalized_fpath = path + fname; + largest_file_number = std::max(largest_file_number, number); + if (type == kTableFile && number >= next_file_number && + recovery_ctx->files_to_delete_.find(normalized_fpath) == + recovery_ctx->files_to_delete_.end()) { + recovery_ctx->files_to_delete_.emplace(normalized_fpath); + } + } + } + if (!s.ok()) { + return s; + } + + if (largest_file_number >= next_file_number) { + versions_->next_file_number_.store(largest_file_number + 1); + } + + VersionEdit edit; + edit.SetNextFile(versions_->next_file_number_.load()); + assert(versions_->GetColumnFamilySet()); + ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(default_cfd); + recovery_ctx->UpdateVersionEdits(default_cfd, edit); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_open.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_open.cc new file mode 100644 index 0000000000..6bc8ae4c30 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_open.cc @@ -0,0 +1,2202 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/periodic_task_scheduler.h" +#include "env/composite_env_wrapper.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/table.h" +#include "rocksdb/wal_filter.h" +#include "test_util/sync_point.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { +Options SanitizeOptions(const std::string& dbname, const Options& src, + bool read_only, Status* logger_creation_s) { + auto db_options = + SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s); + ImmutableDBOptions immutable_db_options(db_options); + auto cf_options = + SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, + bool read_only, Status* logger_creation_s) { + DBOptions result(src); + + if (result.env == nullptr) { + result.env = Env::Default(); + } + + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + int max_max_open_files = port::GetMaxOpenFiles(); + if (max_max_open_files == -1) { + max_max_open_files = 0x400000; + } + ClipToRange(&result.max_open_files, 20, max_max_open_files); + TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles", + &result.max_open_files); + } + + if (result.info_log == nullptr && !read_only) { + Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + if (logger_creation_s) { + *logger_creation_s = s; + } + } + } + + if (!result.write_buffer_manager) { + result.write_buffer_manager.reset( + new WriteBufferManager(result.db_write_buffer_size)); + } + auto bg_job_limits = DBImpl::GetBGJobLimits( + result.max_background_flushes, result.max_background_compactions, + result.max_background_jobs, true /* parallelize_compactions */); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, + Env::Priority::LOW); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, + Env::Priority::HIGH); + + if (result.rate_limiter.get() != nullptr) { + if (result.bytes_per_sync == 0) { + result.bytes_per_sync = 1024 * 1024; + } + } + + if (result.delayed_write_rate == 0) { + if (result.rate_limiter.get() != nullptr) { + result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond(); + } + if (result.delayed_write_rate == 0) { + result.delayed_write_rate = 16 * 1024 * 1024; + } + } + + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + + if (result.recycle_log_file_num && + (result.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { + // - kTolerateCorruptedTailRecords is inconsistent with recycle log file + // feature. WAL recycling expects recovery success upon encountering a + // corrupt record at the point where new data ends and recycled data + // remains at the tail. However, `kTolerateCorruptedTailRecords` must fail + // upon encountering any such corrupt record, as it cannot differentiate + // between this and a real corruption, which would cause committed updates + // to be truncated -- a violation of the recovery guarantee. + // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with + // recycle log file feature temporarily due to a bug found introducing a + // hole in the recovered data + // (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236). + // Besides this bug, we believe the features are fundamentally compatible. + result.recycle_log_file_num = 0; + } + + if (result.db_paths.size() == 0) { + result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + } else if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + if (!result.wal_dir.empty()) { + // If there is a wal_dir already set, check to see if the wal_dir is the + // same as the dbname AND the same as the db_path[0] (which must exist from + // a few lines ago). If the wal_dir matches both of these values, then clear + // the wal_dir value, which will make wal_dir == dbname. Most likely this + // condition was the result of reading an old options file where we forced + // wal_dir to be set (to dbname). + auto npath = NormalizePath(dbname + "/"); + if (npath == NormalizePath(result.wal_dir + "/") && + npath == NormalizePath(result.db_paths[0].path + "/")) { + result.wal_dir.clear(); + } + } + + if (!result.wal_dir.empty() && result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + } + + if (result.use_direct_reads && result.compaction_readahead_size == 0) { + TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); + result.compaction_readahead_size = 1024 * 1024 * 2; + } + + // Force flush on DB open if 2PC is enabled, since with 2PC we have no + // guarantee that consecutive log files have consecutive sequence id, which + // make recovery complicated. + if (result.allow_2pc) { + result.avoid_flush_during_recovery = false; + } + + ImmutableDBOptions immutable_db_options(result); + if (!immutable_db_options.IsWalDirSameAsDBPath()) { + // Either the WAL dir and db_paths[0]/db_name are not the same, or we + // cannot tell for sure. In either case, assume they're different and + // explicitly cleanup the trash log files (bypass DeleteScheduler) + // Do this first so even if we end up calling + // DeleteScheduler::CleanupDirectory on the same dir later, it will be + // safe + std::vector filenames; + IOOptions io_opts; + io_opts.do_not_recurse = true; + auto wal_dir = immutable_db_options.GetWalDir(); + Status s = immutable_db_options.fs->GetChildren( + wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr); + s.PermitUncheckedError(); //**TODO: What to do on error? + for (std::string& filename : filenames) { + if (filename.find(".log.trash", filename.length() - + std::string(".log.trash").length()) != + std::string::npos) { + std::string trash_file = wal_dir + "/" + filename; + result.env->DeleteFile(trash_file).PermitUncheckedError(); + } + } + } + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = static_cast(result.sst_file_manager.get()); + for (size_t i = 0; i < result.db_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path) + .PermitUncheckedError(); + } + + // Create a default SstFileManager for purposes of tracking compaction size + // and facilitating recovery from out of space errors. + if (result.sst_file_manager.get() == nullptr) { + std::shared_ptr sst_file_manager( + NewSstFileManager(result.env, result.info_log)); + result.sst_file_manager = sst_file_manager; + } + + // Supported wal compression types + if (!StreamingCompressionTypeSupported(result.wal_compression)) { + result.wal_compression = kNoCompression; + ROCKS_LOG_WARN(result.info_log, + "wal_compression is disabled since only zstd is supported"); + } + + if (!result.paranoid_checks) { + result.skip_checking_sst_file_sizes_on_db_open = true; + ROCKS_LOG_INFO(result.info_log, + "file size check will be skipped during open."); + } + + return result; +} + +namespace { +Status ValidateOptionsByTable( + const DBOptions& db_opts, + const std::vector& column_families) { + Status s; + for (auto& cf : column_families) { + s = ValidateOptions(db_opts, cf.options); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} +} // namespace + +Status DBImpl::ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families) { + Status s; + for (auto& cfd : column_families) { + s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); + if (!s.ok()) { + return s; + } + } + s = ValidateOptions(db_options); + return s; +} + +Status DBImpl::ValidateOptions(const DBOptions& db_options) { + if (db_options.db_paths.size() > 4) { + return Status::NotSupported( + "More than four DB paths are not supported yet. "); + } + + if (db_options.allow_mmap_reads && db_options.use_direct_reads) { + // Protect against assert in PosixMMapReadableFile constructor + return Status::NotSupported( + "If memory mapped reads (allow_mmap_reads) are enabled " + "then direct I/O reads (use_direct_reads) must be disabled. "); + } + + if (db_options.allow_mmap_writes && + db_options.use_direct_io_for_flush_and_compaction) { + return Status::NotSupported( + "If memory mapped writes (allow_mmap_writes) are enabled " + "then direct I/O writes (use_direct_io_for_flush_and_compaction) must " + "be disabled. "); + } + + if (db_options.keep_log_file_num == 0) { + return Status::InvalidArgument("keep_log_file_num must be greater than 0"); + } + + if (db_options.unordered_write && + !db_options.allow_concurrent_memtable_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with " + "!allow_concurrent_memtable_write"); + } + + if (db_options.unordered_write && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with enable_pipelined_write"); + } + + if (db_options.atomic_flush && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "atomic_flush is incompatible with enable_pipelined_write"); + } + + // TODO remove this restriction + if (db_options.atomic_flush && db_options.best_efforts_recovery) { + return Status::InvalidArgument( + "atomic_flush is currently incompatible with best-efforts recovery"); + } + + if (db_options.use_direct_io_for_flush_and_compaction && + 0 == db_options.writable_file_max_buffer_size) { + return Status::InvalidArgument( + "writes in direct IO require writable_file_max_buffer_size > 0"); + } + + return Status::OK(); +} + +Status DBImpl::NewDB(std::vector* new_filenames) { + VersionEdit new_db; + Status s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.write_dbid_to_manifest) { + std::string temp_db_id; + GetDbIdentityFromIdentityFile(&temp_db_id); + new_db.SetDBId(temp_db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); + const std::string manifest = DescriptorFileName(dbname_, 1); + { + if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) { + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); + } + std::unique_ptr file; + FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); + s = NewWritableFile(fs_.get(), manifest, &file, file_options); + if (!s.ok()) { + return s; + } + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; + file->SetPreallocationBlockSize( + immutable_db_options_.manifest_preallocation_size); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), manifest, file_options, immutable_db_options_.clock, + io_tracer_, nullptr /* stats */, immutable_db_options_.listeners, + nullptr, tmp_set.Contains(FileType::kDescriptorFile), + tmp_set.Contains(FileType::kDescriptorFile))); + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = SyncManifest(&immutable_db_options_, log.file()); + } + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir()); + if (new_filenames) { + new_filenames->emplace_back( + manifest.substr(manifest.find_last_of("/\\") + 1)); + } + } else { + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); + } + return s; +} + +IOStatus DBImpl::CreateAndNewDirectory( + FileSystem* fs, const std::string& dirname, + std::unique_ptr* directory) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr); + if (!io_s.ok()) { + return io_s; + } + return fs->NewDirectory(dirname, IOOptions(), directory, nullptr); +} + +IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { + IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_); + if (!io_s.ok()) { + return io_s; + } + if (!wal_dir.empty() && dbname != wal_dir) { + io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_); + if (!io_s.ok()) { + return io_s; + } + } + + data_dirs_.clear(); + for (auto& p : data_paths) { + const std::string db_path = p.path; + if (db_path == dbname) { + data_dirs_.emplace_back(nullptr); + } else { + std::unique_ptr path_directory; + io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory); + if (!io_s.ok()) { + return io_s; + } + data_dirs_.emplace_back(path_directory.release()); + } + } + assert(data_dirs_.size() == data_paths.size()); + return IOStatus::OK(); +} + +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_wal_file_exists, bool error_if_data_exists_in_wals, + uint64_t* recovered_seq, RecoveryContext* recovery_ctx) { + mutex_.AssertHeld(); + + bool is_new_db = false; + assert(db_lock_ == nullptr); + std::vector files_in_dbname; + if (!read_only) { + Status s = directories_.SetDirectories(fs_.get(), dbname_, + immutable_db_options_.wal_dir, + immutable_db_options_.db_paths); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + std::string current_fname = CurrentFileName(dbname_); + // Path to any MANIFEST file in the db dir. It does not matter which one. + // Since best-efforts recovery ignores CURRENT file, existence of a + // MANIFEST indicates the recovery to recover existing db. If no MANIFEST + // can be found, a new db will be created. + std::string manifest_path; + if (!immutable_db_options_.best_efforts_recovery) { + s = env_->FileExists(current_fname); + } else { + s = Status::NotFound(); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status io_s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); + if (!io_s.ok()) { + s = io_s; + files_in_dbname.clear(); + } + for (const std::string& file : files_in_dbname) { + uint64_t number = 0; + FileType type = kWalFile; // initialize + if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { + uint64_t bytes; + s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes); + if (s.ok() && bytes != 0) { + // Found non-empty MANIFEST (descriptor log), thus best-efforts + // recovery does not have to treat the db as empty. + manifest_path = dbname_ + "/" + file; + break; + } + } + } + } + if (s.IsNotFound()) { + if (immutable_db_options_.create_if_missing) { + s = NewDB(&files_in_dbname); + is_new_db = true; + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + current_fname, "does not exist (create_if_missing is false)"); + } + } else if (s.ok()) { + if (immutable_db_options_.error_if_exists) { + return Status::InvalidArgument(dbname_, + "exists (error_if_exists is true)"); + } + } else { + // Unexpected error reading file + assert(s.IsIOError()); + return s; + } + // Verify compatibility of file_options_ and filesystem + { + std::unique_ptr idfile; + FileOptions customized_fs(file_options_); + customized_fs.use_direct_reads |= + immutable_db_options_.use_direct_io_for_flush_and_compaction; + const std::string& fname = + manifest_path.empty() ? current_fname : manifest_path; + s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); + if (!s.ok()) { + std::string error_str = s.ToString(); + // Check if unsupported Direct I/O is the root cause + customized_fs.use_direct_reads = false; + s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); + if (s.ok()) { + return Status::InvalidArgument( + "Direct I/O is not supported by the specified DB."); + } else { + return Status::InvalidArgument( + "Found options incompatible with filesystem", error_str.c_str()); + } + } + } + } else if (immutable_db_options_.best_efforts_recovery) { + assert(files_in_dbname.empty()); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); + if (s.IsNotFound()) { + return Status::InvalidArgument(dbname_, + "does not exist (open for read only)"); + } else if (s.IsIOError()) { + return s; + } + assert(s.ok()); + } + assert(db_id_.empty()); + Status s; + bool missing_table_file = false; + if (!immutable_db_options_.best_efforts_recovery) { + s = versions_->Recover(column_families, read_only, &db_id_); + } else { + assert(!files_in_dbname.empty()); + s = versions_->TryRecover(column_families, read_only, files_in_dbname, + &db_id_, &missing_table_file); + if (s.ok()) { + // TryRecover may delete previous column_family_set_. + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + } + } + if (!s.ok()) { + return s; + } + if (s.ok() && !read_only) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + // Try to trivially move files down the LSM tree to start from bottommost + // level when level_compaction_dynamic_level_bytes is enabled. This should + // only be useful when user is migrating to turning on this option. + // If a user is migrating from Level Compaction with a smaller level + // multiplier or from Universal Compaction, there may be too many + // non-empty levels and the trivial moves here are not sufficed for + // migration. Additional compactions are needed to drain unnecessary + // levels. + // + // Note that this step moves files down LSM without consulting + // SSTPartitioner. Further compactions are still needed if + // the user wants to partition SST files. + // Note that files moved in this step may not respect the compression + // option in target level. + if (cfd->ioptions()->compaction_style == + CompactionStyle::kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) { + int to_level = cfd->ioptions()->num_levels - 1; + // last level is reserved + // allow_ingest_behind does not support Level Compaction, + // and per_key_placement can have infinite compaction loop for Level + // Compaction. Adjust to_level here just to be safe. + if (cfd->ioptions()->allow_ingest_behind || + cfd->ioptions()->preclude_last_level_data_seconds > 0) { + to_level -= 1; + } + // Whether this column family has a level trivially moved + bool moved = false; + // Fill the LSM starting from to_level and going up one level at a time. + // Some loop invariants (when last level is not reserved): + // - levels in (from_level, to_level] are empty, and + // - levels in (to_level, last_level] are non-empty. + for (int from_level = to_level; from_level >= 0; --from_level) { + const std::vector& level_files = + cfd->current()->storage_info()->LevelFiles(from_level); + if (level_files.empty() || from_level == 0) { + continue; + } + assert(from_level <= to_level); + // Trivial move files from `from_level` to `to_level` + if (from_level < to_level) { + if (!moved) { + // lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with + // 7 levels + std::string lsm_state = "["; + for (int i = 0; i < cfd->ioptions()->num_levels; ++i) { + lsm_state += std::to_string( + cfd->current()->storage_info()->NumLevelFiles(i)); + if (i < cfd->ioptions()->num_levels - 1) { + lsm_state += ","; + } + } + lsm_state += "]"; + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Trivially move files down the LSM when open " + "with level_compaction_dynamic_level_bytes=true," + " lsm_state: %s (Files are moved only if DB " + "Recovery is successful).", + cfd->GetName().c_str(), lsm_state.c_str()); + moved = true; + } + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "[%s] Moving %zu files from from_level-%d to from_level-%d", + cfd->GetName().c_str(), level_files.size(), from_level, + to_level); + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const FileMetaData* f : level_files) { + edit.DeleteFile(from_level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, + f->temperature, // this can be different from + // `last_level_temperature` + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, + f->unique_id, f->compensated_range_deletion_size, + f->tail_size); + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Moving #%" PRIu64 + " from from_level-%d to from_level-%d %" PRIu64 + " bytes\n", + cfd->GetName().c_str(), f->fd.GetNumber(), + from_level, to_level, f->fd.GetFileSize()); + } + recovery_ctx->UpdateVersionEdits(cfd, edit); + } + --to_level; + } + } + } + } + s = SetupDBId(read_only, recovery_ctx); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); + if (s.ok() && !read_only) { + s = DeleteUnreferencedSstFiles(recovery_ctx); + } + + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok() && !read_only) { + // TODO: share file descriptors (FSDirectory) with SetDirectories above + std::map> created_dirs; + for (auto cfd : *versions_->GetColumnFamilySet()) { + s = cfd->AddDirectories(&created_dirs); + if (!s.ok()) { + return s; + } + } + } + + std::vector files_in_wal_dir; + if (s.ok()) { + // Initial max_total_in_memory_state_ before recovery wals. Log recovery + // may check this value to decide whether to flush. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + SequenceNumber next_sequence(kMaxSequenceNumber); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that prev_log_number() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + auto wal_dir = immutable_db_options_.GetWalDir(); + if (!immutable_db_options_.best_efforts_recovery) { + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr); + } + if (s.IsNotFound()) { + return Status::InvalidArgument("wal_dir not found", wal_dir); + } else if (!s.ok()) { + return s; + } + + std::unordered_map wal_files; + for (const auto& file : files_in_wal_dir) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type) && type == kWalFile) { + if (is_new_db) { + return Status::Corruption( + "While creating a new Db, wal_dir contains " + "existing log file: ", + file); + } else { + wal_files[number] = LogFileName(wal_dir, number); + } + } + } + + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + if (!immutable_db_options_.best_efforts_recovery) { + // Verify WALs in MANIFEST. + s = versions_->GetWalSet().CheckWals(env_, wal_files); + } // else since best effort recovery does not recover from WALs, no need + // to check WALs. + } else if (!versions_->GetWalSet().GetWals().empty()) { + // Tracking is disabled, clear previously tracked WALs from MANIFEST, + // otherwise, in the future, if WAL tracking is enabled again, + // since the WALs deleted when WAL tracking is disabled are not persisted + // into MANIFEST, WAL check may fail. + VersionEdit edit; + WalNumber max_wal_number = + versions_->GetWalSet().GetWals().rbegin()->first; + edit.DeleteWalsBefore(max_wal_number + 1); + assert(recovery_ctx != nullptr); + assert(versions_->GetColumnFamilySet() != nullptr); + recovery_ctx->UpdateVersionEdits( + versions_->GetColumnFamilySet()->GetDefault(), edit); + } + if (!s.ok()) { + return s; + } + + if (!wal_files.empty()) { + if (error_if_wal_file_exists) { + return Status::Corruption( + "The db was opened in readonly mode with error_if_wal_file_exists" + "flag but a WAL file already exists"); + } else if (error_if_data_exists_in_wals) { + for (auto& wal_file : wal_files) { + uint64_t bytes; + s = env_->GetFileSize(wal_file.second, &bytes); + if (s.ok()) { + if (bytes > 0) { + return Status::Corruption( + "error_if_data_exists_in_wals is set but there are data " + " in WAL files."); + } + } + } + } + } + + if (!wal_files.empty()) { + // Recover in the order in which the wals were generated + std::vector wals; + wals.reserve(wal_files.size()); + for (const auto& wal_file : wal_files) { + wals.push_back(wal_file.first); + } + std::sort(wals.begin(), wals.end()); + + bool corrupted_wal_found = false; + s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found, + recovery_ctx); + if (corrupted_wal_found && recovered_seq != nullptr) { + *recovered_seq = next_sequence; + } + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + } + } + } + + if (read_only) { + // If we are opening as read-only, we need to update options_file_number_ + // to reflect the most recent OPTIONS file. It does not matter for regular + // read-write db instance because options_file_number_ will later be + // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. + std::vector filenames; + if (s.ok()) { + const std::string normalized_dbname = NormalizePath(dbname_); + const std::string normalized_wal_dir = + NormalizePath(immutable_db_options_.GetWalDir()); + if (immutable_db_options_.best_efforts_recovery) { + filenames = std::move(files_in_dbname); + } else if (normalized_dbname == normalized_wal_dir) { + filenames = std::move(files_in_wal_dir); + } else { + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr); + } + } + if (s.ok()) { + uint64_t number = 0; + uint64_t options_file_number = 0; + FileType type; + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { + options_file_number = std::max(number, options_file_number); + } + } + versions_->options_file_number_ = options_file_number; + uint64_t options_file_size = 0; + if (options_file_number > 0) { + s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number), + &options_file_size); + } + versions_->options_file_size_ = options_file_size; + } + } + return s; +} + +Status DBImpl::PersistentStatsProcessFormatVersion() { + mutex_.AssertHeld(); + Status s; + // persist version when stats CF doesn't exist + bool should_persist_format_version = !persistent_stats_cfd_exists_; + mutex_.Unlock(); + if (persistent_stats_cfd_exists_) { + // Check persistent stats format version compatibility. Drop and recreate + // persistent stats CF if format version is incompatible + uint64_t format_version_recovered = 0; + Status s_format = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); + uint64_t compatible_version_recovered = 0; + Status s_compatible = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kCompatibleVersion, + &compatible_version_recovered); + // abort reading from existing stats CF if any of following is true: + // 1. failed to read format version or compatible version from disk + // 2. sst's format version is greater than current format version, meaning + // this sst is encoded with a newer RocksDB release, and current compatible + // version is below the sst's compatible version + if (!s_format.ok() || !s_compatible.ok() || + (kStatsCFCurrentFormatVersion < format_version_recovered && + kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { + if (!s_format.ok() || !s_compatible.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Recreating persistent stats column family since reading " + "persistent stats version key failed. Format key: %s, compatible " + "key: %s", + s_format.ToString().c_str(), s_compatible.ToString().c_str()); + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Recreating persistent stats column family due to corrupted or " + "incompatible format version. Recovered format: %" PRIu64 + "; recovered format compatible since: %" PRIu64 "\n", + format_version_recovered, compatible_version_recovered); + } + s = DropColumnFamily(persist_stats_cf_handle_); + if (s.ok()) { + s = DestroyColumnFamilyHandle(persist_stats_cf_handle_); + } + ColumnFamilyHandle* handle = nullptr; + if (s.ok()) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + } + if (s.ok()) { + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } + } + } + if (should_persist_format_version) { + // Persistent stats CF being created for the first time, need to write + // format version key + WriteBatch batch; + if (s.ok()) { + s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + std::to_string(kStatsCFCurrentFormatVersion)); + } + if (s.ok()) { + s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + std::to_string(kStatsCFCompatibleFormatVersion)); + } + if (s.ok()) { + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + } + mutex_.Lock(); + return s; +} + +Status DBImpl::InitPersistStatsColumnFamily() { + mutex_.AssertHeld(); + assert(!persist_stats_cf_handle_); + ColumnFamilyData* persistent_stats_cfd = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; + + Status s; + if (persistent_stats_cfd != nullptr) { + // We are recovering from a DB which already contains persistent stats CF, + // the CF is already created in VersionSet::ApplyOneVersionEdit, but + // column family handle was not. Need to explicitly create handle here. + persist_stats_cf_handle_ = + new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); + } else { + mutex_.Unlock(); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + mutex_.Lock(); + } + return s; +} + +Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { + mutex_.AssertHeld(); + assert(versions_->descriptor_log_ == nullptr); + const ReadOptions read_options(Env::IOActivity::kDBOpen); + Status s = versions_->LogAndApply( + recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, read_options, + recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); + if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { + mutex_.Unlock(); + for (const auto& fname : recovery_ctx.files_to_delete_) { + s = env_->DeleteFile(fname); + if (!s.ok()) { + break; + } + } + mutex_.Lock(); + } + return s; +} + +void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() { + if (immutable_db_options_.wal_filter == nullptr) { + return; + } + assert(immutable_db_options_.wal_filter != nullptr); + WalFilter& wal_filter = *(immutable_db_options_.wal_filter); + + std::map cf_name_id_map; + std::map cf_lognumber_map; + assert(versions_); + assert(versions_->GetColumnFamilySet()); + for (auto cfd : *versions_->GetColumnFamilySet()) { + assert(cfd); + cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID())); + cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + } + + wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map); +} + +bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, + const std::string& wal_fname, + log::Reader::Reporter& reporter, + Status& status, + bool& stop_replay, + WriteBatch& batch) { + if (immutable_db_options_.wal_filter == nullptr) { + return true; + } + assert(immutable_db_options_.wal_filter != nullptr); + WalFilter& wal_filter = *(immutable_db_options_.wal_filter); + + WriteBatch new_batch; + bool batch_changed = false; + + bool process_current_record = true; + + WalFilter::WalProcessingOption wal_processing_option = + wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch, + &batch_changed); + + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kContinueProcessing: + // do nothing, proceeed normally + break; + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: + // skip current record + process_current_record = false; + break; + case WalFilter::WalProcessingOption::kStopReplay: + // skip current record and stop replay + process_current_record = false; + stop_replay = true; + break; + case WalFilter::WalProcessingOption::kCorruptedRecord: { + status = Status::Corruption("Corruption reported by Wal Filter ", + wal_filter.Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + process_current_record = false; + reporter.Corruption(batch.GetDataSize(), status); + } + break; + } + default: { + // logical error which should not happen. If RocksDB throws, we would + // just do `throw std::logic_error`. + assert(false); + status = Status::NotSupported( + "Unknown WalProcessingOption returned by Wal Filter ", + wal_filter.Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + // Ignore the error with current record processing. + stop_replay = true; + } + break; + } + } + + if (!process_current_record) { + return false; + } + + if (batch_changed) { + // Make sure that the count in the new batch is + // within the orignal count. + int new_count = WriteBatchInternal::Count(&new_batch); + int original_count = WriteBatchInternal::Count(&batch); + if (new_count > original_count) { + ROCKS_LOG_FATAL( + immutable_db_options_.info_log, + "Recovering log #%" PRIu64 + " mode %d log filter %s returned " + "more records (%d) than original (%d) which is not allowed. " + "Aborting recovery.", + wal_number, static_cast(immutable_db_options_.wal_recovery_mode), + wal_filter.Name(), new_count, original_count); + status = Status::NotSupported( + "More than original # of records " + "returned by Wal Filter ", + wal_filter.Name()); + return false; + } + // Set the same sequence number in the new_batch + // as the original batch. + WriteBatchInternal::SetSequence(&new_batch, + WriteBatchInternal::Sequence(&batch)); + batch = new_batch; + } + return true; +} + +// REQUIRES: wal_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_wal_found, + RecoveryContext* recovery_ctx) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (status == nullptr ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (status != nullptr && status->ok()) { + *status = s; + } + } + }; + + mutex_.AssertHeld(); + Status status; + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } + int job_id = next_job_id_.fetch_add(1); + { + auto stream = event_logger_.Log(); + stream << "job" << job_id << "event" + << "recovery_started"; + stream << "wal_files"; + stream.StartArray(); + for (auto wal_number : wal_numbers) { + stream << wal_number; + } + stream.EndArray(); + } + + // No-op for immutable_db_options_.wal_filter == nullptr. + InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap(); + + bool stop_replay_by_wal_filter = false; + bool stop_replay_for_corruption = false; + bool flushed = false; + uint64_t corrupted_wal_number = kMaxSequenceNumber; + uint64_t min_wal_number = MinLogNumberToKeep(); + if (!allow_2pc()) { + // In non-2pc mode, we skip WALs that do not back unflushed data. + min_wal_number = + std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData()); + } + for (auto wal_number : wal_numbers) { + if (wal_number < min_wal_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Skipping log #%" PRIu64 + " since it is older than min log to keep #%" PRIu64, + wal_number, min_wal_number); + continue; + } + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(wal_number); + // Open the log file + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), wal_number); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", wal_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + auto logFileDropped = [this, &fname]() { + uint64_t bytes; + if (env_->GetFileSize(fname, &bytes).ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), + static_cast(bytes)); + } + }; + if (stop_replay_by_wal_filter) { + logFileDropped(); + continue; + } + + std::unique_ptr file_reader; + { + std::unique_ptr file; + status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size, + io_tracer_)); + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = immutable_db_options_.info_log.get(); + reporter.fname = fname.c_str(); + if (!immutable_db_options_.paranoid_checks || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + reporter.status = nullptr; + } else { + reporter.status = &status; + } + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), + &reporter, true /*checksum*/, wal_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + + TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal", + /*arg=*/nullptr); + uint64_t record_checksum; + while (!stop_replay_by_wal_filter && + reader.ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode, + &record_checksum) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + + // We create a new batch and initialize with a valid prot_info_ to store + // the data checksums + WriteBatch batch; + + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + return status; + } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum", + &record_checksum); + status = WriteBatchInternal::UpdateProtectionInfo( + &batch, 8 /* bytes_per_key */, &record_checksum); + if (!status.ok()) { + return status; + } + + SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // In point-in-time recovery mode, if sequence id of log files are + // consecutive, we continue recovery despite corruption. This could + // happen when we open and write to a corrupted DB, where sequence id + // will start from the last sequence id we recovered. + if (sequence == *next_sequence) { + stop_replay_for_corruption = false; + } + if (stop_replay_for_corruption) { + logFileDropped(); + break; + } + } + + // For the default case of wal_filter == nullptr, always performs no-op + // and returns true. + if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter, + status, stop_replay_by_wal_filter, + batch)) { + continue; + } + + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), &flush_scheduler_, + &trim_history_scheduler_, true, wal_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); + MaybeIgnoreError(&status); + if (!status.ok()) { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reporter.Corruption(record.size(), status); + continue; + } + + if (has_valid_writes && !read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= wal_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + *next_sequence); + } + } + } + + if (!status.ok()) { + if (status.IsNotSupported()) { + // We should not treat NotSupported as corruption. It is rather a clear + // sign that we are processing a WAL that is produced by an incompatible + // version of the code. + return status; + } + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + // We should ignore all errors unconditionally + status = Status::OK(); + } else if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + if (status.IsIOError()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "IOError during point-in-time reading log #%" PRIu64 + " seq #%" PRIu64 + ". %s. This likely mean loss of synced WAL, " + "thus recovery fails.", + wal_number, *next_sequence, + status.ToString().c_str()); + return status; + } + // We should ignore the error but not continue replaying + status = Status::OK(); + stop_replay_for_corruption = true; + corrupted_wal_number = wal_number; + if (corrupted_wal_found != nullptr) { + *corrupted_wal_found = true; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Point in time recovered to log #%" PRIu64 + " seq #%" PRIu64, + wal_number, *next_sequence); + } else { + assert(immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency); + return status; + } + } + + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } + // Compare the corrupted log number to all columnfamily's current log number. + // Abort Open() if any column family's log number is greater than + // the corrupted log number, which means CF contains data beyond the point of + // corruption. This could during PIT recovery when the WAL is corrupted and + // some (but not all) CFs are flushed + // Exclude the PIT case where no log is dropped after the corruption point. + // This is to cover the case for empty wals after corrupted log, in which we + // don't reset stop_replay_for_corruption. + if (stop_replay_for_corruption == true && + (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + // One special case cause cfd->GetLogNumber() > corrupted_wal_number but + // the CF is still consistent: If a new column family is created during + // the flush and the WAL sync fails at the same time, the new CF points to + // the new WAL but the old WAL is curropted. Since the new CF is empty, it + // is still consistent. We add the check of CF sst file size to avoid the + // false positive alert. + + // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to + // the ignorance of a very rare inconsistency case caused in data + // canclation. One CF is empty due to KV deletion. But those operations + // are in the WAL. If the WAL is corrupted, the status of this CF might + // not be consistent with others. However, the consistency check will be + // bypassed due to empty CF. + // TODO: a better and complete implementation is needed to ensure strict + // consistency check in WAL recovery including hanlding the tailing + // issues. + if (cfd->GetLogNumber() > corrupted_wal_number && + cfd->GetLiveSstFilesSize() > 0) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Column family inconsistency: SST file contains data" + " beyond the point of corruption."); + return Status::Corruption("SST file is ahead of WALs in CF " + + cfd->GetName()); + } + } + } + + // True if there's any data in the WALs; if not, we can skip re-processing + // them later + bool data_seen = false; + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + const WalNumber max_wal_number = wal_numbers.back(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > max_wal_number) { + // Column family cfd has already flushed the data + // from all wals. Memtable has to be empty because + // we filter the updates based on wal_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr); + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + // If flush happened in the middle of recovery (e.g. due to memtable + // being full), we flush at the end. Otherwise we'll need to record + // where we were on last flush, which make the logic complicated. + if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Recovery failed + break; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + versions_->LastSequence()); + } + data_seen = true; + } + + // Update the log number info in the version edit corresponding to this + // column family. Note that the version edits will be written to MANIFEST + // together later. + // writing wal_number in the manifest means that any log file + // with number strongly less than (wal_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered max_wal_number, we want all wals + // with numbers `<= max_wal_number` (includes this one) to be ignored + if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { + edit->SetLogNumber(max_wal_number + 1); + } + } + if (status.ok()) { + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsed(max_wal_number + 1); + assert(recovery_ctx != nullptr); + + for (auto* cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + recovery_ctx->UpdateVersionEdits(cfd, iter->second); + } + + if (flushed || !data_seen) { + VersionEdit wal_deletion; + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + wal_deletion.DeleteWalsBefore(max_wal_number + 1); + } + if (!allow_2pc()) { + // In non-2pc mode, flushing the memtables of the column families + // means we can advance min_log_number_to_keep. + wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1); + } + assert(versions_->GetColumnFamilySet() != nullptr); + recovery_ctx->UpdateVersionEdits( + versions_->GetColumnFamilySet()->GetDefault(), wal_deletion); + } + } + } + + if (status.ok()) { + if (data_seen && !flushed) { + status = RestoreAliveLogFiles(wal_numbers); + } else if (!wal_numbers.empty()) { // If there's no data in the WAL, or we + // flushed all the data, still + // truncate the log file. If the process goes into a crash loop before + // the file is deleted, the preallocated space will never get freed. + const bool truncate = !read_only; + GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr) + .PermitUncheckedError(); + } + } + + event_logger_.Log() << "job" << job_id << "event" + << "recovery_finished"; + + return status; +} + +Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log_ptr) { + LogFileNumberSize log(wal_number); + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), wal_number); + Status s; + // This gets the appear size of the wals, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s); + if (s.ok() && truncate) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok() && !truncate_status.IsNotSupported()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", wal_number, + truncate_status.ToString().c_str()); + } + } + if (log_ptr) { + *log_ptr = log; + } + return s; +} + +Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { + if (wal_numbers.empty()) { + return Status::OK(); + } + Status s; + mutex_.AssertHeld(); + assert(immutable_db_options_.avoid_flush_during_recovery); + // Mark these as alive so they'll be considered for deletion later by + // FindObsoleteFiles() + total_log_size_ = 0; + log_empty_ = false; + uint64_t min_wal_with_unflushed_data = + versions_->MinLogNumberWithUnflushedData(); + for (auto wal_number : wal_numbers) { + if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) { + // In non-2pc mode, the WAL files not backing unflushed data are not + // alive, thus should not be added to the alive_log_files_. + continue; + } + // We preallocate space for wals, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + LogFileNumberSize log; + s = GetLogSizeAndMaybeTruncate( + wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); + if (!s.ok()) { + break; + } + total_log_size_ += log.size; + alive_log_files_.push_back(log); + } + return s; +} + +Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + assert(cfd); + assert(cfd->imm()); + // The immutable memtable list must be empty. + assert(std::numeric_limits::max() == + cfd->imm()->GetEarliestMemTableID()); + + const uint64_t start_micros = immutable_db_options_.clock->NowMicros(); + + FileMetaData meta; + std::vector blob_file_additions; + + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kDBOpen; + Arena arena; + Status s; + TableProperties table_properties; + { + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + // Get the latest mutable cf options while the mutex is still locked + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + bool paranoid_file_checks = + cfd->GetLatestMutableCFOptions()->paranoid_file_checks; + + int64_t _current_time = 0; + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error + const uint64_t current_time = static_cast(_current_time); + meta.oldest_ancester_time = current_time; + meta.epoch_number = cfd->NewEpochNumber(); + { + auto write_hint = cfd->CalculateSSTWriteHint(0); + mutex_.Unlock(); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + auto snapshot_checker = snapshot_checker_.get(); + if (use_custom_gc_ && snapshot_checker == nullptr) { + snapshot_checker = DisableGCSnapshotChecker::Instance(); + } + std::vector> + range_del_iters; + auto range_del_iter = + // This is called during recovery, where a live memtable is flushed + // directly. In this case, no fragmented tombstone list is cached in + // this memtable yet. + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, + false /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + + IOStatus io_s; + TableBuilderOptions tboptions( + *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, + 0 /* file_creation_time */, db_id_, db_session_id_, + 0 /* target_file_size */, meta.fd.GetNumber()); + SeqnoToTimeMapping empty_seqno_time_mapping; + Version* version = cfd->current(); + version->Ref(); + const ReadOptions read_option(Env::IOActivity::kDBOpen); + s = BuildTable( + dbname_, versions_.get(), immutable_db_options_, tboptions, + file_options_for_compaction_, read_option, cfd->table_cache(), + iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, + snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, + snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, + io_tracer_, BlobFileCreationReason::kRecovery, + empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH, + nullptr /* table_properties */, write_hint, + nullptr /*full_history_ts_low*/, &blob_callback_, version); + version->Unref(); + LogFlush(immutable_db_options_.info_log); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), + meta.fd.GetFileSize(), s.ToString().c_str()); + mutex_.Lock(); + + // TODO(AR) is this ok? + if (!io_s.ok() && s.ok()) { + s = io_s; + } + } + } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + const bool has_output = meta.fd.GetFileSize() > 0; + + constexpr int level = 0; + + if (s.ok() && has_output) { + edit->AddFile( + level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id, + meta.compensated_range_deletion_size, meta.tail_size); + + for (const auto& blob : blob_file_additions) { + edit->AddBlobFile(blob); + } + } + + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; + + if (has_output) { + stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + } + + const auto& blobs = edit->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); + } + + stats.num_output_files_blob = static_cast(blobs.size()); + + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); + RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + return s; +} + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + if (db_options.persist_stats_to_disk) { + column_families.push_back( + ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + } + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + if (db_options.persist_stats_to_disk) { + assert(handles.size() == 2); + } else { + assert(handles.size() == 1); + } + // i can delete the handle since DBImpl is always holding a reference to + // default column family + if (db_options.persist_stats_to_disk && handles[1] != nullptr) { + delete handles[1]; + } + delete handles[0]; + } + return s; +} + +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + const bool kSeqPerBatch = true; + const bool kBatchPerTxn = true; + ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_DBOPEN); + Status s = DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::ResetThreadStatus(); + return s; +} + +// TODO: Implement the trimming in flush code path. +// TODO: Perform trimming before inserting into memtable during recovery. +// TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta +// info, and handle only these files to reduce io. +Status DB::OpenAndTrimHistory( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + std::string trim_ts) { + assert(dbptr != nullptr); + assert(handles != nullptr); + auto validate_options = [&db_options] { + if (db_options.avoid_flush_during_recovery) { + return Status::InvalidArgument( + "avoid_flush_during_recovery incompatible with " + "OpenAndTrimHistory"); + } + return Status::OK(); + }; + auto s = validate_options(); + if (!s.ok()) { + return s; + } + + DB* db = nullptr; + s = DB::Open(db_options, dbname, column_families, handles, &db); + if (!s.ok()) { + return s; + } + assert(db); + CompactRangeOptions options; + options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + auto db_impl = static_cast_with_check(db); + for (auto handle : *handles) { + assert(handle != nullptr); + auto cfh = static_cast_with_check(handle); + auto cfd = cfh->cfd(); + assert(cfd != nullptr); + // Only compact column families with timestamp enabled + if (cfd->user_comparator() != nullptr && + cfd->user_comparator()->timestamp_size() > 0) { + s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr, + trim_ts); + if (!s.ok()) { + break; + } + } + } + auto clean_op = [&handles, &db] { + for (auto handle : *handles) { + auto temp_s = db->DestroyColumnFamilyHandle(handle); + assert(temp_s.ok()); + } + handles->clear(); + delete db; + }; + if (!s.ok()) { + clean_op(); + return s; + } + + *dbptr = db; + return s; +} + +IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, + log::Writer** new_log) { + IOStatus io_s; + std::unique_ptr lfile; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + FileOptions opt_file_options = + fs_->OptimizeForLogWrite(file_options_, db_options); + std::string wal_dir = immutable_db_options_.GetWalDir(); + std::string log_fname = LogFileName(wal_dir, log_file_num); + + if (recycle_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", + recycle_log_number); + std::string old_log_fname = LogFileName(wal_dir, recycle_log_number); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1"); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2"); + io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, + &lfile, /*dbg=*/nullptr); + } else { + io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); + } + + if (io_s.ok()) { + lfile->SetWriteLifeTimeHint(CalculateWALWriteHint()); + lfile->SetPreallocationBlockSize(preallocate_block_size); + + const auto& listeners = immutable_db_options_.listeners; + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(lfile), log_fname, opt_file_options, + immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners, + nullptr, tmp_set.Contains(FileType::kWalFile), + tmp_set.Contains(FileType::kWalFile))); + *new_log = new log::Writer(std::move(file_writer), log_file_num, + immutable_db_options_.recycle_log_file_num > 0, + immutable_db_options_.manual_wal_flush, + immutable_db_options_.wal_compression); + io_s = (*new_log)->AddCompressionTypeRecord(); + } + return io_s; +} + +Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn) { + Status s = ValidateOptionsByTable(db_options, column_families); + if (!s.ok()) { + return s; + } + + s = ValidateOptions(db_options, column_families); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + assert(handles); + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + } + + DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + if (!impl->immutable_db_options_.info_log) { + s = impl->init_logger_creation_s_; + delete impl; + return s; + } else { + assert(impl->init_logger_creation_s_.ok()); + } + s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir()); + if (s.ok()) { + std::vector paths; + for (auto& db_path : impl->immutable_db_options_.db_paths) { + paths.emplace_back(db_path.path); + } + for (auto& cf : column_families) { + for (auto& cf_path : cf.options.cf_paths) { + paths.emplace_back(cf_path.path); + } + } + for (auto& path : paths) { + s = impl->env_->CreateDirIfMissing(path); + if (!s.ok()) { + break; + } + } + + // For recovery from NoSpace() error, we can only handle + // the case where the database is stored in a single path + if (paths.size() <= 1) { + impl->error_handler_.EnableAutoRecovery(); + } + } + if (s.ok()) { + s = impl->CreateArchivalDirectory(); + } + if (!s.ok()) { + delete impl; + return s; + } + + impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); + RecoveryContext recovery_ctx; + impl->mutex_.Lock(); + + // Handles create_if_missing, error_if_exists + uint64_t recovered_seq(kMaxSequenceNumber); + s = impl->Recover(column_families, false /* read_only */, + false /* error_if_wal_file_exists */, + false /* error_if_data_exists_in_wals */, &recovered_seq, + &recovery_ctx); + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + log::Writer* new_log = nullptr; + const size_t preallocate_block_size = + impl->GetWalPreallocateBlockSize(max_write_buffer_size); + s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/, + preallocate_block_size, &new_log); + if (s.ok()) { + InstrumentedMutexLock wl(&impl->log_write_mutex_); + impl->logfile_number_ = new_log_number; + assert(new_log != nullptr); + assert(impl->logs_.empty()); + impl->logs_.emplace_back(new_log_number, new_log); + } + + if (s.ok()) { + impl->alive_log_files_.push_back( + DBImpl::LogFileNumberSize(impl->logfile_number_)); + // In WritePrepared there could be gap in sequence numbers. This breaks + // the trick we use in kPointInTimeRecovery which assumes the first seq in + // the log right after the corrupted log is one larger than the last seq + // we read from the wals. To let this trick keep working, we add a dummy + // entry with the expected sequence to the first log right after recovery. + // In non-WritePrepared case also the new log after recovery could be + // empty, and thus missing the consecutive seq hint to distinguish + // middle-log corruption to corrupted-log-remained-after-recovery. This + // case also will be addressed by a dummy write. + if (recovered_seq != kMaxSequenceNumber) { + WriteBatch empty_batch; + WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); + WriteOptions write_options; + uint64_t log_used, log_size; + log::Writer* log_writer = impl->logs_.back().writer; + LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back(); + + assert(log_writer->get_log_number() == log_file_number_size.number); + impl->mutex_.AssertHeld(); + s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size, + Env::IO_TOTAL, log_file_number_size); + if (s.ok()) { + // Need to fsync, otherwise it might get lost after a power reset. + s = impl->FlushWAL(false); + TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s); + if (s.ok()) { + s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + } + } + } + } + } + if (s.ok()) { + s = impl->LogAndApplyForRecovery(recovery_ctx); + } + + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + impl->mutex_.AssertHeld(); + s = impl->InitPersistStatsColumnFamily(); + } + + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd != nullptr) { + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); + } else { + if (db_options.create_missing_column_families) { + // missing column family, create it + ColumnFamilyHandle* handle = nullptr; + impl->mutex_.Unlock(); + s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + impl->mutex_.Lock(); + if (s.ok()) { + handles->push_back(handle); + } else { + break; + } + } else { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + } + } + } + + if (s.ok()) { + SuperVersionContext sv_context(/* create_superversion */ true); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + impl->InstallSuperVersionAndScheduleWork( + cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); + } + sv_context.Clean(); + } + + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // try to read format version + s = impl->PersistentStatsProcessFormatVersion(); + } + + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (!cfd->mem()->IsSnapshotSupported()) { + impl->is_snapshot_supported_ = false; + } + if (cfd->ioptions()->merge_operator != nullptr && + !cfd->mem()->IsMergeOperatorSupported()) { + s = Status::InvalidArgument( + "The memtable of column family %s does not support merge operator " + "its options.merge_operator is non-null", + cfd->GetName().c_str()); + } + if (!s.ok()) { + break; + } + } + } + TEST_SYNC_POINT("DBImpl::Open:Opened"); + Status persist_options_status; + if (s.ok()) { + // Persist RocksDB Options before scheduling the compaction. + // The WriteOptionsFile() will release and lock the mutex internally. + persist_options_status = impl->WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + + *dbptr = impl; + impl->opened_successfully_ = true; + impl->DeleteObsoleteFiles(); + TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles"); + impl->MaybeScheduleFlushOrCompaction(); + } else { + persist_options_status.PermitUncheckedError(); + } + impl->mutex_.Unlock(); + + auto sfm = static_cast( + impl->immutable_db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Set Statistics ptr for SstFileManager to dump the stats of + // DeleteScheduler. + sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics); + ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, + "SstFileManager instance %p", sfm); + + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] and cf_paths[0] when the DB is opened. + + // SstFileManagerImpl needs to know sizes of the files. For files whose size + // we already know (sst files that appear in manifest - typically that's the + // vast majority of all files), we'll pass the size to SstFileManager. + // For all other files SstFileManager will query the size from filesystem. + + std::vector metadata; + impl->GetAllColumnFamilyMetaData(&metadata); + + std::unordered_map known_file_sizes; + for (const auto& md : metadata) { + for (const auto& lmd : md.levels) { + for (const auto& fmd : lmd.files) { + known_file_sizes[fmd.relative_filename] = fmd.size; + } + } + for (const auto& bmd : md.blob_files) { + std::string name = bmd.blob_file_name; + // The BlobMetaData.blob_file_name may start with "/". + if (!name.empty() && name[0] == '/') { + name = name.substr(1); + } + known_file_sizes[name] = bmd.blob_file_size; + } + } + + std::vector paths; + paths.emplace_back(impl->immutable_db_options_.db_paths[0].path); + for (auto& cf : column_families) { + if (!cf.options.cf_paths.empty()) { + paths.emplace_back(cf.options.cf_paths[0].path); + } + } + // Remove duplicate paths. + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + IOOptions io_opts; + io_opts.do_not_recurse = true; + for (auto& path : paths) { + std::vector existing_files; + impl->immutable_db_options_.fs + ->GetChildren(path, io_opts, &existing_files, + /*IODebugContext*=*/nullptr) + .PermitUncheckedError(); //**TODO: What do to on error? + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + (file_type == kTableFile || file_type == kBlobFile)) { + // TODO: Check for errors from OnAddFile? + if (known_file_sizes.count(file_name)) { + // We're assuming that each sst file name exists in at most one of + // the paths. + sfm->OnAddFile(file_path, known_file_sizes.at(file_name)) + .PermitUncheckedError(); + } else { + sfm->OnAddFile(file_path).PermitUncheckedError(); + } + } + } + } + + // Reserve some disk buffer space. This is a heuristic - when we run out + // of disk space, this ensures that there is atleast write_buffer_size + // amount of free space before we resume DB writes. In low disk space + // conditions, we want to avoid a lot of small L0 files due to frequent + // WAL write failures and resultant forced flushes + sfm->ReserveDiskBuffer(max_write_buffer_size, + impl->immutable_db_options_.db_paths[0].path); + } + + + if (s.ok()) { + ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", + impl); + LogFlush(impl->immutable_db_options_.info_log); + if (!impl->WALBufferIsEmpty()) { + s = impl->FlushWAL(false); + if (s.ok()) { + // Sync is needed otherwise WAL buffered data might get lost after a + // power reset. + log::Writer* log_writer = impl->logs_.back().writer; + s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + } + } + if (s.ok() && !persist_options_status.ok()) { + s = Status::IOError( + "DB::Open() failed --- Unable to persist Options file", + persist_options_status.ToString()); + } + } + if (!s.ok()) { + ROCKS_LOG_WARN(impl->immutable_db_options_.info_log, + "DB::Open() failed: %s", s.ToString().c_str()); + } + if (s.ok()) { + s = impl->StartPeriodicTaskScheduler(); + } + + if (s.ok()) { + s = impl->RegisterRecordSeqnoTimeWorker(); + } + if (!s.ok()) { + for (auto* h : *handles) { + delete h; + } + handles->clear(); + delete impl; + *dbptr = nullptr; + } + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.cc new file mode 100644 index 0000000000..871cf80852 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.cc @@ -0,0 +1,334 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_readonly.h" + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_impl/compacted_db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/merge_context.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + + +DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname, /*seq_per_batch*/ false, + /*batch_per_txn*/ true, /*read_only*/ true) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in read only mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplReadOnly::~DBImplReadOnly() {} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + return Get(read_options, column_family, key, pinnable_val, + /*timestamp*/ nullptr); +} + +Status DBImplReadOnly::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, + std::string* timestamp) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + assert(pinnable_val != nullptr); + // TODO: stopwatch DB_GET needed?, perf timer needed? + PERF_TIMER_GUARD(get_snapshot_time); + + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (timestamp) { + timestamp->clear(); + } + + const Comparator* ucmp = column_family->GetComparator(); + assert(ucmp); + std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr; + + Status s; + SequenceNumber snapshot = versions_->LastSequence(); + GetWithTimestampReadCallback read_cb(snapshot); + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + SuperVersion* super_version = cfd->GetSuperVersion(); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), + /*columns=*/nullptr, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, &read_cb)) { + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else { + PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get( + read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, + /*value_found*/ nullptr, + /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, + /*is_blob*/ nullptr, + /*do_merge*/ true); + RecordTick(stats_, MEMTABLE_MISS); + } + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordInHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return NewErrorIterator(s); + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return NewErrorIterator(s); + } + } + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + ReadCallback* read_callback = nullptr; // No read callback provided. + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + super_version->current, read_seq, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + read_seq, /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplReadOnly::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.timestamp) { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + } else { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfCfHasTs(cf); + if (!s.ok()) { + return s; + } + } + } + + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + + for (auto cfh : column_families) { + auto* cfd = static_cast_with_check(cfh)->cfd(); + auto* sv = cfd->GetSuperVersion()->Ref(); + auto* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + sv->current, read_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback); + auto* internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq, + /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); + } + + return Status::OK(); +} + +namespace { +// Return OK if dbname exists in the file system or create it if +// create_if_missing +Status OpenForReadOnlyCheckExistence(const DBOptions& db_options, + const std::string& dbname) { + Status s; + if (!db_options.create_if_missing) { + // Attempt to read "CURRENT" file + const std::shared_ptr& fs = db_options.env->GetFileSystem(); + std::string manifest_path; + uint64_t manifest_file_number; + s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path, + &manifest_file_number); + } else { + // Historic behavior that doesn't necessarily make sense + s = db_options.env->CreateDirIfMissing(dbname); + } + return s; +} +} // namespace + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool /*error_if_wal_file_exists*/) { + Status s = OpenForReadOnlyCheckExistence(options, dbname); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + + // Try to first open DB as fully compacted DB + s = CompactedDBImpl::Open(options, dbname, dbptr); + if (s.ok()) { + return s; + } + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + + s = DBImplReadOnly::OpenForReadOnlyWithoutCheck( + db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a + // reference to default column family + delete handles[0]; + } + return s; +} + +Status DB::OpenForReadOnly( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists) { + // If dbname does not exist in the file system, should not do anything + Status s = OpenForReadOnlyCheckExistence(db_options, dbname); + if (!s.ok()) { + return s; + } + + return DBImplReadOnly::OpenForReadOnlyWithoutCheck( + db_options, dbname, column_families, handles, dbptr, + error_if_wal_file_exists); +} + +Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists) { + *dbptr = nullptr; + handles->clear(); + + SuperVersionContext sv_context(/* create_superversion */ true); + DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(column_families, true /* read only */, + error_if_wal_file_exists); + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto* h : *handles) { + impl->NewThreadStatusCfInfo( + static_cast_with_check(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.h new file mode 100644 index 0000000000..a694acc003 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_readonly.h @@ -0,0 +1,173 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: Share common structure with CompactedDBImpl and DBImplSecondary +class DBImplReadOnly : public DBImpl { + public: + DBImplReadOnly(const DBOptions& options, const std::string& dbname); + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&) = delete; + void operator=(const DBImplReadOnly&) = delete; + + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; + + // TODO: Implement ReadOnly MultiGet? + + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::SyncWAL; + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + // FIXME: some missing overrides for more "write" functions + + protected: + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } + + private: + // A "helper" function for DB::OpenForReadOnly without column families + // to reduce unnecessary I/O + // It has the same functionality as DB::OpenForReadOnly with column families + // but does not check the existence of dbname in the file system + static Status OpenForReadOnlyWithoutCheck( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists = false); + friend class DB; +}; +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.cc new file mode 100644 index 0000000000..c6fcefddc7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.cc @@ -0,0 +1,964 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_secondary.h" + +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/merge_context.h" +#include "logging/auto_roll_logger.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/configurable.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +DBImplSecondary::DBImplSecondary(const DBOptions& db_options, + const std::string& dbname, + std::string secondary_path) + : DBImpl(db_options, dbname, false, true, true), + secondary_path_(std::move(secondary_path)) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in secondary mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplSecondary::~DBImplSecondary() {} + +Status DBImplSecondary::Recover( + const std::vector& column_families, + bool /*readonly*/, bool /*error_if_wal_file_exists*/, + bool /*error_if_data_exists_in_wals*/, uint64_t*, + RecoveryContext* /*recovery_ctx*/) { + mutex_.AssertHeld(); + + JobContext job_context(0); + Status s; + s = static_cast(versions_.get()) + ->Recover(column_families, &manifest_reader_, &manifest_reporter_, + &manifest_reader_status_); + if (!s.ok()) { + if (manifest_reader_status_) { + manifest_reader_status_->PermitUncheckedError(); + } + return s; + } + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + // Initial max_total_in_memory_state_ before recovery logs. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + if (s.ok()) { + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + + std::unordered_set cfds_changed; + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + // TODO: update options_file_number_ needed? + + job_context.Clean(); + return s; +} + +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context); + } + return s; +} + +// List wal_dir and find all new WALs, return these log numbers +Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { + assert(logs != nullptr); + std::vector filenames; + Status s; + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(), + io_opts, &filenames, + /*IODebugContext*=*/nullptr); + if (s.IsNotFound()) { + return Status::InvalidArgument("Failed to open wal_dir", + immutable_db_options_.GetWalDir()); + } else if (!s.ok()) { + return s; + } + + // if log_readers_ is non-empty, it means we have applied all logs with log + // numbers smaller than the smallest log in log_readers_, so there is no + // need to pass these logs to RecoverLogFiles + uint64_t log_number_min = 0; + if (!log_readers_.empty()) { + log_number_min = log_readers_.begin()->first; + } + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kWalFile && + number >= log_number_min) { + logs->push_back(number); + } + } + // Recover logs in the order that they were generated + if (!logs->empty()) { + std::sort(logs->begin(), logs->end()); + } + return s; +} + +Status DBImplSecondary::MaybeInitLogReader( + uint64_t log_number, log::FragmentBufferedReader** log_reader) { + auto iter = log_readers_.find(log_number); + // make sure the log file is still present + if (iter == log_readers_.end() || + iter->second->reader_->GetLogNumber() != log_number) { + // delete the obsolete log reader if log number mismatch + if (iter != log_readers_.end()) { + log_readers_.erase(iter); + } + // initialize log reader from log_number + // TODO: min_log_number_to_keep_2pc check needed? + // Open the log file + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), log_number); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + + std::unique_ptr file_reader; + { + std::unique_ptr file; + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); + if (!status.ok()) { + *log_reader = nullptr; + return status; + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size, + io_tracer_)); + } + + // Create the log reader. + LogReaderContainer* log_reader_container = new LogReaderContainer( + env_, immutable_db_options_.info_log, std::move(fname), + std::move(file_reader), log_number); + log_readers_.insert(std::make_pair( + log_number, std::unique_ptr(log_reader_container))); + } + iter = log_readers_.find(log_number); + assert(iter != log_readers_.end()); + *log_reader = iter->second->reader_; + return Status::OK(); +} + +// After manifest recovery, replay WALs and refresh log_readers_ if necessary +// REQUIRES: log_numbers are sorted in ascending order +Status DBImplSecondary::RecoverLogFiles( + const std::vector& log_numbers, SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + mutex_.AssertHeld(); + Status status; + for (auto log_number : log_numbers) { + log::FragmentBufferedReader* reader = nullptr; + status = MaybeInitLogReader(log_number, &reader); + if (!status.ok()) { + return status; + } + assert(reader != nullptr); + } + for (auto log_number : log_numbers) { + auto it = log_readers_.find(log_number); + assert(it != log_readers_.end()); + log::FragmentBufferedReader* reader = it->second->reader_; + Status* wal_read_status = it->second->status_; + assert(wal_read_status); + // Manually update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (reader->ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + wal_read_status->ok() && status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reader->GetReporter()->Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + break; + } + SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); + std::vector column_family_ids; + status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + const std::vector& l0_files = + cfd->current()->storage_info()->LevelFiles(0); + SequenceNumber seq = + l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno; + // If the write batch's sequence number is smaller than the last + // sequence number of the largest sequence persisted for this column + // family, then its data must reside in an SST that has already been + // added in the prior MANIFEST replay. + if (seq_of_batch <= seq) { + continue; + } + auto curr_log_num = std::numeric_limits::max(); + if (cfd_to_current_log_.count(cfd) > 0) { + curr_log_num = cfd_to_current_log_[cfd]; + } + // If the active memtable contains records added by replaying an + // earlier WAL, then we need to seal the memtable, add it to the + // immutable memtable list and create a new active memtable. + if (!cfd->mem()->IsEmpty() && + (curr_log_num == std::numeric_limits::max() || + curr_log_num != log_number)) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + MemTable* new_mem = + cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch); + cfd->mem()->SetNextLogNumber(log_number); + cfd->mem()->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + } + } + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/, + true, log_number, this, false /* concurrent_memtable_writes */, + next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); + } + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + // passing null flush_scheduler will disable memtable flushing which is + // needed for secondary instances + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + std::unordered_map::iterator iter = + cfd_to_current_log_.find(cfd); + if (iter == cfd_to_current_log_.end()) { + cfd_to_current_log_.insert({cfd, log_number}); + } else if (log_number > iter->second) { + iter->second = log_number; + } + } + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } else { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reader->GetReporter()->Corruption(record.size(), status); + } + } + if (status.ok() && !wal_read_status->ok()) { + status = *wal_read_status; + } + if (!status.ok()) { + return status; + } + } + // remove logreaders from map after successfully recovering the WAL + if (log_readers_.size() > 1) { + auto erase_iter = log_readers_.begin(); + std::advance(erase_iter, log_readers_.size() - 1); + log_readers_.erase(log_readers_.begin(), erase_iter); + } + return status; +} + +// Implementation of the DB interface +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value, + /*timestamp*/ nullptr); +} + +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) { + return GetImpl(read_options, column_family, key, value, timestamp); +} + +Status DBImplSecondary::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, + std::string* timestamp) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + } + + // Clear the timestamp for returning results so that we can distinguish + // between tombstone or key that has never been written later. + if (timestamp) { + timestamp->clear(); + } + + auto cfh = static_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + // Acquire SuperVersion + SuperVersion* super_version = GetAndRefSuperVersion(cfd); + SequenceNumber snapshot = versions_->LastSequence(); + GetWithTimestampReadCallback read_cb(snapshot); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + Status s; + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + + bool done = false; + const Comparator* ucmp = column_family->GetComparator(); + assert(ucmp); + std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr; + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), + /*columns=*/nullptr, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, &read_cb)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + super_version->imm->Get( + lkey, pinnable_val->GetSelf(), /*columns=*/nullptr, ts, &s, + &merge_context, &max_covering_tombstone_seq, read_options, + &read_cb)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, super_version); + return s; + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get( + read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, + /*value_found*/ nullptr, + /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr, + /*do_merge*/ true); + RecordTick(stats_, MEMTABLE_MISS); + } + { + PERF_TIMER_GUARD(get_post_process_time); + ReturnAndCleanupSuperVersion(cfd, super_version); + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordTimeToHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + return s; +} + +Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + + assert(column_family); + if (read_options.timestamp) { + const Status s = FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return NewErrorIterator(s); + } + } else { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return NewErrorIterator(s); + } + } + + Iterator* result = nullptr; + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { + return NewErrorIterator(Status::NotSupported( + "tailing iterator not supported in secondary mode")); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return NewErrorIterator( + Status::NotSupported("snapshot not supported in secondary mode")); + } else { + SequenceNumber snapshot(kMaxSequenceNumber); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SequenceNumber snapshot, ReadCallback* read_callback, + bool expose_blob_index, bool allow_refresh) { + assert(nullptr != cfd); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + assert(snapshot == kMaxSequenceNumber); + snapshot = versions_->LastSequence(); + assert(snapshot != kMaxSequenceNumber); + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + super_version->current, snapshot, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback, this, cfd, + expose_blob_index, read_options.snapshot ? false : allow_refresh); + auto internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + snapshot, /* allow_unprepared_value */ true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplSecondary::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call NewIterators with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + + if (read_options.timestamp) { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), + /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + } else { + for (auto* cf : column_families) { + assert(cf); + const Status s = FailIfCfHasTs(cf); + if (!s.ok()) { + return s; + } + } + } + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { + return Status::NotSupported( + "tailing iterator not supported in secondary mode"); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return Status::NotSupported("snapshot not supported in secondary mode"); + } else { + SequenceNumber read_seq(kMaxSequenceNumber); + for (auto cfh : column_families) { + ColumnFamilyData* cfd = static_cast(cfh)->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, read_seq, read_callback)); + } + } + return Status::OK(); +} + +Status DBImplSecondary::CheckConsistency() { + mutex_.AssertHeld(); + Status s = DBImpl::CheckConsistency(); + // If DBImpl::CheckConsistency() which is stricter returns success, then we + // do not need to give a second chance. + if (s.ok()) { + return s; + } + // It's possible that DBImpl::CheckConssitency() can fail because the primary + // may have removed certain files, causing the GetFileSize(name) call to + // fail and returning a PathNotFound. In this case, we take a best-effort + // approach and just proceed. + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + return Status::OK(); + } + + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || + s.IsPathNotFound())) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } + } + return corruption_messages.empty() ? Status::OK() + : Status::Corruption(corruption_messages); +} + +Status DBImplSecondary::TryCatchUpWithPrimary() { + assert(versions_.get() != nullptr); + assert(manifest_reader_.get() != nullptr); + Status s; + // read the manifest and apply new changes to the secondary instance + std::unordered_set cfds_changed; + JobContext job_context(0, true /*create_superversion*/); + { + InstrumentedMutexLock lock_guard(&mutex_); + s = static_cast_with_check(versions_.get()) + ->ReadAndApply(&mutex_, &manifest_reader_, + manifest_reader_status_.get(), &cfds_changed); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, + static_cast(versions_->LastSequence())); + for (ColumnFamilyData* cfd : cfds_changed) { + if (cfd->IsDropped()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n", + cfd->GetName().c_str()); + continue; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Level summary: %s\n", cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + if (s.ok()) { + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + if (s.ok()) { + for (auto cfd : cfds_changed) { + cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), + &job_context.memtables_to_free); + auto& sv_context = job_context.superversion_contexts.back(); + cfd->InstallSuperVersion(&sv_context, &mutex_); + sv_context.NewSuperVersion(); + } + } + } + job_context.Clean(); + + // Cleanup unused, obsolete files. + JobContext purge_files_job_context(0); + { + InstrumentedMutexLock lock_guard(&mutex_); + // Currently, secondary instance does not own the database files, thus it + // is unnecessary for the secondary to force full scan. + FindObsoleteFiles(&purge_files_job_context, /*force=*/false); + } + if (purge_files_job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(purge_files_job_context); + } + purge_files_job_context.Clean(); + return s; +} + +Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, + const std::string& secondary_path, DB** dbptr) { + *dbptr = nullptr; + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); + std::vector handles; + + Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + delete handles[0]; + } + return s; +} + +Status DB::OpenAsSecondary( + const DBOptions& db_options, const std::string& dbname, + const std::string& secondary_path, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + + DBOptions tmp_opts(db_options); + Status s; + if (nullptr == tmp_opts.info_log) { + s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); + if (!s.ok()) { + tmp_opts.info_log = nullptr; + return s; + } + } + + assert(tmp_opts.info_log != nullptr); + if (db_options.max_open_files != -1) { + std::ostringstream oss; + oss << "The primary instance may delete all types of files after they " + "become obsolete. The application can coordinate the primary and " + "secondary so that primary does not delete/rename files that are " + "currently being used by the secondary. Alternatively, a custom " + "Env/FS can be provided such that files become inaccessible only " + "after all primary and secondaries indicate that they are obsolete " + "and deleted. If the above two are not possible, you can open the " + "secondary instance with `max_open_files==-1` so that secondary " + "will eagerly keep all table files open. Even if a file is deleted, " + "its content can still be accessed via a prior open file " + "descriptor. This is a hacky workaround for only table files. If " + "none of the above is done, then point lookup or " + "range scan via the secondary instance can result in IOError: file " + "not found. This can be resolved by retrying " + "TryCatchUpWithPrimary()."; + ROCKS_LOG_WARN(tmp_opts.info_log, "%s", oss.str().c_str()); + } + + handles->clear(); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path); + impl->versions_.reset(new ReactiveVersionSet( + dbname, &impl->immutable_db_options_, impl->file_options_, + impl->table_cache_.get(), impl->write_buffer_manager_, + &impl->write_controller_, impl->io_tracer_)); + impl->column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); + impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); + + impl->mutex_.Lock(); + s = impl->Recover(column_families, true, false, false); + if (s.ok()) { + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (nullptr == cfd) { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + SuperVersionContext sv_context(true /* create_superversion */); + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto h : *handles) { + impl->NewThreadStatusCfInfo( + static_cast_with_check(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + +Status DBImplSecondary::CompactWithoutInstallation( + const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, CompactionServiceResult* result) { + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + InstrumentedMutexLock l(&mutex_); + auto cfd = static_cast_with_check(cfh)->cfd(); + if (!cfd) { + return Status::InvalidArgument("Cannot find column family" + + cfh->GetName()); + } + + std::unordered_set input_set; + for (const auto& file_name : input.input_files) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + auto* version = cfd->current(); + + ColumnFamilyMetaData cf_meta; + version->GetColumnFamilyMetaData(&cf_meta); + + const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions(); + VersionStorageInfo* vstorage = version->storage_info(); + + // Use comp_options to reuse some CompactFiles functions + CompactionOptions comp_options; + comp_options.compression = kDisableCompressionOption; + comp_options.output_file_size_limit = MaxFileSizeForLevel( + *mutable_cf_options, input.output_level, cf_options.compaction_style, + vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes); + + std::vector input_files; + Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage, comp_options); + if (!s.ok()) { + return s; + } + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + comp_options, input_files, input.output_level, vstorage, + *mutable_cf_options, mutable_db_options_, 0)); + assert(c != nullptr); + + c->SetInputVersion(version); + + // Create output directory if it's not existed yet + std::unique_ptr output_dir; + s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir); + if (!s.ok()) { + return s; + } + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + const int job_id = next_job_id_.fetch_add(1); + + // use primary host's db_id for running the compaction, but db_session_id is + // using the local one, which is to make sure the unique id is unique from + // the remote compactors. Because the id is generated from db_id, + // db_session_id and orig_file_number, unlike the local compaction, remote + // compaction cannot guarantee the uniqueness of orig_file_number, the file + // number is only assigned when compaction is done. + CompactionServiceCompactionJob compaction_job( + job_id, c.get(), immutable_db_options_, mutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_, + input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_, + options.canceled ? *options.canceled : kManualCompactionCanceledFalse_, + input.db_id, db_session_id_, secondary_path_, input, result); + + mutex_.Unlock(); + s = compaction_job.Run(); + mutex_.Lock(); + + // clean up + compaction_job.io_status().PermitUncheckedError(); + compaction_job.CleanupCompaction(); + c->ReleaseCompactionFiles(s); + c.reset(); + + TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End", + &s); + result->status = s; + return s; +} + +Status DB::OpenAndCompact( + const OpenAndCompactOptions& options, const std::string& name, + const std::string& output_directory, const std::string& input, + std::string* output, + const CompactionServiceOptionsOverride& override_options) { + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + CompactionServiceInput compaction_input; + Status s = CompactionServiceInput::Read(input, &compaction_input); + if (!s.ok()) { + return s; + } + + compaction_input.db_options.max_open_files = -1; + compaction_input.db_options.compaction_service = nullptr; + if (compaction_input.db_options.statistics) { + compaction_input.db_options.statistics.reset(); + } + compaction_input.db_options.env = override_options.env; + compaction_input.db_options.file_checksum_gen_factory = + override_options.file_checksum_gen_factory; + compaction_input.db_options.statistics = override_options.statistics; + compaction_input.column_family.options.comparator = + override_options.comparator; + compaction_input.column_family.options.merge_operator = + override_options.merge_operator; + compaction_input.column_family.options.compaction_filter = + override_options.compaction_filter; + compaction_input.column_family.options.compaction_filter_factory = + override_options.compaction_filter_factory; + compaction_input.column_family.options.prefix_extractor = + override_options.prefix_extractor; + compaction_input.column_family.options.table_factory = + override_options.table_factory; + compaction_input.column_family.options.sst_partitioner_factory = + override_options.sst_partitioner_factory; + compaction_input.column_family.options.table_properties_collector_factories = + override_options.table_properties_collector_factories; + compaction_input.db_options.listeners = override_options.listeners; + + std::vector column_families; + column_families.push_back(compaction_input.column_family); + // TODO: we have to open default CF, because of an implementation limitation, + // currently we just use the same CF option from input, which is not collect + // and open may fail. + if (compaction_input.column_family.name != kDefaultColumnFamilyName) { + column_families.emplace_back(kDefaultColumnFamilyName, + compaction_input.column_family.options); + } + + DB* db; + std::vector handles; + + s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory, + column_families, &handles, &db); + if (!s.ok()) { + return s; + } + + CompactionServiceResult compaction_result; + DBImplSecondary* db_secondary = static_cast_with_check(db); + assert(handles.size() > 0); + s = db_secondary->CompactWithoutInstallation( + options, handles[0], compaction_input, &compaction_result); + + Status serialization_status = compaction_result.Write(output); + + for (auto& handle : handles) { + delete handle; + } + delete db; + if (s.ok()) { + return serialization_status; + } + return s; +} + +Status DB::OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* output, + const CompactionServiceOptionsOverride& override_options) { + return OpenAndCompact(OpenAndCompactOptions(), name, output_directory, input, + output, override_options); +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.h new file mode 100644 index 0000000000..3c21986221 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_secondary.h @@ -0,0 +1,406 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper class to hold log reader, log reporter, log status. +class LogReaderContainer { + public: + LogReaderContainer() + : reader_(nullptr), reporter_(nullptr), status_(nullptr) {} + LogReaderContainer(Env* env, std::shared_ptr info_log, + std::string fname, + std::unique_ptr&& file_reader, + uint64_t log_number) { + LogReporter* reporter = new LogReporter(); + status_ = new Status(); + reporter->env = env; + reporter->info_log = info_log.get(); + reporter->fname = std::move(fname); + reporter->status = status_; + reporter_ = reporter; + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader), + reporter, true /*checksum*/, + log_number); + } + log::FragmentBufferedReader* reader_; + log::Reader::Reporter* reporter_; + Status* status_; + ~LogReaderContainer() { + delete reader_; + delete reporter_; + delete status_; + } + + private: + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + std::string fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname.c_str(), static_cast(bytes), + s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; +}; + +// The secondary instance shares access to the storage as the primary. +// The secondary is able to read and replay changes described in both the +// MANIFEST and the WAL files without coordination with the primary. +// The secondary instance can be opened using `DB::OpenAsSecondary`. After +// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best +// effort attempts to catch up with the primary. +// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly +class DBImplSecondary : public DBImpl { + public: + DBImplSecondary(const DBOptions& options, const std::string& dbname, + std::string secondary_path); + ~DBImplSecondary() override; + + // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ + // and log_readers_ to facilitate future operations. + Status Recover(const std::vector& column_families, + bool read_only, bool error_if_wal_file_exists, + bool error_if_data_exists_in_wals, uint64_t* = nullptr, + RecoveryContext* recovery_ctx = nullptr) override; + + // Implementations of the DB interface. + using DB::Get; + // Can return IOError due to files being deleted by the primary. To avoid + // IOError in this case, application can coordinate between primary and + // secondaries so that primary will not delete files that are currently being + // used by the secondaries. The application can also provide a custom FS/Env + // implementation so that files will remain present until all primary and + // secondaries indicate that they can be deleted. As a partial hacky + // workaround, the secondaries can be opened with `max_open_files=-1` so that + // it eagerly keeps all talbe files open and is able to access the contents of + // deleted files via prior open fd. + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp); + + using DBImpl::NewIterator; + // Operations on the created iterators can return IOError due to files being + // deleted by the primary. To avoid IOError in this case, application can + // coordinate between primary and secondaries so that primary will not delete + // files that are currently being used by the secondaries. The application can + // also provide a custom FS/Env implementation so that files will remain + // present until all primary and secondaries indicate that they can be + // deleted. As a partial hacky workaround, the secondaries can be opened with + // `max_open_files=-1` so that it eagerly keeps all talbe files open and is + // able to access the contents of deleted files via prior open fd. + Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool expose_blob_index = false, + bool allow_refresh = true); + + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::PutEntity; + Status PutEntity(const WriteOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + const WideColumns& /* columns */) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Merge; + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Delete; + Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SingleDelete; + Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactRange; + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactFiles; + Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status GetLiveFiles(std::vector&, + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Flush; + Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetDBOptions; + Status SetDBOptions(const std::unordered_map& + /*options_map*/) override { + // Currently not supported because changing certain options may cause + // flush/compaction. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetOptions; + Status SetOptions( + ColumnFamilyHandle* /*cfd*/, + const std::unordered_map& /*options_map*/) + override { + // Currently not supported because changing certain options may cause + // flush/compaction and/or write to MANIFEST. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SyncWAL; + Status SyncWAL() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DB::IngestExternalFile; + Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + // Try to catch up with the primary by reading as much as possible from the + // log files until there is nothing more to read or encounters an error. If + // the amount of information in the log files to process is huge, this + // method can take long time due to all the I/O and CPU costs. + Status TryCatchUpWithPrimary() override; + + // Try to find log reader using log_number from log_readers_ map, initialize + // if it doesn't exist + Status MaybeInitLogReader(uint64_t log_number, + log::FragmentBufferedReader** log_reader); + + // Check if all live files exist on file system and that their file sizes + // matche to the in-memory records. It is possible that some live files may + // have been deleted by the primary. In this case, CheckConsistency() does + // not flag the missing file as inconsistency. + Status CheckConsistency() override; + +#ifndef NDEBUG + Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options, + ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result) { + return CompactWithoutInstallation(options, cfh, input, result); + } +#endif // NDEBUG + + protected: + Status FlushForGetLiveFiles() override { + // No-op for read-only DB + return Status::OK(); + } + + // ColumnFamilyCollector is a write batch handler which does nothing + // except recording unique column family IDs + class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + if (column_family_ids_.find(column_family_id) == + column_family_ids_.end()) { + column_family_ids_.insert(column_family_id); + } + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkNoop(bool) override { return Status::OK(); } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } + }; + + Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; + } + + bool OwnTablesAndLogs() const override { + // Currently, the secondary instance does not own the database files. It + // simply opens the files of the primary instance and tracks their file + // descriptors until they become obsolete. In the future, the secondary may + // create links to database files. OwnTablesAndLogs will return true then. + return false; + } + + private: + friend class DB; + + // No copying allowed + DBImplSecondary(const DBImplSecondary&); + void operator=(const DBImplSecondary&); + + using DBImpl::Recover; + + Status FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context); + Status FindNewLogNumbers(std::vector* logs); + // After manifest recovery, replay WALs and refresh log_readers_ if necessary + // REQUIRES: log_numbers are sorted in ascending order + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context); + + // Run compaction without installation, the output files will be placed in the + // secondary DB path. The LSM tree won't be changed, the secondary DB is still + // in read-only mode. + Status CompactWithoutInstallation(const OpenAndCompactOptions& options, + ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result); + + std::unique_ptr manifest_reader_; + std::unique_ptr manifest_reporter_; + std::unique_ptr manifest_reader_status_; + + // Cache log readers for each log number, used for continue WAL replay + // after recovery + std::map> log_readers_; + + // Current WAL number replayed for each column family. + std::unordered_map cfd_to_current_log_; + + const std::string secondary_path_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_write.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_write.cc new file mode 100644 index 0000000000..fb74434dd4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_impl/db_impl_write.cc @@ -0,0 +1,2487 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "options/options_helper.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::Put(o, column_family, key, val); +} + +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Put(o, column_family, key, ts, val); +} + +Status DBImpl::PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + + return DB::PutEntity(options, column_family, key, columns); +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + auto cfh = static_cast_with_check(column_family); + if (!cfh->cfd()->ioptions()->merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, column_family, key, val); + } +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Merge(o, column_family, key, ts, val); +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::Delete(write_options, column_family, key); +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Delete(write_options, column_family, key, ts); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& key) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::SingleDelete(write_options, column_family, key); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::SingleDelete(write_options, column_family, key, ts); +} + +Status DBImpl::DeleteRange(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + const Status s = FailIfCfHasTs(column_family); + if (!s.ok()) { + return s; + } + return DB::DeleteRange(write_options, column_family, begin_key, end_key); +} + +Status DBImpl::DeleteRange(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::DeleteRange(write_options, column_family, begin_key, end_key, ts); +} + +void DBImpl::SetRecoverableStatePreReleaseCallback( + PreReleaseCallback* callback) { + recoverable_state_pre_release_callback_.reset(callback); +} + +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { + Status s; + if (write_options.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + my_batch, write_options.protection_bytes_per_key); + } + if (s.ok()) { + s = WriteImpl(write_options, my_batch, /*callback=*/nullptr, + /*log_used=*/nullptr); + } + return s; +} + +Status DBImpl::WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback) { + Status s; + if (write_options.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + my_batch, write_options.protection_bytes_per_key); + } + if (s.ok()) { + s = WriteImpl(write_options, my_batch, callback, nullptr); + } + return s; +} + +// The main write queue. This is the only write queue that updates LastSequence. +// When using one write queue, the same sequence also indicates the last +// published sequence. +Status DBImpl::WriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used, + size_t batch_cnt, + PreReleaseCallback* pre_release_callback, + PostMemTableCallback* post_memtable_callback) { + assert(!seq_per_batch_ || batch_cnt != 0); + assert(my_batch == nullptr || my_batch->Count() == 0 || + write_options.protection_bytes_per_key == 0 || + write_options.protection_bytes_per_key == + my_batch->GetProtectionBytesPerKey()); + if (my_batch == nullptr) { + return Status::InvalidArgument("Batch is nullptr!"); + } else if (!disable_memtable && + WriteBatchInternal::TimestampsUpdateNeeded(*my_batch)) { + // If writing to memtable, then we require the caller to set/update the + // timestamps for the keys in the write batch. + // Otherwise, it means we are just writing to the WAL, and we allow + // timestamps unset for the keys in the write batch. This can happen if we + // use TransactionDB with write-committed policy, and we currently do not + // support user-defined timestamp with other policies. + // In the prepare phase, a transaction can write the batch to the WAL + // without inserting to memtable. The keys in the batch do not have to be + // assigned timestamps because they will be used only during recovery if + // there is a commit marker which includes their commit timestamp. + return Status::InvalidArgument("write batch must have timestamp(s) set"); + } else if (write_options.rate_limiter_priority != Env::IO_TOTAL && + write_options.rate_limiter_priority != Env::IO_USER) { + return Status::InvalidArgument( + "WriteOptions::rate_limiter_priority only allows " + "Env::IO_TOTAL and Env::IO_USER due to implementation constraints"); + } else if (write_options.rate_limiter_priority != Env::IO_TOTAL && + (write_options.disableWAL || manual_wal_flush_)) { + return Status::InvalidArgument( + "WriteOptions::rate_limiter_priority currently only supports " + "rate-limiting automatic WAL flush, which requires " + "`WriteOptions::disableWAL` and " + "`DBOptions::manual_wal_flush` both set to false"); + } else if (write_options.protection_bytes_per_key != 0 && + write_options.protection_bytes_per_key != 8) { + return Status::InvalidArgument( + "`WriteOptions::protection_bytes_per_key` must be zero or eight"); + } + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && !tracer_->IsWriteOrderPreserved()) { + // We don't have to preserve write order so can trace anywhere. It's more + // efficient to trace here than to add latency to a phase of the log/apply + // pipeline. + // TODO: maybe handle the tracing status? + tracer_->Write(my_batch).PermitUncheckedError(); + } + } + if (write_options.sync && write_options.disableWAL) { + return Status::InvalidArgument("Sync writes has to enable WAL."); + } + if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with concurrent prepares"); + } + if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) { + // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt + return Status::NotSupported( + "pipelined_writes is not compatible with seq_per_batch"); + } + if (immutable_db_options_.unordered_write && + immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with unordered_write"); + } + if (immutable_db_options_.enable_pipelined_write && + post_memtable_callback != nullptr) { + return Status::NotSupported( + "pipelined write currently does not honor post_memtable_callback"); + } + if (seq_per_batch_ && post_memtable_callback != nullptr) { + return Status::NotSupported( + "seq_per_batch currently does not honor post_memtable_callback"); + } + // Otherwise IsLatestPersistentState optimization does not make sense + assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || + disable_memtable); + + if (write_options.low_pri) { + Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch); + if (!s.ok()) { + return s; + } + } + + if (two_write_queues_ && disable_memtable) { + AssignOrder assign_order = + seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder; + // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and + // they don't consume sequence. + return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch, + callback, log_used, log_ref, seq_used, batch_cnt, + pre_release_callback, assign_order, + kDontPublishLastSeq, disable_memtable); + } + + if (immutable_db_options_.unordered_write) { + const size_t sub_batch_cnt = batch_cnt != 0 + ? batch_cnt + // every key is a sub-batch consuming a seq + : WriteBatchInternal::Count(my_batch); + uint64_t seq = 0; + // Use a write thread to i) optimize for WAL write, ii) publish last + // sequence in in increasing order, iii) call pre_release_callback serially + Status status = WriteImplWALOnly( + &write_thread_, write_options, my_batch, callback, log_used, log_ref, + &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); + TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL"); + if (!status.ok()) { + return status; + } + if (seq_used) { + *seq_used = seq; + } + if (!disable_memtable) { + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"); + status = UnorderedWriteMemtable(write_options, my_batch, callback, + log_ref, seq, sub_batch_cnt); + } + return status; + } + + if (immutable_db_options_.enable_pipelined_write) { + return PipelinedWriteImpl(write_options, my_batch, callback, log_used, + log_ref, disable_memtable, seq_used); + } + + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, batch_cnt, pre_release_callback, + post_memtable_callback); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + // we are a non-leader in a parallel group + + if (w.ShouldWriteToMemtable()) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_memtable_time); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt, + batch_per_txn_, write_options.memtable_insert_hint_per_batch); + + PERF_TIMER_START(write_pre_and_post_process_time); + } + + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + // we're responsible for exit batch group + // TODO(myabandeh): propagate status to write_group + auto last_sequence = w.write_group->last_sequence; + for (auto* tmp_w : *(w.write_group)) { + assert(tmp_w); + if (tmp_w->post_memtable_callback) { + Status tmp_s = + (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable); + // TODO: propagate the execution status of post_memtable_callback to + // caller. + assert(tmp_s.ok()); + } + } + versions_->SetLastSequence(last_sequence); + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupFollower(&w); + } + assert(w.state == WriteThread::STATE_COMPLETED); + // STATE_COMPLETED conditional below handles exit + } + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + // write is complete and leader has updated sequence + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + Status status; + // Once reaches this point, the current writer "w" will try to do its write + // job. It may also pick up some of the remaining writers in the "writers_" + // when it finds suitable, and finish them in the same write batch. + // This is how a write job could be done by the other writer. + WriteContext write_context; + LogContext log_context(write_options.sync); + WriteThread::WriteGroup write_group; + bool in_parallel_group = false; + uint64_t last_sequence = kMaxSequenceNumber; + + assert(!two_write_queues_ || !disable_memtable); + { + // With concurrent writes we do preprocess only in the write thread that + // also does write to memtable to avoid sync issue on shared data structure + // with the other thread + + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + + status = PreprocessWrite(write_options, &log_context, &write_context); + if (!two_write_queues_) { + // Assign it after ::PreprocessWrite since the sequence might advance + // inside it by WriteRecoverableState + last_sequence = versions_->LastSequence(); + } + + PERF_TIMER_START(write_pre_and_post_process_time); + } + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into memtables + + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters"); + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + + IOStatus io_s; + Status pre_release_cb_status; + if (status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + // Rules for when we can update the memtable concurrently + // 1. supported by memtable + // 2. Puts are not okay if inplace_update_support + // 3. Merges are not okay + // + // Rules 1..2 are enforced by checking the options + // during startup (CheckConcurrentWritesSupported), so if + // options.allow_concurrent_memtable_write is true then they can be + // assumed to be true. Rule 3 is checked for each batch. We could + // relax rules 2 if we could prevent write batches from referring + // more than once to a particular key. + bool parallel = immutable_db_options_.allow_concurrent_memtable_write && + write_group.size > 1; + size_t total_count = 0; + size_t valid_batches = 0; + size_t total_byte_size = 0; + size_t pre_release_callback_cnt = 0; + for (auto* writer : write_group) { + assert(writer); + if (writer->CheckCallback(this)) { + valid_batches += writer->batch_cnt; + if (writer->ShouldWriteToMemtable()) { + total_count += WriteBatchInternal::Count(writer->batch); + parallel = parallel && !writer->batch->HasMerge(); + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + // Note about seq_per_batch_: either disableWAL is set for the entire write + // group or not. In either case we inc seq for each write batch with no + // failed callback. This means that there could be a batch with + // disalbe_memtable in between; although we do not write this batch to + // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc + // the seq per valid written key to mem. + size_t seq_inc = seq_per_batch_ ? valid_batches : total_count; + + const bool concurrent_update = two_write_queues_; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count, + concurrent_update); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (!two_write_queues_) { + if (status.ok() && !write_options.disableWAL) { + assert(log_context.log_file_number_size); + LogFileNumberSize& log_file_number_size = + *(log_context.log_file_number_size); + PERF_TIMER_GUARD(write_wal_time); + io_s = + WriteToWAL(write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + last_sequence + 1, log_file_number_size); + } + } else { + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, + seq_inc); + } else { + // Otherwise we inc seq number for memtable writes + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + } + status = io_s; + assert(last_sequence != kMaxSequenceNumber); + const SequenceNumber current_sequence = last_sequence + 1; + last_sequence += seq_inc; + + // PreReleaseCallback is called after WAL write and before memtable write + if (status.ok()) { + SequenceNumber next_sequence = current_sequence; + size_t index = 0; + // Note: the logic for advancing seq here must be consistent with the + // logic in WriteBatchInternal::InsertInto(write_group...) as well as + // with WriteBatchInternal::InsertInto(write_batch...) that is called on + // the merged batch during recovery from the WAL. + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = next_sequence; + if (writer->pre_release_callback) { + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = pre_release_cb_status = ws; + break; + } + } + if (seq_per_batch_) { + assert(writer->batch_cnt); + next_sequence += writer->batch_cnt; + } else if (writer->ShouldWriteToMemtable()) { + next_sequence += WriteBatchInternal::Count(writer->batch); + } + } + } + + if (status.ok()) { + PERF_TIMER_GUARD(write_memtable_time); + + if (!parallel) { + // w.sequence will be set inside InsertInto + w.status = WriteBatchInternal::InsertInto( + write_group, current_sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, + 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, + batch_per_txn_); + } else { + write_group.last_sequence = last_sequence; + write_thread_.LaunchParallelMemTableWriters(&write_group); + in_parallel_group = true; + + // Each parallel follower is doing each own writes. The leader should + // also do its own. + if (w.ShouldWriteToMemtable()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*concurrent_memtable_writes*/, seq_per_batch_, + w.batch_cnt, batch_per_txn_, + write_options.memtable_insert_hint_per_batch); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!io_s.ok()) { + // Check WriteToWAL status + IOStatusCheck(io_s); + } + if (!w.CallbackFailed()) { + if (!io_s.ok()) { + assert(pre_release_cb_status.ok()); + } else { + WriteStatusCheck(pre_release_cb_status); + } + } else { + assert(pre_release_cb_status.ok()); + } + + if (log_context.need_log_sync) { + VersionEdit synced_wals; + log_write_mutex_.Lock(); + if (status.ok()) { + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + &synced_wals); + } else { + MarkLogsNotSynced(logfile_number_); + } + log_write_mutex_.Unlock(); + if (status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); + } + + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hence provide a simple implementation that is not necessarily efficient. + if (two_write_queues_) { + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + } + + bool should_exit_batch_group = true; + if (in_parallel_group) { + // CompleteParallelWorker returns true if this thread should + // handle exit, false means somebody else did + should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w); + } + if (should_exit_batch_group) { + if (status.ok()) { + for (auto* tmp_w : write_group) { + assert(tmp_w); + if (tmp_w->post_memtable_callback) { + Status tmp_s = + (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable); + // TODO: propagate the execution status of post_memtable_callback to + // caller. + assert(tmp_s.ok()); + } + } + // Note: if we are to resume after non-OK statuses we need to revisit how + // we reacts to non-OK statuses here. + versions_->SetLastSequence(last_sequence); + } + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupLeader(write_group, status); + } + + if (status.ok()) { + status = w.FinalStatus(); + } + return status; +} + +Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + WriteContext write_context; + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, /*_batch_cnt=*/0, + /*_pre_release_callback=*/nullptr); + write_thread_.JoinBatchGroup(&w); + TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"); + if (w.state == WriteThread::STATE_GROUP_LEADER) { + WriteThread::WriteGroup wal_write_group; + if (w.callback && !w.callback->AllowWriteBatching()) { + write_thread_.WaitForMemTableWriters(); + } + LogContext log_context(!write_options.disableWAL && write_options.sync); + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + w.status = PreprocessWrite(write_options, &log_context, &write_context); + PERF_TIMER_START(write_pre_and_post_process_time); + + // This can set non-OK status if callback fail. + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group); + const SequenceNumber current_sequence = + write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1; + size_t total_count = 0; + size_t total_byte_size = 0; + + if (w.status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : wal_write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + SequenceNumber next_sequence = current_sequence; + for (auto* writer : wal_write_group) { + assert(writer); + if (writer->CheckCallback(this)) { + if (writer->ShouldWriteToMemtable()) { + writer->sequence = next_sequence; + size_t count = WriteBatchInternal::Count(writer->batch); + next_sequence += count; + total_count += count; + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + } + } + if (w.disable_wal) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + write_thread_.UpdateLastSequence(current_sequence + total_count - 1); + } + + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + IOStatus io_s; + io_s.PermitUncheckedError(); // Allow io_s to be uninitialized + + if (w.status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); + RecordTick(stats_, WRITE_DONE_BY_SELF, 1); + if (wal_write_group.size > 1) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + wal_write_group.size - 1); + RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); + } + assert(log_context.log_file_number_size); + LogFileNumberSize& log_file_number_size = + *(log_context.log_file_number_size); + io_s = + WriteToWAL(wal_write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + current_sequence, log_file_number_size); + w.status = io_s; + } + + if (!io_s.ok()) { + // Check WriteToWAL status + IOStatusCheck(io_s); + } else if (!w.CallbackFailed()) { + WriteStatusCheck(w.status); + } + + VersionEdit synced_wals; + if (log_context.need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); + if (w.status.ok()) { + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + &synced_wals); + } else { + MarkLogsNotSynced(logfile_number_); + } + } + if (w.status.ok() && synced_wals.IsWalAddition()) { + InstrumentedMutexLock l(&mutex_); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + w.status = ApplyWALToManifest(read_options, &synced_wals); + } + write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); + } + + // NOTE: the memtable_write_group is declared before the following + // `if` statement because its lifetime needs to be longer + // that the inner context of the `if` as a reference to it + // may be used further below within the outer _write_thread + WriteThread::WriteGroup memtable_write_group; + + if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { + PERF_TIMER_GUARD(write_memtable_time); + assert(w.ShouldWriteToMemtable()); + write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group); + if (memtable_write_group.size > 1 && + immutable_db_options_.allow_concurrent_memtable_write) { + write_thread_.LaunchParallelMemTableWriters(&memtable_write_group); + } else { + memtable_write_group.status = WriteBatchInternal::InsertInto( + memtable_write_group, w.sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_); + versions_->SetLastSequence(memtable_write_group.last_sequence); + write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); + } + } else { + // NOTE: the memtable_write_group is never really used, + // so we need to set its status to pass ASSERT_STATUS_CHECKED + memtable_write_group.status.PermitUncheckedError(); + } + + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + assert(w.ShouldWriteToMemtable()); + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + MemTableInsertStatusCheck(w.status); + versions_->SetLastSequence(w.write_group->last_sequence); + write_thread_.ExitAsMemTableWriter(&w, *w.write_group); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + + assert(w.state == WriteThread::STATE_COMPLETED); + return w.FinalStatus(); +} + +Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback, uint64_t log_ref, + SequenceNumber seq, + const size_t sub_batch_cnt) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + false /*disable_memtable*/); + + if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { + w.sequence = seq; + size_t total_count = WriteBatchInternal::Count(my_batch); + InternalStats* stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + } + + size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; + if (pending_cnt == 0) { + // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex + // before notify ensures that cv is in waiting state when it is notified + // thus not missing the update to pending_memtable_writes_ even though it is + // not modified under the mutex. + std::lock_guard lck(switch_mutex_); + switch_cv_.notify_all(); + } + WriteStatusCheck(w.status); + + if (!w.FinalStatus().ok()) { + return w.FinalStatus(); + } + return Status::OK(); +} + +// The 2nd write queue. If enabled it will be used only for WAL-only writes. +// This is the only queue that updates LastPublishedSequence which is only +// applicable in a two-queue setting. +Status DBImpl::WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, sub_batch_cnt, pre_release_callback); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); + + write_thread->JoinBatchGroup(&w); + assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + if (publish_last_seq == kDoPublishLastSeq) { + Status status; + + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + WriteContext write_context; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them + // without paying the cost of obtaining the mutex. + if (status.ok()) { + LogContext log_context; + status = PreprocessWrite(write_options, &log_context, &write_context); + WriteStatusCheckOnLocked(status); + } + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } else { + InstrumentedMutexLock lock(&mutex_); + Status status = + DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options); + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } + + WriteThread::WriteGroup write_group; + uint64_t last_sequence; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + // Note: no need to update last_batch_group_size_ here since the batch writes + // to WAL only + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + + size_t pre_release_callback_cnt = 0; + size_t total_byte_size = 0; + for (auto* writer : write_group) { + assert(writer); + if (writer->CheckCallback(this)) { + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + + const bool concurrent_update = true; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + size_t seq_inc = 0 /* total_count */; + if (assign_order == kDoAssignOrder) { + size_t total_batch_cnt = 0; + for (auto* writer : write_group) { + assert(writer->batch_cnt || !seq_per_batch_); + if (!writer->CallbackFailed()) { + total_batch_cnt += writer->batch_cnt; + } + } + seq_inc = total_batch_cnt; + } + Status status; + if (!write_options.disableWAL) { + IOStatus io_s = + ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + status = io_s; + // last_sequence may not be set if there is an error + // This error checking and return is moved up to avoid using uninitialized + // last_sequence. + if (!io_s.ok()) { + IOStatusCheck(io_s); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } else { + // Otherwise we inc seq number to do solely the seq allocation + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + + size_t memtable_write_cnt = 0; + auto curr_seq = last_sequence + 1; + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = curr_seq; + if (assign_order == kDoAssignOrder) { + assert(writer->batch_cnt || !seq_per_batch_); + curr_seq += writer->batch_cnt; + } + if (!writer->disable_memtable) { + memtable_write_cnt++; + } + // else seq advances only by memtable writes + } + if (status.ok() && write_options.sync) { + assert(!write_options.disableWAL); + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hance provide a simple implementation that is not necessarily efficient. + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!w.CallbackFailed()) { + WriteStatusCheck(status); + } + if (status.ok()) { + size_t index = 0; + for (auto* writer : write_group) { + if (!writer->CallbackFailed() && writer->pre_release_callback) { + assert(writer->sequence != kMaxSequenceNumber); + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = ws; + break; + } + } + } + } + if (publish_last_seq == kDoPublishLastSeq) { + versions_->SetLastSequence(last_sequence + seq_inc); + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + } + if (immutable_db_options_.unordered_write && status.ok()) { + pending_memtable_writes_ += memtable_write_cnt; + } + write_thread->ExitAsBatchGroupLeader(write_group, status); + if (status.ok()) { + status = w.FinalStatus(); + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return status; +} + +void DBImpl::WriteStatusCheckOnLocked(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + InstrumentedMutexLock l(&mutex_); + assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + } +} + +void DBImpl::WriteStatusCheck(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + mutex_.Lock(); + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } +} + +void DBImpl::IOStatusCheck(const IOStatus& io_status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if ((immutable_db_options_.paranoid_checks && !io_status.ok() && + !io_status.IsBusy() && !io_status.IsIncomplete()) || + io_status.IsIOFenced()) { + mutex_.Lock(); + // Maybe change the return status to void? + error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } else { + // Force writable file to be continue writable. + logs_.back().writer->file()->reset_seen_error(); + } +} + +void DBImpl::MemTableInsertStatusCheck(const Status& status) { + // A non-OK status here indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be + // because of a corrupt write_batch (very bad), or because the + // client specified an invalid column family and didn't specify + // ignore_missing_column_families. + if (!status.ok()) { + mutex_.Lock(); + assert(!error_handler_.IsBGWorkStopped()); + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable) + .PermitUncheckedError(); + mutex_.Unlock(); + } +} + +Status DBImpl::PreprocessWrite(const WriteOptions& write_options, + LogContext* log_context, + WriteContext* write_context) { + assert(write_context != nullptr && log_context != nullptr); + Status status; + + if (error_handler_.IsDBStopped()) { + InstrumentedMutexLock l(&mutex_); + status = error_handler_.GetBGError(); + } + + PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); + + if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) { + assert(versions_); + InstrumentedMutexLock l(&mutex_); + const ColumnFamilySet* const column_families = + versions_->GetColumnFamilySet(); + assert(column_families); + size_t num_cfs = column_families->NumberOfColumnFamilies(); + assert(num_cfs >= 1); + if (num_cfs > 1) { + WaitForPendingWrites(); + status = SwitchWAL(write_context); + } + } + + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + InstrumentedMutexLock l(&mutex_); + WaitForPendingWrites(); + status = HandleWriteBufferManagerFlush(write_context); + } + + if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); + status = TrimMemtableHistory(write_context); + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); + WaitForPendingWrites(); + status = ScheduleFlushes(write_context); + } + + PERF_TIMER_STOP(write_scheduling_flushes_compactions_time); + PERF_TIMER_GUARD(write_pre_and_post_process_time); + + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || + write_controller_.NeedsDelay()))) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_delay_time); + // We don't know size of curent batch so that we always use the size + // for previous one. It might create a fairness issue that expiration + // might happen for smaller writes but larger writes can go through. + // Can optimize it if it is an issue. + InstrumentedMutexLock l(&mutex_); + status = DelayWrite(last_batch_group_size_, write_thread_, write_options); + PERF_TIMER_START(write_pre_and_post_process_time); + } + + // If memory usage exceeded beyond a certain threshold, + // write_buffer_manager_->ShouldStall() returns true to all threads writing to + // all DBs and writers will be stalled. + // It does soft checking because WriteBufferManager::buffer_limit_ has already + // exceeded at this point so no new write (including current one) will go + // through until memory usage is decreased. + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, 1, + true /* concurrent */); + if (write_options.no_slowdown) { + status = Status::Incomplete("Write stall"); + } else { + InstrumentedMutexLock l(&mutex_); + WriteBufferManagerStallWrites(); + } + } + InstrumentedMutexLock l(&log_write_mutex_); + if (status.ok() && log_context->need_log_sync) { + // Wait until the parallel syncs are finished. Any sync process has to sync + // the front log too so it is enough to check the status of front() + // We do a while loop since log_sync_cv_ is signalled when any sync is + // finished + // Note: there does not seem to be a reason to wait for parallel sync at + // this early step but it is not important since parallel sync (SyncWAL) and + // need_log_sync are usually not used together. + while (logs_.front().IsSyncing()) { + log_sync_cv_.Wait(); + } + for (auto& log : logs_) { + // This is just to prevent the logs to be synced by a parallel SyncWAL + // call. We will do the actual syncing later after we will write to the + // WAL. + // Note: there does not seem to be a reason to set this early before we + // actually write to the WAL + log.PrepareForSync(); + } + } else { + log_context->need_log_sync = false; + } + log_context->writer = logs_.back().writer; + log_context->need_log_dir_sync = + log_context->need_log_dir_sync && !log_dir_synced_; + log_context->log_file_number_size = std::addressof(alive_log_files_.back()); + + return status; +} + +Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, WriteBatch** merged_batch, + size_t* write_with_wal, + WriteBatch** to_be_cached_state) { + assert(write_with_wal != nullptr); + assert(tmp_batch != nullptr); + assert(*to_be_cached_state == nullptr); + *write_with_wal = 0; + auto* leader = write_group.leader; + assert(!leader->disable_wal); // Same holds for all in the batch group + if (write_group.size == 1 && !leader->CallbackFailed() && + leader->batch->GetWalTerminationPoint().is_cleared()) { + // we simply write the first WriteBatch to WAL if the group only + // contains one batch, that batch should be written to the WAL, + // and the batch is not wanting to be truncated + *merged_batch = leader->batch; + if (WriteBatchInternal::IsLatestPersistentState(*merged_batch)) { + *to_be_cached_state = *merged_batch; + } + *write_with_wal = 1; + } else { + // WAL needs all of the batches flattened into a single batch. + // We could avoid copying here with an iov-like AddRecord + // interface + *merged_batch = tmp_batch; + for (auto writer : write_group) { + if (!writer->CallbackFailed()) { + Status s = WriteBatchInternal::Append(*merged_batch, writer->batch, + /*WAL_only*/ true); + if (!s.ok()) { + tmp_batch->Clear(); + return s; + } + if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) { + // We only need to cache the last of such write batch + *to_be_cached_state = writer->batch; + } + (*write_with_wal)++; + } + } + } + // return merged_batch; + return Status::OK(); +} + +// When two_write_queues_ is disabled, this function is called from the only +// write thread. Otherwise this must be called holding log_write_mutex_. +IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size, + Env::IOPriority rate_limiter_priority, + LogFileNumberSize& log_file_number_size) { + assert(log_size != nullptr); + + Slice log_entry = WriteBatchInternal::Contents(&merged_batch); + TEST_SYNC_POINT_CALLBACK("DBImpl::WriteToWAL:log_entry", &log_entry); + auto s = merged_batch.VerifyChecksum(); + if (!s.ok()) { + return status_to_io_status(std::move(s)); + } + *log_size = log_entry.size(); + // When two_write_queues_ WriteToWAL has to be protected from concurretn calls + // from the two queues anyway and log_write_mutex_ is already held. Otherwise + // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord + // from possible concurrent calls via the FlushWAL by the application. + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + // Due to performance cocerns of missed branch prediction penalize the new + // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case + // when we do not need any locking. + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority); + + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + if (log_used != nullptr) { + *log_used = logfile_number_; + } + total_log_size_ += log_entry.size(); + log_file_number_size.AddSize(*log_size); + log_empty_ = false; + return io_s; +} + +IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence, + LogFileNumberSize& log_file_number_size) { + IOStatus io_s; + assert(!two_write_queues_); + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch; + io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch, + &write_with_wal, &to_be_cached_state)); + if (UNLIKELY(!io_s.ok())) { + return io_s; + } + + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + + WriteBatchInternal::SetSequence(merged_batch, sequence); + + uint64_t log_size; + io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, + write_group.leader->rate_limiter_priority, + log_file_number_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + + if (io_s.ok() && need_log_sync) { + StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS); + // It's safe to access logs_ with unlocked mutex_ here because: + // - we've set getting_synced=true for all logs, + // so other threads won't pop from logs_ while we're here, + // - only writer thread can push to logs_, and we're in + // writer thread, so no one will push to logs_, + // - as long as other threads don't modify it, it's safe to read + // from std::deque from multiple threads concurrently. + // + // Sync operation should work with locked log_write_mutex_, because: + // when DBOptions.manual_wal_flush_ is set, + // FlushWAL function will be invoked by another thread. + // if without locked log_write_mutex_, the log file may get data + // corruption + + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + + for (auto& log : logs_) { + io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + break; + } + } + + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + + if (io_s.ok() && need_log_dir_sync) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + } + + if (merged_batch == &tmp_batch_) { + tmp_batch_.Clear(); + } + if (io_s.ok()) { + auto stats = default_cf_internal_stats_; + if (need_log_sync) { + stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); + RecordTick(stats_, WAL_FILE_SYNCED); + } + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return io_s; +} + +IOStatus DBImpl::ConcurrentWriteToWAL( + const WriteThread::WriteGroup& write_group, uint64_t* log_used, + SequenceNumber* last_sequence, size_t seq_inc) { + IOStatus io_s; + + assert(two_write_queues_ || immutable_db_options_.unordered_write); + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + WriteBatch tmp_batch; + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch; + io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch, &merged_batch, + &write_with_wal, &to_be_cached_state)); + if (UNLIKELY(!io_s.ok())) { + return io_s; + } + + // We need to lock log_write_mutex_ since logs_ and alive_log_files might be + // pushed back concurrently + log_write_mutex_.Lock(); + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + auto sequence = *last_sequence + 1; + WriteBatchInternal::SetSequence(merged_batch, sequence); + + log::Writer* log_writer = logs_.back().writer; + LogFileNumberSize& log_file_number_size = alive_log_files_.back(); + + assert(log_writer->get_log_number() == log_file_number_size.number); + + uint64_t log_size; + io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, + write_group.leader->rate_limiter_priority, + log_file_number_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + log_write_mutex_.Unlock(); + + if (io_s.ok()) { + const bool concurrent = true; + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size, + concurrent); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal, + concurrent); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return io_s; +} + +Status DBImpl::WriteRecoverableState() { + mutex_.AssertHeld(); + if (!cached_recoverable_state_empty_) { + bool dont_care_bool; + SequenceNumber next_seq; + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + SequenceNumber seq; + if (two_write_queues_) { + seq = versions_->FetchAddLastAllocatedSequence(0); + } else { + seq = versions_->LastSequence(); + } + WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1); + auto status = WriteBatchInternal::InsertInto( + &cached_recoverable_state_, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, true, + 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */, + &next_seq, &dont_care_bool, seq_per_batch_); + auto last_seq = next_seq - 1; + if (two_write_queues_) { + versions_->FetchAddLastAllocatedSequence(last_seq - seq); + versions_->SetLastPublishedSequence(last_seq); + } + versions_->SetLastSequence(last_seq); + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + if (status.ok() && recoverable_state_pre_release_callback_) { + const bool DISABLE_MEMTABLE = true; + for (uint64_t sub_batch_seq = seq + 1; + sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) { + uint64_t const no_log_num = 0; + // Unlock it since the callback might end up locking mutex. e.g., + // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB + mutex_.Unlock(); + status = recoverable_state_pre_release_callback_->Callback( + sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1); + mutex_.Lock(); + } + } + if (status.ok()) { + cached_recoverable_state_.Clear(); + cached_recoverable_state_empty_ = true; + } + return status; + } + return Status::OK(); +} + +void DBImpl::SelectColumnFamiliesForAtomicFlush( + autovector* selected_cfds, + const autovector& provided_candidate_cfds) { + mutex_.AssertHeld(); + assert(selected_cfds); + + autovector candidate_cfds; + + // Generate candidate cfds if not provided + if (provided_candidate_cfds.empty()) { + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + candidate_cfds.push_back(cfd); + } + } + } else { + candidate_cfds = provided_candidate_cfds; + } + + for (ColumnFamilyData* cfd : candidate_cfds) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + selected_cfds->push_back(cfd); + } + } + + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } +} + +// Assign sequence number for atomic flush. +void DBImpl::AssignAtomicFlushSeq(const autovector& cfds) { + assert(immutable_db_options_.atomic_flush); + auto seq = versions_->LastSequence(); + for (auto cfd : cfds) { + cfd->imm()->AssignAtomicFlushSeq(seq); + } +} + +Status DBImpl::SwitchWAL(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + if (alive_log_files_.begin()->getting_flushed) { + return status; + } + + auto oldest_alive_log = alive_log_files_.begin()->number; + bool flush_wont_release_oldest_log = false; + if (allow_2pc()) { + auto oldest_log_with_uncommitted_prep = + logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); + + assert(oldest_log_with_uncommitted_prep == 0 || + oldest_log_with_uncommitted_prep >= oldest_alive_log); + if (oldest_log_with_uncommitted_prep > 0 && + oldest_log_with_uncommitted_prep == oldest_alive_log) { + if (unable_to_release_oldest_log_) { + // we already attempted to flush all column families dependent on + // the oldest alive log but the log still contained uncommitted + // transactions so there is still nothing that we can do. + return status; + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to release oldest log due to uncommitted transaction"); + unable_to_release_oldest_log_ = true; + flush_wont_release_oldest_log = true; + } + } + } + if (!flush_wont_release_oldest_log) { + // we only mark this log as getting flushed if we have successfully + // flushed all data in this log. If this log contains outstanding prepared + // transactions then we cannot flush this log until those transactions are + // commited. + unable_to_release_oldest_log_ = false; + alive_log_files_.begin()->getting_flushed = true; + } + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing all column families with data in WAL number %" PRIu64 + ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, + oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->OldestLogToKeep() <= oldest_alive_log) { + cfds.push_back(cfd); + } + } + MaybeFlushStatsCF(&cfds); + } + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (const auto cfd : cfds) { + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req); + SchedulePendingFlush(flush_req); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req); + SchedulePendingFlush(flush_req); + } + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + ColumnFamilyData* cfd_picked = nullptr; + SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; + + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) { + // We only consider flush on CFs with bytes in the mutable memtable, + // and no immutable memtables for which flush has yet to finish. If + // we triggered flush on CFs already trying to flush, we would risk + // creating too many immutable memtables leading to write stalls. + uint64_t seq = cfd->mem()->GetCreationSeq(); + if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { + cfd_picked = cfd; + seq_num_for_cf_picked = seq; + } + } + } + if (cfd_picked != nullptr) { + cfds.push_back(cfd_picked); + } + MaybeFlushStatsCF(&cfds); + } + if (!cfds.empty()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing triggered to alleviate write buffer memory usage. Write " + "buffer is using %" ROCKSDB_PRIszt + " bytes out of a total of %" ROCKSDB_PRIszt ".", + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size()); + } + + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + for (const auto cfd : cfds) { + if (cfd->mem()->IsEmpty()) { + continue; + } + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (const auto cfd : cfds) { + cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager, + &flush_req); + SchedulePendingFlush(flush_req); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req); + SchedulePendingFlush(flush_req); + } + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +uint64_t DBImpl::GetMaxTotalWalSize() const { + uint64_t max_total_wal_size = + max_total_wal_size_.load(std::memory_order_acquire); + if (max_total_wal_size > 0) { + return max_total_wal_size; + } + return 4 * max_total_in_memory_state_.load(std::memory_order_acquire); +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the leader for write_thread +Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, + const WriteOptions& write_options) { + mutex_.AssertHeld(); + uint64_t time_delayed = 0; + bool delayed = false; + { + StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, + Histograms::HISTOGRAM_ENUM_MAX, &time_delayed); + // To avoid parallel timed delays (bad throttling), only support them + // on the primary write queue. + uint64_t delay; + if (&write_thread == &write_thread_) { + delay = + write_controller_.GetDelay(immutable_db_options_.clock, num_bytes); + } else { + assert(num_bytes == 0); + delay = 0; + } + TEST_SYNC_POINT("DBImpl::DelayWrite:Start"); + if (delay > 0) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); + + // Notify write_thread about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread.BeginWriteStall(); + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); + // We will delay the write until we have slept for `delay` microseconds + // or we don't need a delay anymore. We check for cancellation every 1ms + // (slightly longer because WriteController minimum delay is 1ms, in + // case of sleep imprecision, rounding, etc.) + const uint64_t kDelayInterval = 1001; + uint64_t stall_end = sw.start_time() + delay; + while (write_controller_.NeedsDelay()) { + if (immutable_db_options_.clock->NowMicros() >= stall_end) { + // We already delayed this write `delay` microseconds + break; + } + + delayed = true; + // Sleep for 0.001 seconds + immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval); + } + mutex_.Lock(); + write_thread.EndWriteStall(); + } + + // Don't wait if there's a background error, even if its a soft error. We + // might wait here indefinitely as the background compaction may never + // finish successfully, resulting in the stall condition lasting + // indefinitely + while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() && + !shutting_down_.load(std::memory_order_relaxed)) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + delayed = true; + + // Notify write_thread about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread.BeginWriteStall(); + if (&write_thread == &write_thread_) { + TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + } else { + TEST_SYNC_POINT("DBImpl::DelayWrite:NonmemWait"); + } + bg_cv_.Wait(); + TEST_SYNC_POINT_CALLBACK("DBImpl::DelayWrite:AfterWait", &mutex_); + write_thread.EndWriteStall(); + } + } + assert(!delayed || !write_options.no_slowdown); + if (delayed) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteStallMicros, time_delayed); + RecordTick(stats_, STALL_MICROS, time_delayed); + } + + // If DB is not in read-only mode and write_controller is not stopping + // writes, we can ignore any background errors and allow the write to + // proceed + Status s; + if (write_controller_.IsStopped()) { + if (!shutting_down_.load(std::memory_order_relaxed)) { + // If writes are still stopped and db not shutdown, it means we bailed + // due to a background error + s = Status::Incomplete(error_handler_.GetBGError().ToString()); + } else { + s = Status::ShutdownInProgress("stalled writes"); + } + } + if (error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +void DBImpl::WriteBufferManagerStallWrites() { + mutex_.AssertHeld(); + // First block future writer threads who want to add themselves to the queue + // of WriteThread. + write_thread_.BeginWriteStall(); + mutex_.Unlock(); + + // Change the state to State::Blocked. + static_cast(wbm_stall_.get()) + ->SetState(WBMStallInterface::State::BLOCKED); + // Then WriteBufferManager will add DB instance to its queue + // and block this thread by calling WBMStallInterface::Block(). + write_buffer_manager_->BeginWriteStall(wbm_stall_.get()); + wbm_stall_->Block(); + + mutex_.Lock(); + // Stall has ended. Signal writer threads so that they can add + // themselves to the WriteThread queue for writes. + write_thread_.EndWriteStall(); +} + +Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch) { + assert(write_options.low_pri); + // This is called outside the DB mutex. Although it is safe to make the call, + // the consistency condition is not guaranteed to hold. It's OK to live with + // it in this case. + // If we need to speed compaction, it means the compaction is left behind + // and we start to limit low pri writes to a limit. + if (write_controller_.NeedSpeedupCompaction()) { + if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) { + // For 2PC, we only rate limit prepare, not commit. + return Status::OK(); + } + if (write_options.no_slowdown) { + return Status::Incomplete("Low priority write stall"); + } else { + assert(my_batch != nullptr); + // Rate limit those writes. The reason that we don't completely wait + // is that in case the write is heavy, low pri writes may never have + // a chance to run. Now we guarantee we are still slowly making + // progress. + PERF_TIMER_GUARD(write_delay_time); + write_controller_.low_pri_rate_limiter()->Request( + my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + return Status::OK(); +} + +void DBImpl::MaybeFlushStatsCF(autovector* cfds) { + assert(cfds != nullptr); + if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) { + for (ColumnFamilyData* cfd : *cfds) { + if (cfd == cfd_stats) { + // stats CF already included in cfds + return; + } + } + // force flush stats CF when its log number is less than all other CF's + // log numbers + bool force_flush_stats_cf = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + force_flush_stats_cf = false; + } + } + if (force_flush_stats_cf) { + cfds->push_back(cfd_stats); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with automated flush " + "to avoid holding old logs"); + } + } + } +} + +Status DBImpl::TrimMemtableHistory(WriteContext* context) { + autovector cfds; + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) != + nullptr) { + cfds.push_back(tmp_cfd); + } + for (auto& cfd : cfds) { + autovector to_delete; + bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_, + cfd->mem()->MemoryAllocatedBytes()); + if (trimmed) { + context->superversion_context.NewSuperVersion(); + assert(context->superversion_context.new_superversion.get() != nullptr); + cfd->InstallSuperVersion(&context->superversion_context, &mutex_); + } + + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + } + return Status::OK(); +} + +Status DBImpl::ScheduleFlushes(WriteContext* context) { + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + for (auto cfd : cfds) { + cfd->Ref(); + } + flush_scheduler_.Clear(); + } else { + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfds.push_back(tmp_cfd); + } + MaybeFlushStatsCF(&cfds); + } + Status status; + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (auto& cfd : cfds) { + if (!cfd->mem()->IsEmpty()) { + status = SwitchMemtable(cfd, context); + } + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + if (!status.ok()) { + break; + } + } + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + FlushRequest flush_req; + GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req); + SchedulePendingFlush(flush_req); + } else { + for (auto* cfd : cfds) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req); + SchedulePendingFlush(flush_req); + } + } + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, + const MemTableInfo& mem_table_info) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + mutex_.Unlock(); + for (auto listener : immutable_db_options_.listeners) { + listener->OnMemTableSealed(mem_table_info); + } + mutex_.Lock(); +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +// REQUIRES: this thread is currently at the front of the 2nd writer queue if +// two_write_queues_ is true (This is to simplify the reasoning.) +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { + mutex_.AssertHeld(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + IOStatus io_s; + + // Recoverable state is persisted in WAL. After memtable switch, WAL might + // be deleted, so we write the state to memtable to be persisted as well. + Status s = WriteRecoverableState(); + if (!s.ok()) { + return s; + } + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->prev_log_number() == 0); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + bool creating_new_log = !log_empty_; + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + uint64_t recycle_log_number = 0; + if (creating_new_log && immutable_db_options_.recycle_log_file_num && + !log_recycle_files_.empty()) { + recycle_log_number = log_recycle_files_.front(); + } + uint64_t new_log_number = + creating_new_log ? versions_->NewFileNumber() : logfile_number_; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Set memtable_info for memtable sealed callback + MemTableInfo memtable_info; + memtable_info.cf_name = cfd->GetName(); + memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); + memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.num_entries = cfd->mem()->num_entries(); + memtable_info.num_deletes = cfd->mem()->num_deletes(); + // Log this later after lock release. It may be outdated, e.g., if background + // flush happens before logging, but that should be ok. + int num_imm_unflushed = cfd->imm()->NumNotFlushed(); + const auto preallocate_block_size = + GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); + mutex_.Unlock(); + if (creating_new_log) { + // TODO: Write buffer size passed in should be max of all CF's instead + // of mutable_cf_options.write_buffer_size. + io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, + &new_log); + if (s.ok()) { + s = io_s; + } + } + if (s.ok()) { + SequenceNumber seq = versions_->LastSequence(); + new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); + context->superversion_context.NewSuperVersion(); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 + ". Immutable memtables: %d.\n", + cfd->GetName().c_str(), new_log_number, num_imm_unflushed); + // There should be no concurrent write as the thread is at the front of + // writer queue + cfd->mem()->ConstructFragmentedRangeTombstones(); + + mutex_.Lock(); + if (recycle_log_number != 0) { + // Since renaming the file is done outside DB mutex, we need to ensure + // concurrent full purges don't delete the file while we're recycling it. + // To achieve that we hold the old log number in the recyclable list until + // after it has been renamed. + assert(log_recycle_files_.front() == recycle_log_number); + log_recycle_files_.pop_front(); + } + if (s.ok() && creating_new_log) { + InstrumentedMutexLock l(&log_write_mutex_); + assert(new_log != nullptr); + if (!logs_.empty()) { + // Alway flush the buffer of the last log before switching to a new one + log::Writer* cur_log_writer = logs_.back().writer; + if (error_handler_.IsRecoveryInProgress()) { + // In recovery path, we force another try of writing WAL buffer. + cur_log_writer->file()->reset_seen_error(); + } + io_s = cur_log_writer->WriteBuffer(); + if (s.ok()) { + s = io_s; + } + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64 + " WAL file\n", + cfd->GetName().c_str(), cur_log_writer->get_log_number(), + new_log_number); + } + } + if (s.ok()) { + logfile_number_ = new_log_number; + log_empty_ = true; + log_dir_synced_ = false; + logs_.emplace_back(logfile_number_, new_log); + alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + } + } + + if (!s.ok()) { + // how do we fail if we're not creating new log? + assert(creating_new_log); + delete new_mem; + delete new_log; + context->superversion_context.new_superversion.reset(); + // We may have lost data from the WritableFileBuffer in-memory buffer for + // the current log, so treat it as a fatal error and set bg_error + if (!io_s.ok()) { + error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable); + } else { + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); + } + // Read back bg_error in order to get the right severity + s = error_handler_.GetBGError(); + return s; + } + + bool empty_cf_updated = false; + if (immutable_db_options_.track_and_verify_wals_in_manifest && + !immutable_db_options_.allow_2pc && creating_new_log) { + // In non-2pc mode, WALs become obsolete if they do not contain unflushed + // data. Updating the empty CF's log number might cause some WALs to become + // obsolete. So we should track the WAL obsoletion event before actually + // updating the empty CF's log number. + uint64_t min_wal_number_to_keep = + versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_); + if (min_wal_number_to_keep > + versions_->GetWalSet().GetMinWalNumberToKeep()) { + // Get a snapshot of the empty column families. + // LogAndApply may release and reacquire db + // mutex, during that period, column family may become empty (e.g. its + // flush succeeds), then it affects the computed min_log_number_to_keep, + // so we take a snapshot for consistency of column family data + // status. If a column family becomes non-empty afterwards, its active log + // should still be the created new log, so the min_log_number_to_keep is + // not affected. + autovector empty_cfs; + for (auto cf : *versions_->GetColumnFamilySet()) { + if (cf->IsEmpty()) { + empty_cfs.push_back(cf); + } + } + + VersionEdit wal_deletion; + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + s = versions_->LogAndApplyToDefaultColumnFamily( + read_options, &wal_deletion, &mutex_, directories_.GetDbDir()); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + if (!s.ok()) { + return s; + } + + for (auto cf : empty_cfs) { + if (cf->IsEmpty()) { + cf->SetLogNumber(logfile_number_); + // MEMPURGE: No need to change this, because new adds + // should still receive new sequence numbers. + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } // cf may become non-empty. + } + empty_cf_updated = true; + } + } + if (!empty_cf_updated) { + for (auto cf : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cf->IsEmpty()) { + if (creating_new_log) { + cf->SetLogNumber(logfile_number_); + } + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } + } + } + + cfd->mem()->SetNextLogNumber(logfile_number_); + assert(new_mem != nullptr); + cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, + mutable_cf_options); + + // Notify client that memtable is sealed, now that we have successfully + // installed a new memtable + NotifyOnMemTableSealed(cfd, memtable_info); + // It is possible that we got here without checking the value of i_os, but + // that is okay. If we did, it most likely means that s was already an error. + // In any case, ignore any unchecked error for i_os here. + io_s.PermitUncheckedError(); + return s; +} + +size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { + mutex_.AssertHeld(); + size_t bsize = + static_cast(write_buffer_size / 10 + write_buffer_size); + // Some users might set very high write_buffer_size and rely on + // max_total_wal_size or other parameters to control the WAL size. + if (mutable_db_options_.max_total_wal_size > 0) { + bsize = std::min( + bsize, static_cast(mutable_db_options_.max_total_wal_size)); + } + if (immutable_db_options_.db_write_buffer_size > 0) { + bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); + } + if (immutable_db_options_.write_buffer_manager && + immutable_db_options_.write_buffer_manager->enabled()) { + bsize = std::min( + bsize, immutable_db_options_.write_buffer_manager->buffer_size()); + } + + return bsize; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Put(column_family, key, ts, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) { + const ColumnFamilyHandle* const default_cf = DefaultColumnFamily(); + assert(default_cf); + + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + + WriteBatch batch(/* reserved_bytes */ 0, /* max_bytes */ 0, + options.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + + const Status s = batch.PutEntity(column_family, key, columns); + if (!s.ok()) { + return s; + } + + return Write(options, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.Delete(column_family, key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Delete(column_family, key, ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.SingleDelete(column_family, key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.SingleDelete(column_family, key, ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.DeleteRange(column_family, begin_key, end_key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.DeleteRange(column_family, begin_key, end_key, ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */); + Status s = batch.Merge(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Merge(column_family, key, ts, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.cc new file mode 100644 index 0000000000..be8d5bee1c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_info_dumper.h" + +#include + +#include +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +void DumpDBFileSummary(const ImmutableDBOptions& options, + const std::string& dbname, + const std::string& session_id) { + if (options.info_log == nullptr) { + return; + } + + auto* env = options.env; + uint64_t number = 0; + FileType type = kInfoLogFile; + + std::vector files; + uint64_t file_num = 0; + uint64_t file_size; + std::string file_info, wal_info; + + Header(options.info_log, "DB SUMMARY\n"); + Header(options.info_log, "DB Session ID: %s\n", session_id.c_str()); + + Status s; + // Get files in dbname dir + s = env->GetChildren(dbname, &files); + if (!s.ok()) { + Error(options.info_log, "Error when reading %s dir %s\n", dbname.c_str(), + s.ToString().c_str()); + } + std::sort(files.begin(), files.end()); + for (const std::string& file : files) { + if (!ParseFileName(file, &number, &type)) { + continue; + } + switch (type) { + case kCurrentFile: + Header(options.info_log, "CURRENT file: %s\n", file.c_str()); + break; + case kIdentityFile: + Header(options.info_log, "IDENTITY file: %s\n", file.c_str()); + break; + case kDescriptorFile: + s = env->GetFileSize(dbname + "/" + file, &file_size); + if (s.ok()) { + Header(options.info_log, + "MANIFEST file: %s size: %" PRIu64 " Bytes\n", file.c_str(), + file_size); + } else { + Error(options.info_log, + "Error when reading MANIFEST file: %s/%s %s\n", dbname.c_str(), + file.c_str(), s.ToString().c_str()); + } + break; + case kWalFile: + s = env->GetFileSize(dbname + "/" + file, &file_size); + if (s.ok()) { + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); + } else { + Error(options.info_log, "Error when reading LOG file: %s/%s %s\n", + dbname.c_str(), file.c_str(), s.ToString().c_str()); + } + break; + case kTableFile: + if (++file_num < 10) { + file_info.append(file).append(" "); + } + break; + default: + break; + } + } + + // Get sst files in db_path dir + for (auto& db_path : options.db_paths) { + if (dbname.compare(db_path.path) != 0) { + s = env->GetChildren(db_path.path, &files); + if (!s.ok()) { + Error(options.info_log, "Error when reading %s dir %s\n", + db_path.path.c_str(), s.ToString().c_str()); + continue; + } + std::sort(files.begin(), files.end()); + for (const std::string& file : files) { + if (ParseFileName(file, &number, &type)) { + if (type == kTableFile && ++file_num < 10) { + file_info.append(file).append(" "); + } + } + } + } + Header(options.info_log, + "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n", + db_path.path.c_str(), file_num, file_info.c_str()); + file_num = 0; + file_info.clear(); + } + + // Get wal file in wal_dir + const auto& wal_dir = options.GetWalDir(dbname); + if (!options.IsWalDirSameAsDBPath(dbname)) { + s = env->GetChildren(wal_dir, &files); + if (!s.ok()) { + Error(options.info_log, "Error when reading %s dir %s\n", wal_dir.c_str(), + s.ToString().c_str()); + return; + } + wal_info.clear(); + for (const std::string& file : files) { + if (ParseFileName(file, &number, &type)) { + if (type == kWalFile) { + s = env->GetFileSize(wal_dir + "/" + file, &file_size); + if (s.ok()) { + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); + } else { + Error(options.info_log, "Error when reading LOG file %s/%s %s\n", + wal_dir.c_str(), file.c_str(), s.ToString().c_str()); + } + } + } + } + } + Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(), + wal_info.c_str()); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.h new file mode 100644 index 0000000000..f518e840fd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_info_dumper.h @@ -0,0 +1,15 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "options/db_options.h" + +namespace ROCKSDB_NAMESPACE { +void DumpDBFileSummary(const ImmutableDBOptions& options, + const std::string& dbname, + const std::string& session_id = ""); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.cc new file mode 100644 index 0000000000..3d980c8786 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.cc @@ -0,0 +1,1708 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/wide/wide_column_serialization.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memory/arena.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "trace_replay/trace_replay.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +DBIter::DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* cmp, InternalIterator* iter, + const Version* version, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index) + : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + env_(_env), + clock_(ioptions.clock), + logger_(ioptions.logger), + user_comparator_(cmp), + merge_operator_(ioptions.merge_operator.get()), + iter_(iter), + version_(version), + read_callback_(read_callback), + sequence_(s), + statistics_(ioptions.stats), + max_skip_(max_sequential_skip_in_iterations), + max_skippable_internal_keys_(read_options.max_skippable_internal_keys), + num_internal_keys_skipped_(0), + iterate_lower_bound_(read_options.iterate_lower_bound), + iterate_upper_bound_(read_options.iterate_upper_bound), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + is_key_seqnum_zero_(false), + prefix_same_as_start_(mutable_cf_options.prefix_extractor + ? read_options.prefix_same_as_start + : false), + pin_thru_lifetime_(read_options.pin_data), + expect_total_order_inner_iter_(prefix_extractor_ == nullptr || + read_options.total_order_seek || + read_options.auto_prefix_mode), + read_tier_(read_options.read_tier), + fill_cache_(read_options.fill_cache), + verify_checksums_(read_options.verify_checksums), + expose_blob_index_(expose_blob_index), + is_blob_(false), + arena_mode_(arena_mode), + db_impl_(db_impl), + cfd_(cfd), + timestamp_ub_(read_options.timestamp), + timestamp_lb_(read_options.iter_start_ts), + timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) { + RecordTick(statistics_, NO_ITERATOR_CREATED); + if (pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + if (iter_.iter()) { + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } + status_.PermitUncheckedError(); + assert(timestamp_size_ == + user_comparator_.user_comparator()->timestamp_size()); +} + +Status DBIter::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + return iter_.iter()->GetProperty(prop_name, prop); + } else if (prop_name == "rocksdb.iterator.is-key-pinned") { + if (valid_) { + *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0"; + } else { + *prop = "Iterator is not valid."; + } + return Status::OK(); + } else if (prop_name == "rocksdb.iterator.internal-key") { + *prop = saved_key_.GetUserKey().ToString(); + return Status::OK(); + } + return Status::InvalidArgument("Unidentified property."); +} + +bool DBIter::ParseKey(ParsedInternalKey* ikey) { + Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */); + if (!s.ok()) { + status_ = Status::Corruption("In DBIter: ", s.getState()); + valid_ = false; + ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState()); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + assert(status_.ok()); + + PERF_COUNTER_ADD(iter_next_count, 1); + PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); + // Release temporarily pinned blocks from last operation + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + local_stats_.skip_count_ += num_internal_keys_skipped_; + local_stats_.skip_count_--; + num_internal_keys_skipped_ = 0; + bool ok = true; + if (direction_ == kReverse) { + is_key_seqnum_zero_ = false; + if (!ReverseToForward()) { + ok = false; + } + } else if (!current_entry_is_merged_) { + // If the current value is not a merge, the iter position is the + // current key, which is already returned. We can safely issue a + // Next() without checking the current key. + // If the current key is a merge, very likely iter already points + // to the next internal position. + assert(iter_.Valid()); + iter_.Next(); + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } + + local_stats_.next_count_++; + if (ok && iter_.Valid()) { + ClearSavedValue(); + + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + const Slice prefix = prefix_.GetUserKey(); + FindNextUserEntry(true /* skipping the current user key */, &prefix); + } else { + FindNextUserEntry(true /* skipping the current user key */, nullptr); + } + } else { + is_key_seqnum_zero_ = false; + valid_ = false; + } + if (statistics_ != nullptr && valid_) { + local_stats_.next_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } +} + +bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, + const Slice& blob_index) { + assert(!is_blob_); + assert(blob_value_.empty()); + + if (expose_blob_index_) { // Stacked BlobDB implementation + is_blob_ = true; + return true; + } + + if (!version_) { + status_ = Status::Corruption("Encountered unexpected blob index."); + valid_ = false; + return false; + } + + // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to + // avoid having to copy options back and forth. + // TODO: plumb Env::IOActivity + ReadOptions read_options; + read_options.read_tier = read_tier_; + read_options.fill_cache = fill_cache_; + read_options.verify_checksums = verify_checksums_; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + const Status s = version_->GetBlob(read_options, user_key, blob_index, + prefetch_buffer, &blob_value_, bytes_read); + + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + is_blob_ = true; + return true; +} + +bool DBIter::SetValueAndColumnsFromEntity(Slice slice) { + assert(value_.empty()); + assert(wide_columns_.empty()); + + const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_); + + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + if (!wide_columns_.empty() && + wide_columns_[0].name() == kDefaultWideColumnName) { + value_ = wide_columns_[0].value(); + } + + return true; +} + +// PRE: saved_key_ has the current user key if skipping_saved_key +// POST: saved_key_ should have the next user key if valid_, +// if the current entry is a result of merge +// current_entry_is_merged_ => true +// saved_value_ => the merged value +// +// NOTE: In between, saved_key_ can point to a user key that has +// a delete marker or a sequence number higher than sequence_ +// saved_key_ MUST have a proper user_key before calling this function +// +// The prefix parameter, if not null, indicates that we need to iterate +// within the prefix, and the iterator needs to be made invalid, if no +// more entry for the prefix can be found. +bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) { + PERF_TIMER_GUARD(find_next_user_entry_time); + return FindNextUserEntryInternal(skipping_saved_key, prefix); +} + +// Actual implementation of DBIter::FindNextUserEntry() +bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, + const Slice* prefix) { + // Loop until we hit an acceptable entry to yield + assert(iter_.Valid()); + assert(status_.ok()); + assert(direction_ == kForward); + current_entry_is_merged_ = false; + + // How many times in a row we have skipped an entry with user key less than + // or equal to saved_key_. We could skip these entries either because + // sequence numbers were too high or because skipping_saved_key = true. + // What saved_key_ contains throughout this method: + // - if skipping_saved_key : saved_key_ contains the key that we need + // to skip, and we haven't seen any keys greater + // than that, + // - if num_skipped > 0 : saved_key_ contains the key that we have skipped + // num_skipped times, and we haven't seen any keys + // greater than that, + // - none of the above : saved_key_ can contain anything, it doesn't + // matter. + uint64_t num_skipped = 0; + // For write unprepared, the target sequence number in reseek could be larger + // than the snapshot, and thus needs to be skipped again. This could result in + // an infinite loop of reseeks. To avoid that, we limit the number of reseeks + // to one. + bool reseek_done = false; + + do { + // Will update is_key_seqnum_zero_ as soon as we parsed the current key + // but we need to save the previous value to be used in the loop. + bool is_prev_key_seqnum_zero = is_key_seqnum_zero_; + if (!ParseKey(&ikey_)) { + is_key_seqnum_zero_ = false; + return false; + } + Slice user_key_without_ts = + StripTimestampFromUserKey(ikey_.user_key, timestamp_size_); + + is_key_seqnum_zero_ = (ikey_.sequence == 0); + + assert(iterate_upper_bound_ == nullptr || + iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound || + user_comparator_.CompareWithoutTimestamp( + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) < 0); + if (iterate_upper_bound_ != nullptr && + iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound && + user_comparator_.CompareWithoutTimestamp( + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { + break; + } + + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) != + 0) { + assert(prefix_same_as_start_); + break; + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + assert(ikey_.user_key.size() >= timestamp_size_); + Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_) + : Slice(); + bool more_recent = false; + if (IsVisible(ikey_.sequence, ts, &more_recent)) { + // If the previous entry is of seqnum 0, the current entry will not + // possibly be skipped. This condition can potentially be relaxed to + // prev_key.seq <= ikey_.sequence. We are cautious because it will be more + // prone to bugs causing the same user key with the same sequence number. + // Note that with current timestamp implementation, the same user key can + // have different timestamps and zero sequence number on the bottommost + // level. This may change in the future. + if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) && + skipping_saved_key && + CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + assert(!skipping_saved_key || + CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0); + num_skipped = 0; + reseek_done = false; + switch (ikey_.type) { + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + if (timestamp_lb_) { + saved_key_.SetInternalKey(ikey_); + valid_ = true; + return true; + } else { + saved_key_.SetUserKey( + ikey_.user_key, !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + skipping_saved_key = true; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } + break; + case kTypeValue: + case kTypeBlobIndex: + case kTypeWideColumnEntity: + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } + if (timestamp_lb_) { + saved_key_.SetInternalKey(ikey_); + } else { + saved_key_.SetUserKey( + ikey_.user_key, !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + } + + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } + + SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value() + : blob_value_); + } else if (ikey_.type == kTypeWideColumnEntity) { + if (!SetValueAndColumnsFromEntity(iter_.value())) { + return false; + } + } else { + assert(ikey_.type == kTypeValue); + SetValueAndColumnsFromPlain(iter_.value()); + } + + valid_ = true; + return true; + break; + case kTypeMerge: + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } + saved_key_.SetUserKey( + ikey_.user_key, + !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); + // By now, we are sure the current ikey is going to yield a value + current_entry_is_merged_ = true; + valid_ = true; + return MergeValuesNewToOld(); // Go to a different state machine + break; + default: + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(ikey_.type))); + return false; + } + } + } else { + if (more_recent) { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + } + + // This key was inserted after our snapshot was taken or skipped by + // timestamp range. If this happens too many times in a row for the same + // user key, we want to seek to the target sequence number. + int cmp = user_comparator_.CompareWithoutTimestamp( + ikey_.user_key, saved_key_.GetUserKey()); + if (cmp == 0 || (skipping_saved_key && cmp < 0)) { + num_skipped++; + } else { + saved_key_.SetUserKey( + ikey_.user_key, + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + skipping_saved_key = false; + num_skipped = 0; + reseek_done = false; + } + } + + // If we have sequentially iterated via numerous equal keys, then it's + // better to seek so that we can avoid too many key comparisons. + // + // To avoid infinite loops, do not reseek if we have already attempted to + // reseek previously. + // + // TODO(lth): If we reseek to sequence number greater than ikey_.sequence, + // then it does not make sense to reseek as we would actually land further + // away from the desired key. There is opportunity for optimization here. + if (num_skipped > max_skip_ && !reseek_done) { + is_key_seqnum_zero_ = false; + num_skipped = 0; + reseek_done = true; + std::string last_key; + if (skipping_saved_key) { + // We're looking for the next user-key but all we see are the same + // user-key with decreasing sequence numbers. Fast forward to + // sequence number 0 and type deletion (the smallest type). + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion)); + } else { + const std::string kTsMin(timestamp_size_, '\0'); + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion), + kTsMin); + } + // Don't set skipping_saved_key = false because we may still see more + // user-keys equal to saved_key_. + } else { + // We saw multiple entries with this user key and sequence numbers + // higher than sequence_. Fast forward to sequence_. + // Note that this only covers a case when a higher key was overwritten + // many times since our snapshot was taken, not the case when a lot of + // different keys were inserted after our snapshot was taken. + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + *timestamp_ub_); + } + } + iter_.Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_.Next(); + } + } while (iter_.Valid()); + + valid_ = false; + return iter_.status().ok(); +} + +// Merge values of the same user key starting from the current iter_ position +// Scan from the newer entries to older entries. +// PRE: iter_.key() points to the first merge type entry +// saved_key_ stores the user key +// iter_.PrepareValue() has been called +// POST: saved_value_ has the merged value for the user key +// iter_ points to the next entry (or invalid) +bool DBIter::MergeValuesNewToOld() { + if (!merge_operator_) { + ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null."); + status_ = Status::InvalidArgument("merge_operator_ must be set."); + valid_ = false; + return false; + } + + // Temporarily pin the blocks that hold merge operands + TempPinData(); + merge_context_.Clear(); + // Start the merge process by pushing the first operand + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand"); + + ParsedInternalKey ikey; + for (iter_.Next(); iter_.Valid(); iter_.Next()) { + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand"); + if (!ParseKey(&ikey)) { + return false; + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + // hit the next user key, stop right here + break; + } + if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || + kTypeDeletionWithTimestamp == ikey.type) { + // hit a delete with the same user key, stop right here + // iter_ is positioned after delete + iter_.Next(); + break; + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (kTypeValue == ikey.type) { + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + const Slice val = iter_.value(); + if (!Merge(&val, ikey.user_key)) { + return false; + } + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; + } else if (kTypeMerge == ikey.type) { + // hit a merge, add the value as an operand and run associative merge. + // when complete, add result to operands and continue. + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } else if (kTypeBlobIndex == ikey.type) { + if (expose_blob_index_) { + status_ = + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; + } + valid_ = true; + if (!Merge(&blob_value_, ikey.user_key)) { + return false; + } + + ResetBlobValue(); + + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; + } else if (kTypeWideColumnEntity == ikey.type) { + if (!MergeEntity(iter_.value(), ikey.user_key)) { + return false; + } + + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; + } else { + valid_ = false; + status_ = Status::Corruption( + "Unrecognized value type: " + + std::to_string(static_cast(ikey.type))); + return false; + } + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + if (!Merge(nullptr, saved_key_.GetUserKey())) { + return false; + } + assert(status_.ok()); + return true; +} + +void DBIter::Prev() { + assert(valid_); + assert(status_.ok()); + + PERF_COUNTER_ADD(iter_prev_count, 1); + PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + bool ok = true; + if (direction_ == kForward) { + if (!ReverseToBackward()) { + ok = false; + } + } + if (ok) { + ClearSavedValue(); + + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } + PrevInternal(prefix_same_as_start_ ? &prefix : nullptr); + } + + if (statistics_ != nullptr) { + local_stats_.prev_count_++; + if (valid_) { + local_stats_.prev_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } + } +} + +bool DBIter::ReverseToForward() { + assert(iter_.status().ok()); + + // When moving backwards, iter_ is positioned on _previous_ key, which may + // not exist or may have different prefix than the current key(). + // If that's the case, seek iter_ to current key. + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + IterKey last_key; + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); + iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + direction_ = kForward; + // Skip keys less than the current key() (a.k.a. saved_key_). + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) { + return true; + } + iter_.Next(); + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; +} + +// Move iter_ to the key before saved_key_. +bool DBIter::ReverseToBackward() { + assert(iter_.status().ok()); + + // When current_entry_is_merged_ is true, iter_ may be positioned on the next + // key, which may not exist or may have prefix different from current. + // If that's the case, seek to saved_key_. + if (current_entry_is_merged_ && + (!expect_total_order_inner_iter() || !iter_.Valid())) { + IterKey last_key; + // Using kMaxSequenceNumber and kValueTypeForSeek + // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller + // than saved_key_. + last_key.SetInternalKey(ParsedInternalKey( + saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + if (!expect_total_order_inner_iter()) { + iter_.SeekForPrev(last_key.GetInternalKey()); + } else { + // Some iterators may not support SeekForPrev(), so we avoid using it + // when prefix seek mode is disabled. This is somewhat expensive + // (an extra Prev(), as well as an extra change of direction of iter_), + // so we may need to reconsider it later. + iter_.Seek(last_key.GetInternalKey()); + if (!iter_.Valid() && iter_.status().ok()) { + iter_.SeekToLast(); + } + } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + direction_ = kReverse; + return FindUserKeyBeforeSavedKey(); +} + +void DBIter::PrevInternal(const Slice* prefix) { + while (iter_.Valid()) { + saved_key_.SetUserKey( + ExtractUserKey(iter_.key()), + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_ + ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(), + timestamp_size_)) + .compare(*prefix) != 0) { + assert(prefix_same_as_start_); + // Current key does not have the same prefix as start + valid_ = false; + return; + } + + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, + *iterate_lower_bound_, /*b_has_ts=*/false) >= 0); + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { + // We've iterated earlier than the user-specified lower bound. + valid_ = false; + return; + } + + if (!FindValueForCurrentKey()) { // assigns valid_ + return; + } + + // Whether or not we found a value for current key, we need iter_ to end up + // on a smaller key. + if (!FindUserKeyBeforeSavedKey()) { + return; + } + + if (valid_) { + // Found the value. + return; + } + + if (TooManyInternalKeysSkipped(false)) { + return; + } + } + + // We haven't found any key - iterator is not valid + valid_ = false; +} + +// Used for backwards iteration. +// Looks at the entries with user key saved_key_ and finds the most up-to-date +// value for it, or executes a merge, or determines that the value was deleted. +// Sets valid_ to true if the value is found and is ready to be presented to +// the user through value(). +// Sets valid_ to false if the value was deleted, and we should try another key. +// Returns false if an error occurred, and !status().ok() and !valid_. +// +// PRE: iter_ is positioned on the last entry with user key equal to saved_key_. +// POST: iter_ is positioned on one of the entries equal to saved_key_, or on +// the entry just before them, or on the entry just after them. +bool DBIter::FindValueForCurrentKey() { + assert(iter_.Valid()); + merge_context_.Clear(); + current_entry_is_merged_ = false; + // last entry before merge (could be kTypeDeletion, + // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue, + // kTypeBlobIndex, or kTypeWideColumnEntity) + ValueType last_not_merge_type = kTypeDeletion; + ValueType last_key_entry_type = kTypeDeletion; + + // If false, it indicates that we have not seen any valid entry, even though + // last_key_entry_type is initialized to kTypeDeletion. + bool valid_entry_seen = false; + + // Temporarily pin blocks that hold (merge operands / the value) + ReleaseTempPinnedData(); + TempPinData(); + size_t num_skipped = 0; + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + // Found a smaller user key, thus we are done with current user key. + break; + } + + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + + bool visible = IsVisible(ikey.sequence, ts); + if (!visible && + (timestamp_lb_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) { + // Found an invisible version of the current user key, and it must have + // a higher sequence number or timestamp. Therefore, we are done with the + // current user key. + break; + } + + if (!ts.empty()) { + saved_timestamp_.assign(ts.data(), ts.size()); + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + // This user key has lots of entries. + // We're going from old to new, and it's taking too long. Let's do a Seek() + // and go from new to old. This helps when a key was overwritten many times. + if (num_skipped >= max_skip_) { + return FindValueForCurrentKeyUsingSeek(); + } + + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (timestamp_lb_ != nullptr) { + // Only needed when timestamp_lb_ is not null + [[maybe_unused]] const bool ret = ParseKey(&ikey_); + // Since the preceding ParseKey(&ikey) succeeds, so must this. + assert(ret); + saved_key_.SetInternalKey(ikey); + } else if (user_comparator_.Compare(ikey.user_key, + saved_key_.GetUserKey()) < 0) { + saved_key_.SetUserKey( + ikey.user_key, + !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); + } + + valid_entry_seen = true; + last_key_entry_type = ikey.type; + switch (last_key_entry_type) { + case kTypeValue: + case kTypeBlobIndex: + case kTypeWideColumnEntity: + if (iter_.iter()->IsValuePinned()) { + pinned_value_ = iter_.value(); + } else { + valid_ = false; + status_ = Status::NotSupported( + "Backward iteration not supported if underlying iterator's value " + "cannot be pinned."); + } + merge_context_.Clear(); + last_not_merge_type = last_key_entry_type; + if (!status_.ok()) { + return false; + } + break; + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + merge_context_.Clear(); + last_not_merge_type = last_key_entry_type; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeMerge: { + assert(merge_operator_ != nullptr); + merge_context_.PushOperandBack( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } break; + default: + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(last_key_entry_type))); + return false; + } + + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + iter_.Prev(); + ++num_skipped; + + if (visible && timestamp_lb_ != nullptr) { + // If timestamp_lb_ is not nullptr, we do not have to look further for + // another internal key. We can return this current internal key. Yet we + // still keep the invariant that iter_ is positioned before the returned + // key. + break; + } + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + if (!valid_entry_seen) { + // Since we haven't seen any valid entry, last_key_entry_type remains + // unchanged and the same as its initial value. + assert(last_key_entry_type == kTypeDeletion); + assert(last_not_merge_type == kTypeDeletion); + valid_ = false; + return true; + } + + if (timestamp_lb_ != nullptr) { + assert(last_key_entry_type == ikey_.type); + } + + switch (last_key_entry_type) { + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + if (timestamp_lb_ == nullptr) { + valid_ = false; + } else { + valid_ = true; + } + return true; + case kTypeMerge: + current_entry_is_merged_ = true; + if (last_not_merge_type == kTypeDeletion || + last_not_merge_type == kTypeSingleDeletion || + last_not_merge_type == kTypeDeletionWithTimestamp) { + if (!Merge(nullptr, saved_key_.GetUserKey())) { + return false; + } + return true; + } else if (last_not_merge_type == kTypeBlobIndex) { + if (expose_blob_index_) { + status_ = + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + return false; + } + valid_ = true; + if (!Merge(&blob_value_, saved_key_.GetUserKey())) { + return false; + } + + ResetBlobValue(); + + return true; + } else if (last_not_merge_type == kTypeWideColumnEntity) { + if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) { + return false; + } + + return true; + } else { + assert(last_not_merge_type == kTypeValue); + if (!Merge(&pinned_value_, saved_key_.GetUserKey())) { + return false; + } + return true; + } + break; + case kTypeValue: + SetValueAndColumnsFromPlain(pinned_value_); + + break; + case kTypeBlobIndex: + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + return false; + } + + SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_ + : blob_value_); + + break; + case kTypeWideColumnEntity: + if (!SetValueAndColumnsFromEntity(pinned_value_)) { + return false; + } + break; + default: + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(last_key_entry_type))); + return false; + } + valid_ = true; + return true; +} + +// This function is used in FindValueForCurrentKey. +// We use Seek() function instead of Prev() to find necessary value +// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld(). +// Would be nice to reuse some code. +bool DBIter::FindValueForCurrentKeyUsingSeek() { + // FindValueForCurrentKey will enable pinning before calling + // FindValueForCurrentKeyUsingSeek() + assert(pinned_iters_mgr_.PinningEnabled()); + std::string last_key; + if (0 == timestamp_size_) { + AppendInternalKey(&last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_); + } + iter_.Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + + // In case read_callback presents, the value we seek to may not be visible. + // Find the next value that's visible. + ParsedInternalKey ikey; + + while (true) { + if (!iter_.Valid()) { + valid_ = false; + return iter_.status().ok(); + } + + if (!ParseKey(&ikey)) { + return false; + } + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + // No visible values for this key, even though FindValueForCurrentKey() + // has seen some. This is possible if we're using a tailing iterator, and + // the entries were discarded in a compaction. + valid_ = false; + return true; + } + + if (IsVisible(ikey.sequence, ts)) { + break; + } + + iter_.Next(); + } + + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + kTypeDeletionWithTimestamp == ikey.type) { + if (timestamp_lb_ == nullptr) { + valid_ = false; + } else { + valid_ = true; + saved_key_.SetInternalKey(ikey); + } + return true; + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + if (timestamp_size_ > 0) { + Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_); + saved_timestamp_.assign(ts.data(), ts.size()); + } + if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex || + ikey.type == kTypeWideColumnEntity) { + assert(iter_.iter()->IsValuePinned()); + pinned_value_ = iter_.value(); + if (ikey.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { + return false; + } + + SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_ + : blob_value_); + } else if (ikey.type == kTypeWideColumnEntity) { + if (!SetValueAndColumnsFromEntity(pinned_value_)) { + return false; + } + } else { + assert(ikey.type == kTypeValue); + SetValueAndColumnsFromPlain(pinned_value_); + } + + if (timestamp_lb_ != nullptr) { + saved_key_.SetInternalKey(ikey); + } + + valid_ = true; + return true; + } + + // kTypeMerge. We need to collect all kTypeMerge values and save them + // in operands + assert(ikey.type == kTypeMerge); + current_entry_is_merged_ = true; + merge_context_.Clear(); + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + + while (true) { + iter_.Next(); + + if (!iter_.Valid()) { + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + break; + } + if (!ParseKey(&ikey)) { + return false; + } + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { + break; + } + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + ikey.type == kTypeDeletionWithTimestamp) { + break; + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (ikey.type == kTypeValue) { + const Slice val = iter_.value(); + if (!Merge(&val, saved_key_.GetUserKey())) { + return false; + } + return true; + } else if (ikey.type == kTypeMerge) { + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } else if (ikey.type == kTypeBlobIndex) { + if (expose_blob_index_) { + status_ = + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; + } + valid_ = true; + if (!Merge(&blob_value_, saved_key_.GetUserKey())) { + return false; + } + + ResetBlobValue(); + + return true; + } else if (ikey.type == kTypeWideColumnEntity) { + if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) { + return false; + } + + return true; + } else { + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(ikey.type))); + return false; + } + } + + if (!Merge(nullptr, saved_key_.GetUserKey())) { + return false; + } + + // Make sure we leave iter_ in a good state. If it's valid and we don't care + // about prefixes, that's already good enough. Otherwise it needs to be + // seeked to the current key. + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + if (!expect_total_order_inner_iter()) { + iter_.SeekForPrev(last_key); + } else { + iter_.Seek(last_key); + if (!iter_.Valid() && iter_.status().ok()) { + iter_.SeekToLast(); + } + } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + valid_ = true; + return true; +} + +bool DBIter::Merge(const Slice* val, const Slice& user_key) { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, val, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, &pinned_value_, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_ + : saved_value_); + + valid_ = true; + return true; +} + +bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + Status s = MergeHelper::TimedFullMergeWithEntity( + merge_operator_, user_key, entity, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + if (!SetValueAndColumnsFromEntity(saved_value_)) { + return false; + } + + valid_ = true; + return true; +} + +// Move backwards until the key smaller than saved_key_. +// Changes valid_ only if return value is false. +bool DBIter::FindUserKeyBeforeSavedKey() { + assert(status_.ok()); + size_t num_skipped = 0; + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + + if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) { + return true; + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + assert(ikey.sequence != kMaxSequenceNumber); + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + if (!IsVisible(ikey.sequence, ts)) { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + } else { + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } + + if (num_skipped >= max_skip_) { + num_skipped = 0; + IterKey last_key; + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); + // It would be more efficient to use SeekForPrev() here, but some + // iterators may not support it. + iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + if (!iter_.Valid()) { + break; + } + } else { + ++num_skipped; + } + + iter_.Prev(); + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; +} + +bool DBIter::TooManyInternalKeysSkipped(bool increment) { + if ((max_skippable_internal_keys_ > 0) && + (num_internal_keys_skipped_ > max_skippable_internal_keys_)) { + valid_ = false; + status_ = Status::Incomplete("Too many internal keys skipped."); + return true; + } else if (increment) { + num_internal_keys_skipped_++; + } + return false; +} + +bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts, + bool* more_recent) { + // Remember that comparator orders preceding timestamp as larger. + // TODO(yanqin): support timestamp in read_callback_. + bool visible_by_seq = (read_callback_ == nullptr) + ? sequence <= sequence_ + : read_callback_->IsVisible(sequence); + + bool visible_by_ts = + (timestamp_ub_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) && + (timestamp_lb_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0); + + if (more_recent) { + *more_recent = !visible_by_seq; + } + return visible_by_seq && visible_by_ts; +} + +void DBIter::SetSavedKeyToSeekTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + SequenceNumber seq = sequence_; + saved_key_.Clear(); + saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_); + + if (iterate_lower_bound_ != nullptr && + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { + // Seek key is smaller than the lower bound. + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek, + timestamp_ub_); + } +} + +void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + saved_key_.Clear(); + // now saved_key is used to store internal key. + saved_key_.SetInternalKey(target, 0 /* sequence_number */, + kValueTypeForSeekForPrev, timestamp_ub_); + + if (timestamp_size_ > 0) { + const std::string kTsMin(timestamp_size_, '\0'); + Slice ts = kTsMin; + saved_key_.UpdateInternalKey( + /*seq=*/0, kValueTypeForSeekForPrev, + timestamp_lb_ == nullptr ? &ts : timestamp_lb_); + } + + if (iterate_upper_bound_ != nullptr && + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber, + kValueTypeForSeekForPrev, timestamp_ub_); + if (timestamp_size_ > 0) { + const std::string kTsMax(timestamp_size_, '\xff'); + Slice ts = kTsMax; + saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev, + &ts); + } + } +} + +void DBIter::Seek(const Slice& target) { + PERF_COUNTER_ADD(iter_seek_count, 1); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); + + if (db_impl_ != nullptr && cfd_ != nullptr) { + // TODO: What do we do if this returns an error? + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound) + .PermitUncheckedError(); + } + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + + // Seek the inner iterator based on the target key. + { + PERF_TIMER_GUARD(seek_internal_seek_time); + + SetSavedKeyToSeekTarget(target); + iter_.Seek(saved_key_.GetInternalKey()); + + RecordTick(statistics_, NUMBER_DB_SEEK); + } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kForward; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the next key that is visible to the user. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exhausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix = prefix_extractor_->Transform(target); + FindNextUserEntry(false /* not skipping saved_key */, + &target_prefix /* prefix */); + if (valid_) { + // Remember the prefix of the seek key for the future Next() call to + // check. + prefix_.SetUserKey(target_prefix); + } + } else { + FindNextUserEntry(false /* not skipping saved_key */, nullptr); + } + if (!valid_) { + return; + } + + // Updating stats and perf context counters. + if (statistics_ != nullptr) { + // Decrement since we don't want to count this key as skipped + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + } + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); +} + +void DBIter::SeekForPrev(const Slice& target) { + PERF_COUNTER_ADD(iter_seek_count, 1); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); + + if (db_impl_ != nullptr && cfd_ != nullptr) { + // TODO: What do we do if this returns an error? + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_ + ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound, + upper_bound) + .PermitUncheckedError(); + } + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + + // Seek the inner iterator based on the target key. + { + PERF_TIMER_GUARD(seek_internal_seek_time); + SetSavedKeyToSeekForPrevTarget(target); + iter_.SeekForPrev(saved_key_.GetInternalKey()); + RecordTick(statistics_, NUMBER_DB_SEEK); + } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kReverse; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the first key that is visible to the user in the + // backward direction. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exhausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix = prefix_extractor_->Transform(target); + PrevInternal(&target_prefix); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); + } + } else { + PrevInternal(nullptr); + } + + // Report stats and perf context. + if (statistics_ != nullptr && valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } +} + +void DBIter::SeekToFirst() { + if (iterate_lower_bound_ != nullptr) { + Seek(*iterate_lower_bound_); + return; + } + PERF_COUNTER_ADD(iter_seek_count, 1); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + // Don't use iter_::Seek() if we set a prefix extractor + // because prefix seek will be used. + if (!expect_total_order_inner_iter()) { + max_skip_ = std::numeric_limits::max(); + } + status_ = Status::OK(); + // if iterator is empty, this status_ could be unchecked. + status_.PermitUncheckedError(); + direction_ = kForward; + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + ClearSavedValue(); + is_key_seqnum_zero_ = false; + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_.SeekToFirst(); + } + + RecordTick(statistics_, NUMBER_DB_SEEK); + if (iter_.Valid()) { + saved_key_.SetUserKey( + ExtractUserKey(iter_.key()), + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + FindNextUserEntry(false /* not skipping saved_key */, + nullptr /* no prefix check */); + if (statistics_ != nullptr) { + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } + } + } else { + valid_ = false; + } + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); + } +} + +void DBIter::SeekToLast() { + if (iterate_upper_bound_ != nullptr) { + // Seek to last key strictly less than ReadOptions.iterate_upper_bound. + SeekForPrev(*iterate_upper_bound_); +#ifndef NDEBUG + Slice k = Valid() ? key() : Slice(); + if (Valid() && timestamp_size_ > 0 && timestamp_lb_) { + k.remove_suffix(kNumInternalBytes + timestamp_size_); + } + assert(!Valid() || user_comparator_.CompareWithoutTimestamp( + k, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) < 0); +#endif + return; + } + + PERF_COUNTER_ADD(iter_seek_count, 1); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + // Don't use iter_::Seek() if we set a prefix extractor + // because prefix seek will be used. + if (!expect_total_order_inner_iter()) { + max_skip_ = std::numeric_limits::max(); + } + status_ = Status::OK(); + // if iterator is empty, this status_ could be unchecked. + status_.PermitUncheckedError(); + direction_ = kReverse; + ReleaseTempPinnedData(); + ResetBlobValue(); + ResetValueAndColumns(); + ResetInternalKeysSkippedCounter(); + ClearSavedValue(); + is_key_seqnum_zero_ = false; + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_.SeekToLast(); + } + PrevInternal(nullptr); + if (statistics_ != nullptr) { + RecordTick(statistics_, NUMBER_DB_SEEK); + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } + } + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); + } +} + +Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, + InternalIterator* internal_iter, const Version* version, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index) { + DBIter* db_iter = + new DBIter(env, read_options, ioptions, mutable_cf_options, + user_key_comparator, internal_iter, version, sequence, false, + max_sequential_skip_in_iterations, read_callback, db_impl, cfd, + expose_blob_index); + return db_iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.h new file mode 100644 index 0000000000..e1663bb7ed --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_iter.h @@ -0,0 +1,411 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/wide_columns.h" +#include "table/iterator_wrapper.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +class Version; + +// This file declares the factory functions of DBIter, in its original form +// or a wrapped form with class ArenaWrappedDBIter, which is defined here. +// Class DBIter, which is declared and implemented inside db_iter.cc, is +// an iterator that converts internal keys (yielded by an InternalIterator) +// that were live at the specified sequence number into appropriate user +// keys. +// Each internal key consists of a user key, a sequence number, and a value +// type. DBIter deals with multiple key versions, tombstones, merge operands, +// etc, and exposes an Iterator. +// For example, DBIter may wrap following InternalIterator: +// user key: AAA value: v3 seqno: 100 type: Put +// user key: AAA value: v2 seqno: 97 type: Put +// user key: AAA value: v1 seqno: 95 type: Put +// user key: BBB value: v1 seqno: 90 type: Put +// user key: BBC value: N/A seqno: 98 type: Delete +// user key: BBC value: v1 seqno: 95 type: Put +// If the snapshot passed in is 102, then the DBIter is expected to +// expose the following iterator: +// key: AAA value: v3 +// key: BBB value: v1 +// If the snapshot passed in is 96, then it should expose: +// key: AAA value: v1 +// key: BBB value: v1 +// key: BBC value: v1 +// + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter final : public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward: + // (1a) if current_entry_is_merged_ = false, the internal iterator is + // positioned at the exact entry that yields this->key(), this->value() + // (1b) if current_entry_is_merged_ = true, the internal iterator is + // positioned immediately after the last entry that contributed to the + // current this->value(). That entry may or may not have key equal to + // this->key(). + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction : uint8_t { kForward, kReverse }; + + // LocalStatistics contain Statistics counters that will be aggregated per + // each iterator instance and then will be sent to the global statistics when + // the iterator is destroyed. + // + // The purpose of this approach is to avoid perf regression happening + // when multiple threads bump the atomic counters from a DBIter::Next(). + struct LocalStatistics { + explicit LocalStatistics() { ResetCounters(); } + + void ResetCounters() { + next_count_ = 0; + next_found_count_ = 0; + prev_count_ = 0; + prev_found_count_ = 0; + bytes_read_ = 0; + skip_count_ = 0; + } + + void BumpGlobalStatistics(Statistics* global_statistics) { + RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); + RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); + RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); + RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); + RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); + RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_); + PERF_COUNTER_ADD(iter_read_bytes, bytes_read_); + ResetCounters(); + } + + // Map to Tickers::NUMBER_DB_NEXT + uint64_t next_count_; + // Map to Tickers::NUMBER_DB_NEXT_FOUND + uint64_t next_found_count_; + // Map to Tickers::NUMBER_DB_PREV + uint64_t prev_count_; + // Map to Tickers::NUMBER_DB_PREV_FOUND + uint64_t prev_found_count_; + // Map to Tickers::ITER_BYTES_READ + uint64_t bytes_read_; + // Map to Tickers::NUMBER_ITER_SKIP + uint64_t skip_count_; + }; + + DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Comparator* cmp, + InternalIterator* iter, const Version* version, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool expose_blob_index); + + // No copying allowed + DBIter(const DBIter&) = delete; + void operator=(const DBIter&) = delete; + + ~DBIter() override { + // Release pinned data if any + if (pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + RecordTick(statistics_, NO_ITERATOR_DELETED); + ResetInternalKeysSkippedCounter(); + local_stats_.BumpGlobalStatistics(statistics_); + iter_.DeleteIter(arena_mode_); + } + void SetIter(InternalIterator* iter) { + assert(iter_.iter() == nullptr); + iter_.Set(iter); + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } + + bool Valid() const override { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (valid_) { + status_.PermitUncheckedError(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return valid_; + } + Slice key() const override { + assert(valid_); + if (timestamp_lb_) { + return saved_key_.GetInternalKey(); + } else { + const Slice ukey_and_ts = saved_key_.GetUserKey(); + return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_); + } + } + Slice value() const override { + assert(valid_); + + return value_; + } + + const WideColumns& columns() const override { + assert(valid_); + + return wide_columns_; + } + + Status status() const override { + if (status_.ok()) { + return iter_.status(); + } else { + assert(!valid_); + return status_; + } + } + Slice timestamp() const override { + assert(valid_); + assert(timestamp_size_ > 0); + if (direction_ == kReverse) { + return saved_timestamp_; + } + const Slice ukey_and_ts = saved_key_.GetUserKey(); + assert(timestamp_size_ < ukey_and_ts.size()); + return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_); + } + bool IsBlob() const { + assert(valid_); + return is_blob_; + } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + void Next() final override; + void Prev() final override; + // 'target' does not contain timestamp, even if user timestamp feature is + // enabled. + void Seek(const Slice& target) final override; + void SeekForPrev(const Slice& target) final override; + void SeekToFirst() final override; + void SeekToLast() final override; + Env* env() const { return env_; } + void set_sequence(uint64_t s) { + sequence_ = s; + if (read_callback_) { + read_callback_->Refresh(s); + } + } + void set_valid(bool v) { valid_ = v; } + + private: + // For all methods in this block: + // PRE: iter_->Valid() && status_.ok() + // Return false if there was an error, and status() is non-ok, valid_ = false; + // in this case callers would usually stop what they were doing and return. + bool ReverseToForward(); + bool ReverseToBackward(); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is smaller than iterator lower bound. + // target does not have timestamp. + void SetSavedKeyToSeekTarget(const Slice& target); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is larger than iterator upper bound. + // target does not have timestamp. + void SetSavedKeyToSeekForPrevTarget(const Slice& target); + bool FindValueForCurrentKey(); + bool FindValueForCurrentKeyUsingSeek(); + bool FindUserKeyBeforeSavedKey(); + // If `skipping_saved_key` is true, the function will keep iterating until it + // finds a user key that is larger than `saved_key_`. + // If `prefix` is not null, the iterator needs to stop when all keys for the + // prefix are exhausted and the iterator is set to invalid. + bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); + // Internal implementation of FindNextUserEntry(). + bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); + bool ParseKey(ParsedInternalKey* key); + bool MergeValuesNewToOld(); + + // If prefix is not null, we need to set the iterator to invalid if no more + // entry can be found within the prefix. + void PrevInternal(const Slice* prefix); + bool TooManyInternalKeysSkipped(bool increment = true); + bool IsVisible(SequenceNumber sequence, const Slice& ts, + bool* more_recent = nullptr); + + // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() + // is called + void TempPinData() { + if (!pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + } + + // Release blocks pinned by TempPinData() + void ReleaseTempPinnedData() { + if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + inline void ResetInternalKeysSkippedCounter() { + local_stats_.skip_count_ += num_internal_keys_skipped_; + if (valid_) { + local_stats_.skip_count_--; + } + num_internal_keys_skipped_ = 0; + } + + bool expect_total_order_inner_iter() { + assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr); + return expect_total_order_inner_iter_; + } + + // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need + // to return versions of the same key. We cannot just skip if the key value + // is the same but timestamps are different but fall in timestamp range. + inline int CompareKeyForSkip(const Slice& a, const Slice& b) { + return timestamp_lb_ != nullptr + ? user_comparator_.Compare(a, b) + : user_comparator_.CompareWithoutTimestamp(a, b); + } + + // Retrieves the blob value for the specified user key using the given blob + // index when using the integrated BlobDB implementation. + bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index); + + void ResetBlobValue() { + is_blob_ = false; + blob_value_.Reset(); + } + + void SetValueAndColumnsFromPlain(const Slice& slice) { + assert(value_.empty()); + assert(wide_columns_.empty()); + + value_ = slice; + wide_columns_.emplace_back(kDefaultWideColumnName, slice); + } + + bool SetValueAndColumnsFromEntity(Slice slice); + + void ResetValueAndColumns() { + value_.clear(); + wide_columns_.clear(); + } + + // If user-defined timestamp is enabled, `user_key` includes timestamp. + bool Merge(const Slice* val, const Slice& user_key); + bool MergeEntity(const Slice& entity, const Slice& user_key); + + const SliceTransform* prefix_extractor_; + Env* const env_; + SystemClock* clock_; + Logger* logger_; + UserComparatorWrapper user_comparator_; + const MergeOperator* const merge_operator_; + IteratorWrapper iter_; + const Version* version_; + ReadCallback* read_callback_; + // Max visible sequence number. It is normally the snapshot seq unless we have + // uncommitted data in db as in WriteUnCommitted. + SequenceNumber sequence_; + + IterKey saved_key_; + // Reusable internal key data structure. This is only used inside one function + // and should not be used across functions. Reusing this object can reduce + // overhead of calling construction of the function if creating it each time. + ParsedInternalKey ikey_; + std::string saved_value_; + Slice pinned_value_; + // for prefix seek mode to support prev() + PinnableSlice blob_value_; + // Value of the default column + Slice value_; + // All columns (i.e. name-value pairs) + WideColumns wide_columns_; + Statistics* statistics_; + uint64_t max_skip_; + uint64_t max_skippable_internal_keys_; + uint64_t num_internal_keys_skipped_; + const Slice* iterate_lower_bound_; + const Slice* iterate_upper_bound_; + + // The prefix of the seek key. It is only used when prefix_same_as_start_ + // is true and prefix extractor is not null. In Next() or Prev(), current keys + // will be checked against this prefix, so that the iterator can be + // invalidated if the keys in this prefix has been exhausted. Set it using + // SetUserKey() and use it using GetUserKey(). + IterKey prefix_; + + Status status_; + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + // True if we know that the current entry's seqnum is 0. + // This information is used as that the next entry will be for another + // user key. + bool is_key_seqnum_zero_; + const bool prefix_same_as_start_; + // Means that we will pin all data blocks we read as long the Iterator + // is not deleted, will be true if ReadOptions::pin_data is true + const bool pin_thru_lifetime_; + // Expect the inner iterator to maintain a total order. + // prefix_extractor_ must be non-NULL if the value is false. + const bool expect_total_order_inner_iter_; + ReadTier read_tier_; + bool fill_cache_; + bool verify_checksums_; + // Whether the iterator is allowed to expose blob references. Set to true when + // the stacked BlobDB implementation is used, false otherwise. + bool expose_blob_index_; + bool is_blob_; + bool arena_mode_; + // List of operands for merge operator. + MergeContext merge_context_; + LocalStatistics local_stats_; + PinnedIteratorsManager pinned_iters_mgr_; + DBImpl* db_impl_; + ColumnFamilyData* cfd_; + const Slice* const timestamp_ub_; + const Slice* const timestamp_lb_; + const size_t timestamp_size_; + std::string saved_timestamp_; +}; + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified `sequence` number +// into appropriate user keys. +extern Iterator* NewDBIterator( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, InternalIterator* internal_iter, + const Version* version, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_test_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_test_util.h new file mode 100644 index 0000000000..5b35c9907b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_test_util.h @@ -0,0 +1,1340 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "file/filename.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/checkpoint.h" +#include "table/mock_table.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/cast_util.h" +#include "util/compression.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +// In case defined by Windows headers +#undef small + +namespace ROCKSDB_NAMESPACE { +class MockEnv; + +namespace anon { +class AtomicCounter { + public: + explicit AtomicCounter(Env* env = NULL) + : env_(env), cond_count_(&mu_), count_(0) {} + + void Increment() { + MutexLock l(&mu_); + count_++; + cond_count_.SignalAll(); + } + + int Read() { + MutexLock l(&mu_); + return count_; + } + + bool WaitFor(int count) { + MutexLock l(&mu_); + + uint64_t start = env_->NowMicros(); + while (count_ < count) { + uint64_t now = env_->NowMicros(); + cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000); + if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) { + return false; + } + if (count_ < count) { + GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual"; + } + } + + return true; + } + + void Reset() { + MutexLock l(&mu_); + count_ = 0; + cond_count_.SignalAll(); + } + + private: + Env* env_; + port::Mutex mu_; + port::CondVar cond_count_; + int count_; +}; + +struct OptionsOverride { + std::shared_ptr filter_policy = nullptr; + // These will be used only if filter_policy is set + bool partition_filters = false; + // Force using a default block cache. (Setting to false allows ASAN build + // use a trivially small block cache for better UAF error detection.) + bool full_block_cache = false; + uint64_t metadata_block_size = 1024; + + // Used as a bit mask of individual enums in which to skip an XF test point + int skip_policy = 0; +}; + +} // namespace anon + +enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 }; + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false); + + static const char* kClassName() { return "SpecialEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) override { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + std::unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, std::unique_ptr&& base) + : env_(env), base_(std::move(base)) {} + Status Append(const Slice& data) override { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { + // Drop writes on the floor + return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::NoSpace("No space left on device"); + } else { + env_->bytes_written_ += data.size(); + return base_->Append(data); + } + } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { + // Drop writes on the floor + return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::NoSpace("No space left on device"); + } else { + env_->bytes_written_ += data.size(); + return base_->PositionedAppend(data, offset); + } + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /* verification_info */) override { + return PositionedAppend(data, offset); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + Status s = base_->RangeSync(offset, nbytes); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + Status Close() override { +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) + // Check preallocation size + // preallocation size is never passed to base file. + size_t preallocation_size = preallocation_block_size(); + TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus", + &preallocation_size); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + Status s = base_->Close(); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) { + env_->SleepForMicroseconds(100000); + } + Status s; + if (!env_->skip_fsync_) { + s = base_->Sync(); + } +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + void SetIOPriority(Env::IOPriority pri) override { + base_->SetIOPriority(pri); + } + Env::IOPriority GetIOPriority() override { + return base_->GetIOPriority(); + } + bool use_direct_io() const override { return base_->use_direct_io(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return base_->GetUniqueId(id, max_size); + } + }; + class ManifestFile : public WritableFile { + public: + ManifestFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) {} + Status Append(const Slice& data) override { + if (env_->manifest_write_error_.load(std::memory_order_acquire)) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { return base_->Close(); } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + if (env_->manifest_sync_error_.load(std::memory_order_acquire)) { + return Status::IOError("simulated sync error"); + } else { + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + } + uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + class WalFile : public WritableFile { + public: + WalFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) { + env_->num_open_wal_file_.fetch_add(1); + } + virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); } + Status Append(const Slice& data) override { +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1"); +#endif + Status s; + if (env_->log_write_error_.load(std::memory_order_acquire)) { + s = Status::IOError("simulated writer error"); + } else { + int slowdown = + env_->log_write_slowdown_.load(std::memory_order_acquire); + if (slowdown > 0) { + env_->SleepForMicroseconds(slowdown); + } + s = base_->Append(data); + } +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2"); +#endif + return s; + } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + void PrepareWrite(size_t offset, size_t len) override { + base_->PrepareWrite(offset, len); + } + void SetPreallocationBlockSize(size_t size) override { + base_->SetPreallocationBlockSize(size); + } + Status Close() override { +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) + // Check preallocation size + size_t block_size, last_allocated_block; + base_->GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus", + &block_size); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + + return base_->Close(); + } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + if (env_->corrupt_in_sync_) { + EXPECT_OK(Append(std::string(33000, ' '))); + return Status::IOError("Ingested Sync Failure"); + } + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + bool IsSyncThreadSafe() const override { + return env_->is_wal_sync_thread_safe_.load(); + } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + class OtherFile : public WritableFile { + public: + OtherFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) {} + Status Append(const Slice& data) override { return base_->Append(data); } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { return base_->Close(); } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + + if (no_file_overwrite_.load(std::memory_order_acquire) && + target()->FileExists(f).ok()) { + return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true."); + } + + if (non_writeable_rate_.load(std::memory_order_acquire) > 0) { + uint32_t random_number; + { + MutexLock l(&rnd_mutex_); + random_number = rnd_.Uniform(100); + } + if (random_number < non_writeable_rate_.load()) { + return Status::IOError("simulated random write error"); + } + } + + new_writable_count_++; + + if (non_writable_count_.load() > 0) { + non_writable_count_--; + return Status::IOError("simulated write error"); + } + + EnvOptions optimized = soptions; + if (strstr(f.c_str(), "MANIFEST") != nullptr || + strstr(f.c_str(), "log") != nullptr) { + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + } + + Status s = target()->NewWritableFile(f, r, optimized); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "log") != nullptr) { + r->reset(new WalFile(this, std::move(*r))); + } else { + r->reset(new OtherFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + class CountingFile : public RandomAccessFile { + public: + CountingFile(std::unique_ptr&& target, + anon::AtomicCounter* counter, + std::atomic* bytes_read) + : target_(std::move(target)), + counter_(counter), + bytes_read_(bytes_read) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + counter_->Increment(); + Status s = target_->Read(offset, n, result, scratch); + *bytes_read_ += result->size(); + return s; + } + + virtual Status Prefetch(uint64_t offset, size_t n) override { + Status s = target_->Prefetch(offset, n); + *bytes_read_ += n; + return s; + } + + private: + std::unique_ptr target_; + anon::AtomicCounter* counter_; + std::atomic* bytes_read_; + }; + + class RandomFailureFile : public RandomAccessFile { + public: + RandomFailureFile(std::unique_ptr&& target, + std::atomic* failure_cnt, uint32_t fail_odd) + : target_(std::move(target)), + fail_cnt_(failure_cnt), + fail_odd_(fail_odd) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + if (Random::GetTLSInstance()->OneIn(fail_odd_)) { + fail_cnt_->fetch_add(1); + return Status::IOError("random error"); + } + return target_->Read(offset, n, result, scratch); + } + + virtual Status Prefetch(uint64_t offset, size_t n) override { + return target_->Prefetch(offset, n); + } + + private: + std::unique_ptr target_; + std::atomic* fail_cnt_; + uint32_t fail_odd_; + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + random_file_open_counter_++; + if (s.ok()) { + if (count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_, + &random_read_bytes_counter_)); + } else if (rand_reads_fail_odd_ > 0) { + r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_, + rand_reads_fail_odd_)); + } + } + + if (s.ok() && soptions.compaction_readahead_size > 0) { + compaction_readahead_size_ = soptions.compaction_readahead_size; + } + return s; + } + + virtual Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + class CountingFile : public SequentialFile { + public: + CountingFile(std::unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) {} + virtual Status Read(size_t n, Slice* result, char* scratch) override { + counter_->Increment(); + return target_->Read(n, result, scratch); + } + virtual Status Skip(uint64_t n) override { return target_->Skip(n); } + + private: + std::unique_ptr target_; + anon::AtomicCounter* counter_; + }; + + Status s = target()->NewSequentialFile(f, r, soptions); + if (s.ok() && count_sequential_reads_) { + r->reset(new CountingFile(std::move(*r), &sequential_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) override { + sleep_counter_.Increment(); + if (no_slowdown_ || time_elapse_only_sleep_) { + addon_microseconds_.fetch_add(micros); + } + if (!no_slowdown_) { + target()->SleepForMicroseconds(micros); + } + } + + void MockSleepForMicroseconds(int64_t micros) { + sleep_counter_.Increment(); + assert(no_slowdown_); + addon_microseconds_.fetch_add(micros); + } + + void MockSleepForSeconds(int64_t seconds) { + sleep_counter_.Increment(); + assert(no_slowdown_); + addon_microseconds_.fetch_add(seconds * 1000000); + } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s; + if (time_elapse_only_sleep_) { + *unix_time = maybe_starting_time_; + } else { + s = target()->GetCurrentTime(unix_time); + } + if (s.ok()) { + // mock microseconds elapsed to seconds of time + *unix_time += addon_microseconds_.load() / 1000000; + } + return s; + } + + virtual uint64_t NowCPUNanos() override { + now_cpu_count_.fetch_add(1); + return target()->NowCPUNanos(); + } + + virtual uint64_t NowNanos() override { + return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + + addon_microseconds_.load() * 1000; + } + + virtual uint64_t NowMicros() override { + return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) + + addon_microseconds_.load(); + } + + virtual Status DeleteFile(const std::string& fname) override { + delete_count_.fetch_add(1); + return target()->DeleteFile(fname); + } + + void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; } + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + if (!skip_fsync_) { + return target()->NewDirectory(name, result); + } else { + class NoopDirectory : public Directory { + public: + NoopDirectory() {} + ~NoopDirectory() {} + + Status Fsync() override { return Status::OK(); } + Status Close() override { return Status::OK(); } + }; + + result->reset(new NoopDirectory()); + return Status::OK(); + } + } + + Status RenameFile(const std::string& src, const std::string& dest) override { + rename_count_.fetch_add(1); + if (rename_error_.load(std::memory_order_acquire)) { + return Status::NotSupported("Simulated `RenameFile()` error."); + } + return target()->RenameFile(src, dest); + } + + // Something to return when mocking current time + const int64_t maybe_starting_time_; + + Random rnd_; + port::Mutex rnd_mutex_; // Lock to pretect rnd_ + + // sstable Sync() calls are blocked while this pointer is non-nullptr. + std::atomic delay_sstable_sync_; + + // Drop writes on the floor while this pointer is non-nullptr. + std::atomic drop_writes_; + + // Simulate no-space errors while this pointer is non-nullptr. + std::atomic no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + std::atomic non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + std::atomic manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + std::atomic manifest_write_error_; + + // Force write to log files to fail while this pointer is non-nullptr + std::atomic log_write_error_; + + // Force `RenameFile()` to fail while this pointer is non-nullptr + std::atomic rename_error_{false}; + + // Slow down every log write, in micro-seconds. + std::atomic log_write_slowdown_; + + // If true, returns Status::NotSupported for file overwrite. + std::atomic no_file_overwrite_; + + // Number of WAL files that are still open for write. + std::atomic num_open_wal_file_; + + bool count_random_reads_; + uint32_t rand_reads_fail_odd_ = 0; + std::atomic num_reads_fails_; + anon::AtomicCounter random_read_counter_; + std::atomic random_read_bytes_counter_; + std::atomic random_file_open_counter_; + + bool count_sequential_reads_; + anon::AtomicCounter sequential_read_counter_; + + anon::AtomicCounter sleep_counter_; + + std::atomic bytes_written_; + + std::atomic sync_counter_; + + // If true, all fsync to files and directories are skipped. + bool skip_fsync_ = false; + + // If true, ingest the corruption to file during sync. + bool corrupt_in_sync_ = false; + + std::atomic non_writeable_rate_; + + std::atomic new_writable_count_; + + std::atomic non_writable_count_; + + std::function* table_write_callback_; + + std::atomic now_cpu_count_; + + std::atomic delete_count_; + + std::atomic rename_count_{0}; + + std::atomic is_wal_sync_thread_safe_{true}; + + std::atomic compaction_readahead_size_{}; + + private: // accessing these directly is prone to error + friend class DBTestBase; + + std::atomic addon_microseconds_{0}; + + // Do not modify in the env of a running DB (could cause deadlock) + std::atomic time_elapse_only_sleep_; + + bool no_slowdown_; +}; + +class FileTemperatureTestFS : public FileSystemWrapper { + public: + explicit FileTemperatureTestFS(const std::shared_ptr& fs) + : FileSystemWrapper(fs) {} + + static const char* kClassName() { return "FileTemperatureTestFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewSequentialFile(fname, opts, result, dbg); + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + requested_sst_file_temperatures_.emplace_back(number, opts.temperature); + if (s.ok()) { + if (opts.temperature != Temperature::kUnknown) { + // Be extra picky and don't open if a wrong non-unknown temperature is + // provided + auto e = current_sst_file_temperatures_.find(number); + if (e != current_sst_file_temperatures_.end() && + e->second != opts.temperature) { + result->reset(); + return IOStatus::PathNotFound("Temperature mismatch on " + fname); + } + } + *result = WrapWithTemperature( + number, std::move(*result)); + } + } + return s; + } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewRandomAccessFile(fname, opts, result, dbg); + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + requested_sst_file_temperatures_.emplace_back(number, opts.temperature); + if (s.ok()) { + if (opts.temperature != Temperature::kUnknown) { + // Be extra picky and don't open if a wrong non-unknown temperature is + // provided + auto e = current_sst_file_temperatures_.find(number); + if (e != current_sst_file_temperatures_.end() && + e->second != opts.temperature) { + result->reset(); + return IOStatus::PathNotFound("Temperature mismatch on " + fname); + } + } + *result = WrapWithTemperature( + number, std::move(*result)); + } + } + return s; + } + + void PopRequestedSstFileTemperatures( + std::vector>* out = nullptr) { + MutexLock lock(&mu_); + if (out) { + *out = std::move(requested_sst_file_temperatures_); + assert(requested_sst_file_temperatures_.empty()); + } else { + requested_sst_file_temperatures_.clear(); + } + } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + current_sst_file_temperatures_[number] = opts.temperature; + } + return target()->NewWritableFile(fname, opts, result, dbg); + } + + void CopyCurrentSstFileTemperatures(std::map* out) { + MutexLock lock(&mu_); + *out = current_sst_file_temperatures_; + } + + void OverrideSstFileTemperature(uint64_t number, Temperature temp) { + MutexLock lock(&mu_); + current_sst_file_temperatures_[number] = temp; + } + + protected: + port::Mutex mu_; + std::vector> + requested_sst_file_temperatures_; + std::map current_sst_file_temperatures_; + + std::string GetFileName(const std::string& fname) { + auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); + // workaround only for Windows that the file path could contain both Windows + // FilePathSeparator and '/' + filename = filename.substr(filename.find_last_of('/') + 1); + return filename; + } + + template + std::unique_ptr WrapWithTemperature(uint64_t number, + std::unique_ptr&& t) { + class FileWithTemp : public FileOwnerWrapperT { + public: + FileWithTemp(FileTemperatureTestFS* fs, uint64_t number, + std::unique_ptr&& t) + : FileOwnerWrapperT(std::move(t)), fs_(fs), number_(number) {} + + Temperature GetTemperature() const override { + MutexLock lock(&fs_->mu_); + return fs_->current_sst_file_temperatures_[number_]; + } + + private: + FileTemperatureTestFS* fs_; + uint64_t number_; + }; + return std::make_unique(this, number, std::move(t)); + } +}; + +class OnFileDeletionListener : public EventListener { + public: + OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {} + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "OnFileDeletionListener"; } + + void SetExpectedFileName(const std::string file_name) { + expected_file_name_ = file_name; + } + + void VerifyMatchedCount(size_t expected_value) { + ASSERT_EQ(matched_count_, expected_value); + } + + void OnTableFileDeleted(const TableFileDeletionInfo& info) override { + if (expected_file_name_ != "") { + ASSERT_EQ(expected_file_name_, info.file_path); + expected_file_name_ = ""; + matched_count_++; + } + } + + private: + size_t matched_count_; + std::string expected_file_name_; +}; + +class FlushCounterListener : public EventListener { + public: + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "FlushCounterListener"; } + std::atomic count{0}; + std::atomic expected_flush_reason{FlushReason::kOthers}; + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { + count++; + ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason); + } +}; + +// A test merge operator mimics put but also fails if one of merge operands is +// "corrupted", "corrupted_try_merge", or "corrupted_must_merge". +class TestPutOperator : public MergeOperator { + public: + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + static const std::map + bad_operand_to_op_failure_scope = { + {"corrupted", MergeOperator::OpFailureScope::kDefault}, + {"corrupted_try_merge", MergeOperator::OpFailureScope::kTryMerge}, + {"corrupted_must_merge", + MergeOperator::OpFailureScope::kMustMerge}}; + auto check_operand = + [](Slice operand_val, + MergeOperator::OpFailureScope* op_failure_scope) -> bool { + auto iter = bad_operand_to_op_failure_scope.find(operand_val.ToString()); + if (iter != bad_operand_to_op_failure_scope.end()) { + *op_failure_scope = iter->second; + return false; + } + return true; + }; + if (merge_in.existing_value != nullptr && + !check_operand(*merge_in.existing_value, + &merge_out->op_failure_scope)) { + return false; + } + for (auto value : merge_in.operand_list) { + if (!check_operand(value, &merge_out->op_failure_scope)) { + return false; + } + } + merge_out->existing_operand = merge_in.operand_list.back(); + return true; + } + + virtual const char* Name() const override { return "TestPutOperator"; } +}; + +/* + * A cache wrapper that tracks certain CacheEntryRole's cache charge, its + * peaks and increments + * + * p0 + * / \ p1 + * / \ /\ + * / \/ \ + * a / b \ + * peaks = {p0, p1} + * increments = {p1-a, p2-b} + */ +template +class TargetCacheChargeTrackingCache : public CacheWrapper { + public: + explicit TargetCacheChargeTrackingCache(std::shared_ptr target); + + const char* Name() const override { return "TargetCacheChargeTrackingCache"; } + + Status Insert(const Slice& key, ObjectPtr value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) override; + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override; + + std::size_t GetCacheCharge() { return cur_cache_charge_; } + + std::deque GetChargedCachePeaks() { return cache_charge_peaks_; } + + std::size_t GetChargedCacheIncrementSum() { + return cache_charge_increments_sum_; + } + + private: + static const Cache::CacheItemHelper* kCrmHelper; + + std::size_t cur_cache_charge_; + std::size_t cache_charge_peak_; + std::size_t cache_charge_increment_; + bool last_peak_tracked_; + std::deque cache_charge_peaks_; + std::size_t cache_charge_increments_sum_; +}; + +class DBTestBase : public testing::Test { + public: + // Sequence of option configurations to try + enum OptionConfig : int { + kDefault = 0, + kBlockBasedTableWithPrefixHashIndex = 1, + kBlockBasedTableWithWholeKeyHashIndex = 2, + kPlainTableFirstBytePrefix = 3, + kPlainTableCappedPrefix = 4, + kPlainTableCappedPrefixNonMmap = 5, + kPlainTableAllBytesPrefix = 6, + kVectorRep = 7, + kHashLinkList = 8, + kMergePut = 9, + kFilter = 10, + kFullFilterWithNewTableReaderForCompactions = 11, + kUncompressed = 12, + kNumLevel_3 = 13, + kDBLogDir = 14, + kWalDirAndMmapReads = 15, + kManifestFileSize = 16, + kPerfOptions = 17, + kHashSkipList = 18, + kUniversalCompaction = 19, + kUniversalCompactionMultiLevel = 20, + kInfiniteMaxOpenFiles = 21, + kCRC32cChecksum = 22, + kFIFOCompaction = 23, + kOptimizeFiltersForHits = 24, + kRowCache = 25, + kRecycleLogFiles = 26, + kConcurrentSkipList = 27, + kPipelinedWrite = 28, + kConcurrentWALWrites = 29, + kDirectIO, + kLevelSubcompactions, + kBlockBasedTableWithIndexRestartInterval, + kBlockBasedTableWithPartitionedIndex, + kBlockBasedTableWithPartitionedIndexFormat4, + kBlockBasedTableWithLatestFormat, + kPartitionedFilterWithNewTableReaderForCompactions, + kUniversalSubcompactions, + kUnorderedWrite, + // This must be the last line + kEnd, + }; + + public: + std::string dbname_; + std::string alternative_wal_dir_; + std::string alternative_db_log_dir_; + MockEnv* mem_env_; + Env* encrypted_env_; + SpecialEnv* env_; + std::shared_ptr env_guard_; + DB* db_; + std::vector handles_; + + int option_config_; + Options last_options_; + + // Skip some options, as they may not be applicable to a specific test. + // To add more skip constants, use values 4, 8, 16, etc. + enum OptionSkip { + kNoSkip = 0, + kSkipDeletesFilterFirst = 1, + kSkipUniversalCompaction = 2, + kSkipMergePut = 4, + kSkipPlainTable = 8, + kSkipHashIndex = 16, + kSkipNoSeekToLast = 32, + kSkipFIFOCompaction = 128, + kSkipMmapReads = 256, + }; + + const int kRangeDelSkipConfigs = + // Plain tables do not support range deletions. + kSkipPlainTable | + // MmapReads disables the iterator pinning that RangeDelAggregator + // requires. + kSkipMmapReads; + + // `env_do_fsync` decides whether the special Env would do real + // fsync for files and directories. Skipping fsync can speed up + // tests, but won't cover the exact fsync logic. + DBTestBase(const std::string path, bool env_do_fsync); + + ~DBTestBase(); + + static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); + } + + static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip); + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions(int skip_mask = kNoSkip); + + // Switch between different compaction styles. + bool ChangeCompactOptions(); + + // Switch between different WAL-realted options. + bool ChangeWalOptions(); + + // Switch between different filter policy + // Jump from kDefault to kFilter to kFullFilter + bool ChangeFilterOptions(); + + // Switch between different DB options for file ingestion tests. + bool ChangeOptionsForFileIngestionTest(); + + // Return the current option configuration. + Options CurrentOptions(const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + Options CurrentOptions(const Options& default_options, + const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + Options GetDefaultOptions() const; + + Options GetOptions(int option_config) const { + return GetOptions(option_config, GetDefaultOptions()); + } + + Options GetOptions(int option_config, const Options& default_options, + const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + DBImpl* dbfull() { return static_cast_with_check(db_); } + + void CreateColumnFamilies(const std::vector& cfs, + const Options& options); + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options& options); + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options); + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options& options); + + void Reopen(const Options& options); + + void Close(); + + void DestroyAndReopen(const Options& options); + + void Destroy(const Options& options, bool delete_cf_paths = false); + + Status ReadOnlyReopen(const Options& options); + + Status TryReopen(const Options& options); + + bool IsDirectIOSupported(); + + bool IsMemoryMappedAccessSupported() const; + + Status Flush(int cf = 0); + + Status Flush(const std::vector& cf_ids); + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()); + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Merge(const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Merge(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Delete(const std::string& k); + + Status Delete(int cf, const std::string& k); + + Status SingleDelete(const std::string& k); + + Status SingleDelete(int cf, const std::string& k); + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr); + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr); + + Status Get(const std::string& k, PinnableSlice* v); + + std::vector MultiGet(std::vector cfs, + const std::vector& k, + const Snapshot* snapshot, + const bool batched, + const bool async = false); + + std::vector MultiGet(const std::vector& k, + const Snapshot* snapshot = nullptr, + const bool async = false); + + uint64_t GetNumSnapshots(); + + uint64_t GetTimeOldestSnapshots(); + + uint64_t GetSequenceOldestSnapshots(); + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents(int cf = 0); + + std::string AllEntriesFor(const Slice& user_key, int cf = 0); + + // Similar to AllEntriesFor but this function also covers reopen with fifo. + // Note that test cases with snapshots or entries in memtable should simply + // use AllEntriesFor instead as snapshots and entries in memtable will + // survive after db reopen. + void CheckAllEntriesWithFifoReopen(const std::string& expected_value, + const Slice& user_key, int cf, + const std::vector& cfs, + const Options& options); + + int NumSortedRuns(int cf = 0); + + uint64_t TotalSize(int cf = 0); + + uint64_t SizeAtLevel(int level); + + size_t TotalLiveFiles(int cf = 0); + + size_t TotalLiveFilesAtPath(int cf, const std::string& path); + + size_t CountLiveFiles(); + + int NumTableFilesAtLevel(int level, int cf = 0); + + double CompressionRatioAtLevel(int level, int cf = 0); + + int TotalTableFiles(int cf = 0, int levels = -1); + + std::vector GetBlobFileNumbers(); + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0); + + size_t CountFiles(); + + Status CountFiles(size_t* count); + + Status Size(const Slice& start, const Slice& limit, uint64_t* size) { + return Size(start, limit, 0, size); + } + + Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size); + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id); + + void Compact(int cf, const Slice& start, const Slice& limit); + + void Compact(const Slice& start, const Slice& limit); + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0); + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest, + int cf); + + void MoveFilesToLevel(int level, int cf = 0); + + void DumpFileCounts(const char* label); + + std::string DumpSSTableList(); + + static void GetSstFiles(Env* env, std::string path, + std::vector* files); + + int GetSstFileCount(std::string path); + + // this will generate non-overlapping files since it keeps increasing key_idx + void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false); + + void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false); + + static const int kNumKeysByGenerateNewRandomFile; + static const int KNumKeysByGenerateNewFile = 100; + + void GenerateNewRandomFile(Random* rnd, bool nowait = false); + + std::string IterStatus(Iterator* iter); + + Options OptionsForLogIterTest(); + + std::string DummyString(size_t len, char c = 'a'); + + void VerifyIterLast(std::string expected_key, int cf = 0); + + // Used to test InplaceUpdate + + // If previous value is nullptr or delta is > than previous value, + // sets newValue with delta + // If previous value is not empty, + // updates previous value with 'b' string of previous value size - 1. + static UpdateStatus updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceLargerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue); + + // Utility method to test InplaceUpdate + void validateNumberOfEntries(int numValues, int cf = 0); + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0); + + Status GetAllDataFiles(const FileType file_type, + std::unordered_map* sst_files, + uint64_t* total_size = nullptr); + + std::vector ListTableFiles(Env* env, const std::string& path); + + void VerifyDBFromMap( + std::map true_data, + size_t* total_reads_res = nullptr, bool tailing_iter = false, + std::map status = std::map()); + + void VerifyDBInternal( + std::vector> true_data); + + uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name); + + uint64_t GetSstSizeHelper(Temperature temperature); + + uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); + } + + uint64_t TestGetAndResetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); + } + // Short name for TestGetAndResetTickerCount + uint64_t PopTicker(const Options& options, Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); + } + + // Note: reverting this setting within the same test run is not yet + // supported + void SetTimeElapseOnlySleepOnReopen(DBOptions* options); + + private: // Prone to error on direct use + void MaybeInstallTimeElapseOnlySleep(const DBOptions& options); + + bool time_elapse_only_sleep_on_reopen_ = false; +}; + +// For verifying that all files generated by current version have SST +// unique ids. +void VerifySstUniqueIds(const TablePropertiesCollection& props); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_with_timestamp_test_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_with_timestamp_test_util.h new file mode 100644 index 0000000000..8a0d8e4e31 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/db_with_timestamp_test_util.h @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { +class DBBasicTestWithTimestampBase : public DBTestBase { + public: + explicit DBBasicTestWithTimestampBase(const std::string& dbname) + : DBTestBase(dbname, /*env_do_fsync=*/true) {} + + protected: + static std::string Key1(uint64_t k); + + static std::string KeyWithPrefix(std::string prefix, uint64_t k); + + static std::vector ConvertStrToSlice( + std::vector& strings); + + class TestComparator : public Comparator { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : Comparator(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return -CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + if (a_has_ts) { + assert(a.size() >= timestamp_size()); + } + if (b_has_ts) { + assert(b.size() >= timestamp_size()); + } + Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a; + Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b; + return cmp_without_ts_->Compare(lhs, rhs); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + const size_t kSize = ts1.size(); + std::unique_ptr ts1_buf(new char[kSize]); + memcpy(ts1_buf.get(), ts1.data(), ts1.size()); + std::unique_ptr ts2_buf(new char[kSize]); + memcpy(ts2_buf.get(), ts2.data(), ts2.size()); + Slice ts1_copy = Slice(ts1_buf.get(), kSize); + Slice ts2_copy = Slice(ts2_buf.get(), kSize); + auto* ptr1 = const_cast(&ts1_copy); + auto* ptr2 = const_cast(&ts2_copy); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return -1; + } else if (high1 > high2) { + return 1; + } + if (low1 < low2) { + return -1; + } else if (low1 > low2) { + return 1; + } + return 0; + } + }; + + std::string Timestamp(uint64_t low, uint64_t high); + + void CheckIterUserEntry(const Iterator* it, const Slice& expected_key, + ValueType expected_value_type, + const Slice& expected_value, + const Slice& expected_ts) const; + + void CheckIterEntry(const Iterator* it, const Slice& expected_ukey, + SequenceNumber expected_seq, ValueType expected_val_type, + const Slice& expected_value, + const Slice& expected_ts) const; + + void CheckIterEntry(const Iterator* it, const Slice& expected_ukey, + ValueType expected_val_type, const Slice& expected_value, + const Slice& expected_ts) const; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.cc new file mode 100644 index 0000000000..2c3581ca00 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.cc @@ -0,0 +1,213 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/dbformat.h" + +#include + +#include + +#include "db/lookup_key.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +const ValueType kValueTypeForSeek = kTypeWideColumnEntity; +const ValueType kValueTypeForSeekForPrev = kTypeDeletion; +const std::string kDisableUserTimestamp(""); + +EntryType GetEntryType(ValueType value_type) { + switch (value_type) { + case kTypeValue: + return kEntryPut; + case kTypeDeletion: + return kEntryDelete; + case kTypeDeletionWithTimestamp: + return kEntryDeleteWithTimestamp; + case kTypeSingleDeletion: + return kEntrySingleDelete; + case kTypeMerge: + return kEntryMerge; + case kTypeRangeDeletion: + return kEntryRangeDeletion; + case kTypeBlobIndex: + return kEntryBlobIndex; + case kTypeWideColumnEntity: + return kEntryWideColumnEntity; + default: + return kEntryOther; + } +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +void AppendInternalKeyWithDifferentTimestamp(std::string* result, + const ParsedInternalKey& key, + const Slice& ts) { + assert(key.user_key.size() >= ts.size()); + result->append(key.user_key.data(), key.user_key.size() - ts.size()); + result->append(ts.data(), ts.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t) { + PutFixed64(result, PackSequenceAndType(s, t)); +} + +void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMin(ts_sz, static_cast(0)); + result->append(key.data(), key.size()); + result->append(kTsMin.data(), ts_sz); +} + +void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMax(ts_sz, static_cast(0xff)); + result->append(key.data(), key.size()); + result->append(kTsMax.data(), ts_sz); +} + +void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + result->append(key.data(), key.size() - ts_sz); + + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_sz < strlen(kTsMax)) { + result->append(kTsMax, ts_sz); + } else { + result->append(std::string(ts_sz, '\xff')); + } +} + +std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const { + std::string result = "'"; + if (log_err_key) { + result += user_key.ToString(hex); + } else { + result += ""; + } + + char buf[50]; + snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence, + static_cast(type)); + + result += buf; + return result; +} + +std::string InternalKey::DebugString(bool hex) const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) { + result = parsed.DebugString(true /* log_err_key */, hex); // TODO + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(a.user_key, b.user_key); + if (r == 0) { + if (a.sequence > b.sequence) { + r = -1; + } else if (a.sequence < b.sequence) { + r = +1; + } else if (a.type > b.type) { + r = -1; + } else if (a.type < b.type) { + r = +1; + } + } + return r; +} + +int InternalKeyComparator::Compare(const Slice& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key); + if (r == 0) { + const uint64_t anum = + DecodeFixed64(a.data() + a.size() - kNumInternalBytes); + const uint64_t bnum = (b.sequence << 8) | b.type; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const Slice& b) const { + return -Compare(b, a); +} + +LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, + const Slice* ts) { + size_t usize = _user_key.size(); + size_t ts_sz = (nullptr == ts) ? 0 : ts->size(); + size_t needed = usize + ts_sz + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + // NOTE: We don't support users keys of more than 2GB :) + dst = EncodeVarint32(dst, static_cast(usize + ts_sz + 8)); + kstart_ = dst; + memcpy(dst, _user_key.data(), usize); + dst += usize; + if (nullptr != ts) { + memcpy(dst, ts->data(), ts_sz); + dst += ts_sz; + } + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + +void IterKey::EnlargeBuffer(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + assert(key_size > buf_size_); + // Need to enlarge the buffer. + ResetBuffer(); + buf_ = new char[key_size]; + buf_size_ = key_size; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.h new file mode 100644 index 0000000000..3a6edc1bf0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/dbformat.h @@ -0,0 +1,869 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +// The file declares data structures and functions that deal with internal +// keys. +// Each internal key contains a user key, a sequence number (SequenceNumber) +// and a type (ValueType), and they are usually encoded together. +// There are some related helper classes here. + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +// The highest bit of the value type needs to be reserved to SST tables +// for them to do more flexible encoding. +enum ValueType : unsigned char { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeMerge = 0x2, + kTypeLogData = 0x3, // WAL only. + kTypeColumnFamilyDeletion = 0x4, // WAL only. + kTypeColumnFamilyValue = 0x5, // WAL only. + kTypeColumnFamilyMerge = 0x6, // WAL only. + kTypeSingleDeletion = 0x7, + kTypeColumnFamilySingleDeletion = 0x8, // WAL only. + kTypeBeginPrepareXID = 0x9, // WAL only. + kTypeEndPrepareXID = 0xA, // WAL only. + kTypeCommitXID = 0xB, // WAL only. + kTypeRollbackXID = 0xC, // WAL only. + kTypeNoop = 0xD, // WAL only. + kTypeColumnFamilyRangeDeletion = 0xE, // WAL only. + kTypeRangeDeletion = 0xF, // meta block + kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only + kTypeBlobIndex = 0x11, // Blob DB only + // When the prepared record is also persisted in db, we use a different + // record. This is to ensure that the WAL that is generated by a WritePolicy + // is not mistakenly read by another, which would result into data + // inconsistency. + kTypeBeginPersistedPrepareXID = 0x12, // WAL only. + // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL + // generated by WriteUnprepared write policy is not mistakenly read by + // another. + kTypeBeginUnprepareXID = 0x13, // WAL only. + kTypeDeletionWithTimestamp = 0x14, + kTypeCommitXIDAndTimestamp = 0x15, // WAL only + kTypeWideColumnEntity = 0x16, + kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only + kTypeMaxValid, // Should be after the last valid type, only used for + // validation + kMaxValue = 0x7F // Not used for storing records. +}; + +// Defined in dbformat.cc +extern const ValueType kValueTypeForSeek; +extern const ValueType kValueTypeForSeekForPrev; + +// Checks whether a type is an inline value type +// (i.e. a type used in memtable skiplist and sst file datablock). +inline bool IsValueType(ValueType t) { + return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t || + kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t; +} + +// Checks whether a type is from user operation +// kTypeRangeDeletion is in meta block so this API is separated from above +// kTypeMaxValid can be from keys generated by +// TruncatedRangeDelIterator::start_key() +inline bool IsExtendedValueType(ValueType t) { + return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid; +} + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1); + +static const SequenceNumber kDisableGlobalSequenceNumber = + std::numeric_limits::max(); + +constexpr uint64_t kNumInternalBytes = 8; + +// Defined in dbformat.cc +extern const std::string kDisableUserTimestamp; + +// The data structure that represents an internal key in the way that user_key, +// sequence number and type are stored in separated forms. +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() + : sequence(kMaxSequenceNumber), + type(kTypeDeletion) // Make code analyzer happy + {} // Intentionally left uninitialized (for speed) + // u contains timestamp if user timestamp feature is enabled. + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) {} + std::string DebugString(bool log_err_key, bool hex) const; + + void clear() { + user_key.clear(); + sequence = 0; + type = kTypeDeletion; + } + + void SetTimestamp(const Slice& ts) { + assert(ts.size() <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts.size(); + memcpy(const_cast(addr), ts.data(), ts.size()); + } + + Slice GetTimestamp(size_t ts_sz) { + assert(ts_sz <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts_sz; + return Slice(const_cast(addr), ts_sz); + } +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + kNumInternalBytes; +} + +// Pack a sequence number and a ValueType into a uint64_t +inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor. + assert(IsExtendedValueType(t) || t == kTypeMaxValid); + return (seq << 8) | t; +} + +// Given the result of PackSequenceAndType, store the sequence number in *seq +// and the ValueType in *t. +inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, + ValueType* t) { + *seq = packed >> 8; + *t = static_cast(packed & 0xff); + + // Commented the following two assertions in order to test key-value checksum + // on corrupted keys without crashing ("DbKvChecksumTest"). + // assert(*seq <= kMaxSequenceNumber); + // assert(IsExtendedValueType(*t)); +} + +EntryType GetEntryType(ValueType value_type); + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Append the serialization of "key" to *result, replacing the original +// timestamp with argument ts. +extern void AppendInternalKeyWithDifferentTimestamp( + std::string* result, const ParsedInternalKey& key, const Slice& ts); + +// Serialized internal key consists of user key followed by footer. +// This function appends the footer to *result, assuming that *result already +// contains the user key at the end. +extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t); + +// Append the key and a minimal timestamp to *result +extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// Append the key and a maximal timestamp to *result +extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is a user key with timestamp. Append the user key without timestamp +// and the maximal timestamp to *result. +extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern Status ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result, bool log_err_key); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= kNumInternalBytes); + return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes); +} + +inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, + size_t ts_sz) { + Slice ret = internal_key; + ret.remove_suffix(kNumInternalBytes + ts_sz); + return ret; +} + +inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + Slice ret = user_key; + ret.remove_suffix(ts_sz); + return ret; +} + +inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz); +} + +inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) { + const size_t key_size = internal_key.size(); + assert(key_size >= kNumInternalBytes + ts_sz); + return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes, + ts_sz); +} + +inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { + assert(internal_key.size() >= kNumInternalBytes); + const size_t n = internal_key.size(); + return DecodeFixed64(internal_key.data() + n - kNumInternalBytes); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + uint64_t num = ExtractInternalKeyFooter(internal_key); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator +#ifdef NDEBUG + final +#endif + : public CompareInterface { + private: + UserComparatorWrapper user_comparator_; + + public: + // `InternalKeyComparator`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + InternalKeyComparator() = default; + + // @param named If true, assign a name to this comparator based on the + // underlying comparator's name. This involves an allocation and copy in + // this constructor to precompute the result of `Name()`. To avoid this + // overhead, set `named` to false. In that case, `Name()` will return a + // generic name that is non-specific to the underlying comparator. + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {} + virtual ~InternalKeyComparator() {} + + int Compare(const Slice& a, const Slice& b) const override; + + bool Equal(const Slice& a, const Slice& b) const { + // TODO Use user_comparator_.Equal(). Perhaps compare seqno before + // comparing the user key too. + return Compare(a, b) == 0; + } + + // Same as Compare except that it excludes the value type from comparison + int CompareKeySeq(const Slice& a, const Slice& b) const; + + const Comparator* user_comparator() const { + return user_comparator_.user_comparator(); + } + + int Compare(const InternalKey& a, const InternalKey& b) const; + int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + int Compare(const Slice& a, const ParsedInternalKey& b) const; + int Compare(const ParsedInternalKey& a, const Slice& b) const; + // In this `Compare()` overload, the sequence numbers provided in + // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` + // and `b`, respectively. To disable sequence number override(s), provide the + // value `kDisableGlobalSequenceNumber`. + int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b, + SequenceNumber b_global_seqno) const; +}; + +// The class represent the internal key in encoded form. +class InternalKey { + private: + std::string rep_; + + public: + InternalKey() {} // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t)); + } + InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) { + AppendInternalKeyWithDifferentTimestamp( + &rep_, ParsedInternalKey(_user_key, s, t), ts); + } + + // sets the internal key to be bigger or equal to all internal keys with this + // user key + void SetMaxPossibleForUserKey(const Slice& _user_key) { + AppendInternalKey( + &rep_, ParsedInternalKey(_user_key, 0, static_cast(0))); + } + + // sets the internal key to be smaller or equal to all internal keys with this + // user key + void SetMinPossibleForUserKey(const Slice& _user_key) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber, + kValueTypeForSeek)); + } + + bool Valid() const { + ParsedInternalKey parsed; + return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */) + .ok()); // TODO + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + size_t size() const { return rep_.size(); } + + void Set(const Slice& _user_key, SequenceNumber s, ValueType t) { + SetFrom(ParsedInternalKey(_user_key, s, t)); + } + + void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t, + const Slice& ts) { + ParsedInternalKey pik = ParsedInternalKey(_user_key_with_ts, s, t); + // Should not call pik.SetTimestamp() directly as it overwrites the buffer + // containing _user_key. + SetFrom(pik, ts); + } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void SetFrom(const ParsedInternalKey& p, const Slice& ts) { + rep_.clear(); + AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts); + } + + void Clear() { rep_.clear(); } + + // The underlying representation. + // Intended only to be used together with ConvertFromUserKey(). + std::string* rep() { return &rep_; } + + // Assuming that *rep() contains a user key, this method makes internal key + // out of it in-place. This saves a memcpy compared to Set()/SetFrom(). + void ConvertFromUserKey(SequenceNumber s, ValueType t) { + AppendInternalKeyFooter(&rep_, s, t); + } + + std::string DebugString(bool hex) const; +}; + +inline int InternalKeyComparator::Compare(const InternalKey& a, + const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline Status ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result, bool log_err_key) { + const size_t n = internal_key.size(); + + if (n < kNumInternalBytes) { + return Status::Corruption("Corrupted Key: Internal Key too small. Size=" + + std::to_string(n) + ". "); + } + + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + assert(result->type <= ValueType::kMaxValue); + result->user_key = Slice(internal_key.data(), n - kNumInternalBytes); + + if (IsExtendedValueType(result->type)) { + return Status::OK(); + } else { + return Status::Corruption("Corrupted Key", + result->DebugString(log_err_key, true)); + } +} + +// Update the sequence number in the internal key. +// Guarantees not to invalidate ikey.data(). +inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) { + size_t ikey_sz = ikey->size(); + assert(ikey_sz >= kNumInternalBytes); + uint64_t newval = (seq << 8) | t; + + // Note: Since C++11, strings are guaranteed to be stored contiguously and + // string::operator[]() is guaranteed not to change ikey.data(). + EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval); +} + +// Get the sequence number from the internal key +inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { + const size_t n = internal_key.size(); + assert(n >= kNumInternalBytes); + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); + return num >> 8; +} + +// The class to store keys in an efficient way. It allows: +// 1. Users can either copy the key into it, or have it point to an unowned +// address. +// 2. For copied key, a short inline buffer is kept to reduce memory +// allocation for smaller keys. +// 3. It tracks user key or internal key, and allow conversion between them. +class IterKey { + public: + IterKey() + : buf_(space_), + key_(buf_), + key_size_(0), + buf_size_(sizeof(space_)), + is_user_key_(true) {} + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; + + ~IterKey() { ResetBuffer(); } + + // The bool will be picked up by the next calls to SetKey + void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } + + // Returns the key in whichever format that was provided to KeyIter + // If user-defined timestamp is enabled, then timestamp is included in the + // return result. + Slice GetKey() const { return Slice(key_, key_size_); } + + Slice GetInternalKey() const { + assert(!IsUserKey()); + return Slice(key_, key_size_); + } + + // If user-defined timestamp is enabled, then timestamp is included in the + // return result of GetUserKey(); + Slice GetUserKey() const { + if (IsUserKey()) { + return Slice(key_, key_size_); + } else { + assert(key_size_ >= kNumInternalBytes); + return Slice(key_, key_size_ - kNumInternalBytes); + } + } + + size_t Size() const { return key_size_; } + + void Clear() { key_size_ = 0; } + + // Append "non_shared_data" to its back, from "shared_len" + // This function is used in Block::Iter::ParseNextKey + // shared_len: bytes in [0, shard_len-1] would be remained + // non_shared_data: data to be append, its length must be >= non_shared_len + void TrimAppend(const size_t shared_len, const char* non_shared_data, + const size_t non_shared_len) { + assert(shared_len <= key_size_); + size_t total_size = shared_len + non_shared_len; + + if (IsKeyPinned() /* key is not in buf_ */) { + // Copy the key from external memory to buf_ (copy shared_len bytes) + EnlargeBufferIfNeeded(total_size); + memcpy(buf_, key_, shared_len); + } else if (total_size > buf_size_) { + // Need to allocate space, delete previous space + char* p = new char[total_size]; + memcpy(p, key_, shared_len); + + if (buf_ != space_) { + delete[] buf_; + } + + buf_ = p; + buf_size_ = total_size; + } + + memcpy(buf_ + shared_len, non_shared_data, non_shared_len); + key_ = buf_; + key_size_ = total_size; + } + + Slice SetKey(const Slice& key, bool copy = true) { + // is_user_key_ expected to be set already via SetIsUserKey + return SetKeyImpl(key, copy); + } + + // If user-defined timestamp is enabled, then `key` includes timestamp. + // TODO(yanqin) this is also used to set prefix, which do not include + // timestamp. Should be handled. + Slice SetUserKey(const Slice& key, bool copy = true) { + is_user_key_ = true; + return SetKeyImpl(key, copy); + } + + Slice SetInternalKey(const Slice& key, bool copy = true) { + is_user_key_ = false; + return SetKeyImpl(key, copy); + } + + // Copies the content of key, updates the reference to the user key in ikey + // and returns a Slice referencing the new copy. + Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) { + size_t key_n = key.size(); + assert(key_n >= kNumInternalBytes); + SetInternalKey(key); + ikey->user_key = Slice(key_, key_n - kNumInternalBytes); + return Slice(key_, key_n); + } + + // Copy the key into IterKey own buf_ + void OwnKey() { + assert(IsKeyPinned() == true); + + Reserve(key_size_); + memcpy(buf_, key_, key_size_); + key_ = buf_; + } + + // Update the sequence number in the internal key. Guarantees not to + // invalidate slices to the key (and the user key). + void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { + assert(!IsKeyPinned()); + assert(key_size_ >= kNumInternalBytes); + if (ts) { + assert(key_size_ >= kNumInternalBytes + ts->size()); + memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(), + ts->size()); + } + uint64_t newval = (seq << 8) | t; + EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); + } + + bool IsKeyPinned() const { return (key_ != buf_); } + + // If `ts` is provided, user_key should not contain timestamp, + // and `ts` is appended after user_key. + // TODO: more efficient storage for timestamp. + void SetInternalKey(const Slice& key_prefix, const Slice& user_key, + SequenceNumber s, + ValueType value_type = kValueTypeForSeek, + const Slice* ts = nullptr) { + size_t psize = key_prefix.size(); + size_t usize = user_key.size(); + size_t ts_sz = (ts != nullptr ? ts->size() : 0); + EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz); + if (psize > 0) { + memcpy(buf_, key_prefix.data(), psize); + } + memcpy(buf_ + psize, user_key.data(), usize); + if (ts) { + memcpy(buf_ + psize + usize, ts->data(), ts_sz); + } + EncodeFixed64(buf_ + usize + psize + ts_sz, + PackSequenceAndType(s, value_type)); + + key_ = buf_; + key_size_ = psize + usize + sizeof(uint64_t) + ts_sz; + is_user_key_ = false; + } + + void SetInternalKey(const Slice& user_key, SequenceNumber s, + ValueType value_type = kValueTypeForSeek, + const Slice* ts = nullptr) { + SetInternalKey(Slice(), user_key, s, value_type, ts); + } + + void Reserve(size_t size) { + EnlargeBufferIfNeeded(size); + key_size_ = size; + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + SetInternalKey(Slice(), parsed_key); + } + + void SetInternalKey(const Slice& key_prefix, + const ParsedInternalKey& parsed_key_suffix) { + SetInternalKey(key_prefix, parsed_key_suffix.user_key, + parsed_key_suffix.sequence, parsed_key_suffix.type); + } + + void EncodeLengthPrefixedKey(const Slice& key) { + auto size = key.size(); + EnlargeBufferIfNeeded(size + static_cast(VarintLength(size))); + char* ptr = EncodeVarint32(buf_, static_cast(size)); + memcpy(ptr, key.data(), size); + key_ = buf_; + is_user_key_ = true; + } + + bool IsUserKey() const { return is_user_key_; } + + private: + char* buf_; + const char* key_; + size_t key_size_; + size_t buf_size_; + char space_[32]; // Avoid allocation for short keys + bool is_user_key_; + + Slice SetKeyImpl(const Slice& key, bool copy) { + size_t size = key.size(); + if (copy) { + // Copy key to buf_ + EnlargeBufferIfNeeded(size); + memcpy(buf_, key.data(), size); + key_ = buf_; + } else { + // Update key_ to point to external memory + key_ = key.data(); + } + key_size_ = size; + return Slice(key_, key_size_); + } + + void ResetBuffer() { + if (buf_ != space_) { + delete[] buf_; + buf_ = space_; + } + buf_size_ = sizeof(space_); + key_size_ = 0; + } + + // Enlarge the buffer size if needed based on key_size. + // By default, static allocated buffer is used. Once there is a key + // larger than the static allocated buffer, another buffer is dynamically + // allocated, until a larger key buffer is requested. In that case, we + // reallocate buffer and delete the old one. + void EnlargeBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + if (key_size > buf_size_) { + EnlargeBuffer(key_size); + } + } + + void EnlargeBuffer(size_t key_size); +}; + +// Convert from a SliceTransform of user keys, to a SliceTransform of +// internal keys. +class InternalKeySliceTransform : public SliceTransform { + public: + explicit InternalKeySliceTransform(const SliceTransform* transform) + : transform_(transform) {} + + virtual const char* Name() const override { return transform_->Name(); } + + virtual Slice Transform(const Slice& src) const override { + auto user_key = ExtractUserKey(src); + return transform_->Transform(user_key); + } + + virtual bool InDomain(const Slice& src) const override { + auto user_key = ExtractUserKey(src); + return transform_->InDomain(user_key); + } + + virtual bool InRange(const Slice& dst) const override { + auto user_key = ExtractUserKey(dst); + return transform_->InRange(user_key); + } + + const SliceTransform* user_prefix_extractor() const { return transform_; } + + private: + // Like comparator, InternalKeySliceTransform will not take care of the + // deletion of transform_ + const SliceTransform* const transform_; +}; + +// Read the key of a record from a write batch. +// if this record represent the default column family then cf_record +// must be passed as false, otherwise it must be passed as true. +extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, + bool cf_record); + +// Read record from a write batch piece from input. +// tag, column_family, key, value and blob are return values. Callers own the +// slice they point to. +// Tag is defined as ValueType. +// input will be advanced to after the record. +// If user-defined timestamp is enabled for a column family, then the `key` +// resulting from this call will include timestamp. +extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid); + +// When user call DeleteRange() to delete a range of keys, +// we will store a serialized RangeTombstone in MemTable and SST. +// the struct here is an easy-understood form +// start/end_key_ is the start/end user key of the range to be deleted +struct RangeTombstone { + Slice start_key_; + Slice end_key_; + SequenceNumber seq_; + // TODO: we should optimize the storage here when user-defined timestamp + // is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone. + Slice ts_; + std::string pinned_start_key_; + std::string pinned_end_key_; + + RangeTombstone() = default; + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn) + : start_key_(sk), end_key_(ek), seq_(sn) {} + + // User-defined timestamp is enabled, `sk` and `ek` should be user key + // with timestamp, `ts` will replace the timestamps in `sk` and + // `ek`. + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) + : seq_(sn), ts_(ts) { + assert(!ts.empty()); + pinned_start_key_.reserve(sk.size()); + pinned_start_key_.append(sk.data(), sk.size() - ts.size()); + pinned_start_key_.append(ts.data(), ts.size()); + pinned_end_key_.reserve(ek.size()); + pinned_end_key_.append(ek.data(), ek.size() - ts.size()); + pinned_end_key_.append(ts.data(), ts.size()); + start_key_ = pinned_start_key_; + end_key_ = pinned_end_key_; + } + + RangeTombstone(ParsedInternalKey parsed_key, Slice value) { + start_key_ = parsed_key.user_key; + seq_ = parsed_key.sequence; + end_key_ = value; + } + + // be careful to use Serialize(), allocates new memory + std::pair Serialize() const { + auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion); + return std::make_pair(std::move(key), end_key_); + } + + // be careful to use SerializeKey(), allocates new memory + InternalKey SerializeKey() const { + return InternalKey(start_key_, seq_, kTypeRangeDeletion); + } + + // The tombstone end-key is exclusive, so we generate an internal-key here + // which has a similar property. Using kMaxSequenceNumber guarantees that + // the returned internal-key will compare less than any other internal-key + // with the same user-key. This in turn guarantees that the serialized + // end-key for a tombstone such as [a-b] will compare less than the key "b". + // + // be careful to use SerializeEndKey(), allocates new memory + InternalKey SerializeEndKey() const { + if (!ts_.empty()) { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_.size() <= strlen(kTsMax)) { + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion, + Slice(kTsMax, ts_.size())); + } else { + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion, + std::string(ts_.size(), '\xff')); + } + } + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion); + } +}; + +inline int InternalKeyComparator::Compare(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = + DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes); + const uint64_t bnum = + DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + // Shift the number to exclude the last byte which contains the value type + const uint64_t anum = + DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8; + const uint64_t bnum = + DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +inline int InternalKeyComparator::Compare(const Slice& a, + SequenceNumber a_global_seqno, + const Slice& b, + SequenceNumber b_global_seqno) const { + int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b)); + if (r == 0) { + uint64_t a_footer, b_footer; + if (a_global_seqno == kDisableGlobalSequenceNumber) { + a_footer = ExtractInternalKeyFooter(a); + } else { + a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a)); + } + if (b_global_seqno == kDisableGlobalSequenceNumber) { + b_footer = ExtractInternalKeyFooter(b); + } else { + b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b)); + } + if (a_footer > b_footer) { + r = -1; + } else if (a_footer < b_footer) { + r = +1; + } + } + return r; +} + +// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey. +struct ParsedInternalKeyComparator { + explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} + + bool operator()(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + return cmp->Compare(a, b) < 0; + } + + const InternalKeyComparator* cmp; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.cc new file mode 100644 index 0000000000..98c3e82d5f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.cc @@ -0,0 +1,792 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/error_handler.h" + +#include "db/db_impl/db_impl.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "port/lang.h" + +namespace ROCKSDB_NAMESPACE { + +// Maps to help decide the severity of an error based on the +// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks +// is set or not. There are 3 maps, going from most specific to least specific +// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and +// paranoid_checks). The less specific map serves as a catch all in case we miss +// a specific error code or subcode. +std::map, + Status::Severity> + ErrorSeverityMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kSoftError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kSpaceLimit, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kNoSpace, true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kNoSpace, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kSpaceLimit, true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kIOFenced, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kIOFenced, false), + Status::Severity::kFatalError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during MANIFEST write + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during BG flush with WAL disabled + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kSpaceLimit, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during MANIFEST write when WAL is disabled + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + +}; + +std::map, + Status::Severity> + DefaultErrorSeverityMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + false), + Status::Severity::kNoError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, + // Errors during BG flush with WAL disabled + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, +}; + +std::map, Status::Severity> + DefaultReasonMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, false), + Status::Severity::kNoError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, false), + Status::Severity::kNoError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), + Status::Severity::kFatalError}, + // Errors during Memtable update + {std::make_tuple(BackgroundErrorReason::kMemTable, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kMemTable, false), + Status::Severity::kFatalError}, +}; + +void ErrorHandler::CancelErrorRecovery() { + db_mutex_->AssertHeld(); + + // We'll release the lock before calling sfm, so make sure no new + // recovery gets scheduled at that point + auto_recovery_ = false; + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); + if (sfm) { + // This may or may not cancel a pending recovery + db_mutex_->Unlock(); + bool cancelled = sfm->CancelErrorRecovery(this); + db_mutex_->Lock(); + if (cancelled) { + recovery_in_prog_ = false; + } + } + + // If auto recovery is also runing to resume from the retryable error, + // we should wait and end the auto recovery. + EndAutoRecovery(); +} + +STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()}; + +// This is the main function for looking at an error during a background +// operation and deciding the severity, and error recovery strategy. The high +// level algorithm is as follows - +// 1. Classify the severity of the error based on the ErrorSeverityMap, +// DefaultErrorSeverityMap and DefaultReasonMap defined earlier +// 2. Call a Status code specific override function to adjust the severity +// if needed. The reason for this is our ability to recover may depend on +// the exact options enabled in DBOptions +// 3. Determine if auto recovery is possible. A listener notification callback +// is called, which can disable the auto recovery even if we decide its +// feasible +// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control +// the actual recovery. If no sst file manager is specified in DBOptions, +// a default one is allocated during DB::Open(), so there will always be +// one. +// This can also get called as part of a recovery operation. In that case, we +// also track the error separately in recovery_error_ so we can tell in the +// end whether recovery succeeded or not +const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, + BackgroundErrorReason reason) { + db_mutex_->AssertHeld(); + if (bg_err.ok()) { + return kOkStatus; + } + + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set regular background error\n"); + + bool paranoid = db_options_.paranoid_checks; + Status::Severity sev = Status::Severity::kFatalError; + Status new_bg_err; + DBRecoverContext context; + bool found = false; + + { + auto entry = ErrorSeverityMap.find( + std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid)); + if (entry != ErrorSeverityMap.end()) { + sev = entry->second; + found = true; + } + } + + if (!found) { + auto entry = DefaultErrorSeverityMap.find( + std::make_tuple(reason, bg_err.code(), paranoid)); + if (entry != DefaultErrorSeverityMap.end()) { + sev = entry->second; + found = true; + } + } + + if (!found) { + auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid)); + if (entry != DefaultReasonMap.end()) { + sev = entry->second; + } + } + + new_bg_err = Status(bg_err, sev); + + // Check if recovery is currently in progress. If it is, we will save this + // error so we can check it at the end to see if recovery succeeded or not + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = new_bg_err; + } + + bool auto_recovery = auto_recovery_; + if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) { + auto_recovery = false; + } + + // Allow some error specific overrides + if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || + new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { + new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery); + } + + if (!new_bg_err.ok()) { + Status s = new_bg_err; + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, + db_mutex_, &auto_recovery); + if (!s.ok() && (s.severity() > bg_error_.severity())) { + bg_error_ = s; + } else { + // This error is less severe than previously encountered error. Don't + // take any further action + return bg_error_; + } + } + + recover_context_ = context; + if (auto_recovery) { + recovery_in_prog_ = true; + + // Kick-off error specific recovery + if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || + new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { + RecoverFromNoSpace(); + } + } + if (bg_error_.severity() >= Status::Severity::kHardError) { + is_db_stopped_.store(true, std::memory_order_release); + } + return bg_error_; +} + +// This is the main function for looking at IO related error during the +// background operations. The main logic is: +// 1) File scope IO error is treated as retryable IO error in the write +// path. In RocksDB, If a file has write IO error and it is at file scope, +// RocksDB never write to the same file again. RocksDB will create a new +// file and rewrite the whole content. Thus, it is retryable. +// 1) if the error is caused by data loss, the error is mapped to +// unrecoverable error. Application/user must take action to handle +// this situation (File scope case is excluded). +// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, +// or its retryable flag is set and not a data loss error), auto resume +// will be called and the auto resume can be controlled by resume count +// and resume interval options. There are three sub-cases: +// a) if the error happens during compaction, it is mapped to a soft error. +// the compaction thread will reschedule a new compaction. +// b) if the error happens during flush and also WAL is empty, it is mapped +// to a soft error. Note that, it includes the case that IO error happens +// in SST or manifest write during flush. +// c) all other errors are mapped to hard error. +// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason +// reason) will be called to handle other error cases. +const Status& ErrorHandler::SetBGError(const Status& bg_status, + BackgroundErrorReason reason) { + db_mutex_->AssertHeld(); + Status tmp_status = bg_status; + IOStatus bg_io_err = status_to_io_status(std::move(tmp_status)); + + if (bg_io_err.ok()) { + return kOkStatus; + } + ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", + bg_io_err.ToString().c_str()); + + if (recovery_in_prog_ && recovery_io_error_.ok()) { + recovery_io_error_ = bg_io_err; + } + if (BackgroundErrorReason::kManifestWrite == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { + // Always returns ok + ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions"); + db_->DisableFileDeletionsWithLock().PermitUncheckedError(); + } + + Status new_bg_io_err = bg_io_err; + DBRecoverContext context; + if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && + bg_io_err.GetDataLoss()) { + // First, data loss (non file scope) is treated as unrecoverable error. So + // it can directly overwrite any existing bg_error_. + bool auto_recovery = false; + Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); + CheckAndSetRecoveryAndBGError(bg_err); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Set background IO error as unrecoverable error\n"); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &bg_err, db_mutex_, &auto_recovery); + recover_context_ = context; + return bg_error_; + } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace && + (bg_io_err.GetScope() == + IOStatus::IOErrorScope::kIOErrorScopeFile || + bg_io_err.GetRetryable())) { + // Second, check if the error is a retryable IO error (file scope IO error + // is also treated as retryable IO error in RocksDB write path). if it is + // retryable error and its severity is higher than bg_error_, overwrite the + // bg_error_ with new error. In current stage, for retryable IO error of + // compaction, treat it as soft error. In other cases, treat the retryable + // IO error as hard error. Note that, all the NoSpace error should be + // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter + // it is retryable or file scope, this logic will be bypassed. + bool auto_recovery = false; + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &new_bg_io_err, db_mutex_, + &auto_recovery); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set background retryable IO error\n"); + if (BackgroundErrorReason::kCompaction == reason) { + // We map the retryable IO error during compaction to soft error. Since + // compaction can reschedule by itself. We will not set the BG error in + // this case + // TODO: a better way to set or clean the retryable IO error which + // happens during compaction SST file write. + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Compaction will schedule by itself to resume\n"); + return bg_error_; + } else if (BackgroundErrorReason::kFlushNoWAL == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { + // When the BG Retryable IO error reason is flush without WAL, + // We map it to a soft error. At the same time, all the background work + // should be stopped except the BG work from recovery. Therefore, we + // set the soft_error_no_bg_work_ to true. At the same time, since DB + // continues to receive writes when BG error is soft error, to avoid + // to many small memtable being generated during auto resume, the flush + // reason is set to kErrorRecoveryRetryFlush. + Status bg_err(new_bg_io_err, Status::Severity::kSoftError); + CheckAndSetRecoveryAndBGError(bg_err); + soft_error_no_bg_work_ = true; + context.flush_reason = FlushReason::kErrorRecoveryRetryFlush; + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); + } else { + Status bg_err(new_bg_io_err, Status::Severity::kHardError); + CheckAndSetRecoveryAndBGError(bg_err); + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); + } + } else { + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + // HandleKnownErrors() will use recovery_error_, so ignore + // recovery_io_error_. + // TODO: Do some refactoring and use only one recovery_error_ + recovery_io_error_.PermitUncheckedError(); + return HandleKnownErrors(new_bg_io_err, reason); + } +} + +Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, + bool* auto_recovery) { + if (bg_error.severity() >= Status::Severity::kFatalError) { + return bg_error; + } + + if (db_options_.sst_file_manager.get() == nullptr) { + // We rely on SFM to poll for enough disk space and recover + *auto_recovery = false; + return bg_error; + } + + if (db_options_.allow_2pc && + (bg_error.severity() <= Status::Severity::kSoftError)) { + // Don't know how to recover, as the contents of the current WAL file may + // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled, + // we can just flush the memtable and discard the log + *auto_recovery = false; + return Status(bg_error, Status::Severity::kFatalError); + } + + { + uint64_t free_space; + if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, + &free_space) == Status::NotSupported()) { + *auto_recovery = false; + } + } + + return bg_error; +} + +void ErrorHandler::RecoverFromNoSpace() { + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); + + // Inform SFM of the error, so it can kick-off the recovery + if (sfm) { + sfm->StartErrorRecovery(this, bg_error_); + } +} + +Status ErrorHandler::ClearBGError() { + db_mutex_->AssertHeld(); + + // Signal that recovery succeeded + if (recovery_error_.ok()) { + Status old_bg_error = bg_error_; + // old_bg_error is only for notifying listeners, so may not be checked + old_bg_error.PermitUncheckedError(); + // Clear and check the recovery IO and BG error + bg_error_ = Status::OK(); + recovery_io_error_ = IOStatus::OK(); + bg_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); + recovery_in_prog_ = false; + soft_error_no_bg_work_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error, + bg_error_, db_mutex_); + } + return recovery_error_; +} + +Status ErrorHandler::RecoverFromBGError(bool is_manual) { + InstrumentedMutexLock l(db_mutex_); + bool no_bg_work_original_flag = soft_error_no_bg_work_; + if (is_manual) { + // If its a manual recovery and there's a background recovery in progress + // return busy status + if (recovery_in_prog_) { + return Status::Busy(); + } + recovery_in_prog_ = true; + + // In manual resume, we allow the bg work to run. If it is a auto resume, + // the bg work should follow this tag. + soft_error_no_bg_work_ = false; + + // In manual resume, if the bg error is a soft error and also requires + // no bg work, the error must be recovered by call the flush with + // flush reason: kErrorRecoveryRetryFlush. In other case, the flush + // reason is set to kErrorRecovery. + if (no_bg_work_original_flag) { + recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush; + } else { + recover_context_.flush_reason = FlushReason::kErrorRecovery; + } + } + + if (bg_error_.severity() == Status::Severity::kSoftError && + recover_context_.flush_reason == FlushReason::kErrorRecovery) { + // Simply clear the background error and return + recovery_error_ = Status::OK(); + return ClearBGError(); + } + + // Reset recovery_error_. We will use this to record any errors that happen + // during the recovery process. While recovering, the only operations that + // can generate background errors should be the flush operations + recovery_error_ = Status::OK(); + recovery_error_.PermitUncheckedError(); + Status s = db_->ResumeImpl(recover_context_); + if (s.ok()) { + soft_error_no_bg_work_ = false; + } else { + soft_error_no_bg_work_ = no_bg_work_original_flag; + } + + // For manual recover, shutdown, and fatal error cases, set + // recovery_in_prog_ to false. For automatic background recovery, leave it + // as is regardless of success or failure as it will be retried + if (is_manual || s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + recovery_in_prog_ = false; + } + return s; +} + +const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( + const IOStatus& io_error) { + db_mutex_->AssertHeld(); + if (bg_error_.ok()) { + return bg_error_; + } else if (io_error.ok()) { + return kOkStatus; + } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { + // Auto resume BG error is not enabled, directly return bg_error_. + return bg_error_; + } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); + if (recovery_thread_) { + // In this case, if recovery_in_prog_ is false, current thread should + // wait the previous recover thread to finish and create a new thread + // to recover from the bg error. + db_mutex_->Unlock(); + recovery_thread_->join(); + db_mutex_->Lock(); + } + + recovery_in_prog_ = true; + recovery_thread_.reset( + new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); + + if (recovery_io_error_.ok() && recovery_error_.ok()) { + return recovery_error_; + } else { + return bg_error_; + } +} + +// Automatic recover from Retryable BG IO error. Must be called after db +// mutex is released. +void ErrorHandler::RecoverFromRetryableBGIOError() { + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); + InstrumentedMutexLock l(db_mutex_); + if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + return; + } + DBRecoverContext context = recover_context_; + int resume_count = db_options_.max_bgerror_resume_count; + uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; + uint64_t retry_count = 0; + // Recover from the retryable error. Create a separate thread to do it. + while (resume_count > 0) { + if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + return; + } + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); + recovery_io_error_ = IOStatus::OK(); + recovery_error_ = Status::OK(); + retry_count++; + Status s = db_->ResumeImpl(context); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); + } + if (s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + // If DB shutdown in progress or the error severity is higher than + // Hard Error, stop auto resume and returns. + recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + bg_error_, db_mutex_); + return; + } + if (!recovery_io_error_.ok() && + recovery_error_.severity() <= Status::Severity::kHardError && + recovery_io_error_.GetRetryable()) { + // If new BG IO error happens during auto recovery and it is retryable + // and its severity is Hard Error or lower, the auto resmue sleep for + // a period of time and redo auto resume if it is allowed. + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1"); + int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; + cv_.TimedWait(wait_until); + } else { + // There are three possibility: 1) recover_io_error is set during resume + // and the error is not retryable, 2) recover is successful, 3) other + // error happens during resume and cannot be resumed here. + if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) { + // recover from the retryable IO error and no other BG errors. Clean + // the bg_error and notify user. + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); + Status old_bg_error = bg_error_; + is_db_stopped_.store(false, std::memory_order_release); + bg_error_ = Status::OK(); + bg_error_.PermitUncheckedError(); + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, old_bg_error, bg_error_, db_mutex_); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + recovery_in_prog_ = false; + if (soft_error_no_bg_work_) { + soft_error_no_bg_work_ = false; + } + return; + } else { + // In this case: 1) recovery_io_error is more serious or not retryable + // 2) other Non IO recovery_error happens. The auto recovery stops. + recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + !recovery_io_error_.ok() + ? recovery_io_error_ + : (!recovery_error_.ok() ? recovery_error_ : s), + db_mutex_); + return; + } + } + resume_count--; + } + recovery_in_prog_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + Status::Aborted("Exceeded resume retry count"), db_mutex_); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + return; +} + +void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = bg_err; + } + if (bg_err.severity() > bg_error_.severity()) { + bg_error_ = bg_err; + } + if (bg_error_.severity() >= Status::Severity::kHardError) { + is_db_stopped_.store(true, std::memory_order_release); + } + return; +} + +void ErrorHandler::EndAutoRecovery() { + db_mutex_->AssertHeld(); + if (!end_recovery_) { + end_recovery_ = true; + } + cv_.SignalAll(); + db_mutex_->Unlock(); + if (recovery_thread_) { + recovery_thread_->join(); + } + db_mutex_->Lock(); + return; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.h new file mode 100644 index 0000000000..34e08a525d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/error_handler.h @@ -0,0 +1,124 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "rocksdb/io_status.h" +#include "rocksdb/listener.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; + +// This structure is used to store the DB recovery context. The context is +// the information that related to the recover actions. For example, it contains +// FlushReason, which tells the flush job why this flush is called. +struct DBRecoverContext { + FlushReason flush_reason; + + DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {} + + DBRecoverContext(FlushReason reason) : flush_reason(reason) {} +}; + +class ErrorHandler { + public: + ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, + InstrumentedMutex* db_mutex) + : db_(db), + db_options_(db_options), + cv_(db_mutex), + end_recovery_(false), + recovery_thread_(nullptr), + db_mutex_(db_mutex), + auto_recovery_(false), + recovery_in_prog_(false), + soft_error_no_bg_work_(false), + is_db_stopped_(false), + bg_error_stats_(db_options.statistics) { + // Clear the checked flag for uninitialized errors + bg_error_.PermitUncheckedError(); + recovery_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); + } + + void EnableAutoRecovery() { auto_recovery_ = true; } + + Status::Severity GetErrorSeverity(BackgroundErrorReason reason, + Status::Code code, Status::SubCode subcode); + + const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); + + Status GetBGError() const { return bg_error_; } + + Status GetRecoveryError() const { return recovery_error_; } + + Status ClearBGError(); + + bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); } + + bool IsBGWorkStopped() { + assert(db_mutex_); + db_mutex_->AssertHeld(); + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_ || soft_error_no_bg_work_); + } + + bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; } + + bool IsRecoveryInProgress() { return recovery_in_prog_; } + + Status RecoverFromBGError(bool is_manual = false); + void CancelErrorRecovery(); + + void EndAutoRecovery(); + + private: + DBImpl* db_; + const ImmutableDBOptions& db_options_; + Status bg_error_; + // A separate Status variable used to record any errors during the + // recovery process from hard errors + Status recovery_error_; + // A separate IO Status variable used to record any IO errors during + // the recovery process. At the same time, recovery_error_ is also set. + IOStatus recovery_io_error_; + // The condition variable used with db_mutex during auto resume for time + // wait. + InstrumentedCondVar cv_; + bool end_recovery_; + std::unique_ptr recovery_thread_; + + InstrumentedMutex* db_mutex_; + // A flag indicating whether automatic recovery from errors is enabled + bool auto_recovery_; + bool recovery_in_prog_; + // A flag to indicate that for the soft error, we should not allow any + // background work except the work is from recovery. + bool soft_error_no_bg_work_; + + // Used to store the context for recover, such as flush reason. + DBRecoverContext recover_context_; + std::atomic is_db_stopped_; + + // The pointer of DB statistics. + std::shared_ptr bg_error_stats_; + + const Status& HandleKnownErrors(const Status& bg_err, + BackgroundErrorReason reason); + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); + void RecoverFromNoSpace(); + const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); + void RecoverFromRetryableBGIOError(); + // First, if it is in recovery and the recovery_error is ok. Set the + // recovery_error_ to bg_err. Second, if the severity is higher than the + // current bg_error_, overwrite it. + void CheckAndSetRecoveryAndBGError(const Status& bg_err); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.cc new file mode 100644 index 0000000000..4360144ece --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.cc @@ -0,0 +1,323 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/event_helpers.h" + +#include "rocksdb/convenience.h" +#include "rocksdb/listener.h" +#include "rocksdb/utilities/customizable_util.h" + +namespace ROCKSDB_NAMESPACE { +Status EventListener::CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result) { + return LoadSharedObject(config_options, id, result); +} + +namespace { +template +inline T SafeDivide(T a, T b) { + return b == 0 ? 0 : a / b; +} +} // anonymous namespace + +void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { + *jwriter << "time_micros" + << std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + +void EventHelpers::NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason) { + if (listeners.empty()) { + return; + } + TableFileCreationBriefInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.job_id = job_id; + info.reason = reason; + for (auto& listener : listeners) { + listener->OnTableFileCreationStarted(info); + } +} + +void EventHelpers::NotifyOnBackgroundError( + const std::vector>& listeners, + BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex, + bool* auto_recovery) { + if (listeners.empty()) { + return; + } + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + listener->OnBackgroundError(reason, bg_error); + bg_error->PermitUncheckedError(); + if (*auto_recovery) { + listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery); + } + } + db_mutex->Lock(); +} + +void EventHelpers::LogAndNotifyTableFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "table_file_creation" + << "file_number" << fd.GetNumber() << "file_size" + << fd.GetFileSize() << "file_checksum" + << Slice(file_checksum).ToString(true) << "file_checksum_func_name" + << file_checksum_func_name << "smallest_seqno" << fd.smallest_seqno + << "largest_seqno" << fd.largest_seqno; + + // table_properties + { + jwriter << "table_properties"; + jwriter.StartObject(); + + // basic properties: + jwriter << "data_size" << table_properties.data_size << "index_size" + << table_properties.index_size << "index_partitions" + << table_properties.index_partitions << "top_level_index_size" + << table_properties.top_level_index_size + << "index_key_is_user_key" + << table_properties.index_key_is_user_key + << "index_value_is_delta_encoded" + << table_properties.index_value_is_delta_encoded << "filter_size" + << table_properties.filter_size << "raw_key_size" + << table_properties.raw_key_size << "raw_average_key_size" + << SafeDivide(table_properties.raw_key_size, + table_properties.num_entries) + << "raw_value_size" << table_properties.raw_value_size + << "raw_average_value_size" + << SafeDivide(table_properties.raw_value_size, + table_properties.num_entries) + << "num_data_blocks" << table_properties.num_data_blocks + << "num_entries" << table_properties.num_entries + << "num_filter_entries" << table_properties.num_filter_entries + << "num_deletions" << table_properties.num_deletions + << "num_merge_operands" << table_properties.num_merge_operands + << "num_range_deletions" << table_properties.num_range_deletions + << "format_version" << table_properties.format_version + << "fixed_key_len" << table_properties.fixed_key_len + << "filter_policy" << table_properties.filter_policy_name + << "column_family_name" << table_properties.column_family_name + << "column_family_id" << table_properties.column_family_id + << "comparator" << table_properties.comparator_name + << "merge_operator" << table_properties.merge_operator_name + << "prefix_extractor_name" + << table_properties.prefix_extractor_name << "property_collectors" + << table_properties.property_collectors_names << "compression" + << table_properties.compression_name << "compression_options" + << table_properties.compression_options << "creation_time" + << table_properties.creation_time << "oldest_key_time" + << table_properties.oldest_key_time << "file_creation_time" + << table_properties.file_creation_time + << "slow_compression_estimated_data_size" + << table_properties.slow_compression_estimated_data_size + << "fast_compression_estimated_data_size" + << table_properties.fast_compression_estimated_data_size + << "db_id" << table_properties.db_id << "db_session_id" + << table_properties.db_session_id << "orig_file_number" + << table_properties.orig_file_number << "seqno_to_time_mapping"; + + if (table_properties.seqno_to_time_mapping.empty()) { + jwriter << "N/A"; + } else { + SeqnoToTimeMapping tmp; + Status status = tmp.Add(table_properties.seqno_to_time_mapping); + if (status.ok()) { + jwriter << tmp.ToHumanString(); + } else { + jwriter << "Invalid"; + } + } + + // user collected properties + for (const auto& prop : table_properties.readable_properties) { + jwriter << prop.first << prop.second; + } + jwriter.EndObject(); + } + + if (oldest_blob_file_number != kInvalidBlobFileNumber) { + jwriter << "oldest_blob_file_number" << oldest_blob_file_number; + } + + jwriter.EndObject(); + + event_logger->Log(jwriter); + } + + if (listeners.empty()) { + return; + } + TableFileCreationInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.file_size = fd.file_size; + info.job_id = job_id; + info.table_properties = table_properties; + info.reason = reason; + info.status = s; + info.file_checksum = file_checksum; + info.file_checksum_func_name = file_checksum_func_name; + for (auto& listener : listeners) { + listener->OnTableFileCreated(info); + } + info.status.PermitUncheckedError(); +} + +void EventHelpers::LogAndNotifyTableFileDeletion( + EventLogger* event_logger, int job_id, uint64_t file_number, + const std::string& file_path, const Status& status, + const std::string& dbname, + const std::vector>& listeners) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" + << "table_file_deletion" + << "file_number" << file_number; + if (!status.ok()) { + jwriter << "status" << status.ToString(); + } + + jwriter.EndObject(); + + event_logger->Log(jwriter); + + if (listeners.empty()) { + return; + } + TableFileDeletionInfo info; + info.db_name = dbname; + info.job_id = job_id; + info.file_path = file_path; + info.status = status; + for (auto& listener : listeners) { + listener->OnTableFileDeleted(info); + } + info.status.PermitUncheckedError(); +} + +void EventHelpers::NotifyOnErrorRecoveryEnd( + const std::vector>& listeners, + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex) { + if (!listeners.empty()) { + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + BackgroundErrorRecoveryInfo info; + info.old_bg_error = old_bg_error; + info.new_bg_error = new_bg_error; + listener->OnErrorRecoveryCompleted(old_bg_error); + listener->OnErrorRecoveryEnd(info); + info.old_bg_error.PermitUncheckedError(); + info.new_bg_error.PermitUncheckedError(); + } + db_mutex->Lock(); + } +} + +void EventHelpers::NotifyBlobFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, + BlobFileCreationReason creation_reason) { + if (listeners.empty()) { + return; + } + BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id, + creation_reason); + for (const auto& listener : listeners) { + listener->OnBlobFileCreationStarted(info); + } +} + +void EventHelpers::LogAndNotifyBlobFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, uint64_t file_number, + BlobFileCreationReason creation_reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name, uint64_t total_blob_count, + uint64_t total_blob_bytes) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "blob_file_creation" + << "file_number" << file_number << "total_blob_count" + << total_blob_count << "total_blob_bytes" << total_blob_bytes + << "file_checksum" << file_checksum << "file_checksum_func_name" + << file_checksum_func_name << "status" << s.ToString(); + + jwriter.EndObject(); + event_logger->Log(jwriter); + } + + if (listeners.empty()) { + return; + } + BlobFileCreationInfo info(db_name, cf_name, file_path, job_id, + creation_reason, total_blob_count, total_blob_bytes, + s, file_checksum, file_checksum_func_name); + for (const auto& listener : listeners) { + listener->OnBlobFileCreated(info); + } + info.status.PermitUncheckedError(); +} + +void EventHelpers::LogAndNotifyBlobFileDeletion( + EventLogger* event_logger, + const std::vector>& listeners, int job_id, + uint64_t file_number, const std::string& file_path, const Status& status, + const std::string& dbname) { + if (event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" + << "blob_file_deletion" + << "file_number" << file_number; + if (!status.ok()) { + jwriter << "status" << status.ToString(); + } + + jwriter.EndObject(); + event_logger->Log(jwriter); + } + if (listeners.empty()) { + return; + } + BlobFileDeletionInfo info(dbname, file_path, job_id, status); + for (const auto& listener : listeners) { + listener->OnBlobFileDeleted(info); + } + info.status.PermitUncheckedError(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.h new file mode 100644 index 0000000000..a1331d8a9a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/event_helpers.h @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/column_family.h" +#include "db/version_edit.h" +#include "logging/event_logger.h" +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +class EventHelpers { + public: + static void AppendCurrentTime(JSONWriter* json_writer); + static void NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason); + static void NotifyOnBackgroundError( + const std::vector>& listeners, + BackgroundErrorReason reason, Status* bg_error, + InstrumentedMutex* db_mutex, bool* auto_recovery); + static void LogAndNotifyTableFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name); + static void LogAndNotifyTableFileDeletion( + EventLogger* event_logger, int job_id, uint64_t file_number, + const std::string& file_path, const Status& status, + const std::string& db_name, + const std::vector>& listeners); + static void NotifyOnErrorRecoveryEnd( + const std::vector>& listeners, + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex); + + static void NotifyBlobFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, + BlobFileCreationReason creation_reason); + + static void LogAndNotifyBlobFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, uint64_t file_number, + BlobFileCreationReason creation_reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name, uint64_t total_blob_count, + uint64_t total_blob_bytes); + + static void LogAndNotifyBlobFileDeletion( + EventLogger* event_logger, + const std::vector>& listeners, int job_id, + uint64_t file_number, const std::string& file_path, const Status& status, + const std::string& db_name); + + private: + static void LogAndNotifyTableFileCreation( + EventLogger* event_logger, + const std::vector>& listeners, + const FileDescriptor& fd, const TableFileCreationInfo& info); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/experimental.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/experimental.cc new file mode 100644 index 0000000000..cb5fb31794 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/experimental.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/experimental.h" + +#include "db/db_impl/db_impl.h" +#include "db/version_util.h" +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { +namespace experimental { + + +Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + if (db == nullptr) { + return Status::InvalidArgument("DB is empty"); + } + + return db->SuggestCompactRange(column_family, begin, end); +} + +Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) { + if (db == nullptr) { + return Status::InvalidArgument("Didn't recognize DB object"); + } + return db->PromoteL0(column_family, target_level); +} + + +Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) { + return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end); +} + +Status UpdateManifestForFilesState( + const DBOptions& db_opts, const std::string& db_name, + const std::vector& column_families, + const UpdateManifestForFilesStateOptions& opts) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + OfflineManifestWriter w(db_opts, db_name); + Status s = w.Recover(column_families); + + size_t files_updated = 0; + size_t cfs_updated = 0; + auto fs = db_opts.env->GetFileSystem(); + + for (auto cfd : *w.Versions().GetColumnFamilySet()) { + if (!s.ok()) { + break; + } + assert(cfd); + + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + /* SST files */ + for (int level = 0; level < cfd->NumberLevels(); level++) { + if (!s.ok()) { + break; + } + const auto& level_files = vstorage->LevelFiles(level); + + for (const auto& lf : level_files) { + assert(lf); + + uint64_t number = lf->fd.GetNumber(); + std::string fname = + TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId()); + + std::unique_ptr f; + FileOptions fopts; + // Use kUnknown to signal the FileSystem to search all tiers for the + // file. + fopts.temperature = Temperature::kUnknown; + + IOStatus file_ios = + fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr); + if (file_ios.ok()) { + if (opts.update_temperatures) { + Temperature temp = f->GetTemperature(); + if (temp != Temperature::kUnknown && temp != lf->temperature) { + // Current state inconsistent with manifest + ++files_updated; + edit.DeleteFile(level, number); + edit.AddFile( + level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(), + lf->smallest, lf->largest, lf->fd.smallest_seqno, + lf->fd.largest_seqno, lf->marked_for_compaction, temp, + lf->oldest_blob_file_number, lf->oldest_ancester_time, + lf->file_creation_time, lf->epoch_number, lf->file_checksum, + lf->file_checksum_func_name, lf->unique_id, + lf->compensated_range_deletion_size, lf->tail_size); + } + } + } else { + s = file_ios; + break; + } + } + } + + if (s.ok() && edit.NumEntries() > 0) { + std::unique_ptr db_dir; + s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); + if (s.ok()) { + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); + } + if (s.ok()) { + ++cfs_updated; + } + } + } + + if (cfs_updated > 0) { + ROCKS_LOG_INFO(db_opts.info_log, + "UpdateManifestForFilesState: updated %zu files in %zu CFs", + files_updated, cfs_updated); + } else if (s.ok()) { + ROCKS_LOG_INFO(db_opts.info_log, + "UpdateManifestForFilesState: no updates needed"); + } + if (!s.ok()) { + ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s", + s.ToString().c_str()); + } + + return s; +} + +} // namespace experimental +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.cc new file mode 100644 index 0000000000..f8716e5f48 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.cc @@ -0,0 +1,1104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "db/external_sst_file_ingestion_job.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ExternalSstFileIngestionJob::Prepare( + const std::vector& external_files_paths, + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are ingesting + for (const std::string& file_path : external_files_paths) { + IngestedFileInfo file_to_ingest; + status = + GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); + if (!status.ok()) { + return status; + } + + if (file_to_ingest.cf_id != + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && + file_to_ingest.cf_id != cfd_->GetID()) { + return Status::InvalidArgument( + "External file column family id don't match"); + } + + if (file_to_ingest.num_entries == 0 && + file_to_ingest.num_range_deletions == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!file_to_ingest.smallest_internal_key.Valid() || + !file_to_ingest.largest_internal_key.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); + } + + files_to_ingest_.emplace_back(std::move(file_to_ingest)); + } + + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_ingest_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + sorted_files.push_back(&files_to_ingest_[i]); + } + + std::sort( + sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i + 1 < num_files; i++) { + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { + files_overlap_ = true; + break; + } + } + } + + // Hanlde the file temperature + for (size_t i = 0; i < num_files; i++) { + files_to_ingest_[i].file_temperature = file_temperature; + } + + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported("Files have overlapping ranges"); + } + + // Copy/Move external files into DB + std::unordered_set ingestion_path_ids; + for (IngestedFileInfo& f : files_to_ingest_) { + f.copy_file = false; + const std::string path_outside_db = f.external_file_path; + const std::string path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + if (ingestion_options_.move_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.ok()) { + // It is unsafe to assume application had sync the file and file + // directory before ingest the file. For integrity of RocksDB we need + // to sync the file. + std::unique_ptr file_to_sync; + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } + } + } + } else if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. + f.copy_file = true; + ROCKS_LOG_INFO(db_options_.info_log, + "Triy to link file %s but it's not supported : %s", + path_outside_db.c_str(), status.ToString().c_str()); + } + } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); + // CopyFile also sync the new file. + status = + CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); + if (!status.ok()) { + break; + } + f.internal_file_path = path_inside_db; + // Initialize the checksum information of ingested files. + f.file_checksum = kUnknownFileChecksum; + f.file_checksum_func_name = kUnknownFileChecksumFuncName; + ingestion_path_ids.insert(f.fd.GetPathId()); + } + + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); + if (status.ok()) { + for (auto path_id : ingestion_path_ids) { + status = directories_->GetDataDir(path_id)->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync directory %" ROCKSDB_PRIszt + " while ingest file: %s", + path_id, status.ToString().c_str()); + break; + } + } + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + + // Generate and check the sst file checksum. Note that, if + // IngestExternalFileOptions::write_global_seqno is true, we will not update + // the checksum information in the files_to_ingests_ here, since the file is + // upadted with the new global_seqno. After global_seqno is updated, DB will + // generate the new checksum and store it in the Manifest. In all other cases + // if ingestion_options_.write_global_seqno == true and + // verify_file_checksum is false, we only check the checksum function name. + if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) { + if (ingestion_options_.verify_file_checksum == false && + files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Only when verify_file_checksum == false and the checksum for ingested + // files are provided, DB will use the provided checksum and does not + // generate the checksum for ingested files. + need_generate_file_checksum_ = false; + } else { + need_generate_file_checksum_ = true; + } + FileChecksumGenContext gen_context; + std::unique_ptr file_checksum_gen = + db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); + std::vector generated_checksums; + std::vector generated_checksum_func_names; + // Step 1: generate the checksum for ingested sst file. + if (need_generate_file_checksum_) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + std::string generated_checksum; + std::string generated_checksum_func_name; + std::string requested_checksum_func_name; + // TODO: rate limit file reads for checksum calculation during file + // ingestion. + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), files_to_ingest_[i].internal_file_path, + db_options_.file_checksum_gen_factory.get(), + requested_checksum_func_name, &generated_checksum, + &generated_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, + db_options_.rate_limiter.get(), + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + status = io_s; + ROCKS_LOG_WARN(db_options_.info_log, + "Sst file checksum generation of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + if (ingestion_options_.write_global_seqno == false) { + files_to_ingest_[i].file_checksum = generated_checksum; + files_to_ingest_[i].file_checksum_func_name = + generated_checksum_func_name; + } + generated_checksums.push_back(generated_checksum); + generated_checksum_func_names.push_back(generated_checksum_func_name); + } + } + + // Step 2: based on the verify_file_checksum and ingested checksum + // information, do the verification. + if (status.ok()) { + if (files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Verify the checksum and checksum function name. + if (ingestion_options_.verify_file_checksum) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != + generated_checksum_func_names[i]) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + if (files_checksums[i] != generated_checksums[i]) { + status = Status::Corruption( + "Ingested checksum does not match with the generated " + "checksum"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + } + } else { + // If verify_file_checksum is not enabled, we only verify the + // checksum function name. If it does not match, fail the ingestion. + // If matches, we trust the ingested checksum information and store + // in the Manifest. + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != file_checksum_gen->Name()) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + files_to_ingest_[i].file_checksum = files_checksums[i]; + files_to_ingest_[i].file_checksum_func_name = + files_checksum_func_names[i]; + } + } + } else if (files_checksums.size() != files_checksum_func_names.size() || + (files_checksums.size() == files_checksum_func_names.size() && + files_checksums.size() != 0)) { + // The checksum or checksum function name vector are not both empty + // and they are incomplete. + status = Status::InvalidArgument( + "The checksum information of ingested sst files are nonempty and " + "the size of checksums or the size of the checksum function " + "names " + "does not match with the number of ingested sst files"); + ROCKS_LOG_WARN( + db_options_.info_log, + "The ingested sst files checksum information is incomplete: %s", + status.ToString().c_str()); + } + } + } + + // TODO: The following is duplicated with Cleanup(). + if (!status.ok()) { + IOOptions io_opts; + // We failed, remove all files that we copied into the db + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, + SuperVersion* super_version) { + autovector ranges; + autovector keys; + size_t ts_sz = cfd_->user_comparator()->timestamp_size(); + if (ts_sz) { + // Check all ranges [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to + // include all `end` keys. + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + std::string begin_str; + std::string end_str; + AppendUserKeyWithMaxTimestamp( + &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz); + AppendKeyWithMinTimestamp( + &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz); + keys.emplace_back(std::move(begin_str)); + keys.emplace_back(std::move(end_str)); + } + for (size_t i = 0; i < files_to_ingest_.size(); ++i) { + ranges.emplace_back(keys[2 * i], keys[2 * i + 1]); + } + } else { + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), + file_to_ingest.largest_internal_key.user_key()); + } + } + Status status = cfd_->RangesOverlapWithMemtables( + ranges, super_version, db_options_.allow_data_in_errors, flush_needed); + if (status.ok() && *flush_needed && + !ingestion_options_.allow_blocking_flush) { + status = Status::InvalidArgument("External file requires flush"); + } + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ExternalSstFileIngestionJob::Run() { + Status status; + SuperVersion* super_version = cfd_->GetSuperVersion(); +#ifndef NDEBUG + // We should never run the job with a memtable that is overlapping + // with the files we are ingesting + bool need_flush = false; + status = NeedsFlush(&need_flush, super_version); + if (!status.ok()) { + return status; + } + if (need_flush) { + return Status::TryAgain(); + } + assert(status.ok() && need_flush == false); +#endif + + bool force_global_seqno = false; + + if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) { + // We need to assign a global sequence number to all the files even + // if the don't overlap with any ranges since we have snapshots + force_global_seqno = true; + } + // It is safe to use this instead of LastAllocatedSequence since we are + // the only active writer, and hence they are equal + SequenceNumber last_seqno = versions_->LastSequence(); + edit_.SetColumnFamily(cfd_->GetID()); + // The levels that the files will be ingested into + + for (IngestedFileInfo& f : files_to_ingest_) { + SequenceNumber assigned_seqno = 0; + if (ingestion_options_.ingest_behind) { + status = CheckLevelForIngestedBehindFile(&f); + } else { + status = AssignLevelAndSeqnoForIngestedFile( + super_version, force_global_seqno, cfd_->ioptions()->compaction_style, + last_seqno, &f, &assigned_seqno); + } + + // Modify the smallest/largest internal key to include the sequence number + // that we just learned. Only overwrite sequence number zero. There could + // be a nonzero sequence number already to indicate a range tombstone's + // exclusive endpoint. + ParsedInternalKey smallest_parsed, largest_parsed; + if (status.ok()) { + status = ParseInternalKey(*f.smallest_internal_key.rep(), + &smallest_parsed, false /* log_err_key */); + } + if (status.ok()) { + status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, + false /* log_err_key */); + } + if (!status.ok()) { + return status; + } + if (smallest_parsed.sequence == 0) { + UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + smallest_parsed.type); + } + if (largest_parsed.sequence == 0) { + UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + largest_parsed.type); + } + + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", + &assigned_seqno); + if (assigned_seqno > last_seqno) { + assert(assigned_seqno == last_seqno + 1); + last_seqno = assigned_seqno; + ++consumed_seqno_count_; + } + if (!status.ok()) { + return status; + } + + status = GenerateChecksumForIngestedFile(&f); + if (!status.ok()) { + return status; + } + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t current_time = kUnknownFileCreationTime; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + if (clock_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + + FileMetaData f_metadata( + f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), + f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, + f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, + ingestion_options_.ingest_behind + ? kReservedEpochNumberForFileIngestedBehind + : cfd_->NewEpochNumber(), + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size); + f_metadata.temperature = f.file_temperature; + edit_.AddFile(f.picked_level, f_metadata); + } + + CreateEquivalentFileIngestingCompactions(); + return status; +} + +void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { + // A map from output level to input of compactions equivalent to this + // ingestion job. + // TODO: simplify below logic to creating compaction per ingested file + // instead of per output level, once we figure out how to treat ingested files + // with adjacent range deletion tombstones to same output level in the same + // job as non-overlapping compactions. + std::map + output_level_to_file_ingesting_compaction_input; + + for (const auto& pair : edit_.GetNewFiles()) { + int output_level = pair.first; + const FileMetaData& f_metadata = pair.second; + + CompactionInputFiles& input = + output_level_to_file_ingesting_compaction_input[output_level]; + if (input.files.empty()) { + // Treat the source level of ingested files to be level 0 + input.level = 0; + } + + compaction_input_metdatas_.push_back(new FileMetaData(f_metadata)); + input.files.push_back(compaction_input_metdatas_.back()); + } + + for (const auto& pair : output_level_to_file_ingesting_compaction_input) { + int output_level = pair.first; + const CompactionInputFiles& input = pair.second; + + const auto& mutable_cf_options = *(cfd_->GetLatestMutableCFOptions()); + file_ingesting_compactions_.push_back(new Compaction( + cfd_->current()->storage_info(), *cfd_->ioptions(), mutable_cf_options, + mutable_db_options_, {input}, output_level, + MaxFileSizeForLevel( + mutable_cf_options, output_level, + cfd_->ioptions()->compaction_style) /* output file size + limit, + * not applicable + */ + , + LLONG_MAX /* max compaction bytes, not applicable */, + 0 /* output path ID, not applicable */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompaction, not applicable */, + {} /* grandparents, not applicable */, false /* is manual */, + "" /* trim_ts */, -1 /* score, not applicable */, + false /* is deletion compaction, not applicable */, + files_overlap_ /* l0_files_might_overlap, not applicable */, + CompactionReason::kExternalSstIngestion)); + } +} + +void ExternalSstFileIngestionJob::RegisterRange() { + for (const auto& c : file_ingesting_compactions_) { + cfd_->compaction_picker()->RegisterCompaction(c); + } +} + +void ExternalSstFileIngestionJob::UnregisterRange() { + for (const auto& c : file_ingesting_compactions_) { + cfd_->compaction_picker()->UnregisterCompaction(c); + delete c; + } + file_ingesting_compactions_.clear(); + + for (const auto& f : compaction_input_metdatas_) { + delete f; + } + compaction_input_metdatas_.clear(); +} + +void ExternalSstFileIngestionJob::UpdateStats() { + // Update internal stats for new ingested files + uint64_t total_keys = 0; + uint64_t total_l0_files = 0; + uint64_t total_time = clock_->NowMicros() - job_start_time_; + + EventLoggerStream stream = event_logger_->Log(); + stream << "event" + << "ingest_finished"; + stream << "files_ingested"; + stream.StartArray(); + + for (IngestedFileInfo& f : files_to_ingest_) { + InternalStats::CompactionStats stats( + CompactionReason::kExternalSstIngestion, 1); + stats.micros = total_time; + // If actual copy occurred for this file, then we need to count the file + // size as the actual bytes written. If the file was linked, then we ignore + // the bytes written for file metadata. + // TODO (yanqin) maybe account for file metadata bytes for exact accuracy? + if (f.copy_file) { + stats.bytes_written = f.fd.GetFileSize(); + } else { + stats.bytes_moved = f.fd.GetFileSize(); + } + stats.num_output_files = 1; + cfd_->internal_stats()->AddCompactionStats(f.picked_level, + Env::Priority::USER, stats); + cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE, + f.fd.GetFileSize()); + total_keys += f.num_entries; + if (f.picked_level == 0) { + total_l0_files += 1; + } + ROCKS_LOG_INFO( + db_options_.info_log, + "[AddFile] External SST file %s was ingested in L%d with path %s " + "(global_seqno=%" PRIu64 ")\n", + f.external_file_path.c_str(), f.picked_level, + f.internal_file_path.c_str(), f.assigned_seqno); + stream << "file" << f.internal_file_path << "level" << f.picked_level; + } + stream.EndArray(); + + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL, + total_keys); + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL, + files_to_ingest_.size()); + cfd_->internal_stats()->AddCFStats( + InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files); +} + +void ExternalSstFileIngestionJob::Cleanup(const Status& status) { + IOOptions io_opts; + if (!status.ok()) { + // We failed to add the files to the database + // remove all the files we copied + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + consumed_seqno_count_ = 0; + files_overlap_ = false; + } else if (status.ok() && ingestion_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_ingest_) { + Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ExternalSstFileIngestionJob::GetIngestedFileInfo( + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, SuperVersion* sv) { + file_to_ingest->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_ingest->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Assign FD with number + file_to_ingest->fd = + FileDescriptor(new_file_number, 0, file_to_ingest->file_size); + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), + std::move(sst_file_reader), file_to_ingest->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + if (ingestion_options_.verify_checksums_before_ingest) { + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + // TODO: plumb Env::IOActivity + ReadOptions ro; + ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + status = table_reader->VerifyChecksum( + ro, TableReaderCaller::kExternalSSTIngestion); + } + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + const auto& uprops = props->user_collected_properties; + + // Get table version + auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); + if (version_iter == uprops.end()) { + return Status::Corruption("External file version not found"); + } + file_to_ingest->version = DecodeFixed32(version_iter->second.c_str()); + + auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno); + if (file_to_ingest->version == 2) { + // version 2 imply that we have global sequence number + if (seqno_iter == uprops.end()) { + return Status::Corruption( + "External file global sequence number not found"); + } + + // Set the global sequence number + file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str()); + if (props->external_sst_file_global_seqno_offset == 0) { + file_to_ingest->global_seqno_offset = 0; + return Status::Corruption("Was not able to find file global seqno field"); + } + file_to_ingest->global_seqno_offset = + static_cast(props->external_sst_file_global_seqno_offset); + } else if (file_to_ingest->version == 1) { + // SST file V1 should not have global seqno field + assert(seqno_iter == uprops.end()); + file_to_ingest->original_seqno = 0; + if (ingestion_options_.allow_blocking_flush || + ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument( + "External SST file V1 does not support global seqno"); + } + } else { + return Status::InvalidArgument("External file version is not supported"); + } + // Get number of entries in table + file_to_ingest->num_entries = props->num_entries; + file_to_ingest->num_range_deletions = props->num_range_deletions; + + ParsedInternalKey key; + // TODO: plumb Env::IOActivity + ReadOptions ro; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); + + // Get first (smallest) and last (largest) key from file. + file_to_ingest->smallest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + file_to_ingest->largest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + bool bounds_set = false; + bool allow_data_in_errors = db_options_.allow_data_in_errors; + iter->SeekToFirst(); + if (iter->Valid()) { + Status pik_status = + ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + if (key.sequence != 0) { + return Status::Corruption("External file has non zero sequence number"); + } + file_to_ingest->smallest_internal_key.SetFrom(key); + + iter->SeekToLast(); + pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + if (key.sequence != 0) { + return Status::Corruption("External file has non zero sequence number"); + } + file_to_ingest->largest_internal_key.SetFrom(key); + + bounds_set = true; + } + + // We may need to adjust these key bounds, depending on whether any range + // deletion tombstones extend past them. + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + if (range_del_iter != nullptr) { + for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); + range_del_iter->Next()) { + Status pik_status = + ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone tombstone(key, range_del_iter->value()); + + InternalKey start_key = tombstone.SerializeKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, start_key, + file_to_ingest->smallest_internal_key) < 0) { + file_to_ingest->smallest_internal_key = start_key; + } + InternalKey end_key = tombstone.SerializeEndKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, end_key, + file_to_ingest->largest_internal_key) > 0) { + file_to_ingest->largest_internal_key = end_key; + } + bounds_set = true; + } + } + + file_to_ingest->cf_id = static_cast(props->column_family_id); + + file_to_ingest->table_properties = *props; + + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_ingest->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_ingest->internal_file_path.c_str()); + file_to_ingest->unique_id = kNullUniqueId64x2; + } + + return status; +} + +Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( + SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, + SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno) { + Status status; + *assigned_seqno = 0; + if (force_global_seqno) { + *assigned_seqno = last_seqno + 1; + if (compaction_style == kCompactionStyleUniversal || files_overlap_) { + if (ingestion_options_.fail_if_not_bottommost_level) { + status = Status::TryAgain( + "Files cannot be ingested to Lmax. Please make sure key range of " + "Lmax does not overlap with files to ingest."); + return status; + } + file_to_ingest->picked_level = 0; + return status; + } + } + + bool overlap_with_db = false; + Arena arena; + // TODO: plumb Env::IOActivity + ReadOptions ro; + ro.total_order_seek = true; + int target_level = 0; + auto* vstorage = cfd_->current()->storage_info(); + + for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + if (lvl > 0 && lvl < vstorage->base_level()) { + continue; + } + if (cfd_->RangeOverlapWithCompaction( + file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl)) { + // We must use L0 or any level higher than `lvl` to be able to overwrite + // the compaction output keys that we overlap with in this level, We also + // need to assign this file a seqno to overwrite the compaction output + // keys in level `lvl` + overlap_with_db = true; + break; + } else if (vstorage->NumLevelFiles(lvl) > 0) { + bool overlap_with_level = false; + status = sv->current->OverlapWithLevelIterator( + ro, env_options_, file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl, + &overlap_with_level); + if (!status.ok()) { + return status; + } + if (overlap_with_level) { + // We must use L0 or any level higher than `lvl` to be able to overwrite + // the keys that we overlap with in this level, We also need to assign + // this file a seqno to overwrite the existing keys in level `lvl` + overlap_with_db = true; + break; + } + + if (compaction_style == kCompactionStyleUniversal && lvl != 0) { + const std::vector& level_files = + vstorage->LevelFiles(lvl); + const SequenceNumber level_largest_seqno = + (*std::max_element(level_files.begin(), level_files.end(), + [](FileMetaData* f1, FileMetaData* f2) { + return f1->fd.largest_seqno < + f2->fd.largest_seqno; + })) + ->fd.largest_seqno; + // should only assign seqno to current level's largest seqno when + // the file fits + if (level_largest_seqno != 0 && + IngestedFileFitInLevel(file_to_ingest, lvl)) { + *assigned_seqno = level_largest_seqno; + } else { + continue; + } + } + } else if (compaction_style == kCompactionStyleUniversal) { + continue; + } + + // We don't overlap with any keys in this level, but we still need to check + // if our file can fit in it + if (IngestedFileFitInLevel(file_to_ingest, lvl)) { + target_level = lvl; + } + } + // If files overlap, we have to ingest them at level 0 and assign the newest + // sequence number + if (files_overlap_) { + target_level = 0; + *assigned_seqno = last_seqno + 1; + } + + if (ingestion_options_.fail_if_not_bottommost_level && + target_level < cfd_->NumberLevels() - 1) { + status = Status::TryAgain( + "Files cannot be ingested to Lmax. Please make sure key range of Lmax " + "and ongoing compaction's output to Lmax" + "does not overlap with files to ingest."); + return status; + } + + TEST_SYNC_POINT_CALLBACK( + "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", + &overlap_with_db); + file_to_ingest->picked_level = target_level; + if (overlap_with_db && *assigned_seqno == 0) { + *assigned_seqno = last_seqno + 1; + } + return status; +} + +Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( + IngestedFileInfo* file_to_ingest) { + auto* vstorage = cfd_->current()->storage_info(); + // First, check if new files fit in the bottommost level + int bottom_lvl = cfd_->NumberLevels() - 1; + if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { + return Status::InvalidArgument( + "Can't ingest_behind file as it doesn't fit " + "at the bottommost level!"); + } + + // Second, check if despite allow_ingest_behind=true we still have 0 seqnums + // at some upper level + for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { + for (auto file : vstorage->LevelFiles(lvl)) { + if (file->fd.smallest_seqno == 0) { + return Status::InvalidArgument( + "Can't ingest_behind file as despite allow_ingest_behind=true " + "there are files with 0 seqno in database at upper levels!"); + } + } + } + + file_to_ingest->picked_level = bottom_lvl; + return Status::OK(); +} + +Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( + IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { + if (file_to_ingest->original_seqno == seqno) { + // This file already have the correct global seqno + return Status::OK(); + } else if (!ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument("Global seqno is required, but disabled"); + } else if (file_to_ingest->global_seqno_offset == 0) { + return Status::InvalidArgument( + "Trying to set global seqno for a file that don't have a global seqno " + "field"); + } + + if (ingestion_options_.write_global_seqno) { + // Determine if we can write global_seqno to a given offset of file. + // If the file system does not support random write, then we should not. + // Otherwise we should. + std::unique_ptr rwfile; + Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path, + env_options_, &rwfile, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile", + &status); + if (status.ok()) { + FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_, + file_to_ingest->internal_file_path); + std::string seqno_val; + PutFixed64(&seqno_val, seqno); + status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, + IOOptions(), nullptr); + if (status.ok()) { + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); + status = SyncIngestedFile(fsptr.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s after writing global " + "sequence number: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + } + } + if (!status.ok()) { + return status; + } + } else if (!status.IsNotSupported()) { + return status; + } + } + + file_to_ingest->assigned_seqno = seqno; + return Status::OK(); +} + +IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( + IngestedFileInfo* file_to_ingest) { + if (db_options_.file_checksum_gen_factory == nullptr || + need_generate_file_checksum_ == false || + ingestion_options_.write_global_seqno == false) { + // If file_checksum_gen_factory is not set, we are not able to generate + // the checksum. if write_global_seqno is false, it means we will use + // file checksum generated during Prepare(). This step will be skipped. + return IOStatus::OK(); + } + std::string file_checksum; + std::string file_checksum_func_name; + std::string requested_checksum_func_name; + // TODO: rate limit file reads for checksum calculation during file ingestion. + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), file_to_ingest->internal_file_path, + db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, + &file_checksum, &file_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + file_to_ingest->file_checksum = file_checksum; + file_to_ingest->file_checksum_func_name = file_checksum_func_name; + return IOStatus::OK(); +} + +bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( + const IngestedFileInfo* file_to_ingest, int level) { + if (level == 0) { + // Files can always fit in L0 + return true; + } + + auto* vstorage = cfd_->current()->storage_info(); + Slice file_smallest_user_key( + file_to_ingest->smallest_internal_key.user_key()); + Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key()); + + if (vstorage->OverlapInLevel(level, &file_smallest_user_key, + &file_largest_user_key)) { + // File overlap with another files in this level, we cannot + // add it to this level + return false; + } + + // File did not overlap with level files, nor compaction output + return true; +} + +template +Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { + assert(file != nullptr); + if (db_options_.use_fsync) { + return file->Fsync(IOOptions(), nullptr); + } else { + return file->Sync(IOOptions(), nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.h new file mode 100644 index 0000000000..49bb1e31e5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/external_sst_file_ingestion_job.h @@ -0,0 +1,238 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/internal_stats.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "env/file_system_tracer.h" +#include "logging/event_logger.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Directories; +class SystemClock; + +struct IngestedFileInfo { + // External file path + std::string external_file_path; + // Smallest internal key in external file + InternalKey smallest_internal_key; + // Largest internal key in external file + InternalKey largest_internal_key; + // Sequence number for keys in external file + SequenceNumber original_seqno; + // Offset of the global sequence number field in the file, will + // be zero if version is 1 (global seqno is not supported) + size_t global_seqno_offset; + // External file size + uint64_t file_size; + // total number of keys in external file + uint64_t num_entries; + // total number of range deletions in external file + uint64_t num_range_deletions; + // Id of column family this file shoule be ingested into + uint32_t cf_id; + // TableProperties read from external file + TableProperties table_properties; + // Version of external file + int version; + + // FileDescriptor for the file inside the DB + FileDescriptor fd; + // file path that we picked for file inside the DB + std::string internal_file_path; + // Global sequence number that we picked for the file inside the DB + SequenceNumber assigned_seqno = 0; + // Level inside the DB we picked for the external file. + int picked_level = 0; + // Whether to copy or link the external sst file. copy_file will be set to + // false if ingestion_options.move_files is true and underlying FS + // supports link operation. Need to provide a default value to make the + // undefined-behavior sanity check of llvm happy. Since + // ingestion_options.move_files is false by default, thus copy_file is true + // by default. + bool copy_file = true; + // The checksum of ingested file + std::string file_checksum; + // The name of checksum function that generate the checksum + std::string file_checksum_func_name; + // The temperature of the file to be ingested + Temperature file_temperature = Temperature::kUnknown; + // Unique id of the file to be ingested + UniqueId64x2 unique_id{}; +}; + +class ExternalSstFileIngestionJob { + public: + ExternalSstFileIngestionJob( + VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const EnvOptions& env_options, + SnapshotList* db_snapshots, + const IngestExternalFileOptions& ingestion_options, + Directories* directories, EventLogger* event_logger, + const std::shared_ptr& io_tracer) + : clock_(db_options.clock), + fs_(db_options.fs, io_tracer), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + mutable_db_options_(mutable_db_options), + env_options_(env_options), + db_snapshots_(db_snapshots), + ingestion_options_(ingestion_options), + directories_(directories), + event_logger_(event_logger), + job_start_time_(clock_->NowMicros()), + consumed_seqno_count_(0), + io_tracer_(io_tracer) { + assert(directories != nullptr); + } + + ~ExternalSstFileIngestionJob() { + for (const auto& c : file_ingesting_compactions_) { + cfd_->compaction_picker()->UnregisterCompaction(c); + delete c; + } + + for (const auto& f : compaction_input_metdatas_) { + delete f; + } + } + + // Prepare the job by copying external files into the DB. + Status Prepare(const std::vector& external_files_paths, + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv); + + // Check if we need to flush the memtable before running the ingestion job + // This will be true if the files we are ingesting are overlapping with any + // key range in the memtable. + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status NeedsFlush(bool* flush_needed, SuperVersion* super_version); + + // Will execute the ingestion job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Register key range involved in this ingestion job + // to prevent key range conflict with other ongoing compaction/file ingestion + // REQUIRES: Mutex held + void RegisterRange(); + + // Unregister key range registered for this ingestion job + // REQUIRES: Mutex held + void UnregisterRange(); + + // Update column family stats. + // REQUIRES: Mutex held + void UpdateStats(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_ingest() const { + return files_to_ingest_; + } + + // How many sequence numbers did we consume as part of the ingest job? + int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; } + + private: + // Open the external file and populate `file_to_ingest` with all the + // external information we need to ingest this file. + Status GetIngestedFileInfo(const std::string& external_file, + uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, + SuperVersion* sv); + + // Assign `file_to_ingest` the appropriate sequence number and the lowest + // possible level that it can be ingested to according to compaction_style. + // REQUIRES: Mutex held + Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv, + bool force_global_seqno, + CompactionStyle compaction_style, + SequenceNumber last_seqno, + IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno); + + // File that we want to ingest behind always goes to the lowest level; + // we just check that it fits in the level, that DB allows ingest_behind, + // and that we don't have 0 seqnums at the upper levels. + // REQUIRES: Mutex held + Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest); + + // Set the file global sequence number to `seqno` + Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest, + SequenceNumber seqno); + // Generate the file checksum and store in the IngestedFileInfo + IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest); + + // Check if `file_to_ingest` can fit in level `level` + // REQUIRES: Mutex held + bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, + int level); + + // Helper method to sync given file. + template + Status SyncIngestedFile(TWritableFile* file); + + // Create equivalent `Compaction` objects to this file ingestion job + // , which will be used to check range conflict with other ongoing + // compactions. + void CreateEquivalentFileIngestingCompactions(); + + SystemClock* clock_; + FileSystemPtr fs_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const MutableDBOptions& mutable_db_options_; + const EnvOptions& env_options_; + SnapshotList* db_snapshots_; + autovector files_to_ingest_; + const IngestExternalFileOptions& ingestion_options_; + Directories* directories_; + EventLogger* event_logger_; + VersionEdit edit_; + uint64_t job_start_time_; + int consumed_seqno_count_; + // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are + // ingested in L0 + bool files_overlap_{false}; + // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB + // file_checksum_gen_factory is set, DB will generate checksum each file. + bool need_generate_file_checksum_{true}; + std::shared_ptr io_tracer_; + + // Below are variables used in (un)registering range for this ingestion job + // + // FileMetaData used in inputs of compactions equivalent to this ingestion + // job + std::vector compaction_input_metdatas_; + // Compactions equivalent to this ingestion job + std::vector file_ingesting_compactions_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.cc new file mode 100644 index 0000000000..608f1cb28d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.cc @@ -0,0 +1,218 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" + +#include +#include + +#include "db/version_edit.h" +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { + +FileIndexer::FileIndexer(const Comparator* ucmp) + : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {} + +size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); } + +size_t FileIndexer::LevelIndexSize(size_t level) const { + if (level >= next_level_index_.size()) { + return 0; + } + return next_level_index_[level].num_index; +} + +void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, + int32_t* right_bound) const { + assert(level > 0); + + // Last level, no hint + if (level == num_levels_ - 1) { + *left_bound = 0; + *right_bound = -1; + return; + } + + assert(level < num_levels_ - 1); + assert(static_cast(file_index) <= level_rb_[level]); + + const IndexUnit* index_units = next_level_index_[level].index_units; + const auto& index = index_units[file_index]; + + if (cmp_smallest < 0) { + *left_bound = (level > 0 && file_index > 0) + ? index_units[file_index - 1].largest_lb + : 0; + *right_bound = index.smallest_rb; + } else if (cmp_smallest == 0) { + *left_bound = index.smallest_lb; + *right_bound = index.smallest_rb; + } else if (cmp_smallest > 0 && cmp_largest < 0) { + *left_bound = index.smallest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest == 0) { + *left_bound = index.largest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest > 0) { + *left_bound = index.largest_lb; + *right_bound = level_rb_[level + 1]; + } else { + assert(false); + } + + assert(*left_bound >= 0); + assert(*left_bound <= *right_bound + 1); + assert(*right_bound <= level_rb_[level + 1]); +} + +void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels, + std::vector* const files) { + if (files == nullptr) { + return; + } + if (num_levels == 0) { // uint_32 0-1 would cause bad behavior + num_levels_ = num_levels; + return; + } + assert(level_rb_ == nullptr); // level_rb_ should be init here + + num_levels_ = num_levels; + next_level_index_.resize(num_levels); + + char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t)); + level_rb_ = new (mem) int32_t[num_levels_]; + for (size_t i = 0; i < num_levels_; i++) { + level_rb_[i] = -1; + } + + // L1 - Ln-1 + for (size_t level = 1; level < num_levels_ - 1; ++level) { + const auto& upper_files = files[level]; + const int32_t upper_size = static_cast(upper_files.size()); + const auto& lower_files = files[level + 1]; + level_rb_[level] = static_cast(upper_files.size()) - 1; + if (upper_size == 0) { + continue; + } + IndexLevel& index_level = next_level_index_[level]; + index_level.num_index = upper_size; + mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit)); + index_level.index_units = new (mem) IndexUnit[upper_size]; + + CalculateLB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(), + b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; }); + CalculateLB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->largest.user_key(), + b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; }); + CalculateRB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(), + b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; }); + CalculateRB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->largest.user_key(), + b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; }); + } + + level_rb_[num_levels_ - 1] = + static_cast(files[num_levels_ - 1].size()) - 1; +} + +void FileIndexer::CalculateLB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); + int32_t upper_idx = 0; + int32_t lower_idx = 0; + + IndexUnit* index = index_level->index_units; + while (upper_idx < upper_size && lower_idx < lower_size) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&index[upper_idx], lower_idx); + ++upper_idx; + } else if (cmp > 0) { + // Lower level's file (largest) is smaller, a key won't hit in that + // file. Move to next lower file + ++lower_idx; + } else { + // Lower level's file becomes larger, update the index, and + // move to the next upper file + set_index(&index[upper_idx], lower_idx); + ++upper_idx; + } + } + + while (upper_idx < upper_size) { + // Lower files are exhausted, that means the remaining upper files are + // greater than any lower files. Set the index to be the lower level size. + set_index(&index[upper_idx], lower_size); + ++upper_idx; + } +} + +void FileIndexer::CalculateRB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); + int32_t upper_idx = upper_size - 1; + int32_t lower_idx = lower_size - 1; + + IndexUnit* index = index_level->index_units; + while (upper_idx >= 0 && lower_idx >= 0) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&index[upper_idx], lower_idx); + --upper_idx; + } else if (cmp < 0) { + // Lower level's file (smallest) is larger, a key won't hit in that + // file. Move to next lower file. + --lower_idx; + } else { + // Lower level's file becomes smaller, update the index, and move to + // the next the upper file + set_index(&index[upper_idx], lower_idx); + --upper_idx; + } + } + while (upper_idx >= 0) { + // Lower files are exhausted, that means the remaining upper files are + // smaller than any lower files. Set it to -1. + set_index(&index[upper_idx], -1); + --upper_idx; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.h new file mode 100644 index 0000000000..45cb136150 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/file_indexer.h @@ -0,0 +1,140 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +#include "memory/arena.h" +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +struct FileMetaData; +struct FdWithKeyRange; +struct FileLevel; + +// The file tree structure in Version is prebuilt and the range of each file +// is known. On Version::Get(), it uses binary search to find a potential file +// and then check if a target key can be found in the file by comparing the key +// to each file's smallest and largest key. The results of these comparisons +// can be reused beyond checking if a key falls into a file's range. +// With some pre-calculated knowledge, each key comparison that has been done +// can serve as a hint to narrow down further searches: if a key compared to +// be smaller than a file's smallest or largest, that comparison can be used +// to find out the right bound of next binary search. Similarly, if a key +// compared to be larger than a file's smallest or largest, it can be utilized +// to find out the left bound of next binary search. +// With these hints: it can greatly reduce the range of binary search, +// especially for bottom levels, given that one file most likely overlaps with +// only N files from level below (where N is max_bytes_for_level_multiplier). +// So on level L, we will only look at ~N files instead of N^L files on the +// naive approach. +class FileIndexer { + public: + explicit FileIndexer(const Comparator* ucmp); + + size_t NumLevelIndex() const; + + size_t LevelIndexSize(size_t level) const; + + // Return a file index range in the next level to search for a key based on + // smallest and largest key comparison for the current file specified by + // level and file_index. When *left_index < *right_index, both index should + // be valid and fit in the vector size. + void GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, const int cmp_largest, + int32_t* left_bound, int32_t* right_bound) const; + + void UpdateIndex(Arena* arena, const size_t num_levels, + std::vector* const files); + + enum { kLevelMaxIndex = std::numeric_limits::max() }; + + private: + size_t num_levels_; + const Comparator* ucmp_; + + struct IndexUnit { + IndexUnit() + : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} + // During file search, a key is compared against smallest and largest + // from a FileMetaData. It can have 3 possible outcomes: + // (1) key is smaller than smallest, implying it is also smaller than + // larger. Precalculated index based on "smallest < smallest" can + // be used to provide right bound. + // (2) key is in between smallest and largest. + // Precalculated index based on "smallest > greatest" can be used to + // provide left bound. + // Precalculated index based on "largest < smallest" can be used to + // provide right bound. + // (3) key is larger than largest, implying it is also larger than smallest. + // Precalculated index based on "largest > largest" can be used to + // provide left bound. + // + // As a result, we will need to do: + // Compare smallest (<=) and largest keys from upper level file with + // smallest key from lower level to get a right bound. + // Compare smallest (>=) and largest keys from upper level file with + // largest key from lower level to get a left bound. + // + // Example: + // level 1: [50 - 60] + // level 2: [1 - 40], [45 - 55], [58 - 80] + // A key 35, compared to be less than 50, 3rd file on level 2 can be + // skipped according to rule (1). LB = 0, RB = 1. + // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be + // skipped according to rule (2)-a, but the 3rd file cannot be skipped + // because 60 is greater than 58. LB = 1, RB = 2. + // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped + // according to rule (3). LB = 2, RB = 2. + // + // Point to a left most file in a lower level that may contain a key, + // which compares greater than smallest of a FileMetaData (upper level) + int32_t smallest_lb; + // Point to a left most file in a lower level that may contain a key, + // which compares greater than largest of a FileMetaData (upper level) + int32_t largest_lb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than smallest of a FileMetaData (upper level) + int32_t smallest_rb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than largest of a FileMetaData (upper level) + int32_t largest_rb; + }; + + // Data structure to store IndexUnits in a whole level + struct IndexLevel { + size_t num_index; + IndexUnit* index_units; + + IndexLevel() : num_index(0), index_units(nullptr) {} + }; + + void CalculateLB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index); + + void CalculateRB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index); + + autovector next_level_index_; + int32_t* level_rb_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.cc new file mode 100644 index 0000000000..8b152ee3e0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.cc @@ -0,0 +1,1097 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/flush_job.h" + +#include +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/event_helpers.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/event_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetFlushReasonString(FlushReason flush_reason) { + switch (flush_reason) { + case FlushReason::kOthers: + return "Other Reasons"; + case FlushReason::kGetLiveFiles: + return "Get Live Files"; + case FlushReason::kShutDown: + return "Shut down"; + case FlushReason::kExternalFileIngestion: + return "External File Ingestion"; + case FlushReason::kManualCompaction: + return "Manual Compaction"; + case FlushReason::kWriteBufferManager: + return "Write Buffer Manager"; + case FlushReason::kWriteBufferFull: + return "Write Buffer Full"; + case FlushReason::kTest: + return "Test"; + case FlushReason::kDeleteFiles: + return "Delete Files"; + case FlushReason::kAutoCompaction: + return "Auto Compaction"; + case FlushReason::kManualFlush: + return "Manual Flush"; + case FlushReason::kErrorRecovery: + return "Error Recovery"; + case FlushReason::kWalFull: + return "WAL Full"; + default: + return "Invalid"; + } +} + +FlushJob::FlushJob( + const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + FlushReason flush_reason, LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_file_directory, CompressionType output_compression, + Statistics* stats, EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const SeqnoToTimeMapping& seqno_time_mapping, const std::string& db_id, + const std::string& db_session_id, std::string full_history_ts_low, + BlobFileCompletionCallback* blob_callback) + : dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), + cfd_(cfd), + db_options_(db_options), + mutable_cf_options_(mutable_cf_options), + max_memtable_id_(max_memtable_id), + file_options_(file_options), + versions_(versions), + db_mutex_(db_mutex), + shutting_down_(shutting_down), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + job_context_(job_context), + flush_reason_(flush_reason), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_file_directory_(output_file_directory), + output_compression_(output_compression), + stats_(stats), + event_logger_(event_logger), + measure_io_stats_(measure_io_stats), + sync_output_directory_(sync_output_directory), + write_manifest_(write_manifest), + edit_(nullptr), + base_(nullptr), + pick_memtable_called(false), + thread_pri_(thread_pri), + io_tracer_(io_tracer), + clock_(db_options_.clock), + full_history_ts_low_(std::move(full_history_ts_low)), + blob_callback_(blob_callback), + db_impl_seqno_time_mapping_(seqno_time_mapping) { + // Update the thread status to indicate flush. + ReportStartedFlush(); + TEST_SYNC_POINT("FlushJob::FlushJob()"); +} + +FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); } + +void FlushJob::ReportStartedFlush() { + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd_); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_context_->job_id); + + IOSTATS_RESET(bytes_written); +} + +void FlushJob::ReportFlushInputSize(const autovector& mems) { + uint64_t input_size = 0; + for (auto* mem : mems) { + input_size += mem->ApproximateMemoryUsage(); + } + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_MEMTABLES, input_size); +} + +void FlushJob::RecordFlushIOStats() { + RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} +void FlushJob::PickMemTable() { + db_mutex_->AssertHeld(); + assert(!pick_memtable_called); + pick_memtable_called = true; + + // Maximum "NextLogNumber" of the memtables to flush. + // When mempurge feature is turned off, this variable is useless + // because the memtables are implicitly sorted by increasing order of creation + // time. Therefore mems_->back()->GetNextLogNumber() is already equal to + // max_next_log_number. However when Mempurge is on, the memtables are no + // longer sorted by increasing order of creation time. Therefore this variable + // becomes necessary because mems_->back()->GetNextLogNumber() is no longer + // necessarily equal to max_next_log_number. + uint64_t max_next_log_number = 0; + + // Save the contents of the earliest memtable as a new Table + cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_, + &max_next_log_number); + if (mems_.empty()) { + return; + } + + ReportFlushInputSize(mems_); + + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems_[0]; + edit_ = m->GetEdits(); + edit_->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit_->SetLogNumber(max_next_log_number); + edit_->SetColumnFamily(cfd_->GetID()); + + // path 0 for level 0 file. + meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + meta_.epoch_number = cfd_->NewEpochNumber(); + + base_ = cfd_->current(); + base_->Ref(); // it is likely that we do not need this reference +} + +Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, + bool* switched_to_mempurge) { + TEST_SYNC_POINT("FlushJob::Start"); + db_mutex_->AssertHeld(); + assert(pick_memtable_called); + // Mempurge threshold can be dynamically changed. + // For sake of consistency, mempurge_threshold is + // saved locally to maintain consistency in each + // FlushJob::Run call. + double mempurge_threshold = + mutable_cf_options_.experimental_mempurge_threshold; + + AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN); + if (mems_.empty()) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush", + cfd_->GetName().c_str()); + return Status::OK(); + } + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTime); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + Status mempurge_s = Status::NotFound("No MemPurge."); + if ((mempurge_threshold > 0.0) && + (flush_reason_ == FlushReason::kWriteBufferFull) && (!mems_.empty()) && + MemPurgeDecider(mempurge_threshold) && !(db_options_.atomic_flush)) { + cfd_->SetMempurgeUsed(); + mempurge_s = MemPurge(); + if (!mempurge_s.ok()) { + // Mempurge is typically aborted when the output + // bytes cannot be contained onto a single output memtable. + if (mempurge_s.IsAborted()) { + ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n", + mempurge_s.ToString().c_str()); + } else { + // However the mempurge process can also fail for + // other reasons (eg: new_mem->Add() fails). + ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n", + mempurge_s.ToString().c_str()); + } + } else { + if (switched_to_mempurge) { + *switched_to_mempurge = true; + } else { + // The mempurge process was successful, but no switch_to_mempurge + // pointer provided so no way to propagate the state of flush job. + ROCKS_LOG_WARN(db_options_.info_log, + "Mempurge process succeeded" + "but no 'switched_to_mempurge' ptr provided.\n"); + } + } + } + Status s; + if (mempurge_s.ok()) { + base_->Unref(); + s = Status::OK(); + } else { + // This will release and re-acquire the mutex. + s = WriteLevel0Table(); + } + + if (s.ok() && cfd_->IsDropped()) { + s = Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((s.ok() || s.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress("Database shutdown"); + } + + if (!s.ok()) { + cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber()); + } else if (write_manifest_) { + TEST_SYNC_POINT("FlushJob::InstallResults"); + // Replace immutable memtable with the generated Table + s = cfd_->imm()->TryInstallMemtableFlushResults( + cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, + meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, + log_buffer_, &committed_flush_jobs_info_, + !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted), + but 'false' if mempurge successful: no new min log number + or new level 0 file path to write to manifest. */); + } + + if (s.ok() && file_meta != nullptr) { + *file_meta = meta_; + } + RecordFlushIOStats(); + + // When measure_io_stats_ is true, the default 512 bytes is not enough. + auto stream = event_logger_->LogToBuffer(log_buffer_, 1024); + stream << "job" << job_context_->job_id << "event" + << "flush_finished"; + stream << "output_compression" + << CompressionTypeToString(output_compression_); + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber(); + + assert(blob_files.back()); + stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); + } + + stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed(); + + if (measure_io_stats_) { + if (prev_perf_level != PerfLevel::kEnableTime) { + SetPerfLevel(prev_perf_level); + } + stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos); + stream << "file_range_sync_nanos" + << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos); + stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos); + stream << "file_prepare_write_nanos" + << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos); + stream << "file_cpu_write_nanos" + << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos); + stream << "file_cpu_read_nanos" + << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos); + } + + return s; +} + +void FlushJob::Cancel() { + db_mutex_->AssertHeld(); + assert(base_ != nullptr); + base_->Unref(); +} + +Status FlushJob::MemPurge() { + Status s; + db_mutex_->AssertHeld(); + db_mutex_->Unlock(); + assert(!mems_.empty()); + + // Measure purging time. + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUMicros(); + + MemTable* new_mem = nullptr; + // For performance/log investigation purposes: + // look at how much useful payload we harvest in the new_mem. + // This value is then printed to the DB log. + double new_mem_capacity = 0.0; + + // Create two iterators, one for the memtable data (contains + // info from puts + deletes), and one for the memtable + // Range Tombstones (from DeleteRanges). + // TODO: plumb Env::IOActivity + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + std::vector memtables; + std::vector> + range_del_iters; + for (MemTable* m : mems_) { + memtables.push_back(m->NewIterator(ro, &arena)); + auto* range_del_iter = m->NewRangeTombstoneIterator( + ro, kMaxSequenceNumber, true /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + } + + assert(!memtables.empty()); + SequenceNumber first_seqno = kMaxSequenceNumber; + SequenceNumber earliest_seqno = kMaxSequenceNumber; + // Pick first and earliest seqno as min of all first_seqno + // and earliest_seqno of the mempurged memtables. + for (const auto& mem : mems_) { + first_seqno = mem->GetFirstSequenceNumber() < first_seqno + ? mem->GetFirstSequenceNumber() + : first_seqno; + earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno + ? mem->GetEarliestSequenceNumber() + : earliest_seqno; + } + + ScopedArenaIterator iter( + NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(), + static_cast(memtables.size()), &arena)); + + auto* ioptions = cfd_->ioptions(); + + // Place iterator at the First (meaning most recent) key node. + iter->SeekToFirst(); + + const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow()); + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&(cfd_->internal_comparator()), + existing_snapshots_, + full_history_ts_low)); + for (auto& rd_iter : range_del_iters) { + range_del_agg->AddTombstones(std::move(rd_iter)); + } + + // If there is valid data in the memtable, + // or at least range tombstones, copy over the info + // to the new memtable. + if (iter->Valid() || !range_del_agg->IsEmpty()) { + // MaxSize is the size of a memtable. + size_t maxSize = mutable_cf_options_.write_buffer_size; + std::unique_ptr compaction_filter; + if (ioptions->compaction_filter_factory != nullptr && + ioptions->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kFlush)) { + CompactionFilter::Context ctx; + ctx.is_full_compaction = false; + ctx.is_manual_compaction = false; + ctx.column_family_id = cfd_->GetID(); + ctx.reason = TableFileCreationReason::kFlush; + compaction_filter = + ioptions->compaction_filter_factory->CreateCompactionFilter(ctx); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return s; + } + } + + new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()), + mutable_cf_options_, cfd_->write_buffer_mgr(), + earliest_seqno, cfd_->GetID()); + assert(new_mem != nullptr); + + Env* env = db_options_.env; + assert(env); + MergeHelper merge( + env, (cfd_->internal_comparator()).user_comparator(), + (ioptions->merge_operator).get(), compaction_filter.get(), + ioptions->logger, true /* internal key corruption is not ok */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_); + assert(job_context_); + SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); + const std::atomic kManualCompactionCanceledFalse{false}; + CompactionIterator c_iter( + iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge, + kMaxSequenceNumber, &existing_snapshots_, + earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_, + env, ShouldReportDetailedTime(env, ioptions->stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + nullptr, ioptions->allow_data_in_errors, + ioptions->enforce_single_del_contracts, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low); + + // Set earliest sequence number in the new memtable + // to be equal to the earliest sequence number of the + // memtable being flushed (See later if there is a need + // to update this number!). + new_mem->SetEarliestSequenceNumber(earliest_seqno); + // Likewise for first seq number. + new_mem->SetFirstSequenceNumber(first_seqno); + SequenceNumber new_first_seqno = kMaxSequenceNumber; + + c_iter.SeekToFirst(); + + // Key transfer + for (; c_iter.Valid(); c_iter.Next()) { + const ParsedInternalKey ikey = c_iter.ikey(); + const Slice value = c_iter.value(); + new_first_seqno = + ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno; + + // Should we update "OldestKeyTime" ???? -> timestamp appear + // to still be an "experimental" feature. + s = new_mem->Add( + ikey.sequence, ikey.type, ikey.user_key, value, + nullptr, // KV protection info set as nullptr since it + // should only be useful for the first add to + // the original memtable. + false, // : allow concurrent_memtable_writes_ + // Not seen as necessary for now. + nullptr, // get_post_process_info(m) must be nullptr + // when concurrent_memtable_writes is switched off. + nullptr); // hint, only used when concurrent_memtable_writes_ + // is switched on. + if (!s.ok()) { + break; + } + + // If new_mem has size greater than maxSize, + // then rollback to regular flush operation, + // and destroy new_mem. + if (new_mem->ApproximateMemoryUsage() > maxSize) { + s = Status::Aborted("Mempurge filled more than one memtable."); + new_mem_capacity = 1.0; + break; + } + } + + // Check status and propagate + // potential error status from c_iter + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } + + // Range tombstone transfer. + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + new_first_seqno = + tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno; + s = new_mem->Add( + tombstone.seq_, // Sequence number + kTypeRangeDeletion, // KV type + tombstone.start_key_, // Key is start key. + tombstone.end_key_, // Value is end key. + nullptr, // KV protection info set as nullptr since it + // should only be useful for the first add to + // the original memtable. + false, // : allow concurrent_memtable_writes_ + // Not seen as necessary for now. + nullptr, // get_post_process_info(m) must be nullptr + // when concurrent_memtable_writes is switched off. + nullptr); // hint, only used when concurrent_memtable_writes_ + // is switched on. + + if (!s.ok()) { + break; + } + + // If new_mem has size greater than maxSize, + // then rollback to regular flush operation, + // and destroy new_mem. + if (new_mem->ApproximateMemoryUsage() > maxSize) { + s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); + new_mem_capacity = 1.0; + break; + } + } + } + + // If everything happened smoothly and new_mem contains valid data, + // decide if it is flushed to storage or kept in the imm() + // memtable list (memory). + if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) { + // Rectify the first sequence number, which (unlike the earliest seq + // number) needs to be present in the new memtable. + new_mem->SetFirstSequenceNumber(new_first_seqno); + + // The new_mem is added to the list of immutable memtables + // only if it filled at less than 100% capacity and isn't flagged + // as in need of being flushed. + if (new_mem->ApproximateMemoryUsage() < maxSize && + !(new_mem->ShouldFlushNow())) { + // Construct fragmented memtable range tombstones without mutex + new_mem->ConstructFragmentedRangeTombstones(); + db_mutex_->Lock(); + uint64_t new_mem_id = mems_[0]->GetID(); + + new_mem->SetID(new_mem_id); + new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber()); + + // This addition will not trigger another flush, because + // we do not call SchedulePendingFlush(). + cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free); + new_mem->Ref(); + // Piggyback FlushJobInfo on the first flushed memtable. + db_mutex_->AssertHeld(); + meta_.fd.file_size = 0; + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); + db_mutex_->Unlock(); + } else { + s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); + new_mem_capacity = 1.0; + if (new_mem) { + job_context_->memtables_to_free.push_back(new_mem); + } + } + } else { + // In this case, the newly allocated new_mem is empty. + assert(new_mem != nullptr); + job_context_->memtables_to_free.push_back(new_mem); + } + } + + // Reacquire the mutex for WriteLevel0 function. + db_mutex_->Lock(); + + // If mempurge successful, don't write input tables to level0, + // but write any full output table to level0. + if (s.ok()) { + TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful"); + } else { + TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful"); + } + const uint64_t micros = clock_->NowMicros() - start_micros; + const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Mempurge lasted %" PRIu64 + " microseconds, and %" PRIu64 + " cpu " + "microseconds. Status is %s ok. Perc capacity: %f\n", + cfd_->GetName().c_str(), job_context_->job_id, micros, + cpu_micros, s.ok() ? "" : "not", new_mem_capacity); + + return s; +} + +bool FlushJob::MemPurgeDecider(double threshold) { + // Never trigger mempurge if threshold is not a strictly positive value. + if (!(threshold > 0.0)) { + return false; + } + if (threshold > (1.0 * mems_.size())) { + return true; + } + // Payload and useful_payload (in bytes). + // The useful payload ratio of a given MemTable + // is estimated to be useful_payload/payload. + uint64_t payload = 0, useful_payload = 0, entry_size = 0; + + // Local variables used repetitively inside the for-loop + // when iterating over the sampled entries. + Slice key_slice, value_slice; + ParsedInternalKey res; + SnapshotImpl min_snapshot; + std::string vget; + Status mget_s, parse_s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0, sqno = 0, + min_seqno_snapshot = 0; + bool get_res, can_be_useful_payload, not_in_next_mems; + + // If estimated_useful_payload is > threshold, + // then flush to storage, else MemPurge. + double estimated_useful_payload = 0.0; + // Cochran formula for determining sample size. + // 95% confidence interval, 7% precision. + // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 + // TODO: plumb Env::IOActivity + double n0 = 196.0; + ReadOptions ro; + ro.total_order_seek = true; + + // Iterate over each memtable of the set. + for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_); + mem_iter++) { + MemTable* mt = *mem_iter; + + // Else sample from the table. + uint64_t nentries = mt->num_entries(); + // Corrected Cochran formula for small populations + // (converges to n0 for large populations). + uint64_t target_sample_size = + static_cast(ceil(n0 / (1.0 + (n0 / nentries)))); + std::unordered_set sentries = {}; + // Populate sample entries set. + mt->UniqueRandomSample(target_sample_size, &sentries); + + // Estimate the garbage ratio by comparing if + // each sample corresponds to a valid entry. + for (const char* ss : sentries) { + key_slice = GetLengthPrefixedSlice(ss); + parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/); + if (!parse_s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Memtable Decider: ParseInternalKey did not parse " + "key_slice %s successfully.", + key_slice.data()); + } + + // Size of the entry is "key size (+ value size if KV entry)" + entry_size = key_slice.size(); + if (res.type == kTypeValue) { + value_slice = + GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + entry_size += value_slice.size(); + } + + // Count entry bytes as payload. + payload += entry_size; + + LookupKey lkey(res.user_key, kMaxSequenceNumber); + + // Paranoia: zero out these values just in case. + max_covering_tombstone_seq = 0; + sqno = 0; + + // Pick the oldest existing snapshot that is more recent + // than the sequence number of the sampled entry. + min_seqno_snapshot = kMaxSequenceNumber; + for (SequenceNumber seq_num : existing_snapshots_) { + if (seq_num > res.sequence && seq_num < min_seqno_snapshot) { + min_seqno_snapshot = seq_num; + } + } + min_snapshot.number_ = min_seqno_snapshot; + ro.snapshot = + min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr; + + // Estimate if the sample entry is valid or not. + get_res = mt->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr, + &mget_s, &merge_context, &max_covering_tombstone_seq, + &sqno, ro, true /* immutable_memtable */); + if (!get_res) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Memtable Get returned false when Get(sampled entry). " + "Yet each sample entry should exist somewhere in the memtable, " + "unrelated to whether it has been deleted or not."); + } + + // TODO(bjlemaire): evaluate typeMerge. + // This is where the sampled entry is estimated to be + // garbage or not. Note that this is a garbage *estimation* + // because we do not include certain items such as + // CompactionFitlers triggered at flush, or if the same delete + // has been inserted twice or more in the memtable. + + // Evaluate if the entry can be useful payload + // Situation #1: entry is a KV entry, was found in the memtable mt + // and the sequence numbers match. + can_be_useful_payload = (res.type == kTypeValue) && get_res && + mget_s.ok() && (sqno == res.sequence); + + // Situation #2: entry is a delete entry, was found in the memtable mt + // (because gres==true) and no valid KV entry is found. + // (note: duplicate delete entries are also taken into + // account here, because the sequence number 'sqno' + // in memtable->Get(&sqno) operation is set to be equal + // to the most recent delete entry as well). + can_be_useful_payload |= + ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) && + mget_s.IsNotFound() && get_res && (sqno == res.sequence); + + // If there is a chance that the entry is useful payload + // Verify that the entry does not appear in the following memtables + // (memtables with greater memtable ID/larger sequence numbers). + if (can_be_useful_payload) { + not_in_next_mems = true; + for (auto next_mem_iter = mem_iter + 1; + next_mem_iter != std::end(mems_); next_mem_iter++) { + if ((*next_mem_iter) + ->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr, + &mget_s, &merge_context, &max_covering_tombstone_seq, + &sqno, ro, true /* immutable_memtable */)) { + not_in_next_mems = false; + break; + } + } + if (not_in_next_mems) { + useful_payload += entry_size; + } + } + } + if (payload > 0) { + // We use the estimated useful payload ratio to + // evaluate how many of the memtable bytes are useful bytes. + estimated_useful_payload += + (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload); + + ROCKS_LOG_INFO(db_options_.info_log, + "Mempurge sampling [CF %s] - found garbage ratio from " + "sampling: %f. Threshold is %f\n", + cfd_->GetName().c_str(), + (payload - useful_payload) * 1.0 / payload, threshold); + } else { + ROCKS_LOG_WARN(db_options_.info_log, + "Mempurge sampling: null payload measured, and collected " + "sample size is %zu\n.", + sentries.size()); + } + } + // We convert the total number of useful payload bytes + // into the proportion of memtable necessary to store all these bytes. + // We compare this proportion with the threshold value. + return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) < + threshold); +} + +Status FlushJob::WriteLevel0Table() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_FLUSH_WRITE_L0); + db_mutex_->AssertHeld(); + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUMicros(); + Status s; + + SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber(); + if (!db_impl_seqno_time_mapping_.Empty()) { + // make a local copy, as the seqno_time_mapping from db_impl is not thread + // safe, which will be used while not holding the db_mutex. + seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno); + } + + std::vector blob_file_additions; + + { + auto write_hint = cfd_->CalculateSSTWriteHint(0); + Env::IOPriority io_priority = GetRateLimiterPriorityForWrite(); + db_mutex_->Unlock(); + if (log_buffer_) { + log_buffer_->FlushBufferToLog(); + } + // memtables and range_del_iters store internal iterators over each data + // memtable and its associated range deletion memtable, respectively, at + // corresponding indexes. + std::vector memtables; + std::vector> + range_del_iters; + ReadOptions ro; + ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kFlush; + Arena arena; + uint64_t total_num_entries = 0, total_num_deletes = 0; + uint64_t total_data_size = 0; + size_t total_memory_usage = 0; + // Used for testing: + uint64_t mems_size = mems_.size(); + (void)mems_size; // avoids unused variable error when + // TEST_SYNC_POINT_CALLBACK not used. + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:num_memtables", + &mems_size); + assert(job_context_); + for (MemTable* m : mems_) { + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", + cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); + memtables.push_back(m->NewIterator(ro, &arena)); + auto* range_del_iter = m->NewRangeTombstoneIterator( + ro, kMaxSequenceNumber, true /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + total_num_entries += m->num_entries(); + total_num_deletes += m->num_deletes(); + total_data_size += m->get_data_size(); + total_memory_usage += m->ApproximateMemoryUsage(); + } + + event_logger_->Log() << "job" << job_context_->job_id << "event" + << "flush_started" + << "num_memtables" << mems_.size() << "num_entries" + << total_num_entries << "num_deletes" + << total_num_deletes << "total_data_size" + << total_data_size << "memory_usage" + << total_memory_usage << "flush_reason" + << GetFlushReasonString(flush_reason_); + + { + ScopedArenaIterator iter( + NewMergingIterator(&cfd_->internal_comparator(), memtables.data(), + static_cast(memtables.size()), &arena)); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", + cfd_->GetName().c_str(), job_context_->job_id, + meta_.fd.GetNumber()); + + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", + &output_compression_); + int64_t _current_time = 0; + auto status = clock_->GetCurrentTime(&_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to get current time to populate creation_time property. " + "Status: %s", + status.ToString().c_str()); + } + const uint64_t current_time = static_cast(_current_time); + + uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime(); + + // It's not clear whether oldest_key_time is always available. In case + // it is not available, use current_time. + uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time); + + TEST_SYNC_POINT_CALLBACK( + "FlushJob::WriteLevel0Table:oldest_ancester_time", + &oldest_ancester_time); + meta_.oldest_ancester_time = oldest_ancester_time; + meta_.file_creation_time = current_time; + + uint64_t num_input_entries = 0; + uint64_t memtable_payload_bytes = 0; + uint64_t memtable_garbage_bytes = 0; + IOStatus io_s; + + const std::string* const full_history_ts_low = + (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; + TableBuilderOptions tboptions( + *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), output_compression_, + mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kFlush, oldest_key_time, current_time, + db_id_, db_session_id_, 0 /* target_file_size */, + meta_.fd.GetNumber()); + const SequenceNumber job_snapshot_seq = + job_context_->GetJobSnapshotSequence(); + const ReadOptions read_options(Env::IOActivity::kFlush); + s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_, + read_options, cfd_->table_cache(), iter.get(), + std::move(range_del_iters), &meta_, &blob_file_additions, + existing_snapshots_, earliest_write_conflict_snapshot_, + job_snapshot_seq, snapshot_checker_, + mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_, + event_logger_, job_context_->job_id, io_priority, + &table_properties_, write_hint, full_history_ts_low, + blob_callback_, base_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); + // TODO: Cleanup io_status in BuildTable and table builders + assert(!s.ok() || io_s.ok()); + io_s.PermitUncheckedError(); + if (num_input_entries != total_num_entries && s.ok()) { + std::string msg = "Expected " + std::to_string(total_num_entries) + + " entries in memtables, but read " + + std::to_string(num_input_entries); + ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", + cfd_->GetName().c_str(), job_context_->job_id, + msg.c_str()); + if (db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } + if (tboptions.reason == TableFileCreationReason::kFlush) { + TEST_SYNC_POINT("DBImpl::FlushJob:Flush"); + RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + memtable_payload_bytes); + RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + memtable_garbage_bytes); + } + LogFlush(db_options_.info_log); + } + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 + " bytes %s" + "%s", + cfd_->GetName().c_str(), job_context_->job_id, + meta_.fd.GetNumber(), meta_.fd.GetFileSize(), + s.ToString().c_str(), + meta_.marked_for_compaction ? " (needs compaction)" : ""); + + if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) { + s = output_file_directory_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); + db_mutex_->Lock(); + } + base_->Unref(); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + const bool has_output = meta_.fd.GetFileSize() > 0; + + if (s.ok() && has_output) { + TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated"); + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + // Add file to L0 + edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), + meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, + meta_.fd.smallest_seqno, meta_.fd.largest_seqno, + meta_.marked_for_compaction, meta_.temperature, + meta_.oldest_blob_file_number, meta_.oldest_ancester_time, + meta_.file_creation_time, meta_.epoch_number, + meta_.file_checksum, meta_.file_checksum_func_name, + meta_.unique_id, meta_.compensated_range_deletion_size, + meta_.tail_size); + edit_->SetBlobFileAdditions(std::move(blob_file_additions)); + } + // Piggyback FlushJobInfo on the first first flushed memtable. + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); + + // Note that here we treat flush as level 0 compaction in internal stats + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + const uint64_t micros = clock_->NowMicros() - start_micros; + const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; + stats.micros = micros; + stats.cpu_micros = cpu_micros; + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Flush lasted %" PRIu64 + " microseconds, and %" PRIu64 " cpu microseconds.\n", + cfd_->GetName().c_str(), job_context_->job_id, micros, + cpu_micros); + + if (has_output) { + stats.bytes_written = meta_.fd.GetFileSize(); + stats.num_output_files = 1; + } + + const auto& blobs = edit_->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); + } + + stats.num_output_files_blob = static_cast(blobs.size()); + + RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros); + cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats); + cfd_->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); + RecordFlushIOStats(); + + return s; +} + +Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() { + if (versions_ && versions_->GetColumnFamilySet() && + versions_->GetColumnFamilySet()->write_controller()) { + WriteController* write_controller = + versions_->GetColumnFamilySet()->write_controller(); + if (write_controller->IsStopped() || write_controller->NeedsDelay()) { + return Env::IO_USER; + } + } + + return Env::IO_HIGH; +} + +std::unique_ptr FlushJob::GetFlushJobInfo() const { + db_mutex_->AssertHeld(); + std::unique_ptr info(new FlushJobInfo{}); + info->cf_id = cfd_->GetID(); + info->cf_name = cfd_->GetName(); + + const uint64_t file_number = meta_.fd.GetNumber(); + info->file_path = + MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number); + info->file_number = file_number; + info->oldest_blob_file_number = meta_.oldest_blob_file_number; + info->thread_id = db_options_.env->GetThreadID(); + info->job_id = job_context_->job_id; + info->smallest_seqno = meta_.fd.smallest_seqno; + info->largest_seqno = meta_.fd.largest_seqno; + info->table_properties = table_properties_; + info->flush_reason = flush_reason_; + info->blob_compression_type = mutable_cf_options_.blob_compression_type; + + // Update BlobFilesInfo. + for (const auto& blob_file : edit_->GetBlobFileAdditions()) { + BlobFileAdditionInfo blob_file_addition_info( + BlobFileName(cfd_->ioptions()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(), + blob_file.GetTotalBlobBytes()); + info->blob_file_addition_infos.emplace_back( + std::move(blob_file_addition_info)); + } + return info; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.h new file mode 100644 index 0000000000..d3902f0bd0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_job.h @@ -0,0 +1,200 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_completion_callback.h" +#include "db/column_family.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/seqno_to_time_mapping.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class MemTable; +class SnapshotChecker; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class Arena; + +class FlushJob { + public: + // TODO(icanadi) make effort to reduce number of parameters here + // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive + FlushJob(const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + FlushReason flush_reason, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_file_directory, + CompressionType output_compression, Statistics* stats, + EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const SeqnoToTimeMapping& seq_time_mapping, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", + BlobFileCompletionCallback* blob_callback = nullptr); + + ~FlushJob(); + + // Require db_mutex held. + // Once PickMemTable() is called, either Run() or Cancel() has to be called. + void PickMemTable(); + Status Run(LogsWithPrepTracker* prep_tracker = nullptr, + FileMetaData* file_meta = nullptr, + bool* switched_to_mempurge = nullptr); + void Cancel(); + const autovector& GetMemTables() const { return mems_; } + + std::list>* GetCommittedFlushJobsInfo() { + return &committed_flush_jobs_info_; + } + + private: + friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test; + + void ReportStartedFlush(); + void ReportFlushInputSize(const autovector& mems); + void RecordFlushIOStats(); + Status WriteLevel0Table(); + + // Memtable Garbage Collection algorithm: a MemPurge takes the list + // of immutable memtables and filters out (or "purge") the outdated bytes + // out of it. The output (the filtered bytes, or "useful payload") is + // then transfered into a new memtable. If this memtable is filled, then + // the mempurge is aborted and rerouted to a regular flush process. Else, + // depending on the heuristics, placed onto the immutable memtable list. + // The addition to the imm list will not trigger a flush operation. The + // flush of the imm list will instead be triggered once the mutable memtable + // is added to the imm list. + // This process is typically intended for workloads with heavy overwrites + // when we want to avoid SSD writes (and reads) as much as possible. + // "MemPurge" is an experimental feature still at a very early stage + // of development. At the moment it is only compatible with the Get, Put, + // Delete operations as well as Iterators and CompactionFilters. + // For this early version, "MemPurge" is called by setting the + // options.experimental_mempurge_threshold value as >0.0. When this is + // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will + // first go through the MemPurge process. Therefore, we strongly + // recommend all users not to set this flag as true given that the MemPurge + // process has not matured yet. + Status MemPurge(); + bool MemPurgeDecider(double threshold); + // The rate limiter priority (io_priority) is determined dynamically here. + Env::IOPriority GetRateLimiterPriorityForWrite(); + std::unique_ptr GetFlushJobInfo() const; + + const std::string& dbname_; + const std::string db_id_; + const std::string db_session_id_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const MutableCFOptions& mutable_cf_options_; + // A variable storing the largest memtable id to flush in this + // flush job. RocksDB uses this variable to select the memtables to flush in + // this job. All memtables in this column family with an ID smaller than or + // equal to max_memtable_id_ will be selected for flush. + uint64_t max_memtable_id_; + const FileOptions file_options_; + VersionSet* versions_; + InstrumentedMutex* db_mutex_; + std::atomic* shutting_down_; + std::vector existing_snapshots_; + SequenceNumber earliest_write_conflict_snapshot_; + SnapshotChecker* snapshot_checker_; + JobContext* job_context_; + FlushReason flush_reason_; + LogBuffer* log_buffer_; + FSDirectory* db_directory_; + FSDirectory* output_file_directory_; + CompressionType output_compression_; + Statistics* stats_; + EventLogger* event_logger_; + TableProperties table_properties_; + bool measure_io_stats_; + // True if this flush job should call fsync on the output directory. False + // otherwise. + // Usually sync_output_directory_ is true. A flush job needs to call sync on + // the output directory before committing to the MANIFEST. + // However, an individual flush job does not have to call sync on the output + // directory if it is part of an atomic flush. After all flush jobs in the + // atomic flush succeed, call sync once on each distinct output directory. + const bool sync_output_directory_; + // True if this flush job should write to MANIFEST after successfully + // flushing memtables. False otherwise. + // Usually write_manifest_ is true. A flush job commits to the MANIFEST after + // flushing the memtables. + // However, an individual flush job cannot rashly write to the MANIFEST + // immediately after it finishes the flush if it is part of an atomic flush. + // In this case, only after all flush jobs succeed in flush can RocksDB + // commit to the MANIFEST. + const bool write_manifest_; + // The current flush job can commit flush result of a concurrent flush job. + // We collect FlushJobInfo of all jobs committed by current job and fire + // OnFlushCompleted for them. + std::list> committed_flush_jobs_info_; + + // Variables below are set by PickMemTable(): + FileMetaData meta_; + autovector mems_; + VersionEdit* edit_; + Version* base_; + bool pick_memtable_called; + Env::Priority thread_pri_; + + const std::shared_ptr io_tracer_; + SystemClock* clock_; + + const std::string full_history_ts_low_; + BlobFileCompletionCallback* blob_callback_; + + // reference to the seqno_time_mapping_ in db_impl.h, not safe to read without + // db mutex + const SeqnoToTimeMapping& db_impl_seqno_time_mapping_; + SeqnoToTimeMapping seqno_to_time_mapping_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.cc new file mode 100644 index 0000000000..6f4d3e1a5e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/flush_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) { +#ifndef NDEBUG + { + std::lock_guard lock(checking_mutex_); + assert(checking_set_.count(cfd) == 0); + checking_set_.insert(cfd); + } +#endif // NDEBUG + cfd->Ref(); +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ + Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)}; + while (!head_.compare_exchange_strong( + node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) { + // failing CAS updates the first param, so we are already set for + // retry. TakeNextColumnFamily won't happen until after another + // inter-thread synchronization, so we don't even need release + // semantics for this CAS + } +#endif // __clang_analyzer__ +} + +ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { + while (true) { + if (head_.load(std::memory_order_relaxed) == nullptr) { + return nullptr; + } + + // dequeue the head + Node* node = head_.load(std::memory_order_relaxed); + head_.store(node->next, std::memory_order_relaxed); + ColumnFamilyData* cfd = node->column_family; + delete node; + +#ifndef NDEBUG + { + std::lock_guard lock(checking_mutex_); + auto iter = checking_set_.find(cfd); + assert(iter != checking_set_.end()); + checking_set_.erase(iter); + } +#endif // NDEBUG + + if (!cfd->IsDropped()) { + // success + return cfd; + } + + // no longer relevant, retry + cfd->UnrefAndTryDelete(); + } +} + +bool FlushScheduler::Empty() { + auto rv = head_.load(std::memory_order_relaxed) == nullptr; +#ifndef NDEBUG + std::lock_guard lock(checking_mutex_); + // Empty is allowed to be called concurrnetly with ScheduleFlush. It would + // only miss the recent schedules. + assert((rv == checking_set_.empty()) || rv); +#endif // NDEBUG + return rv; +} + +void FlushScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + } + assert(head_.load(std::memory_order_relaxed) == nullptr); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.h new file mode 100644 index 0000000000..eb03f3e114 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/flush_scheduler.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// FlushScheduler keeps track of all column families whose memtable may +// be full and require flushing. Unless otherwise noted, all methods on +// FlushScheduler should be called only with the DB mutex held or from +// a single-threaded recovery context. +class FlushScheduler { + public: + FlushScheduler() : head_(nullptr) {} + + // May be called from multiple threads at once, but not concurrent with + // any other method calls on this instance + void ScheduleWork(ColumnFamilyData* cfd); + + // Removes and returns Ref()-ed column family. Client needs to Unref(). + // Filters column families that have been dropped. + ColumnFamilyData* TakeNextColumnFamily(); + + // This can be called concurrently with ScheduleWork but it would miss all + // the scheduled flushes after the last synchronization. This would result + // into less precise enforcement of memtable sizes but should not matter much. + bool Empty(); + + void Clear(); + + private: + struct Node { + ColumnFamilyData* column_family; + Node* next; + }; + + std::atomic head_; +#ifndef NDEBUG + std::mutex checking_mutex_; + std::set checking_set_; +#endif // NDEBUG +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.cc new file mode 100644 index 0000000000..75a7c599b8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.cc @@ -0,0 +1,1070 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/forward_iterator.h" + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Usage: +// ForwardLevelIterator iter; +// iter.SetFileIndex(file_index); +// iter.Seek(target); // or iter.SeekToFirst(); +// iter.Next() +class ForwardLevelIterator : public InternalIterator { + public: + ForwardLevelIterator( + const ColumnFamilyData* const cfd, const ReadOptions& read_options, + const std::vector& files, + const std::shared_ptr& prefix_extractor, + bool allow_unprepared_value, uint8_t block_protection_bytes_per_key) + : cfd_(cfd), + read_options_(read_options), + files_(files), + valid_(false), + file_index_(std::numeric_limits::max()), + file_iter_(nullptr), + pinned_iters_mgr_(nullptr), + prefix_extractor_(prefix_extractor), + allow_unprepared_value_(allow_unprepared_value), + block_protection_bytes_per_key_(block_protection_bytes_per_key) { + status_.PermitUncheckedError(); // Allow uninitialized status through + } + + ~ForwardLevelIterator() override { + // Reset current pointer + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(file_iter_); + } else { + delete file_iter_; + } + } + + void SetFileIndex(uint32_t file_index) { + assert(file_index < files_.size()); + status_ = Status::OK(); + if (file_index != file_index_) { + file_index_ = file_index; + Reset(); + } + } + void Reset() { + assert(file_index_ < files_.size()); + + // Reset current pointer + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(file_iter_); + } else { + delete file_iter_; + } + + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + file_iter_ = cfd_->table_cache()->NewIterator( + read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), + *files_[file_index_], + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + prefix_extractor_, /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, + /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*max_file_size_for_l0_meta_pin=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + block_protection_bytes_per_key_); + file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + valid_ = false; + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + } + } + void SeekToLast() override { + status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()"); + valid_ = false; + } + void Prev() override { + status_ = Status::NotSupported("ForwardLevelIterator::Prev()"); + valid_ = false; + } + bool Valid() const override { return valid_; } + void SeekToFirst() override { + assert(file_iter_ != nullptr); + if (!status_.ok()) { + assert(!valid_); + return; + } + file_iter_->SeekToFirst(); + valid_ = file_iter_->Valid(); + } + void Seek(const Slice& internal_key) override { + assert(file_iter_ != nullptr); + + // This deviates from the usual convention for InternalIterator::Seek() in + // that it doesn't discard pre-existing error status. That's because this + // Seek() is only supposed to be called immediately after SetFileIndex() + // (which discards pre-existing error status), and SetFileIndex() may set + // an error status, which we shouldn't discard. + if (!status_.ok()) { + assert(!valid_); + return; + } + + file_iter_->Seek(internal_key); + valid_ = file_iter_->Valid(); + } + void SeekForPrev(const Slice& /*internal_key*/) override { + status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()"); + valid_ = false; + } + void Next() override { + assert(valid_); + file_iter_->Next(); + for (;;) { + valid_ = file_iter_->Valid(); + if (!file_iter_->status().ok()) { + assert(!valid_); + return; + } + if (valid_) { + return; + } + if (file_index_ + 1 >= files_.size()) { + valid_ = false; + return; + } + SetFileIndex(file_index_ + 1); + if (!status_.ok()) { + assert(!valid_); + return; + } + file_iter_->SeekToFirst(); + } + } + Slice key() const override { + assert(valid_); + return file_iter_->key(); + } + Slice value() const override { + assert(valid_); + return file_iter_->value(); + } + Status status() const override { + if (!status_.ok()) { + return status_; + } else if (file_iter_) { + return file_iter_->status(); + } + return Status::OK(); + } + bool PrepareValue() override { + assert(valid_); + if (file_iter_->PrepareValue()) { + return true; + } + + assert(!file_iter_->Valid()); + valid_ = false; + return false; + } + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_->IsKeyPinned(); + } + bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_->IsValuePinned(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + if (file_iter_) { + file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + private: + const ColumnFamilyData* const cfd_; + const ReadOptions& read_options_; + const std::vector& files_; + + bool valid_; + uint32_t file_index_; + Status status_; + InternalIterator* file_iter_; + PinnedIteratorsManager* pinned_iters_mgr_; + // Kept alive by ForwardIterator::sv_->mutable_cf_options + const std::shared_ptr& prefix_extractor_; + const bool allow_unprepared_value_; + const uint8_t block_protection_bytes_per_key_; +}; + +ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* current_sv, + bool allow_unprepared_value) + : db_(db), + read_options_(read_options), + cfd_(cfd), + prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()), + user_comparator_(cfd->user_comparator()), + allow_unprepared_value_(allow_unprepared_value), + immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), + sv_(current_sv), + mutable_iter_(nullptr), + current_(nullptr), + valid_(false), + status_(Status::OK()), + immutable_status_(Status::OK()), + has_iter_trimmed_for_upper_bound_(false), + current_over_upper_bound_(false), + is_prev_set_(false), + is_prev_inclusive_(false), + pinned_iters_mgr_(nullptr) { + if (sv_) { + RebuildIterators(false); + } + if (!cfd_->ioptions()->env->GetFileSystem()->use_async_io()) { + read_options_.async_io = false; + } + + // immutable_status_ is a local aggregation of the + // status of the immutable Iterators. + // We have to PermitUncheckedError in case it is never + // used, otherwise it will fail ASSERT_STATUS_CHECKED. + immutable_status_.PermitUncheckedError(); +} + +ForwardIterator::~ForwardIterator() { Cleanup(true); } + +void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv, + bool background_purge_on_iterator_cleanup) { + if (sv->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + db->mutex_.Lock(); + sv->Cleanup(); + db->FindObsoleteFiles(&job_context, false, true); + if (background_purge_on_iterator_cleanup) { + db->ScheduleBgLogWriterClose(&job_context); + db->AddSuperVersionsToFreeQueue(sv); + db->SchedulePurge(); + } + db->mutex_.Unlock(); + if (!background_purge_on_iterator_cleanup) { + delete sv; + } + if (job_context.HaveSomethingToDelete()) { + db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup); + } + job_context.Clean(); + } +} + +namespace { +struct SVCleanupParams { + DBImpl* db; + SuperVersion* sv; + bool background_purge_on_iterator_cleanup; +}; +} // anonymous namespace + +// Used in PinnedIteratorsManager to release pinned SuperVersion +void ForwardIterator::DeferredSVCleanup(void* arg) { + auto d = reinterpret_cast(arg); + ForwardIterator::SVCleanup(d->db, d->sv, + d->background_purge_on_iterator_cleanup); + delete d; +} + +void ForwardIterator::SVCleanup() { + if (sv_ == nullptr) { + return; + } + bool background_purge = + read_options_.background_purge_on_iterator_cleanup || + db_->immutable_db_options().avoid_unnecessary_blocking_io; + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + // pinned_iters_mgr_ tells us to make sure that all visited key-value slices + // are alive until pinned_iters_mgr_->ReleasePinnedData() is called. + // The slices may point into some memtables owned by sv_, so we need to keep + // sv_ referenced until pinned_iters_mgr_ unpins everything. + auto p = new SVCleanupParams{db_, sv_, background_purge}; + pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup); + } else { + SVCleanup(db_, sv_, background_purge); + } +} + +void ForwardIterator::Cleanup(bool release_sv) { + if (mutable_iter_ != nullptr) { + DeleteIterator(mutable_iter_, true /* is_arena */); + } + + for (auto* m : imm_iters_) { + DeleteIterator(m, true /* is_arena */); + } + imm_iters_.clear(); + + for (auto* f : l0_iters_) { + DeleteIterator(f); + } + l0_iters_.clear(); + + for (auto* l : level_iters_) { + DeleteIterator(l); + } + level_iters_.clear(); + + if (release_sv) { + SVCleanup(); + } +} + +bool ForwardIterator::Valid() const { + // See UpdateCurrent(). + return valid_ ? !current_over_upper_bound_ : false; +} + +void ForwardIterator::SeekToFirst() { + if (sv_ == nullptr) { + RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); + } else if (immutable_status_.IsIncomplete()) { + ResetIncompleteIterators(); + } + SeekInternal(Slice(), true, false); +} + +bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const { + return !(read_options_.iterate_upper_bound == nullptr || + cfd_->internal_comparator().user_comparator()->Compare( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) < 0); +} + +void ForwardIterator::Seek(const Slice& internal_key) { + if (sv_ == nullptr) { + RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); + } else if (immutable_status_.IsIncomplete()) { + ResetIncompleteIterators(); + } + + SeekInternal(internal_key, false, false); + if (read_options_.async_io) { + SeekInternal(internal_key, false, true); + } +} + +// In case of async_io, SeekInternal is called twice with seek_after_async_io +// enabled in second call which only does seeking part to retrieve the blocks. +void ForwardIterator::SeekInternal(const Slice& internal_key, + bool seek_to_first, + bool seek_after_async_io) { + assert(mutable_iter_); + // mutable + if (!seek_after_async_io) { + seek_to_first ? mutable_iter_->SeekToFirst() + : mutable_iter_->Seek(internal_key); + } + + // immutable + // TODO(ljin): NeedToSeekImmutable has negative impact on performance + // if it turns to need to seek immutable often. We probably want to have + // an option to turn it off. + if (seek_to_first || seek_after_async_io || + NeedToSeekImmutable(internal_key)) { + if (!seek_after_async_io) { + immutable_status_ = Status::OK(); + if (has_iter_trimmed_for_upper_bound_ && + ( + // prev_ is not set yet + is_prev_set_ == false || + // We are doing SeekToFirst() and internal_key.size() = 0 + seek_to_first || + // prev_key_ > internal_key + cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key_.GetInternalKey(), internal_key) > 0)) { + // Some iterators are trimmed. Need to rebuild. + RebuildIterators(true); + // Already seeked mutable iter, so seek again + seek_to_first ? mutable_iter_->SeekToFirst() + : mutable_iter_->Seek(internal_key); + } + { + auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator())); + immutable_min_heap_.swap(tmp); + } + for (size_t i = 0; i < imm_iters_.size(); i++) { + auto* m = imm_iters_[i]; + seek_to_first ? m->SeekToFirst() : m->Seek(internal_key); + if (!m->status().ok()) { + immutable_status_ = m->status(); + } else if (m->Valid()) { + immutable_min_heap_.push(m); + } + } + } + + Slice target_user_key; + if (!seek_to_first) { + target_user_key = ExtractUserKey(internal_key); + } + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + continue; + } + if (seek_after_async_io) { + if (!l0_iters_[i]->status().IsTryAgain()) { + continue; + } + } + + if (seek_to_first) { + l0_iters_[i]->SeekToFirst(); + } else { + // If the target key passes over the largest key, we are sure Next() + // won't go over this file. + if (seek_after_async_io == false && + user_comparator_->Compare(target_user_key, + l0[i]->largest.user_key()) > 0) { + if (read_options_.iterate_upper_bound != nullptr) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + } + continue; + } + l0_iters_[i]->Seek(internal_key); + } + + if (l0_iters_[i]->status().IsTryAgain()) { + assert(!seek_after_async_io); + continue; + } else if (!l0_iters_[i]->status().ok()) { + immutable_status_ = l0_iters_[i]->status(); + } else if (l0_iters_[i]->Valid() && + !IsOverUpperBound(l0_iters_[i]->key())) { + immutable_min_heap_.push(l0_iters_[i]); + } else { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const std::vector& level_files = + vstorage->LevelFiles(level); + if (level_files.empty()) { + continue; + } + if (level_iters_[level - 1] == nullptr) { + continue; + } + + if (seek_after_async_io) { + if (!level_iters_[level - 1]->status().IsTryAgain()) { + continue; + } + } + uint32_t f_idx = 0; + if (!seek_to_first && !seek_after_async_io) { + f_idx = FindFileInRange(level_files, internal_key, 0, + static_cast(level_files.size())); + } + + // Seek + if (seek_after_async_io || f_idx < level_files.size()) { + if (!seek_after_async_io) { + level_iters_[level - 1]->SetFileIndex(f_idx); + } + seek_to_first ? level_iters_[level - 1]->SeekToFirst() + : level_iters_[level - 1]->Seek(internal_key); + + if (level_iters_[level - 1]->status().IsTryAgain()) { + assert(!seek_after_async_io); + continue; + } else if (!level_iters_[level - 1]->status().ok()) { + immutable_status_ = level_iters_[level - 1]->status(); + } else if (level_iters_[level - 1]->Valid() && + !IsOverUpperBound(level_iters_[level - 1]->key())) { + immutable_min_heap_.push(level_iters_[level - 1]); + } else { + // Nothing in this level is interesting. Remove. + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(level_iters_[level - 1]); + level_iters_[level - 1] = nullptr; + } + } + } + + if (seek_to_first) { + is_prev_set_ = false; + } else { + prev_key_.SetInternalKey(internal_key); + is_prev_set_ = true; + is_prev_inclusive_ = true; + } + + TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this); + } else if (current_ && current_ != mutable_iter_) { + // current_ is one of immutable iterators, push it back to the heap + immutable_min_heap_.push(current_); + } + + // For async_io, it should be updated when seek_after_async_io is true (in + // second call). + if (seek_to_first || !read_options_.async_io || seek_after_async_io) { + UpdateCurrent(); + } + TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this); +} + +void ForwardIterator::Next() { + assert(valid_); + bool update_prev_key = false; + + if (sv_ == nullptr || sv_->version_number != cfd_->GetSuperVersionNumber()) { + std::string current_key = key().ToString(); + Slice old_key(current_key.data(), current_key.size()); + + if (sv_ == nullptr) { + RebuildIterators(true); + } else { + RenewIterators(); + } + + SeekInternal(old_key, false, false); + if (read_options_.async_io) { + SeekInternal(old_key, false, true); + } + + if (!valid_ || key().compare(old_key) != 0) { + return; + } + } else if (current_ != mutable_iter_) { + // It is going to advance immutable iterator + + if (is_prev_set_ && prefix_extractor_) { + // advance prev_key_ to current_ only if they share the same prefix + update_prev_key = + prefix_extractor_->Transform(prev_key_.GetUserKey()) + .compare(prefix_extractor_->Transform(current_->key())) == 0; + } else { + update_prev_key = true; + } + + if (update_prev_key) { + prev_key_.SetInternalKey(current_->key()); + is_prev_set_ = true; + is_prev_inclusive_ = false; + } + } + + current_->Next(); + if (current_ != mutable_iter_) { + if (!current_->status().ok()) { + immutable_status_ = current_->status(); + } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) { + immutable_min_heap_.push(current_); + } else { + if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) { + // remove the current iterator + DeleteCurrentIter(); + current_ = nullptr; + } + if (update_prev_key) { + mutable_iter_->Seek(prev_key_.GetInternalKey()); + } + } + } + UpdateCurrent(); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this); +} + +Slice ForwardIterator::key() const { + assert(valid_); + return current_->key(); +} + +Slice ForwardIterator::value() const { + assert(valid_); + return current_->value(); +} + +Status ForwardIterator::status() const { + if (!status_.ok()) { + return status_; + } else if (!mutable_iter_->status().ok()) { + return mutable_iter_->status(); + } + + return immutable_status_; +} + +bool ForwardIterator::PrepareValue() { + assert(valid_); + if (current_->PrepareValue()) { + return true; + } + + assert(!current_->Valid()); + assert(!current_->status().ok()); + assert(current_ != mutable_iter_); // memtable iterator can't fail + assert(immutable_status_.ok()); + + valid_ = false; + immutable_status_ = current_->status(); + return false; +} + +Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { + assert(prop != nullptr); + if (prop_name == "rocksdb.iterator.super-version-number") { + *prop = std::to_string(sv_->version_number); + return Status::OK(); + } + return Status::InvalidArgument(); +} + +void ForwardIterator::SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) { + pinned_iters_mgr_ = pinned_iters_mgr; + UpdateChildrenPinnedItersMgr(); +} + +void ForwardIterator::UpdateChildrenPinnedItersMgr() { + // Set PinnedIteratorsManager for mutable memtable iterator. + if (mutable_iter_) { + mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + } + + // Set PinnedIteratorsManager for immutable memtable iterators. + for (InternalIterator* child_iter : imm_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + // Set PinnedIteratorsManager for L0 files iterators. + for (InternalIterator* child_iter : l0_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + // Set PinnedIteratorsManager for L1+ levels iterators. + for (ForwardLevelIterator* child_iter : level_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } +} + +bool ForwardIterator::IsKeyPinned() const { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsKeyPinned(); +} + +bool ForwardIterator::IsValuePinned() const { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsValuePinned(); +} + +void ForwardIterator::RebuildIterators(bool refresh_sv) { + // Clean up + Cleanup(refresh_sv); + if (refresh_sv) { + // New + sv_ = cfd_->GetReferencedSuperVersion(db_); + } + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); + sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); + if (!read_options_.ignore_range_deletions) { + std::unique_ptr range_del_iter( + sv_->mem->NewRangeTombstoneIterator( + read_options_, sv_->current->version_set()->LastSequence(), + false /* immutable_memtable */)); + range_del_agg.AddTombstones(std::move(range_del_iter)); + // Always return Status::OK(). + Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_, + &range_del_agg); + assert(temp_s.ok()); + } + has_iter_trimmed_for_upper_bound_ = false; + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + l0_iters_.reserve(l0_files.size()); + for (const auto* l0 : l0_files) { + if ((read_options_.iterate_upper_bound != nullptr) && + cfd_->internal_comparator().user_comparator()->Compare( + l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) { + // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator + // will never be interested in files with smallest key above + // iterate_upper_bound, since iterate_upper_bound can't be changed. + l0_iters_.push_back(nullptr); + continue; + } + l0_iters_.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + sv_->mutable_cf_options.prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key)); + } + BuildLevelIterators(vstorage, sv_); + current_ = nullptr; + is_prev_set_ = false; + + UpdateChildrenPinnedItersMgr(); + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + valid_ = false; + } +} + +void ForwardIterator::RenewIterators() { + SuperVersion* svnew; + assert(sv_); + svnew = cfd_->GetReferencedSuperVersion(db_); + + if (mutable_iter_ != nullptr) { + DeleteIterator(mutable_iter_, true /* is_arena */); + } + for (auto* m : imm_iters_) { + DeleteIterator(m, true /* is_arena */); + } + imm_iters_.clear(); + + mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); + svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + if (!read_options_.ignore_range_deletions) { + std::unique_ptr range_del_iter( + svnew->mem->NewRangeTombstoneIterator( + read_options_, sv_->current->version_set()->LastSequence(), + false /* immutable_memtable */)); + range_del_agg.AddTombstones(std::move(range_del_iter)); + // Always return Status::OK(). + Status temp_s = svnew->imm->AddRangeTombstoneIterators( + read_options_, &arena_, &range_del_agg); + assert(temp_s.ok()); + } + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + const auto* vstorage_new = svnew->current->storage_info(); + const auto& l0_files_new = vstorage_new->LevelFiles(0); + size_t iold, inew; + bool found; + std::vector l0_iters_new; + l0_iters_new.reserve(l0_files_new.size()); + + for (inew = 0; inew < l0_files_new.size(); inew++) { + found = false; + for (iold = 0; iold < l0_files.size(); iold++) { + if (l0_files[iold] == l0_files_new[inew]) { + found = true; + break; + } + } + if (found) { + if (l0_iters_[iold] == nullptr) { + l0_iters_new.push_back(nullptr); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this); + } else { + l0_iters_new.push_back(l0_iters_[iold]); + l0_iters_[iold] = nullptr; + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this); + } + continue; + } + l0_iters_new.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + *l0_files_new[inew], + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + svnew->mutable_cf_options.prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + svnew->mutable_cf_options.block_protection_bytes_per_key)); + } + + for (auto* f : l0_iters_) { + DeleteIterator(f); + } + l0_iters_.clear(); + l0_iters_ = l0_iters_new; + + for (auto* l : level_iters_) { + DeleteIterator(l); + } + level_iters_.clear(); + BuildLevelIterators(vstorage_new, svnew); + current_ = nullptr; + is_prev_set_ = false; + SVCleanup(); + sv_ = svnew; + + UpdateChildrenPinnedItersMgr(); + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + valid_ = false; + } +} + +void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, + SuperVersion* sv) { + level_iters_.reserve(vstorage->num_levels() - 1); + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const auto& level_files = vstorage->LevelFiles(level); + if ((level_files.empty()) || + ((read_options_.iterate_upper_bound != nullptr) && + (user_comparator_->Compare(*read_options_.iterate_upper_bound, + level_files[0]->smallest.user_key()) < + 0))) { + level_iters_.push_back(nullptr); + if (!level_files.empty()) { + has_iter_trimmed_for_upper_bound_ = true; + } + } else { + level_iters_.push_back(new ForwardLevelIterator( + cfd_, read_options_, level_files, + sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_, + sv->mutable_cf_options.block_protection_bytes_per_key)); + } + } +} + +void ForwardIterator::ResetIncompleteIterators() { + const auto& l0_files = sv_->current->storage_info()->LevelFiles(0); + for (size_t i = 0; i < l0_iters_.size(); ++i) { + assert(i < l0_files.size()); + if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) { + continue; + } + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + *l0_files[i], /*range_del_agg=*/nullptr, + sv_->mutable_cf_options.prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key); + l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); + } + + for (auto* level_iter : level_iters_) { + if (level_iter && level_iter->status().IsIncomplete()) { + level_iter->Reset(); + } + } + + current_ = nullptr; + is_prev_set_ = false; +} + +void ForwardIterator::UpdateCurrent() { + if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { + current_ = nullptr; + } else if (immutable_min_heap_.empty()) { + current_ = mutable_iter_; + } else if (!mutable_iter_->Valid()) { + current_ = immutable_min_heap_.top(); + immutable_min_heap_.pop(); + } else { + current_ = immutable_min_heap_.top(); + assert(current_ != nullptr); + assert(current_->Valid()); + int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare( + mutable_iter_->key(), current_->key()); + assert(cmp != 0); + if (cmp > 0) { + immutable_min_heap_.pop(); + } else { + current_ = mutable_iter_; + } + } + valid_ = current_ != nullptr && immutable_status_.ok(); + if (!status_.ok()) { + status_ = Status::OK(); + } + + // Upper bound doesn't apply to the memtable iterator. We want Valid() to + // return false when all iterators are over iterate_upper_bound, but can't + // just set valid_ to false, as that would effectively disable the tailing + // optimization (Seek() would be called on all immutable iterators regardless + // of whether the target key is greater than prev_key_). + current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key()); +} + +bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { + // We maintain the interval (prev_key_, immutable_min_heap_.top()->key()) + // such that there are no records with keys within that range in + // immutable_min_heap_. Since immutable structures (SST files and immutable + // memtables) can't change in this version, we don't need to do a seek if + // 'target' belongs to that interval (immutable_min_heap_.top() is already + // at the correct position). + + if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) { + return true; + } + Slice prev_key = prev_key_.GetInternalKey(); + if (prefix_extractor_ && prefix_extractor_->Transform(target).compare( + prefix_extractor_->Transform(prev_key)) != 0) { + return true; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { + return true; + } + + if (immutable_min_heap_.empty() && current_ == mutable_iter_) { + // Nothing to seek on. + return false; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { + return true; + } + return false; +} + +void ForwardIterator::DeleteCurrentIter() { + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + continue; + } + if (l0_iters_[i] == current_) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + return; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + if (level_iters_[level - 1] == nullptr) { + continue; + } + if (level_iters_[level - 1] == current_) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(level_iters_[level - 1]); + level_iters_[level - 1] = nullptr; + } + } +} + +bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters, + int* pnum_iters) { + bool retval = false; + int deleted_iters = 0; + int num_iters = 0; + + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + retval = true; + deleted_iters++; + } else { + num_iters++; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + if ((level_iters_[level - 1] == nullptr) && + (!vstorage->LevelFiles(level).empty())) { + retval = true; + deleted_iters++; + } else if (!vstorage->LevelFiles(level).empty()) { + num_iters++; + } + } + if ((!retval) && num_iters <= 1) { + retval = true; + } + if (pdeleted_iters) { + *pdeleted_iters = deleted_iters; + } + if (pnum_iters) { + *pnum_iters = num_iters; + } + return retval; +} + +uint32_t ForwardIterator::FindFileInRange( + const std::vector& files, const Slice& internal_key, + uint32_t left, uint32_t right) { + auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool { + return cfd_->internal_comparator().InternalKeyComparator::Compare( + f->largest.Encode(), k) < 0; + }; + const auto& b = files.begin(); + return static_cast( + std::lower_bound(b + left, b + right, internal_key, cmp) - b); +} + +void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) { + if (iter == nullptr) { + return; + } + + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(iter, is_arena); + } else { + if (is_arena) { + iter->~InternalIterator(); + } else { + delete iter; + } + } +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.h new file mode 100644 index 0000000000..cb418aeeb0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/forward_iterator.h @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/comparator.h" + +#include +#include +#include + +#include "memory/arena.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class Env; +struct SuperVersion; +class ColumnFamilyData; +class ForwardLevelIterator; +class VersionStorageInfo; +struct FileMetaData; + +class MinIterComparator { + public: + explicit MinIterComparator(const CompareInterface* comparator) + : comparator_(comparator) {} + + bool operator()(InternalIterator* a, InternalIterator* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + + private: + const CompareInterface* comparator_; +}; + +using MinIterHeap = + std::priority_queue, + MinIterComparator>; + +/** + * ForwardIterator is a special type of iterator that only supports Seek() + * and Next(). It is expected to perform better than TailingIterator by + * removing the encapsulation and making all information accessible within + * the iterator. At the current implementation, snapshot is taken at the + * time Seek() is called. The Next() followed do not see new values after. + */ +class ForwardIterator : public InternalIterator { + public: + ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr, + bool allow_unprepared_value = false); + virtual ~ForwardIterator(); + + void SeekForPrev(const Slice& /*target*/) override { + status_ = Status::NotSupported("ForwardIterator::SeekForPrev()"); + valid_ = false; + } + void SeekToLast() override { + status_ = Status::NotSupported("ForwardIterator::SeekToLast()"); + valid_ = false; + } + void Prev() override { + status_ = Status::NotSupported("ForwardIterator::Prev"); + valid_ = false; + } + + virtual bool Valid() const override; + void SeekToFirst() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + virtual bool PrepareValue() override; + virtual Status GetProperty(std::string prop_name, std::string* prop) override; + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override; + virtual bool IsKeyPinned() const override; + virtual bool IsValuePinned() const override; + + bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters); + + private: + void Cleanup(bool release_sv); + // Unreference and, if needed, clean up the current SuperVersion. This is + // either done immediately or deferred until this iterator is unpinned by + // PinnedIteratorsManager. + void SVCleanup(); + static void SVCleanup(DBImpl* db, SuperVersion* sv, + bool background_purge_on_iterator_cleanup); + static void DeferredSVCleanup(void* arg); + + void RebuildIterators(bool refresh_sv); + void RenewIterators(); + void BuildLevelIterators(const VersionStorageInfo* vstorage, + SuperVersion* sv); + void ResetIncompleteIterators(); + void SeekInternal(const Slice& internal_key, bool seek_to_first, + bool seek_after_async_io); + + void UpdateCurrent(); + bool NeedToSeekImmutable(const Slice& internal_key); + void DeleteCurrentIter(); + uint32_t FindFileInRange(const std::vector& files, + const Slice& internal_key, uint32_t left, + uint32_t right); + + bool IsOverUpperBound(const Slice& internal_key) const; + + // Set PinnedIteratorsManager for all children Iterators, this function should + // be called whenever we update children Iterators or pinned_iters_mgr_. + void UpdateChildrenPinnedItersMgr(); + + // A helper function that will release iter in the proper manner, or pass it + // to pinned_iters_mgr_ to release it later if pinning is enabled. + void DeleteIterator(InternalIterator* iter, bool is_arena = false); + + DBImpl* const db_; + ReadOptions read_options_; + ColumnFamilyData* const cfd_; + const SliceTransform* const prefix_extractor_; + const Comparator* user_comparator_; + const bool allow_unprepared_value_; + MinIterHeap immutable_min_heap_; + + SuperVersion* sv_; + InternalIterator* mutable_iter_; + std::vector imm_iters_; + std::vector l0_iters_; + std::vector level_iters_; + InternalIterator* current_; + bool valid_; + + // Internal iterator status; set only by one of the unsupported methods. + Status status_; + // Status of immutable iterators, maintained here to avoid iterating over + // all of them in status(). + Status immutable_status_; + // Indicates that at least one of the immutable iterators pointed to a key + // larger than iterate_upper_bound and was therefore destroyed. Seek() may + // need to rebuild such iterators. + bool has_iter_trimmed_for_upper_bound_; + // Is current key larger than iterate_upper_bound? If so, makes Valid() + // return false. + bool current_over_upper_bound_; + + // Left endpoint of the range of keys that immutable iterators currently + // cover. When Seek() is called with a key that's within that range, immutable + // iterators don't need to be moved; see NeedToSeekImmutable(). This key is + // included in the range after a Seek(), but excluded when advancing the + // iterator using Next(). + IterKey prev_key_; + bool is_prev_set_; + bool is_prev_inclusive_; + + PinnedIteratorsManager* pinned_iters_mgr_; + Arena arena_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/history_trimming_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/history_trimming_iterator.h new file mode 100644 index 0000000000..4af5cde720 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/history_trimming_iterator.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class HistoryTrimmingIterator : public InternalIterator { + public: + explicit HistoryTrimmingIterator(InternalIterator* input, + const Comparator* cmp, const std::string& ts) + : input_(input), filter_ts_(ts), cmp_(cmp) { + assert(cmp_->timestamp_size() > 0 && !ts.empty()); + } + + bool filter() const { + if (!input_->Valid()) { + return true; + } + Slice current_ts = ExtractTimestampFromKey(key(), cmp_->timestamp_size()); + return cmp_->CompareTimestamp(current_ts, Slice(filter_ts_)) <= 0; + } + + bool Valid() const override { return input_->Valid(); } + + void SeekToFirst() override { + input_->SeekToFirst(); + while (!filter()) { + input_->Next(); + } + } + + void SeekToLast() override { + input_->SeekToLast(); + while (!filter()) { + input_->Prev(); + } + } + + void Seek(const Slice& target) override { + input_->Seek(target); + while (!filter()) { + input_->Next(); + } + } + + void SeekForPrev(const Slice& target) override { + input_->SeekForPrev(target); + while (!filter()) { + input_->Prev(); + } + } + + void Next() override { + do { + input_->Next(); + } while (!filter()); + } + + void Prev() override { + do { + input_->Prev(); + } while (!filter()); + } + + Slice key() const override { return input_->key(); } + + Slice value() const override { return input_->value(); } + + Status status() const override { return input_->status(); } + + bool IsKeyPinned() const override { return input_->IsKeyPinned(); } + + bool IsValuePinned() const override { return input_->IsValuePinned(); } + + bool IsDeleteRangeSentinelKey() const override { + return input_->IsDeleteRangeSentinelKey(); + } + + private: + InternalIterator* input_; + const std::string filter_ts_; + const Comparator* const cmp_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.cc new file mode 100644 index 0000000000..f76d0ac1b9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.cc @@ -0,0 +1,364 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/version_builder.h" + +#include "db/import_column_family_job.h" + +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are importing + for (const auto& file_metadata : metadata_) { + const auto file_path = file_metadata.db_path + "/" + file_metadata.name; + IngestedFileInfo file_to_import; + status = GetIngestedFileInfo(file_path, next_file_number++, sv, + file_metadata, &file_to_import); + if (!status.ok()) { + return status; + } + files_to_import_.push_back(file_to_import); + } + + auto num_files = files_to_import_.size(); + if (num_files == 0) { + status = Status::InvalidArgument("The list of files is empty"); + return status; + } + + for (const auto& f : files_to_import_) { + if (f.num_entries == 0) { + status = Status::InvalidArgument("File contain no entries"); + return status; + } + + if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { + status = Status::Corruption("File has corrupted keys"); + return status; + } + } + + // Copy/Move external files into DB + auto hardlink_files = import_options_.move_files; + for (auto& f : files_to_import_) { + const auto path_outside_db = f.external_file_path; + const auto path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + + if (hardlink_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard linking + hardlink_files = false; + ROCKS_LOG_INFO(db_options_.info_log, + "Try to link file %s but it's not supported : %s", + f.internal_file_path.c_str(), status.ToString().c_str()); + } + } + if (!hardlink_files) { + status = + CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + } + if (!status.ok()) { + break; + } + f.copy_file = !hardlink_files; + f.internal_file_path = path_inside_db; + } + + if (!status.ok()) { + // We failed, remove all files that we copied into the db + for (const auto& f : files_to_import_) { + if (f.internal_file_path.empty()) { + break; + } + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ImportColumnFamilyJob::Run() { + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + uint64_t current_time = kUnknownOldestAncesterTime; + if (clock_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + + // Recover files' epoch number using dummy VersionStorageInfo + VersionBuilder dummy_version_builder( + cfd_->current()->version_set()->file_options(), cfd_->ioptions(), + cfd_->table_cache(), cfd_->current()->storage_info(), + cfd_->current()->version_set(), + cfd_->GetFileMetadataCacheReservationManager()); + VersionStorageInfo dummy_vstorage( + &cfd_->internal_comparator(), cfd_->user_comparator(), + cfd_->NumberLevels(), cfd_->ioptions()->compaction_style, + nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks, + EpochNumberRequirement::kMightMissing); + Status s; + for (size_t i = 0; s.ok() && i < files_to_import_.size(); ++i) { + const auto& f = files_to_import_[i]; + const auto& file_metadata = metadata_[i]; + + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + + VersionEdit dummy_version_edit; + dummy_version_edit.AddFile( + file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, + file_metadata.smallest_seqno, file_metadata.largest_seqno, false, + file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, + current_time, file_metadata.epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size); + s = dummy_version_builder.Apply(&dummy_version_edit); + } + if (s.ok()) { + s = dummy_version_builder.SaveTo(&dummy_vstorage); + } + if (s.ok()) { + dummy_vstorage.RecoverEpochNumbers(cfd_); + } + + // Record changes from this CF import in VersionEdit, including files with + // recovered epoch numbers + if (s.ok()) { + edit_.SetColumnFamily(cfd_->GetID()); + + for (int level = 0; level < dummy_vstorage.num_levels(); level++) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + edit_.AddFile(level, *file_meta); + // If incoming sequence number is higher, update local sequence number. + if (file_meta->fd.largest_seqno > versions_->LastSequence()) { + versions_->SetLastAllocatedSequence(file_meta->fd.largest_seqno); + versions_->SetLastPublishedSequence(file_meta->fd.largest_seqno); + versions_->SetLastSequence(file_meta->fd.largest_seqno); + } + } + } + } + + // Release resources occupied by the dummy VersionStorageInfo + for (int level = 0; level < dummy_vstorage.num_levels(); level++) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + file_meta->refs--; + if (file_meta->refs <= 0) { + delete file_meta; + } + } + } + return s; +} + +void ImportColumnFamilyJob::Cleanup(const Status& status) { + if (!status.ok()) { + // We failed to add files to the database remove all the files we copied. + for (const auto& f : files_to_import_) { + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } else if (status.ok() && import_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_import_) { + const auto s = + fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ImportColumnFamilyJob::GetIngestedFileInfo( + const std::string& external_file, uint64_t new_file_number, + SuperVersion* sv, const LiveFileMetaData& file_meta, + IngestedFileInfo* file_to_import) { + file_to_import->external_file_path = external_file; + Status status; + if (file_meta.size > 0) { + file_to_import->file_size = file_meta.size; + } else { + // Get external file size + status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_import->file_size, nullptr); + if (!status.ok()) { + return status; + } + } + // Assign FD with number + file_to_import->fd = + FileDescriptor(new_file_number, 0, file_to_import->file_size); + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), + std::move(sst_file_reader), file_to_import->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + + // Set original_seqno to 0. + file_to_import->original_seqno = 0; + + // Get number of entries in table + file_to_import->num_entries = props->num_entries; + + // If the importing files were exported with Checkpoint::ExportColumnFamily(), + // we cannot simply recompute smallest and largest used to truncate range + // tombstones from file content, and we expect smallest and largest populated + // in file_meta. + if (file_meta.smallest.empty()) { + assert(file_meta.largest.empty()); + // TODO: plumb Env::IOActivity + ReadOptions ro; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + bool bound_set = false; + if (iter->Valid()) { + file_to_import->smallest_internal_key.DecodeFrom(iter->key()); + iter->SeekToLast(); + file_to_import->largest_internal_key.DecodeFrom(iter->key()); + bound_set = true; + } + + std::unique_ptr range_del_iter{ + table_reader->NewRangeTombstoneIterator(ro)}; + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + ParsedInternalKey key; + Status pik_status = ParseInternalKey(range_del_iter->key(), &key, + db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone first_tombstone(key, range_del_iter->value()); + InternalKey start_key = first_tombstone.SerializeKey(); + const InternalKeyComparator* icmp = &cfd_->internal_comparator(); + if (!bound_set || + icmp->Compare(start_key, file_to_import->smallest_internal_key) < + 0) { + file_to_import->smallest_internal_key = start_key; + } + + range_del_iter->SeekToLast(); + pik_status = ParseInternalKey(range_del_iter->key(), &key, + db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone last_tombstone(key, range_del_iter->value()); + InternalKey end_key = last_tombstone.SerializeEndKey(); + if (!bound_set || + icmp->Compare(end_key, file_to_import->largest_internal_key) > 0) { + file_to_import->largest_internal_key = end_key; + } + bound_set = true; + } + } + assert(bound_set); + } else { + assert(!file_meta.largest.empty()); + file_to_import->smallest_internal_key.DecodeFrom(file_meta.smallest); + file_to_import->largest_internal_key.DecodeFrom(file_meta.largest); + } + + file_to_import->cf_id = static_cast(props->column_family_id); + + file_to_import->table_properties = *props; + + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_import->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_import->internal_file_path.c_str()); + } + + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.h new file mode 100644 index 0000000000..d0e2f9c296 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/import_column_family_job.h @@ -0,0 +1,82 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/snapshot_impl.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/metadata.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +struct EnvOptions; +class SystemClock; + +// Imports a set of sst files as is into a new column family. Logic is similar +// to ExternalSstFileIngestionJob. +class ImportColumnFamilyJob { + public: + ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const EnvOptions& env_options, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadata, + const std::shared_ptr& io_tracer) + : clock_(db_options.clock), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + fs_(db_options_.fs, io_tracer), + env_options_(env_options), + import_options_(import_options), + metadata_(metadata), + io_tracer_(io_tracer) {} + + // Prepare the job by copying external files into the DB. + Status Prepare(uint64_t next_file_number, SuperVersion* sv); + + // Will execute the import job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_import() const { + return files_to_import_; + } + + private: + // Open the external file and populate `file_to_import` with all the + // external information we need to import this file. + Status GetIngestedFileInfo(const std::string& external_file, + uint64_t new_file_number, SuperVersion* sv, + const LiveFileMetaData& file_meta, + IngestedFileInfo* file_to_import); + + SystemClock* clock_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const FileSystemPtr fs_; + const EnvOptions& env_options_; + autovector files_to_import_; + VersionEdit edit_; + const ImportColumnFamilyOptions& import_options_; + std::vector metadata_; + const std::shared_ptr io_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.cc new file mode 100644 index 0000000000..ca28542659 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.cc @@ -0,0 +1,2121 @@ +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/internal_stats.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_entry_stats.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/write_stall_stats.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table.h" +#include "table/block_based/cachable_entry.h" +#include "util/hash_containers.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + + +const std::map InternalStats::compaction_level_stats = + { + {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}}, + {LevelStatType::COMPACTED_FILES, + LevelStat{"CompactedFiles", "CompactedFiles"}}, + {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}}, + {LevelStatType::SCORE, LevelStat{"Score", "Score"}}, + {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}}, + {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}}, + {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}}, + {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}}, + {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}}, + {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}}, + {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}}, + {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}}, + {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}}, + {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}}, + {LevelStatType::COMP_CPU_SEC, + LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}}, + {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}}, + {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}}, + {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}}, + {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}}, + {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}}, + {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}}, +}; + +const std::map + InternalStats::db_stats_type_to_info = { + {InternalStats::kIntStatsWalFileBytes, + DBStatInfo{"db.wal_bytes_written"}}, + {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}}, + {InternalStats::kIntStatsBytesWritten, + DBStatInfo{"db.user_bytes_written"}}, + {InternalStats::kIntStatsNumKeysWritten, + DBStatInfo{"db.user_keys_written"}}, + {InternalStats::kIntStatsWriteDoneByOther, + DBStatInfo{"db.user_writes_by_other"}}, + {InternalStats::kIntStatsWriteDoneBySelf, + DBStatInfo{"db.user_writes_by_self"}}, + {InternalStats::kIntStatsWriteWithWal, + DBStatInfo{"db.user_writes_with_wal"}}, + {InternalStats::kIntStatsWriteStallMicros, + DBStatInfo{"db.user_write_stall_micros"}}, + {InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, + DBStatInfo{WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)}}, +}; + +namespace { +const double kMB = 1048576.0; +const double kGB = kMB * 1024; +const double kMicrosInSec = 1000000.0; + +void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name, + const std::string& group_by) { + int written_size = + snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str()); + written_size = std::min(written_size, static_cast(len)); + auto hdr = [](LevelStatType t) { + return InternalStats::compaction_level_stats.at(t).header_name.c_str(); + }; + int line_size = snprintf( + buf + written_size, len - written_size, + "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s " + "%s\n", + // Note that we skip COMPACTED_FILES and merge it with Files column + group_by.c_str(), hdr(LevelStatType::NUM_FILES), + hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE), + hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB), + hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB), + hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB), + hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS), + hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC), + hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT), + hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN), + hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB), + hdr(LevelStatType::W_BLOB_GB)); + + written_size += line_size; + written_size = std::min(written_size, static_cast(len)); + snprintf(buf + written_size, len - written_size, "%s\n", + std::string(line_size, '-').c_str()); +} + +void PrepareLevelStats(std::map* level_stats, + int num_files, int being_compacted, + double total_file_size, double score, double w_amp, + const InternalStats::CompactionStats& stats) { + const uint64_t bytes_read = stats.bytes_read_non_output_levels + + stats.bytes_read_output_level + + stats.bytes_read_blob; + const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; + const double elapsed = (stats.micros + 1) / kMicrosInSec; + + (*level_stats)[LevelStatType::NUM_FILES] = num_files; + (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted; + (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size; + (*level_stats)[LevelStatType::SCORE] = score; + (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB; + (*level_stats)[LevelStatType::RN_GB] = + stats.bytes_read_non_output_levels / kGB; + (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB; + (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB; + (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB; + (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB; + (*level_stats)[LevelStatType::WRITE_AMP] = w_amp; + (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed; + (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed; + (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec; + (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec; + (*level_stats)[LevelStatType::COMP_COUNT] = stats.count; + (*level_stats)[LevelStatType::AVG_SEC] = + stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count; + (*level_stats)[LevelStatType::KEY_IN] = + static_cast(stats.num_input_records); + (*level_stats)[LevelStatType::KEY_DROP] = + static_cast(stats.num_dropped_records); + (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB; + (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB; +} + +void PrintLevelStats(char* buf, size_t len, const std::string& name, + const std::map& stat_value) { + snprintf( + buf, len, + "%4s " /* Level */ + "%6d/%-3d " /* Files */ + "%8s " /* Size */ + "%5.1f " /* Score */ + "%8.1f " /* Read(GB) */ + "%7.1f " /* Rn(GB) */ + "%8.1f " /* Rnp1(GB) */ + "%9.1f " /* Write(GB) */ + "%8.1f " /* Wnew(GB) */ + "%9.1f " /* Moved(GB) */ + "%5.1f " /* W-Amp */ + "%8.1f " /* Rd(MB/s) */ + "%8.1f " /* Wr(MB/s) */ + "%9.2f " /* Comp(sec) */ + "%17.2f " /* CompMergeCPU(sec) */ + "%9d " /* Comp(cnt) */ + "%8.3f " /* Avg(sec) */ + "%7s " /* KeyIn */ + "%6s " /* KeyDrop */ + "%9.1f " /* Rblob(GB) */ + "%9.1f\n", /* Wblob(GB) */ + name.c_str(), static_cast(stat_value.at(LevelStatType::NUM_FILES)), + static_cast(stat_value.at(LevelStatType::COMPACTED_FILES)), + BytesToHumanString( + static_cast(stat_value.at(LevelStatType::SIZE_BYTES))) + .c_str(), + stat_value.at(LevelStatType::SCORE), + stat_value.at(LevelStatType::READ_GB), + stat_value.at(LevelStatType::RN_GB), + stat_value.at(LevelStatType::RNP1_GB), + stat_value.at(LevelStatType::WRITE_GB), + stat_value.at(LevelStatType::W_NEW_GB), + stat_value.at(LevelStatType::MOVED_GB), + stat_value.at(LevelStatType::WRITE_AMP), + stat_value.at(LevelStatType::READ_MBPS), + stat_value.at(LevelStatType::WRITE_MBPS), + stat_value.at(LevelStatType::COMP_SEC), + stat_value.at(LevelStatType::COMP_CPU_SEC), + static_cast(stat_value.at(LevelStatType::COMP_COUNT)), + stat_value.at(LevelStatType::AVG_SEC), + NumberToHumanString( + static_cast(stat_value.at(LevelStatType::KEY_IN))) + .c_str(), + NumberToHumanString( + static_cast(stat_value.at(LevelStatType::KEY_DROP))) + .c_str(), + stat_value.at(LevelStatType::R_BLOB_GB), + stat_value.at(LevelStatType::W_BLOB_GB)); +} + +void PrintLevelStats(char* buf, size_t len, const std::string& name, + int num_files, int being_compacted, double total_file_size, + double score, double w_amp, + const InternalStats::CompactionStats& stats) { + std::map level_stats; + PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size, + score, w_amp, stats); + PrintLevelStats(buf, len, name, level_stats); +} + +// Assumes that trailing numbers represent an optional argument. This requires +// property names to not end with numbers. +std::pair GetPropertyNameAndArg(const Slice& property) { + Slice name = property, arg = property; + size_t sfx_len = 0; + while (sfx_len < property.size() && + isdigit(property[property.size() - sfx_len - 1])) { + ++sfx_len; + } + name.remove_suffix(sfx_len); + arg.remove_prefix(property.size() - sfx_len); + return {name, arg}; +} +} // anonymous namespace + +static const std::string rocksdb_prefix = "rocksdb."; + +static const std::string num_files_at_level_prefix = "num-files-at-level"; +static const std::string compression_ratio_at_level_prefix = + "compression-ratio-at-level"; +static const std::string allstats = "stats"; +static const std::string sstables = "sstables"; +static const std::string cfstats = "cfstats"; +static const std::string cfstats_no_file_histogram = + "cfstats-no-file-histogram"; +static const std::string cf_file_histogram = "cf-file-histogram"; +static const std::string cf_write_stall_stats = "cf-write-stall-stats"; +static const std::string dbstats = "dbstats"; +static const std::string db_write_stall_stats = "db-write-stall-stats"; +static const std::string levelstats = "levelstats"; +static const std::string block_cache_entry_stats = "block-cache-entry-stats"; +static const std::string fast_block_cache_entry_stats = + "fast-block-cache-entry-stats"; +static const std::string num_immutable_mem_table = "num-immutable-mem-table"; +static const std::string num_immutable_mem_table_flushed = + "num-immutable-mem-table-flushed"; +static const std::string mem_table_flush_pending = "mem-table-flush-pending"; +static const std::string compaction_pending = "compaction-pending"; +static const std::string background_errors = "background-errors"; +static const std::string cur_size_active_mem_table = + "cur-size-active-mem-table"; +static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables"; +static const std::string size_all_mem_tables = "size-all-mem-tables"; +static const std::string num_entries_active_mem_table = + "num-entries-active-mem-table"; +static const std::string num_entries_imm_mem_tables = + "num-entries-imm-mem-tables"; +static const std::string num_deletes_active_mem_table = + "num-deletes-active-mem-table"; +static const std::string num_deletes_imm_mem_tables = + "num-deletes-imm-mem-tables"; +static const std::string estimate_num_keys = "estimate-num-keys"; +static const std::string estimate_table_readers_mem = + "estimate-table-readers-mem"; +static const std::string is_file_deletions_enabled = + "is-file-deletions-enabled"; +static const std::string num_snapshots = "num-snapshots"; +static const std::string oldest_snapshot_time = "oldest-snapshot-time"; +static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence"; +static const std::string num_live_versions = "num-live-versions"; +static const std::string current_version_number = + "current-super-version-number"; +static const std::string estimate_live_data_size = "estimate-live-data-size"; +static const std::string min_log_number_to_keep_str = "min-log-number-to-keep"; +static const std::string min_obsolete_sst_number_to_keep_str = + "min-obsolete-sst-number-to-keep"; +static const std::string base_level_str = "base-level"; +static const std::string total_sst_files_size = "total-sst-files-size"; +static const std::string live_sst_files_size = "live-sst-files-size"; +static const std::string live_sst_files_size_at_temperature = + "live-sst-files-size-at-temperature"; +static const std::string estimate_pending_comp_bytes = + "estimate-pending-compaction-bytes"; +static const std::string aggregated_table_properties = + "aggregated-table-properties"; +static const std::string aggregated_table_properties_at_level = + aggregated_table_properties + "-at-level"; +static const std::string num_running_compactions = "num-running-compactions"; +static const std::string num_running_flushes = "num-running-flushes"; +static const std::string actual_delayed_write_rate = + "actual-delayed-write-rate"; +static const std::string is_write_stopped = "is-write-stopped"; +static const std::string estimate_oldest_key_time = "estimate-oldest-key-time"; +static const std::string block_cache_capacity = "block-cache-capacity"; +static const std::string block_cache_usage = "block-cache-usage"; +static const std::string block_cache_pinned_usage = "block-cache-pinned-usage"; +static const std::string options_statistics = "options-statistics"; +static const std::string num_blob_files = "num-blob-files"; +static const std::string blob_stats = "blob-stats"; +static const std::string total_blob_file_size = "total-blob-file-size"; +static const std::string live_blob_file_size = "live-blob-file-size"; +static const std::string live_blob_file_garbage_size = + "live-blob-file-garbage-size"; +static const std::string blob_cache_capacity = "blob-cache-capacity"; +static const std::string blob_cache_usage = "blob-cache-usage"; +static const std::string blob_cache_pinned_usage = "blob-cache-pinned-usage"; + +const std::string DB::Properties::kNumFilesAtLevelPrefix = + rocksdb_prefix + num_files_at_level_prefix; +const std::string DB::Properties::kCompressionRatioAtLevelPrefix = + rocksdb_prefix + compression_ratio_at_level_prefix; +const std::string DB::Properties::kStats = rocksdb_prefix + allstats; +const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables; +const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats; +const std::string DB::Properties::kCFStatsNoFileHistogram = + rocksdb_prefix + cfstats_no_file_histogram; +const std::string DB::Properties::kCFFileHistogram = + rocksdb_prefix + cf_file_histogram; +const std::string DB::Properties::kCFWriteStallStats = + rocksdb_prefix + cf_write_stall_stats; +const std::string DB::Properties::kDBWriteStallStats = + rocksdb_prefix + db_write_stall_stats; +const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; +const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; +const std::string DB::Properties::kBlockCacheEntryStats = + rocksdb_prefix + block_cache_entry_stats; +const std::string DB::Properties::kFastBlockCacheEntryStats = + rocksdb_prefix + fast_block_cache_entry_stats; +const std::string DB::Properties::kNumImmutableMemTable = + rocksdb_prefix + num_immutable_mem_table; +const std::string DB::Properties::kNumImmutableMemTableFlushed = + rocksdb_prefix + num_immutable_mem_table_flushed; +const std::string DB::Properties::kMemTableFlushPending = + rocksdb_prefix + mem_table_flush_pending; +const std::string DB::Properties::kCompactionPending = + rocksdb_prefix + compaction_pending; +const std::string DB::Properties::kNumRunningCompactions = + rocksdb_prefix + num_running_compactions; +const std::string DB::Properties::kNumRunningFlushes = + rocksdb_prefix + num_running_flushes; +const std::string DB::Properties::kBackgroundErrors = + rocksdb_prefix + background_errors; +const std::string DB::Properties::kCurSizeActiveMemTable = + rocksdb_prefix + cur_size_active_mem_table; +const std::string DB::Properties::kCurSizeAllMemTables = + rocksdb_prefix + cur_size_all_mem_tables; +const std::string DB::Properties::kSizeAllMemTables = + rocksdb_prefix + size_all_mem_tables; +const std::string DB::Properties::kNumEntriesActiveMemTable = + rocksdb_prefix + num_entries_active_mem_table; +const std::string DB::Properties::kNumEntriesImmMemTables = + rocksdb_prefix + num_entries_imm_mem_tables; +const std::string DB::Properties::kNumDeletesActiveMemTable = + rocksdb_prefix + num_deletes_active_mem_table; +const std::string DB::Properties::kNumDeletesImmMemTables = + rocksdb_prefix + num_deletes_imm_mem_tables; +const std::string DB::Properties::kEstimateNumKeys = + rocksdb_prefix + estimate_num_keys; +const std::string DB::Properties::kEstimateTableReadersMem = + rocksdb_prefix + estimate_table_readers_mem; +const std::string DB::Properties::kIsFileDeletionsEnabled = + rocksdb_prefix + is_file_deletions_enabled; +const std::string DB::Properties::kNumSnapshots = + rocksdb_prefix + num_snapshots; +const std::string DB::Properties::kOldestSnapshotTime = + rocksdb_prefix + oldest_snapshot_time; +const std::string DB::Properties::kOldestSnapshotSequence = + rocksdb_prefix + oldest_snapshot_sequence; +const std::string DB::Properties::kNumLiveVersions = + rocksdb_prefix + num_live_versions; +const std::string DB::Properties::kCurrentSuperVersionNumber = + rocksdb_prefix + current_version_number; +const std::string DB::Properties::kEstimateLiveDataSize = + rocksdb_prefix + estimate_live_data_size; +const std::string DB::Properties::kMinLogNumberToKeep = + rocksdb_prefix + min_log_number_to_keep_str; +const std::string DB::Properties::kMinObsoleteSstNumberToKeep = + rocksdb_prefix + min_obsolete_sst_number_to_keep_str; +const std::string DB::Properties::kTotalSstFilesSize = + rocksdb_prefix + total_sst_files_size; +const std::string DB::Properties::kLiveSstFilesSize = + rocksdb_prefix + live_sst_files_size; +const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str; +const std::string DB::Properties::kEstimatePendingCompactionBytes = + rocksdb_prefix + estimate_pending_comp_bytes; +const std::string DB::Properties::kAggregatedTableProperties = + rocksdb_prefix + aggregated_table_properties; +const std::string DB::Properties::kAggregatedTablePropertiesAtLevel = + rocksdb_prefix + aggregated_table_properties_at_level; +const std::string DB::Properties::kActualDelayedWriteRate = + rocksdb_prefix + actual_delayed_write_rate; +const std::string DB::Properties::kIsWriteStopped = + rocksdb_prefix + is_write_stopped; +const std::string DB::Properties::kEstimateOldestKeyTime = + rocksdb_prefix + estimate_oldest_key_time; +const std::string DB::Properties::kBlockCacheCapacity = + rocksdb_prefix + block_cache_capacity; +const std::string DB::Properties::kBlockCacheUsage = + rocksdb_prefix + block_cache_usage; +const std::string DB::Properties::kBlockCachePinnedUsage = + rocksdb_prefix + block_cache_pinned_usage; +const std::string DB::Properties::kOptionsStatistics = + rocksdb_prefix + options_statistics; +const std::string DB::Properties::kLiveSstFilesSizeAtTemperature = + rocksdb_prefix + live_sst_files_size_at_temperature; +const std::string DB::Properties::kNumBlobFiles = + rocksdb_prefix + num_blob_files; +const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats; +const std::string DB::Properties::kTotalBlobFileSize = + rocksdb_prefix + total_blob_file_size; +const std::string DB::Properties::kLiveBlobFileSize = + rocksdb_prefix + live_blob_file_size; +const std::string DB::Properties::kLiveBlobFileGarbageSize = + rocksdb_prefix + live_blob_file_garbage_size; +const std::string DB::Properties::kBlobCacheCapacity = + rocksdb_prefix + blob_cache_capacity; +const std::string DB::Properties::kBlobCacheUsage = + rocksdb_prefix + blob_cache_usage; +const std::string DB::Properties::kBlobCachePinnedUsage = + rocksdb_prefix + blob_cache_pinned_usage; + +const std::string InternalStats::kPeriodicCFStats = + DB::Properties::kCFStats + ".periodic"; +const int InternalStats::kMaxNoChangePeriodSinceDump = 8; + +const UnorderedMap + InternalStats::ppt_name_to_info = { + {DB::Properties::kNumFilesAtLevelPrefix, + {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCompressionRatioAtLevelPrefix, + {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr, + nullptr, nullptr}}, + {DB::Properties::kLevelStats, + {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kStats, + {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kCFStats, + {false, &InternalStats::HandleCFStats, nullptr, + &InternalStats::HandleCFMapStats, nullptr}}, + {InternalStats::kPeriodicCFStats, + {false, &InternalStats::HandleCFStatsPeriodic, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCFStatsNoFileHistogram, + {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCFFileHistogram, + {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCFWriteStallStats, + {false, &InternalStats::HandleCFWriteStallStats, nullptr, + &InternalStats::HandleCFWriteStallStatsMap, nullptr}}, + {DB::Properties::kDBStats, + {false, &InternalStats::HandleDBStats, nullptr, + &InternalStats::HandleDBMapStats, nullptr}}, + {DB::Properties::kDBWriteStallStats, + {false, &InternalStats::HandleDBWriteStallStats, nullptr, + &InternalStats::HandleDBWriteStallStatsMap, nullptr}}, + {DB::Properties::kBlockCacheEntryStats, + {true, &InternalStats::HandleBlockCacheEntryStats, nullptr, + &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}}, + {DB::Properties::kFastBlockCacheEntryStats, + {true, &InternalStats::HandleFastBlockCacheEntryStats, nullptr, + &InternalStats::HandleFastBlockCacheEntryStatsMap, nullptr}}, + {DB::Properties::kSSTables, + {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}}, + {DB::Properties::kAggregatedTableProperties, + {false, &InternalStats::HandleAggregatedTableProperties, nullptr, + &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}}, + {DB::Properties::kAggregatedTablePropertiesAtLevel, + {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, + nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap, + nullptr}}, + {DB::Properties::kNumImmutableMemTable, + {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr, + nullptr}}, + {DB::Properties::kNumImmutableMemTableFlushed, + {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed, + nullptr, nullptr}}, + {DB::Properties::kMemTableFlushPending, + {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr, + nullptr}}, + {DB::Properties::kCompactionPending, + {false, nullptr, &InternalStats::HandleCompactionPending, nullptr, + nullptr}}, + {DB::Properties::kBackgroundErrors, + {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr, + nullptr}}, + {DB::Properties::kCurSizeActiveMemTable, + {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr, + nullptr}}, + {DB::Properties::kCurSizeAllMemTables, + {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr, + nullptr}}, + {DB::Properties::kSizeAllMemTables, + {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr, + nullptr}}, + {DB::Properties::kNumEntriesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable, + nullptr, nullptr}}, + {DB::Properties::kNumEntriesImmMemTables, + {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr, + nullptr}}, + {DB::Properties::kNumDeletesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable, + nullptr, nullptr}}, + {DB::Properties::kNumDeletesImmMemTables, + {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr, + nullptr}}, + {DB::Properties::kEstimateNumKeys, + {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr, + nullptr}}, + {DB::Properties::kEstimateTableReadersMem, + {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr, + nullptr}}, + {DB::Properties::kIsFileDeletionsEnabled, + {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr, + nullptr}}, + {DB::Properties::kNumSnapshots, + {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr, + nullptr}}, + {DB::Properties::kOldestSnapshotTime, + {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr, + nullptr}}, + {DB::Properties::kOldestSnapshotSequence, + {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr, + nullptr}}, + {DB::Properties::kNumLiveVersions, + {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr, + nullptr}}, + {DB::Properties::kCurrentSuperVersionNumber, + {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber, + nullptr, nullptr}}, + {DB::Properties::kEstimateLiveDataSize, + {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr, + nullptr}}, + {DB::Properties::kMinLogNumberToKeep, + {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr, + nullptr}}, + {DB::Properties::kMinObsoleteSstNumberToKeep, + {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep, + nullptr, nullptr}}, + {DB::Properties::kBaseLevel, + {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}}, + {DB::Properties::kTotalSstFilesSize, + {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr, + nullptr}}, + {DB::Properties::kLiveSstFilesSize, + {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr, + nullptr}}, + {DB::Properties::kLiveSstFilesSizeAtTemperature, + {false, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr, + nullptr, nullptr}}, + {DB::Properties::kEstimatePendingCompactionBytes, + {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes, + nullptr, nullptr}}, + {DB::Properties::kNumRunningFlushes, + {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr, + nullptr}}, + {DB::Properties::kNumRunningCompactions, + {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr, + nullptr}}, + {DB::Properties::kActualDelayedWriteRate, + {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr, + nullptr}}, + {DB::Properties::kIsWriteStopped, + {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr, + nullptr}}, + {DB::Properties::kEstimateOldestKeyTime, + {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr, + nullptr}}, + {DB::Properties::kBlockCacheCapacity, + {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr, + nullptr}}, + {DB::Properties::kBlockCacheUsage, + {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr, + nullptr}}, + {DB::Properties::kBlockCachePinnedUsage, + {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr, + nullptr}}, + {DB::Properties::kOptionsStatistics, + {true, nullptr, nullptr, nullptr, + &DBImpl::GetPropertyHandleOptionsStatistics}}, + {DB::Properties::kNumBlobFiles, + {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr, + nullptr}}, + {DB::Properties::kBlobStats, + {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kTotalBlobFileSize, + {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr, + nullptr}}, + {DB::Properties::kLiveBlobFileSize, + {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr, + nullptr}}, + {DB::Properties::kLiveBlobFileGarbageSize, + {false, nullptr, &InternalStats::HandleLiveBlobFileGarbageSize, + nullptr, nullptr}}, + {DB::Properties::kBlobCacheCapacity, + {false, nullptr, &InternalStats::HandleBlobCacheCapacity, nullptr, + nullptr}}, + {DB::Properties::kBlobCacheUsage, + {false, nullptr, &InternalStats::HandleBlobCacheUsage, nullptr, + nullptr}}, + {DB::Properties::kBlobCachePinnedUsage, + {false, nullptr, &InternalStats::HandleBlobCachePinnedUsage, nullptr, + nullptr}}, +}; + +InternalStats::InternalStats(int num_levels, SystemClock* clock, + ColumnFamilyData* cfd) + : db_stats_{}, + cf_stats_value_{}, + cf_stats_count_{}, + comp_stats_(num_levels), + comp_stats_by_pri_(Env::Priority::TOTAL), + file_read_latency_(num_levels), + has_cf_change_since_dump_(true), + bg_error_count_(0), + number_levels_(num_levels), + clock_(clock), + cfd_(cfd), + started_at_(clock->NowMicros()) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + // Extract or create stats collector. Could fail in rare cases. + Status s = CacheEntryStatsCollector::GetShared( + block_cache, clock_, &cache_entry_stats_collector_); + if (s.ok()) { + assert(cache_entry_stats_collector_); + } else { + assert(!cache_entry_stats_collector_); + } + } +} + +void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, + bool foreground) { + CollectCacheEntryStats(foreground); + if (cache_entry_stats_collector_) { + cache_entry_stats_collector_->GetStats(stats); + } +} + +void InternalStats::CollectCacheEntryStats(bool foreground) { + // This function is safe to call from any thread because + // cache_entry_stats_collector_ field is const after constructor + // and ->GetStats does its own synchronization, which also suffices for + // cache_entry_stats_. + + if (!cache_entry_stats_collector_) { + return; // nothing to do (e.g. no block cache) + } + + // For "background" collections, strictly cap the collection time by + // expanding effective cache TTL. For foreground, be more aggressive about + // getting latest data. + int min_interval_seconds = foreground ? 10 : 180; + // 1/500 = max of 0.2% of one CPU thread + int min_interval_factor = foreground ? 10 : 500; + cache_entry_stats_collector_->CollectStats(min_interval_seconds, + min_interval_factor); +} + +std::function +InternalStats::CacheEntryRoleStats::GetEntryCallback() { + return [&](const Slice& /*key*/, Cache::ObjectPtr /*value*/, size_t charge, + const Cache::CacheItemHelper* helper) -> void { + size_t role_idx = + static_cast(helper ? helper->role : CacheEntryRole::kMisc); + entry_counts[role_idx]++; + total_charges[role_idx] += charge; + }; +} + +void InternalStats::CacheEntryRoleStats::BeginCollection( + Cache* cache, SystemClock*, uint64_t start_time_micros) { + Clear(); + last_start_time_micros_ = start_time_micros; + ++collection_count; + std::ostringstream str; + str << cache->Name() << "@" << static_cast(cache) << "#" + << port::GetProcessID(); + cache_id = str.str(); + cache_capacity = cache->GetCapacity(); + cache_usage = cache->GetUsage(); + table_size = cache->GetTableAddressCount(); + occupancy = cache->GetOccupancyCount(); + hash_seed = cache->GetHashSeed(); +} + +void InternalStats::CacheEntryRoleStats::EndCollection( + Cache*, SystemClock*, uint64_t end_time_micros) { + last_end_time_micros_ = end_time_micros; +} + +void InternalStats::CacheEntryRoleStats::SkippedCollection() { + ++copies_of_last_collection; +} + +uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const { + if (last_end_time_micros_ > last_start_time_micros_) { + return last_end_time_micros_ - last_start_time_micros_; + } else { + return 0U; + } +} + +std::string InternalStats::CacheEntryRoleStats::ToString( + SystemClock* clock) const { + std::ostringstream str; + str << "Block cache " << cache_id + << " capacity: " << BytesToHumanString(cache_capacity) + << " seed: " << hash_seed << " usage: " << BytesToHumanString(cache_usage) + << " table_size: " << table_size << " occupancy: " << occupancy + << " collections: " << collection_count + << " last_copies: " << copies_of_last_collection + << " last_secs: " << (GetLastDurationMicros() / 1000000.0) + << " secs_since: " + << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n"; + str << "Block cache entry stats(count,size,portion):"; + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + if (entry_counts[i] > 0) { + str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i] + << "," << BytesToHumanString(total_charges[i]) << "," + << (100.0 * total_charges[i] / cache_capacity) << "%)"; + } + } + str << "\n"; + return str.str(); +} + +void InternalStats::CacheEntryRoleStats::ToMap( + std::map* values, SystemClock* clock) const { + values->clear(); + auto& v = *values; + v[BlockCacheEntryStatsMapKeys::CacheId()] = cache_id; + v[BlockCacheEntryStatsMapKeys::CacheCapacityBytes()] = + std::to_string(cache_capacity); + v[BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()] = + std::to_string(GetLastDurationMicros() / 1000000.0); + v[BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()] = + std::to_string((clock->NowMicros() - last_end_time_micros_) / 1000000U); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + auto role = static_cast(i); + v[BlockCacheEntryStatsMapKeys::EntryCount(role)] = + std::to_string(entry_counts[i]); + v[BlockCacheEntryStatsMapKeys::UsedBytes(role)] = + std::to_string(total_charges[i]); + v[BlockCacheEntryStatsMapKeys::UsedPercent(role)] = + std::to_string(100.0 * total_charges[i] / cache_capacity); + } +} + +bool InternalStats::HandleBlockCacheEntryStatsInternal(std::string* value, + bool fast) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(!fast /* foreground */); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + *value = stats.ToString(clock_); + return true; +} + +bool InternalStats::HandleBlockCacheEntryStatsMapInternal( + std::map* values, bool fast) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(!fast /* foreground */); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + stats.ToMap(values, clock_); + return true; +} + +bool InternalStats::HandleBlockCacheEntryStats(std::string* value, + Slice /*suffix*/) { + return HandleBlockCacheEntryStatsInternal(value, false /* fast */); +} + +bool InternalStats::HandleBlockCacheEntryStatsMap( + std::map* values, Slice /*suffix*/) { + return HandleBlockCacheEntryStatsMapInternal(values, false /* fast */); +} + +bool InternalStats::HandleFastBlockCacheEntryStats(std::string* value, + Slice /*suffix*/) { + return HandleBlockCacheEntryStatsInternal(value, true /* fast */); +} + +bool InternalStats::HandleFastBlockCacheEntryStatsMap( + std::map* values, Slice /*suffix*/) { + return HandleBlockCacheEntryStatsMapInternal(values, true /* fast */); +} + +bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value, + Slice suffix) { + uint64_t temperature; + bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty(); + if (!ok) { + return false; + } + + uint64_t size = 0; + const auto* vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file_meta : vstorage->LevelFiles(level)) { + if (static_cast(file_meta->temperature) == temperature) { + size += file_meta->fd.GetFileSize(); + } + } + } + + *value = std::to_string(size); + return true; +} + +bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + const auto& blob_files = vstorage->GetBlobFiles(); + + *value = blob_files.size(); + + return true; +} + +bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + const auto blob_st = vstorage->GetBlobStats(); + + std::ostringstream oss; + + oss << "Number of blob files: " << vstorage->GetBlobFiles().size() + << "\nTotal size of blob files: " << blob_st.total_file_size + << "\nTotal size of garbage in blob files: " << blob_st.total_garbage_size + << "\nBlob file space amplification: " << blob_st.space_amp << '\n'; + + value->append(oss.str()); + + return true; +} + +bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + *value = cfd_->GetTotalBlobFileSize(); + + return true; +} + +bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + *value = vstorage->GetBlobStats().total_file_size; + + return true; +} + +bool InternalStats::HandleLiveBlobFileGarbageSize(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + assert(value); + assert(cfd_); + + const auto* current = cfd_->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + *value = vstorage->GetBlobStats().total_garbage_size; + + return true; +} + +Cache* InternalStats::GetBlobCacheForStats() { + return cfd_->ioptions()->blob_cache.get(); +} + +bool InternalStats::HandleBlobCacheCapacity(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* blob_cache = GetBlobCacheForStats(); + if (blob_cache) { + *value = static_cast(blob_cache->GetCapacity()); + return true; + } + return false; +} + +bool InternalStats::HandleBlobCacheUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* blob_cache = GetBlobCacheForStats(); + if (blob_cache) { + *value = static_cast(blob_cache->GetUsage()); + return true; + } + return false; +} + +bool InternalStats::HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* blob_cache = GetBlobCacheForStats(); + if (blob_cache) { + *value = static_cast(blob_cache->GetPinnedUsage()); + return true; + } + return false; +} + +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { + std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); + auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); + if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) { + return nullptr; + } + return &ppt_info_iter->second; +} + +bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::string* value) { + assert(value != nullptr); + assert(property_info.handle_string != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_string))(value, arg); +} + +bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::map* value) { + assert(value != nullptr); + assert(property_info.handle_map != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_map))(value, arg); +} + +bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, + uint64_t* value, DBImpl* db) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + !property_info.need_out_of_mutex); + db->mutex_.AssertHeld(); + return (this->*(property_info.handle_int))(value, db, nullptr /* version */); +} + +bool InternalStats::GetIntPropertyOutOfMutex( + const DBPropertyInfo& property_info, Version* version, uint64_t* value) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + property_info.need_out_of_mutex); + return (this->*(property_info.handle_int))(value, nullptr /* db */, version); +} + +bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + vstorage->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } +} + +bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value, + Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || level >= static_cast(number_levels_)) { + return false; + } + *value = std::to_string( + vstorage->GetEstimatedCompressionRatioAtLevel(static_cast(level))); + return true; +} + +bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) { + char buf[1000]; + const auto* vstorage = cfd_->current()->storage_info(); + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); + + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + vstorage->NumLevelFiles(level), + vstorage->NumLevelBytes(level) / kMB); + value->append(buf); + } + return true; +} + +bool InternalStats::HandleStats(std::string* value, Slice suffix) { + if (!HandleCFStats(value, suffix)) { + return false; + } + if (!HandleDBStats(value, suffix)) { + return false; + } + return true; +} + +bool InternalStats::HandleCFMapStats( + std::map* cf_stats, Slice /*suffix*/) { + DumpCFMapStats(cf_stats); + return true; +} + +bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) { + DumpCFStats(value); + return true; +} + +bool InternalStats::HandleCFStatsPeriodic(std::string* value, + Slice /*suffix*/) { + bool has_change = has_cf_change_since_dump_; + if (!has_change) { + // If file histogram changes, there is activity in this period too. + uint64_t new_histogram_num = 0; + for (int level = 0; level < number_levels_; level++) { + new_histogram_num += file_read_latency_[level].num(); + } + new_histogram_num += blob_file_read_latency_.num(); + if (new_histogram_num != last_histogram_num) { + has_change = true; + last_histogram_num = new_histogram_num; + } + } + if (has_change) { + no_cf_change_period_since_dump_ = 0; + has_cf_change_since_dump_ = false; + } else if (no_cf_change_period_since_dump_++ > 0) { + // Not ready to sync + if (no_cf_change_period_since_dump_ == kMaxNoChangePeriodSinceDump) { + // Next periodic, we need to dump stats even if there is no change. + no_cf_change_period_since_dump_ = 0; + } + return true; + } + + DumpCFStatsNoFileHistogram(/*is_periodic=*/true, value); + DumpCFFileHistogram(value); + return true; +} + +bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value, + Slice /*suffix*/) { + DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value); + return true; +} + +bool InternalStats::HandleCFFileHistogram(std::string* value, + Slice /*suffix*/) { + DumpCFFileHistogram(value); + return true; +} + +bool InternalStats::HandleCFWriteStallStats(std::string* value, + Slice /*suffix*/) { + DumpCFStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleCFWriteStallStatsMap( + std::map* value, Slice /*suffix*/) { + DumpCFMapStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleDBMapStats( + std::map* db_stats, Slice /*suffix*/) { + DumpDBMapStats(db_stats); + return true; +} + +bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { + DumpDBStats(value); + return true; +} + +bool InternalStats::HandleDBWriteStallStats(std::string* value, + Slice /*suffix*/) { + DumpDBStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleDBWriteStallStatsMap( + std::map* value, Slice /*suffix*/) { + DumpDBMapStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { + auto* current = cfd_->current(); + *value = current->DebugString(true, true); + return true; +} + +bool InternalStats::HandleAggregatedTableProperties(std::string* value, + Slice /*suffix*/) { + std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); + if (!s.ok()) { + return false; + } + *value = tp->ToString(); + return true; +} + +static std::map MapUint64ValuesToString( + const std::map& from) { + std::map to; + for (const auto& e : from) { + to[e.first] = std::to_string(e.second); + } + return to; +} + +bool InternalStats::HandleAggregatedTablePropertiesMap( + std::map* values, Slice /*suffix*/) { + std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, + Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties( + read_options, &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *values = tp->ToString(); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties( + read_options, &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); + return true; +} + +bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->imm()->NumNotFlushed(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->imm()->NumFlushed(); + return true; +} + +bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->num_running_flushes(); + return true; +} + +bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // 1 if the system already determines at least one compaction is needed. + // 0 otherwise, + const auto* vstorage = cfd_->current()->storage_info(); + *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->num_running_compactions_; + return true; +} + +bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Accumulated number of errors in background flushes or compactions. + *value = GetBackgroundErrorCount(); + return true; +} + +bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Current size of the active memtable + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast(); + return true; +} + +bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Current size of the active memtable + immutable memtables + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); + return true; +} + +bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + + cfd_->imm()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_entries(); + return true; +} + +bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumEntries(); + return true; +} + +bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_deletes(); + return true; +} + +bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumDeletes(); + return true; +} + +bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Estimate number of entries in the column family: + // Use estimated entries in tables + total entries in memtables. + const auto* vstorage = cfd_->current()->storage_info(); + uint64_t estimate_keys = cfd_->mem()->num_entries() + + cfd_->imm()->current()->GetTotalNumEntries() + + vstorage->GetEstimatedActiveKeys(); + uint64_t estimate_deletes = + cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes(); + *value = estimate_keys > estimate_deletes * 2 + ? estimate_keys - (estimate_deletes * 2) + : 0; + return true; +} + +bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->snapshots().count(); + return true; +} + +bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->snapshots().GetOldestSnapshotTime()); + return true; +} + +bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->snapshots().GetOldestSnapshotSequence()); + return true; +} + +bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetNumLiveVersions(); + return true; +} + +bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetSuperVersionNumber(); + return true; +} + +bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->IsFileDeletionsEnabled() ? 1 : 0; + return true; +} + +bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->base_level(); + return true; +} + +bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetTotalSstFilesSize(); + return true; +} + +bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetLiveSstFilesSize(); + return true; +} + +bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->estimated_compaction_needed_bytes(); + return true; +} + +bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, + DBImpl* /*db*/, + Version* version) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + *value = (version == nullptr) + ? 0 + : version->GetMemoryUsageByTableReaders(read_options); + return true; +} + +bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/, + Version* version) { + const auto* vstorage = version->storage_info(); + *value = vstorage->EstimateLiveDataSize(); + return true; +} + +bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->MinLogNumberToKeep(); + return true; +} + +bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value, + DBImpl* db, + Version* /*version*/) { + *value = db->MinObsoleteSstNumberToKeep(); + return true; +} + +bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, + Version* /*version*/) { + const WriteController& wc = db->write_controller(); + if (!wc.NeedsDelay()) { + *value = 0; + } else { + *value = wc.delayed_write_rate(); + } + return true; +} + +bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->write_controller().IsStopped() ? 1 : 0; + return true; +} + +bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // TODO(yiwu): The property is currently available for fifo compaction + // with allow_compaction = false. This is because we don't propagate + // oldest_key_time on compaction. + if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO || + cfd_->GetCurrentMutableCFOptions() + ->compaction_options_fifo.allow_compaction) { + return false; + } + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + TablePropertiesCollection collection; + auto s = cfd_->current()->GetPropertiesOfAllTables(read_options, &collection); + if (!s.ok()) { + return false; + } + *value = std::numeric_limits::max(); + for (auto& p : collection) { + *value = std::min(*value, p.second->oldest_key_time); + if (*value == 0) { + break; + } + } + if (*value > 0) { + *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(), + cfd_->imm()->ApproximateOldestKeyTime(), *value}); + } + return *value > 0 && *value < std::numeric_limits::max(); +} + +Cache* InternalStats::GetBlockCacheForStats() { + auto* table_factory = cfd_->ioptions()->table_factory.get(); + assert(table_factory != nullptr); + return table_factory->GetOptions(TableFactory::kBlockCacheOpts()); +} + +bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + *value = static_cast(block_cache->GetCapacity()); + return true; + } + return false; +} + +bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + *value = static_cast(block_cache->GetUsage()); + return true; + } + return false; +} + +bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache = GetBlockCacheForStats(); + if (block_cache) { + *value = static_cast(block_cache->GetPinnedUsage()); + return true; + } + return false; +} + +void InternalStats::DumpDBMapStats( + std::map* db_stats) { + for (int i = 0; i < static_cast(kIntStatsNumMax); ++i) { + InternalDBStatsType type = static_cast(i); + (*db_stats)[db_stats_type_to_info.at(type).property_name] = + std::to_string(GetDBStats(type)); + } + double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec; + (*db_stats)["db.uptime"] = std::to_string(seconds_up); +} + +void InternalStats::DumpDBStats(std::string* value) { + char buf[1000]; + // DB-level stats, only available from default column family + double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec; + double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), + "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + // Cumulative + uint64_t user_bytes_written = + GetDBStats(InternalStats::kIntStatsBytesWritten); + uint64_t num_keys_written = + GetDBStats(InternalStats::kIntStatsNumKeysWritten); + uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther); + uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf); + uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes); + uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced); + uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal); + uint64_t write_stall_micros = + GetDBStats(InternalStats::kIntStatsWriteStallMicros); + + const int kHumanMicrosLen = 32; + char human_micros[kHumanMicrosLen]; + + // Data + // writes: total number of write requests. + // keys: total number of key updates issued by all the write requests + // commit groups: number of group commits issued to the DB. Each group can + // contain one or more writes. + // so writes/keys is the average number of put in multi-put or put + // writes/groups is the average group commit size. + // + // The format is the same for interval stats. + snprintf(buf, sizeof(buf), + "Cumulative writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n", + NumberToHumanString(write_other + write_self).c_str(), + NumberToHumanString(num_keys_written).c_str(), + NumberToHumanString(write_self).c_str(), + (write_other + write_self) / + std::max(1.0, static_cast(write_self)), + user_bytes_written / kGB, + user_bytes_written / kMB / std::max(seconds_up, 0.001)); + value->append(buf); + // WAL + snprintf(buf, sizeof(buf), + "Cumulative WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + NumberToHumanString(write_with_wal).c_str(), + NumberToHumanString(wal_synced).c_str(), + write_with_wal / std::max(1.0, static_cast(wal_synced)), + wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001)); + value->append(buf); + // Stall + AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n", + human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + write_stall_micros / 10000.0 / std::max(seconds_up, 0.001)); + value->append(buf); + + // Interval + uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; + uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self; + uint64_t interval_num_keys_written = + num_keys_written - db_stats_snapshot_.num_keys_written; + snprintf( + buf, sizeof(buf), + "Interval writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n", + NumberToHumanString(interval_write_other + interval_write_self).c_str(), + NumberToHumanString(interval_num_keys_written).c_str(), + NumberToHumanString(interval_write_self).c_str(), + static_cast(interval_write_other + interval_write_self) / + std::max(1.0, static_cast(interval_write_self)), + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB, + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB / + std::max(interval_seconds_up, 0.001)), + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - db_stats_snapshot_.write_with_wal; + uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced; + uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes; + + snprintf(buf, sizeof(buf), + "Interval WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + NumberToHumanString(interval_write_with_wal).c_str(), + NumberToHumanString(interval_wal_synced).c_str(), + interval_write_with_wal / + std::max(1.0, static_cast(interval_wal_synced)), + interval_wal_bytes / kGB, + interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); + value->append(buf); + + // Stall + AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros, + human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + (write_stall_micros - db_stats_snapshot_.write_stall_micros) / + 10000.0 / std::max(interval_seconds_up, 0.001)); + value->append(buf); + + std::string write_stall_stats; + DumpDBStatsWriteStall(&write_stall_stats); + value->append(write_stall_stats); + + db_stats_snapshot_.seconds_up = seconds_up; + db_stats_snapshot_.ingest_bytes = user_bytes_written; + db_stats_snapshot_.write_other = write_other; + db_stats_snapshot_.write_self = write_self; + db_stats_snapshot_.num_keys_written = num_keys_written; + db_stats_snapshot_.wal_bytes = wal_bytes; + db_stats_snapshot_.wal_synced = wal_synced; + db_stats_snapshot_.write_with_wal = write_with_wal; + db_stats_snapshot_.write_stall_micros = write_stall_micros; +} + +void InternalStats::DumpDBMapStatsWriteStall( + std::map* value) { + constexpr uint32_t max_db_scope_write_stall_cause = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax); + + for (uint32_t i = + max_db_scope_write_stall_cause - kNumDBScopeWriteStallCauses; + i < max_db_scope_write_stall_cause; ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + InternalStats::InternalDBStatsType internal_db_stat = + InternalDBStat(cause, condition); + + if (internal_db_stat == InternalStats::kIntStatsNumMax) { + continue; + } + + std::string name = + WriteStallStatsMapKeys::CauseConditionCount(cause, condition); + uint64_t stat = + db_stats_[static_cast(internal_db_stat)].load( + std::memory_order_relaxed); + (*value)[name] = std::to_string(stat); + } + } +} + +void InternalStats::DumpDBStatsWriteStall(std::string* value) { + assert(value); + + std::map write_stall_stats_map; + DumpDBMapStatsWriteStall(&write_stall_stats_map); + + std::ostringstream str; + str << "Write Stall (count): "; + + for (auto write_stall_stats_map_iter = write_stall_stats_map.begin(); + write_stall_stats_map_iter != write_stall_stats_map.end(); + write_stall_stats_map_iter++) { + const auto& name_and_stat = *write_stall_stats_map_iter; + str << name_and_stat.first << ": " << name_and_stat.second; + if (std::next(write_stall_stats_map_iter) == write_stall_stats_map.end()) { + str << "\n"; + } else { + str << ", "; + } + } + *value = str.str(); +} + +/** + * Dump Compaction Level stats to a map of stat name with "compaction." prefix + * to value in double as string. The level in stat name is represented with + * a prefix "Lx" where "x" is the level number. A special level "Sum" + * represents the sum of a stat for all levels. + * The result also contains IO stall counters which keys start with "io_stalls." + * and values represent uint64 encoded as strings. + */ +void InternalStats::DumpCFMapStats( + std::map* cf_stats) { + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + CompactionStats compaction_stats_sum; + std::map> levels_stats; + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); + for (auto const& level_ent : levels_stats) { + auto level_str = + level_ent.first == -1 ? "Sum" : "L" + std::to_string(level_ent.first); + for (auto const& stat_ent : level_ent.second) { + auto stat_type = stat_ent.first; + auto key_str = + "compaction." + level_str + "." + + InternalStats::compaction_level_stats.at(stat_type).property_name; + (*cf_stats)[key_str] = std::to_string(stat_ent.second); + } + } + + DumpCFMapStatsWriteStall(cf_stats); +} + +void InternalStats::DumpCFMapStats( + const VersionStorageInfo* vstorage, + std::map>* levels_stats, + CompactionStats* compaction_stats_sum) { + assert(vstorage); + + int num_levels_to_check = + (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) + ? vstorage->num_levels() - 1 + : 1; + + // Compaction scores are sorted based on its value. Restore them to the + // level order + std::vector compaction_score(number_levels_, 0); + for (int i = 0; i < num_levels_to_check; ++i) { + compaction_score[vstorage->CompactionScoreLevel(i)] = + vstorage->CompactionScore(i); + } + // Count # of files being compacted for each level + std::vector files_being_compacted(number_levels_, 0); + for (int level = 0; level < number_levels_; ++level) { + for (auto* f : vstorage->LevelFiles(level)) { + if (f->being_compacted) { + ++files_being_compacted[level]; + } + } + } + + int total_files = 0; + int total_files_being_compacted = 0; + double total_file_size = 0; + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t curr_ingest = flush_ingest + add_file_ingest; + for (int level = 0; level < number_levels_; level++) { + int files = vstorage->NumLevelFiles(level); + total_files += files; + total_files_being_compacted += files_being_compacted[level]; + if (comp_stats_[level].micros > 0 || comp_stats_[level].cpu_micros > 0 || + files > 0) { + compaction_stats_sum->Add(comp_stats_[level]); + total_file_size += vstorage->NumLevelBytes(level); + uint64_t input_bytes; + if (level == 0) { + input_bytes = curr_ingest; + } else { + input_bytes = comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; + } + double w_amp = + (input_bytes == 0) + ? 0.0 + : static_cast(comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob) / + input_bytes; + std::map level_stats; + PrepareLevelStats(&level_stats, files, files_being_compacted[level], + static_cast(vstorage->NumLevelBytes(level)), + compaction_score[level], w_amp, comp_stats_[level]); + (*levels_stats)[level] = level_stats; + } + } + // Cumulative summary + double w_amp = (0 == curr_ingest) + ? 0.0 + : (compaction_stats_sum->bytes_written + + compaction_stats_sum->bytes_written_blob) / + static_cast(curr_ingest); + // Stats summary across levels + std::map sum_stats; + PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted, + total_file_size, 0, w_amp, *compaction_stats_sum); + (*levels_stats)[-1] = sum_stats; // -1 is for the Sum level +} + +void InternalStats::DumpCFMapStatsByPriority( + std::map>* priorities_stats) { + for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) { + if (comp_stats_by_pri_[priority].micros > 0) { + std::map priority_stats; + PrepareLevelStats(&priority_stats, 0 /* num_files */, + 0 /* being_compacted */, 0 /* total_file_size */, + 0 /* compaction_score */, 0 /* w_amp */, + comp_stats_by_pri_[priority]); + (*priorities_stats)[static_cast(priority)] = priority_stats; + } + } +} + +void InternalStats::DumpCFMapStatsWriteStall( + std::map* value) { + uint64_t total_delays = 0; + uint64_t total_stops = 0; + constexpr uint32_t max_cf_scope_write_stall_cause = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax); + + for (uint32_t i = + max_cf_scope_write_stall_cause - kNumCFScopeWriteStallCauses; + i < max_cf_scope_write_stall_cause; ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + InternalStats::InternalCFStatsType internal_cf_stat = + InternalCFStat(cause, condition); + + if (internal_cf_stat == InternalStats::INTERNAL_CF_STATS_ENUM_MAX) { + continue; + } + + std::string name = + WriteStallStatsMapKeys::CauseConditionCount(cause, condition); + uint64_t stat = + cf_stats_count_[static_cast(internal_cf_stat)]; + (*value)[name] = std::to_string(stat); + + if (condition == WriteStallCondition::kDelayed) { + total_delays += stat; + } else if (condition == WriteStallCondition::kStopped) { + total_stops += stat; + } + } + } + + (*value)[WriteStallStatsMapKeys:: + CFL0FileCountLimitDelaysWithOngoingCompaction()] = + std::to_string( + cf_stats_count_[L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION]); + (*value)[WriteStallStatsMapKeys:: + CFL0FileCountLimitStopsWithOngoingCompaction()] = + std::to_string( + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION]); + + (*value)[WriteStallStatsMapKeys::TotalStops()] = std::to_string(total_stops); + (*value)[WriteStallStatsMapKeys::TotalDelays()] = + std::to_string(total_delays); +} + +void InternalStats::DumpCFStatsWriteStall(std::string* value, + uint64_t* total_stall_count) { + assert(value); + + std::map write_stall_stats_map; + DumpCFMapStatsWriteStall(&write_stall_stats_map); + + std::ostringstream str; + str << "Write Stall (count): "; + + for (auto write_stall_stats_map_iter = write_stall_stats_map.begin(); + write_stall_stats_map_iter != write_stall_stats_map.end(); + write_stall_stats_map_iter++) { + const auto& name_and_stat = *write_stall_stats_map_iter; + str << name_and_stat.first << ": " << name_and_stat.second; + if (std::next(write_stall_stats_map_iter) == write_stall_stats_map.end()) { + str << "\n"; + } else { + str << ", "; + } + } + + if (total_stall_count) { + *total_stall_count = + ParseUint64( + write_stall_stats_map[WriteStallStatsMapKeys::TotalStops()]) + + ParseUint64( + write_stall_stats_map[WriteStallStatsMapKeys::TotalDelays()]); + if (*total_stall_count > 0) { + str << "interval: " << *total_stall_count - cf_stats_snapshot_.stall_count + << " total count\n"; + } + } + *value = str.str(); +} + +void InternalStats::DumpCFStats(std::string* value) { + DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value); + DumpCFFileHistogram(value); +} + +void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, + std::string* value) { + char buf[2000]; + // Per-ColumnFamily stats + PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level"); + value->append(buf); + + // Print stats for each level + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + std::map> levels_stats; + CompactionStats compaction_stats_sum; + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); + for (int l = 0; l < number_levels_; ++l) { + if (levels_stats.find(l) != levels_stats.end()) { + PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(l), + levels_stats[l]); + value->append(buf); + } + } + + // Print sum of level stats + PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]); + value->append(buf); + + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL]; + uint64_t ingest_l0_files_addfile = + cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL]; + uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL]; + // Interval summary + uint64_t interval_flush_ingest = + flush_ingest - cf_stats_snapshot_.ingest_bytes_flush; + uint64_t interval_add_file_inget = + add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile; + uint64_t interval_ingest = + interval_flush_ingest + interval_add_file_inget + 1; + CompactionStats interval_stats(compaction_stats_sum); + interval_stats.Subtract(cf_stats_snapshot_.comp_stats); + double w_amp = + (interval_stats.bytes_written + interval_stats.bytes_written_blob) / + static_cast(interval_ingest); + PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats); + value->append(buf); + + PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority"); + value->append(buf); + std::map> priorities_stats; + DumpCFMapStatsByPriority(&priorities_stats); + for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) { + if (priorities_stats.find(static_cast(priority)) != + priorities_stats.end()) { + PrintLevelStats( + buf, sizeof(buf), + Env::PriorityToString(static_cast(priority)), + priorities_stats[static_cast(priority)]); + value->append(buf); + } + } + + const auto blob_st = vstorage->GetBlobStats(); + + snprintf(buf, sizeof(buf), + "\nBlob file count: %" ROCKSDB_PRIszt + ", total size: %.1f GB, garbage size: %.1f GB, space amp: %.1f\n\n", + vstorage->GetBlobFiles().size(), blob_st.total_file_size / kGB, + blob_st.total_garbage_size / kGB, blob_st.space_amp); + value->append(buf); + + uint64_t now_micros = clock_->NowMicros(); + double seconds_up = (now_micros - started_at_) / kMicrosInSec; + double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n", + flush_ingest / kGB, interval_flush_ingest / kGB); + value->append(buf); + snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n", + add_file_ingest / kGB, interval_add_file_inget / kGB); + value->append(buf); + + uint64_t interval_ingest_files_addfile = + ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile; + snprintf(buf, sizeof(buf), + "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64 + "\n", + ingest_files_addfile, interval_ingest_files_addfile); + value->append(buf); + + uint64_t interval_ingest_l0_files_addfile = + ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile; + snprintf(buf, sizeof(buf), + "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n", + ingest_l0_files_addfile, interval_ingest_l0_files_addfile); + value->append(buf); + + uint64_t interval_ingest_keys_addfile = + ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile; + snprintf(buf, sizeof(buf), + "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n", + ingest_keys_addfile, interval_ingest_keys_addfile); + value->append(buf); + + // Compact + uint64_t compact_bytes_read = 0; + uint64_t compact_bytes_write = 0; + uint64_t compact_micros = 0; + for (int level = 0; level < number_levels_; level++) { + compact_bytes_read += comp_stats_[level].bytes_read_output_level + + comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; + compact_bytes_write += comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob; + compact_micros += comp_stats_[level].micros; + } + + snprintf(buf, sizeof(buf), + "Cumulative compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + compact_bytes_write / kGB, + compact_bytes_write / kMB / std::max(seconds_up, 0.001), + compact_bytes_read / kGB, + compact_bytes_read / kMB / std::max(seconds_up, 0.001), + compact_micros / kMicrosInSec); + value->append(buf); + + // Compaction interval + uint64_t interval_compact_bytes_write = + compact_bytes_write - cf_stats_snapshot_.compact_bytes_write; + uint64_t interval_compact_bytes_read = + compact_bytes_read - cf_stats_snapshot_.compact_bytes_read; + uint64_t interval_compact_micros = + compact_micros - cf_stats_snapshot_.compact_micros; + + snprintf( + buf, sizeof(buf), + "Interval compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + interval_compact_bytes_write / kGB, + interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_bytes_read / kGB, + interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_micros / kMicrosInSec); + value->append(buf); + if (is_periodic) { + cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; + cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; + cf_stats_snapshot_.compact_micros = compact_micros; + } + + std::string write_stall_stats; + uint64_t total_stall_count; + DumpCFStatsWriteStall(&write_stall_stats, &total_stall_count); + value->append(write_stall_stats); + + if (is_periodic) { + cf_stats_snapshot_.seconds_up = seconds_up; + cf_stats_snapshot_.ingest_bytes_flush = flush_ingest; + cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest; + cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile; + cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile; + cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; + cf_stats_snapshot_.comp_stats = compaction_stats_sum; + cf_stats_snapshot_.stall_count = total_stall_count; + } + + // Do not gather cache entry stats during CFStats because DB + // mutex is held. Only dump last cached collection (rely on DB + // periodic stats dump to update) + if (cache_entry_stats_collector_) { + CacheEntryRoleStats stats; + // thread safe + cache_entry_stats_collector_->GetStats(&stats); + + constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U; + + // Skip if stats are extremely old (> 1 day, incl not yet populated) + if (now_micros - stats.last_end_time_micros_ < kDayInMicros) { + value->append(stats.ToString(clock_)); + } + } +} + +void InternalStats::DumpCFFileHistogram(std::string* value) { + assert(value); + assert(cfd_); + + std::ostringstream oss; + oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName() + << "] **\n"; + + for (int level = 0; level < number_levels_; level++) { + if (!file_read_latency_[level].Empty()) { + oss << "** Level " << level << " read latency histogram (micros):\n" + << file_read_latency_[level].ToString() << '\n'; + } + } + + if (!blob_file_read_latency_.Empty()) { + oss << "** Blob file read latency histogram (micros):\n" + << blob_file_read_latency_.ToString() << '\n'; + } + + value->append(oss.str()); +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.h new file mode 100644 index 0000000000..a65ef629aa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/internal_stats.h @@ -0,0 +1,875 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "db/version_set.h" +#include "rocksdb/system_clock.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +template +class CacheEntryStatsCollector; +class DBImpl; +class MemTableList; + +// Config for retrieving a property's value. +struct DBPropertyInfo { + bool need_out_of_mutex; + + // gcc had an internal error for initializing union of pointer-to-member- + // functions. Workaround is to populate exactly one of the following function + // pointers with a non-nullptr value. + + // @param value Value-result argument for storing the property's string value + // @param suffix Argument portion of the property. For example, suffix would + // be "5" for the property "rocksdb.num-files-at-level5". So far, only + // certain string properties take an argument. + bool (InternalStats::*handle_string)(std::string* value, Slice suffix); + + // @param value Value-result argument for storing the property's uint64 value + // @param db Many of the int properties rely on DBImpl methods. + // @param version Version is needed in case the property is retrieved without + // holding db mutex, which is only supported for int properties. + bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db, + Version* version); + + // @param props Map of general properties to populate + // @param suffix Argument portion of the property. (see handle_string) + bool (InternalStats::*handle_map)(std::map* props, + Slice suffix); + + // handle the string type properties rely on DBImpl methods + // @param value Value-result argument for storing the property's string value + bool (DBImpl::*handle_string_dbimpl)(std::string* value); +}; + +extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); + +#undef SCORE +enum class LevelStatType { + INVALID = 0, + NUM_FILES, + COMPACTED_FILES, + SIZE_BYTES, + SCORE, + READ_GB, + RN_GB, + RNP1_GB, + WRITE_GB, + W_NEW_GB, + MOVED_GB, + WRITE_AMP, + READ_MBPS, + WRITE_MBPS, + COMP_SEC, + COMP_CPU_SEC, + COMP_COUNT, + AVG_SEC, + KEY_IN, + KEY_DROP, + R_BLOB_GB, + W_BLOB_GB, + TOTAL // total number of types +}; + +struct LevelStat { + // This what will be L?.property_name in the flat map returned to the user + std::string property_name; + // This will be what we will print in the header in the cli + std::string header_name; +}; + +struct DBStatInfo { + // This what will be property_name in the flat map returned to the user + std::string property_name; +}; + +class InternalStats { + public: + static const std::map compaction_level_stats; + + enum InternalCFStatsType { + MEMTABLE_LIMIT_DELAYS, + MEMTABLE_LIMIT_STOPS, + L0_FILE_COUNT_LIMIT_DELAYS, + L0_FILE_COUNT_LIMIT_STOPS, + PENDING_COMPACTION_BYTES_LIMIT_DELAYS, + PENDING_COMPACTION_BYTES_LIMIT_STOPS, + // Write slowdown caused by l0 file count limit while there is ongoing L0 + // compaction + L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, + // Write stop caused by l0 file count limit while there is ongoing L0 + // compaction + L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, + WRITE_STALLS_ENUM_MAX, + // End of all write stall stats + BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, + INGESTED_NUM_FILES_TOTAL, + INGESTED_LEVEL0_NUM_FILES_TOTAL, + INGESTED_NUM_KEYS_TOTAL, + INTERNAL_CF_STATS_ENUM_MAX, + }; + + enum InternalDBStatsType { + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + // TODO(hx235): Currently `kIntStatsWriteStallMicros` only measures + // "delayed" time of CF-scope write stalls, not including the "stopped" time + // nor any DB-scope write stalls (e.g, ones triggered by + // `WriteBufferManager`). + // + // However, the word "write stall" includes both "delayed" and "stopped" + // (see `WriteStallCondition`) and DB-scope writes stalls (see + // `WriteStallCause`). + // + // So we should improve, rename or clarify it + kIntStatsWriteStallMicros, + kIntStatsWriteBufferManagerLimitStopsCounts, + kIntStatsNumMax, + }; + + static const std::map db_stats_type_to_info; + + InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd); + + // Per level compaction stats + struct CompactionOutputsStats { + uint64_t num_output_records = 0; + uint64_t bytes_written = 0; + uint64_t bytes_written_blob = 0; + uint64_t num_output_files = 0; + uint64_t num_output_files_blob = 0; + + void Add(const CompactionOutputsStats& stats) { + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files += stats.num_output_files; + this->num_output_files_blob += stats.num_output_files_blob; + } + }; + + // Per level compaction stats. comp_stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + uint64_t micros; + uint64_t cpu_micros; + + // The number of bytes read from all non-output levels (table files) + uint64_t bytes_read_non_output_levels; + + // The number of bytes read from the compaction output level (table files) + uint64_t bytes_read_output_level; + + // The number of bytes read from blob files + uint64_t bytes_read_blob; + + // Total number of bytes written to table files during compaction + uint64_t bytes_written; + + // Total number of bytes written to blob files during compaction + uint64_t bytes_written_blob; + + // Total number of bytes moved to the output level (table files) + uint64_t bytes_moved; + + // The number of compaction input files in all non-output levels (table + // files) + int num_input_files_in_non_output_levels; + + // The number of compaction input files in the output level (table files) + int num_input_files_in_output_level; + + // The number of compaction output files (table files) + int num_output_files; + + // The number of compaction output files (blob files) + int num_output_files_blob; + + // Total incoming entries during compaction between levels N and N+1 + uint64_t num_input_records; + + // Accumulated diff number of entries + // (num input entries - num output entries) for compaction levels N and N+1 + uint64_t num_dropped_records; + + // Total output entries from compaction + uint64_t num_output_records; + + // Number of compactions done + int count; + + // Number of compactions done per CompactionReason + int counts[static_cast(CompactionReason::kNumOfReasons)]{}; + + explicit CompactionStats() + : micros(0), + cpu_micros(0), + bytes_read_non_output_levels(0), + bytes_read_output_level(0), + bytes_read_blob(0), + bytes_written(0), + bytes_written_blob(0), + bytes_moved(0), + num_input_files_in_non_output_levels(0), + num_input_files_in_output_level(0), + num_output_files(0), + num_output_files_blob(0), + num_input_records(0), + num_dropped_records(0), + num_output_records(0), + count(0) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + } + + explicit CompactionStats(CompactionReason reason, int c) + : micros(0), + cpu_micros(0), + bytes_read_non_output_levels(0), + bytes_read_output_level(0), + bytes_read_blob(0), + bytes_written(0), + bytes_written_blob(0), + bytes_moved(0), + num_input_files_in_non_output_levels(0), + num_input_files_in_output_level(0), + num_output_files(0), + num_output_files_blob(0), + num_input_records(0), + num_dropped_records(0), + num_output_records(0), + count(c) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + int r = static_cast(reason); + if (r >= 0 && r < num_of_reasons) { + counts[r] = c; + } else { + count = 0; + } + } + + CompactionStats(const CompactionStats& c) + : micros(c.micros), + cpu_micros(c.cpu_micros), + bytes_read_non_output_levels(c.bytes_read_non_output_levels), + bytes_read_output_level(c.bytes_read_output_level), + bytes_read_blob(c.bytes_read_blob), + bytes_written(c.bytes_written), + bytes_written_blob(c.bytes_written_blob), + bytes_moved(c.bytes_moved), + num_input_files_in_non_output_levels( + c.num_input_files_in_non_output_levels), + num_input_files_in_output_level(c.num_input_files_in_output_level), + num_output_files(c.num_output_files), + num_output_files_blob(c.num_output_files_blob), + num_input_records(c.num_input_records), + num_dropped_records(c.num_dropped_records), + num_output_records(c.num_output_records), + count(c.count) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + } + + CompactionStats& operator=(const CompactionStats& c) { + micros = c.micros; + cpu_micros = c.cpu_micros; + bytes_read_non_output_levels = c.bytes_read_non_output_levels; + bytes_read_output_level = c.bytes_read_output_level; + bytes_read_blob = c.bytes_read_blob; + bytes_written = c.bytes_written; + bytes_written_blob = c.bytes_written_blob; + bytes_moved = c.bytes_moved; + num_input_files_in_non_output_levels = + c.num_input_files_in_non_output_levels; + num_input_files_in_output_level = c.num_input_files_in_output_level; + num_output_files = c.num_output_files; + num_output_files_blob = c.num_output_files_blob; + num_input_records = c.num_input_records; + num_dropped_records = c.num_dropped_records; + num_output_records = c.num_output_records; + count = c.count; + + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + return *this; + } + + void Clear() { + this->micros = 0; + this->cpu_micros = 0; + this->bytes_read_non_output_levels = 0; + this->bytes_read_output_level = 0; + this->bytes_read_blob = 0; + this->bytes_written = 0; + this->bytes_written_blob = 0; + this->bytes_moved = 0; + this->num_input_files_in_non_output_levels = 0; + this->num_input_files_in_output_level = 0; + this->num_output_files = 0; + this->num_output_files_blob = 0; + this->num_input_records = 0; + this->num_dropped_records = 0; + this->num_output_records = 0; + this->count = 0; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->cpu_micros += c.cpu_micros; + this->bytes_read_non_output_levels += c.bytes_read_non_output_levels; + this->bytes_read_output_level += c.bytes_read_output_level; + this->bytes_read_blob += c.bytes_read_blob; + this->bytes_written += c.bytes_written; + this->bytes_written_blob += c.bytes_written_blob; + this->bytes_moved += c.bytes_moved; + this->num_input_files_in_non_output_levels += + c.num_input_files_in_non_output_levels; + this->num_input_files_in_output_level += + c.num_input_files_in_output_level; + this->num_output_files += c.num_output_files; + this->num_output_files_blob += c.num_output_files_blob; + this->num_input_records += c.num_input_records; + this->num_dropped_records += c.num_dropped_records; + this->num_output_records += c.num_output_records; + this->count += c.count; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] += c.counts[i]; + } + } + + void Add(const CompactionOutputsStats& stats) { + this->num_output_files += static_cast(stats.num_output_files); + this->num_output_records += stats.num_output_records; + this->bytes_written += stats.bytes_written; + this->bytes_written_blob += stats.bytes_written_blob; + this->num_output_files_blob += + static_cast(stats.num_output_files_blob); + } + + void Subtract(const CompactionStats& c) { + this->micros -= c.micros; + this->cpu_micros -= c.cpu_micros; + this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels; + this->bytes_read_output_level -= c.bytes_read_output_level; + this->bytes_read_blob -= c.bytes_read_blob; + this->bytes_written -= c.bytes_written; + this->bytes_written_blob -= c.bytes_written_blob; + this->bytes_moved -= c.bytes_moved; + this->num_input_files_in_non_output_levels -= + c.num_input_files_in_non_output_levels; + this->num_input_files_in_output_level -= + c.num_input_files_in_output_level; + this->num_output_files -= c.num_output_files; + this->num_output_files_blob -= c.num_output_files_blob; + this->num_input_records -= c.num_input_records; + this->num_dropped_records -= c.num_dropped_records; + this->num_output_records -= c.num_output_records; + this->count -= c.count; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] -= c.counts[i]; + } + } + + void ResetCompactionReason(CompactionReason reason) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(count == 1); // only support update one compaction reason + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + int r = static_cast(reason); + assert(r >= 0 && r < num_of_reasons); + counts[r] = 1; + } + }; + + // Compaction stats, for per_key_placement compaction, it includes 2 levels + // stats: the last level and the penultimate level. + struct CompactionStatsFull { + // the stats for the target primary output level + CompactionStats stats; + + // stats for penultimate level output if exist + bool has_penultimate_level_output = false; + CompactionStats penultimate_level_stats; + + explicit CompactionStatsFull() : stats(), penultimate_level_stats() {} + + explicit CompactionStatsFull(CompactionReason reason, int c) + : stats(reason, c), penultimate_level_stats(reason, c){}; + + uint64_t TotalBytesWritten() const { + uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + if (has_penultimate_level_output) { + bytes_written += penultimate_level_stats.bytes_written + + penultimate_level_stats.bytes_written_blob; + } + return bytes_written; + } + + uint64_t DroppedRecords() { + uint64_t output_records = stats.num_output_records; + if (has_penultimate_level_output) { + output_records += penultimate_level_stats.num_output_records; + } + if (stats.num_input_records > output_records) { + return stats.num_input_records - output_records; + } + return 0; + } + + void SetMicros(uint64_t val) { + stats.micros = val; + penultimate_level_stats.micros = val; + } + + void AddCpuMicros(uint64_t val) { + stats.cpu_micros += val; + penultimate_level_stats.cpu_micros += val; + } + }; + + // For use with CacheEntryStatsCollector + struct CacheEntryRoleStats { + uint64_t cache_capacity = 0; + uint64_t cache_usage = 0; + size_t table_size = 0; + size_t occupancy = 0; + std::string cache_id; + std::array total_charges; + std::array entry_counts; + uint32_t collection_count = 0; + uint32_t copies_of_last_collection = 0; + uint64_t last_start_time_micros_ = 0; + uint64_t last_end_time_micros_ = 0; + uint32_t hash_seed = 0; + + void Clear() { + // Wipe everything except collection_count + uint32_t saved_collection_count = collection_count; + *this = CacheEntryRoleStats(); + collection_count = saved_collection_count; + } + + void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); + std::function + GetEntryCallback(); + void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); + void SkippedCollection(); + + std::string ToString(SystemClock* clock) const; + void ToMap(std::map* values, + SystemClock* clock) const; + + private: + uint64_t GetLastDurationMicros() const; + }; + + void Clear() { + for (int i = 0; i < kIntStatsNumMax; i++) { + db_stats_[i].store(0); + } + for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) { + cf_stats_count_[i] = 0; + cf_stats_value_[i] = 0; + } + for (auto& comp_stat : comp_stats_) { + comp_stat.Clear(); + } + per_key_placement_comp_stats_.Clear(); + for (auto& h : file_read_latency_) { + h.Clear(); + } + blob_file_read_latency_.Clear(); + cf_stats_snapshot_.Clear(); + db_stats_snapshot_.Clear(); + bg_error_count_ = 0; + started_at_ = clock_->NowMicros(); + has_cf_change_since_dump_ = true; + } + + void AddCompactionStats(int level, Env::Priority thread_pri, + const CompactionStats& stats) { + comp_stats_[level].Add(stats); + comp_stats_by_pri_[thread_pri].Add(stats); + } + + void AddCompactionStats(int level, Env::Priority thread_pri, + const CompactionStatsFull& comp_stats_full) { + AddCompactionStats(level, thread_pri, comp_stats_full.stats); + if (comp_stats_full.has_penultimate_level_output) { + per_key_placement_comp_stats_.Add( + comp_stats_full.penultimate_level_stats); + } + } + + void IncBytesMoved(int level, uint64_t amount) { + comp_stats_[level].bytes_moved += amount; + } + + void AddCFStats(InternalCFStatsType type, uint64_t value) { + has_cf_change_since_dump_ = true; + cf_stats_value_[type] += value; + ++cf_stats_count_[type]; + } + + void AddDBStats(InternalDBStatsType type, uint64_t value, + bool concurrent = false) { + auto& v = db_stats_[type]; + if (concurrent) { + v.fetch_add(value, std::memory_order_relaxed); + } else { + v.store(v.load(std::memory_order_relaxed) + value, + std::memory_order_relaxed); + } + } + + uint64_t GetDBStats(InternalDBStatsType type) { + return db_stats_[type].load(std::memory_order_relaxed); + } + + HistogramImpl* GetFileReadHist(int level) { + return &file_read_latency_[level]; + } + + HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; } + + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } + + uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } + + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value); + + bool GetMapProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::map* value); + + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db); + + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value); + + // Unless there is a recent enough collection of the stats, collect and + // saved new cache entry stats. If `foreground`, require data to be more + // recent to skip re-collection. + // + // This should only be called while NOT holding the DB mutex. + void CollectCacheEntryStats(bool foreground); + + const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; } + + const std::vector& TEST_GetCompactionStats() const { + return comp_stats_; + } + + const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const { + return per_key_placement_comp_stats_; + } + + void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground); + + // Store a mapping from the user-facing DB::Properties string to our + // DBPropertyInfo struct used internally for retrieving properties. + static const UnorderedMap ppt_name_to_info; + + static const std::string kPeriodicCFStats; + + private: + void DumpDBMapStats(std::map* db_stats); + void DumpDBStats(std::string* value); + + void DumpDBMapStatsWriteStall(std::map* value); + void DumpDBStatsWriteStall(std::string* value); + + void DumpCFMapStats(std::map* cf_stats); + void DumpCFMapStats( + const VersionStorageInfo* vstorage, + std::map>* level_stats, + CompactionStats* compaction_stats_sum); + void DumpCFMapStatsByPriority( + std::map>* priorities_stats); + void DumpCFStats(std::string* value); + // if is_periodic = true, it is an internal call by RocksDB periodically to + // dump the status. + void DumpCFStatsNoFileHistogram(bool is_periodic, std::string* value); + // if is_periodic = true, it is an internal call by RocksDB periodically to + // dump the status. + void DumpCFFileHistogram(std::string* value); + + void DumpCFMapStatsWriteStall(std::map* value); + void DumpCFStatsWriteStall(std::string* value, + uint64_t* total_stall_count = nullptr); + + Cache* GetBlockCacheForStats(); + Cache* GetBlobCacheForStats(); + + // Per-DB stats + std::atomic db_stats_[kIntStatsNumMax]; + // Per-ColumnFamily stats + uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; + uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; + // Initialize/reference the collector in constructor so that we don't need + // additional synchronization in InternalStats, relying on synchronization + // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache + // (through a shared_ptr) so that it does not get immediately ejected from + // a full cache, which would force a re-scan on the next GetStats. + std::shared_ptr> + cache_entry_stats_collector_; + // Per-ColumnFamily/level compaction stats + std::vector comp_stats_; + std::vector comp_stats_by_pri_; + CompactionStats per_key_placement_comp_stats_; + std::vector file_read_latency_; + HistogramImpl blob_file_read_latency_; + bool has_cf_change_since_dump_; + // How many periods of no change since the last time stats are dumped for + // a periodic dump. + int no_cf_change_period_since_dump_ = 0; + uint64_t last_histogram_num = std::numeric_limits::max(); + static const int kMaxNoChangePeriodSinceDump; + + // Used to compute per-interval statistics + struct CFStatsSnapshot { + // ColumnFamily-level stats + CompactionStats comp_stats; + uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) + uint64_t stall_count; // Total counts of CF-scope write stalls + // Stats from compaction jobs - bytes written, bytes read, duration. + uint64_t compact_bytes_write; + uint64_t compact_bytes_read; + uint64_t compact_micros; + double seconds_up; + + // AddFile specific stats + uint64_t ingest_bytes_addfile; // Total Bytes ingested + uint64_t ingest_files_addfile; // Total number of files ingested + uint64_t ingest_l0_files_addfile; // Total number of files ingested to L0 + uint64_t ingest_keys_addfile; // Total number of keys ingested + + CFStatsSnapshot() + : ingest_bytes_flush(0), + stall_count(0), + compact_bytes_write(0), + compact_bytes_read(0), + compact_micros(0), + seconds_up(0), + ingest_bytes_addfile(0), + ingest_files_addfile(0), + ingest_l0_files_addfile(0), + ingest_keys_addfile(0) {} + + void Clear() { + comp_stats.Clear(); + ingest_bytes_flush = 0; + stall_count = 0; + compact_bytes_write = 0; + compact_bytes_read = 0; + compact_micros = 0; + seconds_up = 0; + ingest_bytes_addfile = 0; + ingest_files_addfile = 0; + ingest_l0_files_addfile = 0; + ingest_keys_addfile = 0; + } + } cf_stats_snapshot_; + + struct DBStatsSnapshot { + // DB-level stats + uint64_t ingest_bytes; // Bytes written by user + uint64_t wal_bytes; // Bytes written to WAL + uint64_t wal_synced; // Number of times WAL is synced + uint64_t write_with_wal; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other; + uint64_t write_self; + // Total number of keys written. write_self and write_other measure number + // of write requests written, Each of the write request can contain updates + // to multiple keys. num_keys_written is total number of keys updated by all + // those writes. + uint64_t num_keys_written; + // Total time writes delayed by stalls. + uint64_t write_stall_micros; + double seconds_up; + + DBStatsSnapshot() + : ingest_bytes(0), + wal_bytes(0), + wal_synced(0), + write_with_wal(0), + write_other(0), + write_self(0), + num_keys_written(0), + write_stall_micros(0), + seconds_up(0) {} + + void Clear() { + ingest_bytes = 0; + wal_bytes = 0; + wal_synced = 0; + write_with_wal = 0; + write_other = 0; + write_self = 0; + num_keys_written = 0; + write_stall_micros = 0; + seconds_up = 0; + } + } db_stats_snapshot_; + + // Handler functions for getting property values. They use "value" as a value- + // result argument, and return true upon successfully setting "value". + bool HandleNumFilesAtLevel(std::string* value, Slice suffix); + bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix); + bool HandleLevelStats(std::string* value, Slice suffix); + bool HandleStats(std::string* value, Slice suffix); + bool HandleCFMapStats(std::map* compaction_stats, + Slice suffix); + bool HandleCFStats(std::string* value, Slice suffix); + bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); + bool HandleCFFileHistogram(std::string* value, Slice suffix); + bool HandleCFStatsPeriodic(std::string* value, Slice suffix); + bool HandleCFWriteStallStats(std::string* value, Slice suffix); + bool HandleCFWriteStallStatsMap(std::map* values, + Slice suffix); + bool HandleDBMapStats(std::map* compaction_stats, + Slice suffix); + bool HandleDBStats(std::string* value, Slice suffix); + bool HandleDBWriteStallStats(std::string* value, Slice suffix); + bool HandleDBWriteStallStatsMap(std::map* values, + Slice suffix); + bool HandleSsTables(std::string* value, Slice suffix); + bool HandleAggregatedTableProperties(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesMap( + std::map* values, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix); + bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version); + bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version); + bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version); + bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db, + Version* version); + bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlockCacheEntryStatsInternal(std::string* value, bool fast); + bool HandleBlockCacheEntryStatsMapInternal( + std::map* values, bool fast); + bool HandleBlockCacheEntryStats(std::string* value, Slice suffix); + bool HandleBlockCacheEntryStatsMap(std::map* values, + Slice suffix); + bool HandleFastBlockCacheEntryStats(std::string* value, Slice suffix); + bool HandleFastBlockCacheEntryStatsMap( + std::map* values, Slice suffix); + bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix); + bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobStats(std::string* value, Slice suffix); + bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveBlobFileGarbageSize(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlobCacheCapacity(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobCacheUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* db, + Version* version); + + // Total number of background errors encountered. Every time a flush task + // or compaction task fails, this counter is incremented. The failure can + // be caused by any possible reason, including file system errors, out of + // resources, or input file corruption. Failing when retrying the same flush + // or compaction will cause the counter to increase too. + uint64_t bg_error_count_; + + const int number_levels_; + SystemClock* clock_; + ColumnFamilyData* cfd_; + uint64_t started_at_; +}; + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/job_context.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/job_context.h new file mode 100644 index 0000000000..a550ba2d46 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/job_context.h @@ -0,0 +1,237 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "db/column_family.h" +#include "db/log_writer.h" +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTable; +struct SuperVersion; + +struct SuperVersionContext { + struct WriteStallNotification { + WriteStallInfo write_stall_info; + const ImmutableOptions* immutable_options; + }; + + autovector superversions_to_free; +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + autovector write_stall_notifications; +#endif + std::unique_ptr + new_superversion; // if nullptr no new superversion + + explicit SuperVersionContext(bool create_superversion = false) + : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} + + explicit SuperVersionContext(SuperVersionContext&& other) noexcept + : superversions_to_free(std::move(other.superversions_to_free)), +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + write_stall_notifications(std::move(other.write_stall_notifications)), +#endif + new_superversion(std::move(other.new_superversion)) { + } + // No copies + SuperVersionContext(const SuperVersionContext& other) = delete; + void operator=(const SuperVersionContext& other) = delete; + + void NewSuperVersion() { + new_superversion = std::unique_ptr(new SuperVersion()); + } + + inline bool HaveSomethingToDelete() const { +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + return !superversions_to_free.empty() || !write_stall_notifications.empty(); +#else + return !superversions_to_free.empty(); +#endif + } + + void PushWriteStallNotification(WriteStallCondition old_cond, + WriteStallCondition new_cond, + const std::string& name, + const ImmutableOptions* ioptions) { +#if !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + WriteStallNotification notif; + notif.write_stall_info.cf_name = name; + notif.write_stall_info.condition.prev = old_cond; + notif.write_stall_info.condition.cur = new_cond; + notif.immutable_options = ioptions; + write_stall_notifications.push_back(notif); +#else + (void)old_cond; + (void)new_cond; + (void)name; + (void)ioptions; +#endif // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + } + + void Clean() { +#if !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + // notify listeners on changed write stall conditions + for (auto& notif : write_stall_notifications) { + for (auto& listener : notif.immutable_options->listeners) { + listener->OnStallConditionsChanged(notif.write_stall_info); + } + } + write_stall_notifications.clear(); +#endif + // free superversions + for (auto s : superversions_to_free) { + delete s; + } + superversions_to_free.clear(); + } + + ~SuperVersionContext() { +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + assert(write_stall_notifications.empty()); +#endif + assert(superversions_to_free.empty()); + } +}; + +struct JobContext { + inline bool HaveSomethingToDelete() const { + return !(full_scan_candidate_files.empty() && sst_delete_files.empty() && + blob_delete_files.empty() && log_delete_files.empty() && + manifest_delete_files.empty()); + } + + inline bool HaveSomethingToClean() const { + bool sv_have_sth = false; + for (const auto& sv_ctx : superversion_contexts) { + if (sv_ctx.HaveSomethingToDelete()) { + sv_have_sth = true; + break; + } + } + return memtables_to_free.size() > 0 || logs_to_free.size() > 0 || + job_snapshot != nullptr || sv_have_sth; + } + + SequenceNumber GetJobSnapshotSequence() const { + if (job_snapshot) { + assert(job_snapshot->snapshot()); + return job_snapshot->snapshot()->GetSequenceNumber(); + } + return kMaxSequenceNumber; + } + + // Structure to store information for candidate files to delete. + struct CandidateFileInfo { + std::string file_name; + std::string file_path; + CandidateFileInfo(std::string name, std::string path) + : file_name(std::move(name)), file_path(std::move(path)) {} + bool operator==(const CandidateFileInfo& other) const { + return file_name == other.file_name && file_path == other.file_path; + } + }; + + // Unique job id + int job_id; + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + // (filled only if we're doing full scan) + std::vector full_scan_candidate_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // the list of sst files that we need to delete + std::vector sst_delete_files; + + // the list of all live blob files that cannot be deleted + std::vector blob_live; + + // the list of blob files that we need to delete + std::vector blob_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of log files that we need to preserve during full purge since they + // will be reused later + std::vector log_recycle_files; + + // a list of manifest files that we need to delete + std::vector manifest_delete_files; + + // a list of memtables to be free + autovector memtables_to_free; + + // contexts for installing superversions for multiple column families + std::vector superversion_contexts; + + autovector logs_to_free; + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number; + uint64_t pending_manifest_file_number; + uint64_t log_number; + uint64_t prev_log_number; + + uint64_t min_pending_output = 0; + uint64_t prev_total_log_size = 0; + size_t num_alive_log_files = 0; + uint64_t size_log_to_delete = 0; + + // Snapshot taken before flush/compaction job. + std::unique_ptr job_snapshot; + + explicit JobContext(int _job_id, bool create_superversion = false) { + job_id = _job_id; + manifest_file_number = 0; + pending_manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + superversion_contexts.emplace_back( + SuperVersionContext(create_superversion)); + } + + // For non-empty JobContext Clean() has to be called at least once before + // before destruction (see asserts in ~JobContext()). Should be called with + // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally + // doing potentially slow Clean() with locked DB mutex. + void Clean() { + // free superversions + for (auto& sv_context : superversion_contexts) { + sv_context.Clean(); + } + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + for (auto l : logs_to_free) { + delete l; + } + + memtables_to_free.clear(); + logs_to_free.clear(); + job_snapshot.reset(); + } + + ~JobContext() { + assert(memtables_to_free.size() == 0); + assert(logs_to_free.size() == 0); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/kv_checksum.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/kv_checksum.h new file mode 100644 index 0000000000..53c02485ff --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/kv_checksum.h @@ -0,0 +1,484 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains classes containing fields to protect individual entries. +// The classes are named "ProtectionInfo", where indicates the +// combination of fields that are covered. Each field has a single letter +// abbreviation as follows. +// +// K = key +// V = value +// O = optype aka value type +// S = seqno +// C = CF ID +// +// Then, for example, a class that protects an entry consisting of key, value, +// optype, and CF ID (i.e., a `WriteBatch` entry) would be named +// `ProtectionInfoKVOC`. +// +// The `ProtectionInfo.*` classes are templated on the integer type used to hold +// the XOR of hashes for each field. Only unsigned integer types are supported, +// and the maximum supported integer width is 64 bits. When the integer type is +// narrower than the hash values, we lop off the most significant bits to make +// them fit. +// +// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do +// not currently make the byte order consistent for integer fields before +// hashing them, so the resulting values are endianness-dependent. + +#pragma once + +#include + +#include "db/dbformat.h" +#include "rocksdb/types.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +template +class ProtectionInfo; +template +class ProtectionInfoKVO; +template +class ProtectionInfoKVOC; +template +class ProtectionInfoKVOS; +template +class ProtectionInfoKV; + +// Aliases for 64-bit protection infos. +using ProtectionInfo64 = ProtectionInfo; +using ProtectionInfoKVO64 = ProtectionInfoKVO; +using ProtectionInfoKVOC64 = ProtectionInfoKVOC; +using ProtectionInfoKVOS64 = ProtectionInfoKVOS; + +template +class ProtectionInfo { + public: + ProtectionInfo() = default; + + Status GetStatus() const; + ProtectionInfoKVO ProtectKVO(const Slice& key, const Slice& value, + ValueType op_type) const; + ProtectionInfoKVO ProtectKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const; + ProtectionInfoKV ProtectKV(const Slice& key, const Slice& value) const; + + private: + friend class ProtectionInfoKVO; + friend class ProtectionInfoKVOS; + friend class ProtectionInfoKVOC; + friend class ProtectionInfoKV; + + // Each field is hashed with an independent value so we can catch fields being + // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, + // and we should instead vary our seeds by a large odd number. This value by + // which we increment (0xD28AAD72F49BD50B) was taken from + // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd + // number. The values are computed manually since the Windows C++ compiler + // complains about the overflow when adding constants. + static const uint64_t kSeedK = 0; + static const uint64_t kSeedV = 0xD28AAD72F49BD50B; + static const uint64_t kSeedO = 0xA5155AE5E937AA16; + static const uint64_t kSeedS = 0x77A00858DDD37F21; + static const uint64_t kSeedC = 0x4A2AB5CBD26F542C; + + ProtectionInfo(T val) : val_(val) { + static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); + } + + T GetVal() const { return val_; } + void SetVal(T val) { val_ = val; } + + void Encode(uint8_t len, char* dst) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + dst[0] = static_cast(val_); + break; + case 2: + EncodeFixed16(dst, static_cast(val_)); + break; + case 4: + EncodeFixed32(dst, static_cast(val_)); + break; + case 8: + EncodeFixed64(dst, static_cast(val_)); + break; + default: + assert(false); + } + } + + bool Verify(uint8_t len, const char* checksum_ptr) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + return static_cast(checksum_ptr[0]) == + static_cast(val_); + case 2: + return DecodeFixed16(checksum_ptr) == static_cast(val_); + case 4: + return DecodeFixed32(checksum_ptr) == static_cast(val_); + case 8: + return DecodeFixed64(checksum_ptr) == static_cast(val_); + default: + assert(false); + return false; + } + } + + T val_ = 0; +}; + +template +class ProtectionInfoKVO { + public: + ProtectionInfoKVO() = default; + + ProtectionInfo StripKVO(const Slice& key, const Slice& value, + ValueType op_type) const; + ProtectionInfo StripKVO(const SliceParts& key, const SliceParts& value, + ValueType op_type) const; + + ProtectionInfoKVOC ProtectC(ColumnFamilyId column_family_id) const; + ProtectionInfoKVOS ProtectS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key); + void UpdateK(const SliceParts& old_key, const SliceParts& new_key); + void UpdateV(const Slice& old_value, const Slice& new_value); + void UpdateV(const SliceParts& old_value, const SliceParts& new_value); + void UpdateO(ValueType old_op_type, ValueType new_op_type); + + // Encode this protection info into `len` bytes and stores them in `dst`. + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + // Verify this protection info against the protection info encoded by Encode() + // at the first `len` bytes of `checksum_ptr`. + // Returns true iff the verification is successful. + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfo; + friend class ProtectionInfoKVOS; + friend class ProtectionInfoKVOC; + + explicit ProtectionInfoKVO(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), ""); + } + + T GetVal() const { return info_.GetVal(); } + void SetVal(T val) { info_.SetVal(val); } + + ProtectionInfo info_; +}; + +template +class ProtectionInfoKVOC { + public: + ProtectionInfoKVOC() = default; + + ProtectionInfoKVO StripC(ColumnFamilyId column_family_id) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvo_.UpdateO(old_op_type, new_op_type); + } + void UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id); + + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfoKVO; + + explicit ProtectionInfoKVOC(T val) : kvo_(val) { + static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), ""); + } + + T GetVal() const { return kvo_.GetVal(); } + void SetVal(T val) { kvo_.SetVal(val); } + + ProtectionInfoKVO kvo_; +}; + +template +class ProtectionInfoKVOS { + public: + ProtectionInfoKVOS() = default; + + ProtectionInfoKVO StripS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvo_.UpdateO(old_op_type, new_op_type); + } + void UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number); + + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfoKVO; + + explicit ProtectionInfoKVOS(T val) : kvo_(val) { + static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), ""); + } + + T GetVal() const { return kvo_.GetVal(); } + void SetVal(T val) { kvo_.SetVal(val); } + + ProtectionInfoKVO kvo_; +}; + +template +class ProtectionInfoKV { + public: + ProtectionInfoKV() = default; + + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfo; + + explicit ProtectionInfoKV(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKV) == sizeof(T)); + } + + ProtectionInfo info_; +}; + +template +Status ProtectionInfo::GetStatus() const { + if (val_ != 0) { + return Status::Corruption("ProtectionInfo mismatch"); + } + return Status::OK(); +} + +template +ProtectionInfoKVO ProtectionInfo::ProtectKVO(const Slice& key, + const Slice& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfoKVO(val); +} + +template +ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfoKVO(val); +} + +template +ProtectionInfoKV ProtectionInfo::ProtectKV(const Slice& key, + const Slice& value) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + return ProtectionInfoKV(val); +} + +template +void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSliceNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateK(const SliceParts& old_key, + const SliceParts& new_key) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateV(const Slice& old_value, + const Slice& new_value) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(GetSliceNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateV(const SliceParts& old_value, + const SliceParts& new_value) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateO(ValueType old_op_type, + ValueType new_op_type) { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&old_op_type), + sizeof(old_op_type), + ProtectionInfo::kSeedO)); + val = val ^ static_cast(NPHash64(reinterpret_cast(&new_op_type), + sizeof(new_op_type), + ProtectionInfo::kSeedO)); + SetVal(val); +} + +template +ProtectionInfo ProtectionInfoKVO::StripKVO(const Slice& key, + const Slice& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfo(val); +} + +template +ProtectionInfo ProtectionInfoKVO::StripKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfo(val); +} + +template +ProtectionInfoKVOC ProtectionInfoKVO::ProtectC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVOC(val); +} + +template +ProtectionInfoKVO ProtectionInfoKVOC::StripC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVOC::UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_column_family_id), + sizeof(old_column_family_id), ProtectionInfo::kSeedC)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_column_family_id), + sizeof(new_column_family_id), ProtectionInfo::kSeedC)); + SetVal(val); +} + +template +ProtectionInfoKVOS ProtectionInfoKVO::ProtectS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVOS(val); +} + +template +ProtectionInfoKVO ProtectionInfoKVOS::StripS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_sequence_number), + sizeof(old_sequence_number), ProtectionInfo::kSeedS)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_sequence_number), + sizeof(new_sequence_number), ProtectionInfo::kSeedS)); + SetVal(val); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_format.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_format.h new file mode 100644 index 0000000000..a976b3f9ea --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_format.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, + + // For recycled log files + kRecyclableFullType = 5, + kRecyclableFirstType = 6, + kRecyclableMiddleType = 7, + kRecyclableLastType = 8, + + // Compression Type + kSetCompressionType = 9, + + // User-defined timestamp sizes + kUserDefinedTimestampSizeType = 10, + kRecyclableUserDefinedTimestampSizeType = 11, +}; +constexpr int kMaxRecordType = kRecyclableUserDefinedTimestampSizeType; + +constexpr unsigned int kBlockSize = 32768; + +// Header is checksum (4 bytes), length (2 bytes), type (1 byte) +constexpr int kHeaderSize = 4 + 2 + 1; + +// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte), +// log number (4 bytes). +constexpr int kRecyclableHeaderSize = 4 + 2 + 1 + 4; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.cc new file mode 100644 index 0000000000..5ec262dcd3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.cc @@ -0,0 +1,934 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include + +#include "file/sequence_file_reader.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +Reader::Reporter::~Reporter() {} + +Reader::Reader(std::shared_ptr info_log, + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : info_log_(info_log), + file_(std::move(_file)), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false), + read_error_(false), + eof_offset_(0), + last_record_offset_(0), + end_of_buffer_offset_(0), + log_number_(log_num), + recycled_(false), + first_record_read_(false), + compression_type_(kNoCompression), + compression_type_record_read_(false), + uncompress_(nullptr), + hash_state_(nullptr), + uncompress_hash_state_(nullptr){}; + +Reader::~Reader() { + delete[] backing_store_; + if (uncompress_) { + delete uncompress_; + } + if (hash_state_) { + XXH3_freeState(hash_state_); + } + if (uncompress_hash_state_) { + XXH3_freeState(uncompress_hash_state_); + } +} + +// For kAbsoluteConsistency, on clean shutdown we don't expect any error +// in the log files. For other modes, we can ignore only incomplete records +// in the last log file, which are presumably due to a write in progress +// during restart (or from log recycling). +// +// TODO krad: Evaluate if we need to move to a more strict mode where we +// restrict the inconsistency to only the last log +bool Reader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode, + uint64_t* record_checksum) { + scratch->clear(); + record->clear(); + if (record_checksum != nullptr) { + if (hash_state_ == nullptr) { + hash_state_ = XXH3_createState(); + } + XXH3_64bits_reset(hash_state_); + } + if (uncompress_) { + uncompress_->Reset(); + } + bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; + + Slice fragment; + while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + const unsigned int record_type = + ReadPhysicalRecord(&fragment, &drop_size, record_checksum); + switch (record_type) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record && !scratch->empty()) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + ReportCorruption(scratch->size(), "partial record without end(1)"); + } + // No need to compute record_checksum since the record + // consists of a single fragment and the checksum is computed + // in ReadPhysicalRecord() if WAL compression is enabled + if (record_checksum != nullptr && uncompress_ == nullptr) { + // No need to stream since the record is a single fragment + *record_checksum = XXH3_64bits(fragment.data(), fragment.size()); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + *record = fragment; + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record && !scratch->empty()) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + ReportCorruption(scratch->size(), "partial record without end(2)"); + XXH3_64bits_reset(hash_state_); + } + if (record_checksum != nullptr) { + XXH3_64bits_update(hash_state_, fragment.data(), fragment.size()); + } + prospective_record_offset = physical_record_offset; + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + if (record_checksum != nullptr) { + XXH3_64bits_update(hash_state_, fragment.data(), fragment.size()); + } + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + if (record_checksum != nullptr) { + XXH3_64bits_update(hash_state_, fragment.data(), fragment.size()); + *record_checksum = XXH3_64bits_digest(hash_state_); + } + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + return true; + } + break; + + case kSetCompressionType: { + if (compression_type_record_read_) { + ReportCorruption(fragment.size(), + "read multiple SetCompressionType records"); + } + if (first_record_read_) { + ReportCorruption(fragment.size(), + "SetCompressionType not the first record"); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + last_record_offset_ = prospective_record_offset; + CompressionTypeRecord compression_record(kNoCompression); + Status s = compression_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption(fragment.size(), + "could not decode SetCompressionType record"); + } else { + InitCompression(compression_record); + } + break; + } + case kUserDefinedTimestampSizeType: + case kRecyclableUserDefinedTimestampSizeType: { + if (in_fragmented_record && !scratch->empty()) { + ReportCorruption( + scratch->size(), + "user-defined timestamp size record interspersed partial record"); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + last_record_offset_ = prospective_record_offset; + UserDefinedTimestampSizeRecord ts_record; + Status s = ts_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption( + fragment.size(), + "could not decode user-defined timestamp size record"); + } else { + s = UpdateRecordedTimestampSize( + ts_record.GetUserDefinedTimestampSize()); + if (!s.ok()) { + ReportCorruption(fragment.size(), s.getState()); + } + } + break; + } + + case kBadHeader: + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(drop_size, "truncated header"); + } + FALLTHROUGH_INTENDED; + + case kEof: + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + + case kOldRecord: + if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { + // Treat a record from a previous instance of the log as EOF. + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, + // which higher layers can choose to ignore when it's provable + // there is no hole. + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + } + FALLTHROUGH_INTENDED; + + case kBadRecord: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + case kBadRecordLen: + if (eof_) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(drop_size, "truncated record body"); + } + return false; + } + FALLTHROUGH_INTENDED; + + case kBadRecordChecksum: + if (recycled_ && wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords) { + scratch->clear(); + return false; + } + if (record_type == kBadRecordLen) { + ReportCorruption(drop_size, "bad record length"); + } else { + ReportCorruption(drop_size, "checksum mismatch"); + } + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); + ReportCorruption( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + buf); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + } + return false; +} + +uint64_t Reader::LastRecordOffset() { return last_record_offset_; } + +uint64_t Reader::LastRecordEnd() { + return end_of_buffer_offset_ - buffer_.size(); +} + +void Reader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + if (eof_offset_ == 0) { + return; + } + UnmarkEOFInternal(); +} + +void Reader::UnmarkEOFInternal() { + // If the EOF was in the middle of a block (a partial block was read) we have + // to read the rest of the block as ReadPhysicalRecord can only read full + // blocks and expects the file position indicator to be aligned to the start + // of a block. + // + // consumed_bytes + buffer_size() + remaining == kBlockSize + + size_t consumed_bytes = eof_offset_ - buffer_.size(); + size_t remaining = kBlockSize - eof_offset_; + + // backing_store_ is used to concatenate what is left in buffer_ and + // the remainder of the block. If buffer_ already uses backing_store_, + // we just append the new data. + if (buffer_.data() != backing_store_ + consumed_bytes) { + // Buffer_ does not use backing_store_ for storage. + // Copy what is left in buffer_ to backing_store. + memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size()); + } + + Slice read_buffer; + // TODO: rate limit log reader with approriate priority. + // TODO: avoid overcharging rate limiter: + // Note that the Read here might overcharge SequentialFileReader's internal + // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough + // content left until EOF to read. + Status status = + file_->Read(remaining, &read_buffer, backing_store_ + eof_offset_, + Env::IO_TOTAL /* rate_limiter_priority */); + + size_t added = read_buffer.size(); + end_of_buffer_offset_ += added; + + if (!status.ok()) { + if (added > 0) { + ReportDrop(added, status); + } + + read_error_ = true; + return; + } + + if (read_buffer.data() != backing_store_ + eof_offset_) { + // Read did not write to backing_store_ + memmove(backing_store_ + eof_offset_, read_buffer.data(), + read_buffer.size()); + } + + buffer_ = Slice(backing_store_ + consumed_bytes, + eof_offset_ + added - consumed_bytes); + + if (added < remaining) { + eof_ = true; + eof_offset_ += added; + } else { + eof_offset_ = 0; + } +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != nullptr) { + reporter_->Corruption(bytes, reason); + } +} + +bool Reader::ReadMore(size_t* drop_size, int* error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + // TODO: rate limit log reader with approriate priority. + // TODO: avoid overcharging rate limiter: + // Note that the Read here might overcharge SequentialFileReader's internal + // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough + // content left until EOF to read. + Status status = file_->Read(kBlockSize, &buffer_, backing_store_, + Env::IO_TOTAL /* rate_limiter_priority */); + TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + return true; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Unless explicitly requested we don't + // considering this an error, just report EOF. + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + *error = kBadHeader; + return false; + } + buffer_.clear(); + *error = kEof; + return false; + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, + uint64_t* fragment_checksum) { + while (true) { + // We need at least the minimum header size + if (buffer_.size() < static_cast(kHeaderSize)) { + // the default value of r is meaningless because ReadMore will overwrite + // it if it returns false; in case it returns true, the return value will + // not be used anyway + int r = kEof; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if ((type >= kRecyclableFullType && type <= kRecyclableLastType) || + type == kRecyclableUserDefinedTimestampSizeType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + // We need enough for the larger header + if (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + int r = kEof; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + return kOldRecord; + } + } + if (header_size + length > buffer_.size()) { + assert(buffer_.size() >= static_cast(header_size)); + *drop_size = buffer_.size(); + buffer_.clear(); + // If the end of the read has been reached without seeing + // `header_size + length` bytes of payload, report a corruption. The + // higher layers can decide how to handle it based on the recovery mode, + // whether this occurred at EOF, whether this is the final WAL, etc. + return kBadRecordLen; + } + + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + // NOTE: this should never happen in DB written by new RocksDB versions, + // since we turn off mmap writes to manifest and log files + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + *drop_size = buffer_.size(); + buffer_.clear(); + return kBadRecordChecksum; + } + } + + buffer_.remove_prefix(header_size + length); + + if (!uncompress_ || type == kSetCompressionType || + type == kUserDefinedTimestampSizeType || + type == kRecyclableUserDefinedTimestampSizeType) { + *result = Slice(header + header_size, length); + return type; + } else { + // Uncompress compressed records + uncompressed_record_.clear(); + if (fragment_checksum != nullptr) { + if (uncompress_hash_state_ == nullptr) { + uncompress_hash_state_ = XXH3_createState(); + } + XXH3_64bits_reset(uncompress_hash_state_); + } + + size_t uncompressed_size = 0; + int remaining = 0; + const char* input = header + header_size; + do { + remaining = uncompress_->Uncompress( + input, length, uncompressed_buffer_.get(), &uncompressed_size); + input = nullptr; + if (remaining < 0) { + buffer_.clear(); + return kBadRecord; + } + if (uncompressed_size > 0) { + if (fragment_checksum != nullptr) { + XXH3_64bits_update(uncompress_hash_state_, + uncompressed_buffer_.get(), uncompressed_size); + } + uncompressed_record_.append(uncompressed_buffer_.get(), + uncompressed_size); + } + } while (remaining > 0 || uncompressed_size == kBlockSize); + + if (fragment_checksum != nullptr) { + // We can remove this check by updating hash_state_ directly, + // but that requires resetting hash_state_ for full and first types + // for edge cases like consecutive fist type records. + // Leaving the check as is since it is cleaner and can revert to the + // above approach if it causes performance impact. + *fragment_checksum = XXH3_64bits_digest(uncompress_hash_state_); + uint64_t actual_checksum = XXH3_64bits(uncompressed_record_.data(), + uncompressed_record_.size()); + if (*fragment_checksum != actual_checksum) { + // uncompressed_record_ contains bad content that does not match + // actual decompressed content + return kBadRecord; + } + } + *result = Slice(uncompressed_record_); + return type; + } + } +} + +// Initialize uncompress related fields +void Reader::InitCompression(const CompressionTypeRecord& compression_record) { + compression_type_ = compression_record.GetCompressionType(); + compression_type_record_read_ = true; + constexpr uint32_t compression_format_version = 2; + uncompress_ = StreamingUncompress::Create( + compression_type_, compression_format_version, kBlockSize); + assert(uncompress_ != nullptr); + uncompressed_buffer_ = std::unique_ptr(new char[kBlockSize]); + assert(uncompressed_buffer_); +} + +Status Reader::UpdateRecordedTimestampSize( + const std::vector>& cf_to_ts_sz) { + for (const auto& [cf, ts_sz] : cf_to_ts_sz) { + // Zero user-defined timestamp size are not recorded. + if (ts_sz == 0) { + return Status::Corruption( + "User-defined timestamp size record contains zero timestamp size."); + } + // The user-defined timestamp size record for a column family should not be + // updated in the same log file. + if (recorded_cf_to_ts_sz_.count(cf) != 0) { + return Status::Corruption( + "User-defined timestamp size record contains update to " + "recorded column family."); + } + recorded_cf_to_ts_sz_.insert(std::make_pair(cf, ts_sz)); + } + return Status::OK(); +} + +bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode /*unused*/, + uint64_t* /* checksum */) { + assert(record != nullptr); + assert(scratch != nullptr); + record->clear(); + scratch->clear(); + if (uncompress_) { + uncompress_->Reset(); + } + + uint64_t prospective_record_offset = 0; + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy + Slice fragment; + while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) { + switch (fragment_type_or_err) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record_ && !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(1)"); + } + fragments_.clear(); + *record = fragment; + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + in_fragmented_record_ = false; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record_ || !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(2)"); + } + prospective_record_offset = physical_record_offset; + fragments_.assign(fragment.data(), fragment.size()); + in_fragmented_record_ = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + scratch->assign(fragments_.data(), fragments_.size()); + fragments_.clear(); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + first_record_read_ = true; + in_fragmented_record_ = false; + return true; + } + break; + + case kSetCompressionType: { + if (compression_type_record_read_) { + ReportCorruption(fragment.size(), + "read multiple SetCompressionType records"); + } + if (first_record_read_) { + ReportCorruption(fragment.size(), + "SetCompressionType not the first record"); + } + fragments_.clear(); + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + CompressionTypeRecord compression_record(kNoCompression); + Status s = compression_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption(fragment.size(), + "could not decode SetCompressionType record"); + } else { + InitCompression(compression_record); + } + break; + } + + case kUserDefinedTimestampSizeType: + case kRecyclableUserDefinedTimestampSizeType: { + if (in_fragmented_record_ && !scratch->empty()) { + ReportCorruption( + scratch->size(), + "user-defined timestamp size record interspersed partial record"); + } + fragments_.clear(); + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + UserDefinedTimestampSizeRecord ts_record; + Status s = ts_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption( + fragment.size(), + "could not decode user-defined timestamp size record"); + } else { + s = UpdateRecordedTimestampSize( + ts_record.GetUserDefinedTimestampSize()); + if (!s.ok()) { + ReportCorruption(fragment.size(), s.getState()); + } + } + break; + } + + case kBadHeader: + case kBadRecord: + case kEof: + case kOldRecord: + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kBadRecordChecksum: + if (recycled_) { + fragments_.clear(); + return false; + } + ReportCorruption(drop_size, "checksum mismatch"); + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", + fragment_type_or_err); + ReportCorruption( + fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0), + buf); + in_fragmented_record_ = false; + fragments_.clear(); + break; + } + } + } + return false; +} + +void FragmentBufferedReader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + UnmarkEOFInternal(); +} + +bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + // TODO: rate limit log reader with approriate priority. + // TODO: avoid overcharging rate limiter: + // Note that the Read here might overcharge SequentialFileReader's internal + // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough + // content left until EOF to read. + Status status = file_->Read(kBlockSize, &buffer_, backing_store_, + Env::IO_TOTAL /* rate_limiter_priority */); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + TEST_SYNC_POINT_CALLBACK( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr); + } + return true; + } else if (!read_error_) { + UnmarkEOF(); + } + if (!read_error_) { + return true; + } + *error = kEof; + *drop_size = buffer_.size(); + if (buffer_.size() > 0) { + *error = kBadHeader; + } + buffer_.clear(); + return false; +} + +// return true if the caller should process the fragment_type_or_err. +bool FragmentBufferedReader::TryReadFragment( + Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) { + assert(fragment != nullptr); + assert(drop_size != nullptr); + assert(fragment_type_or_err != nullptr); + + while (buffer_.size() < static_cast(kHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if ((type >= kRecyclableFullType && type <= kRecyclableLastType) || + type == kRecyclableUserDefinedTimestampSizeType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + while (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + *fragment_type_or_err = kOldRecord; + return true; + } + } + + while (header_size + length > buffer_.size()) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + + if (type == kZeroType && length == 0) { + buffer_.clear(); + *fragment_type_or_err = kBadRecord; + return true; + } + + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + *drop_size = buffer_.size(); + buffer_.clear(); + *fragment_type_or_err = kBadRecordChecksum; + return true; + } + } + + buffer_.remove_prefix(header_size + length); + + if (!uncompress_ || type == kSetCompressionType || + type == kUserDefinedTimestampSizeType || + type == kRecyclableUserDefinedTimestampSizeType) { + *fragment = Slice(header + header_size, length); + *fragment_type_or_err = type; + return true; + } else { + // Uncompress compressed records + uncompressed_record_.clear(); + size_t uncompressed_size = 0; + int remaining = 0; + const char* input = header + header_size; + do { + remaining = uncompress_->Uncompress( + input, length, uncompressed_buffer_.get(), &uncompressed_size); + input = nullptr; + if (remaining < 0) { + buffer_.clear(); + *fragment_type_or_err = kBadRecord; + return true; + } + if (uncompressed_size > 0) { + uncompressed_record_.append(uncompressed_buffer_.get(), + uncompressed_size); + } + } while (remaining > 0 || uncompressed_size == kBlockSize); + *fragment = Slice(std::move(uncompressed_record_)); + *fragment_type_or_err = type; + return true; + } +} + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.h new file mode 100644 index 0000000000..b7b298e58a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_reader.h @@ -0,0 +1,241 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include +#include + +#include "db/log_format.h" +#include "file/sequence_file_reader.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/compression.h" +#include "util/udt_util.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; + +namespace log { + +/** + * Reader is a general purpose log stream reader implementation. The actual job + * of reading from the device is implemented by the SequentialFile interface. + * + * Please see Writer for details on the file and record layout. + */ +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(std::shared_ptr info_log, + std::unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t log_num); + // No copying allowed + Reader(const Reader&) = delete; + void operator=(const Reader&) = delete; + + virtual ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + // If record_checksum is not nullptr, then this function will calculate the + // checksum of the record read and set record_checksum to it. The checksum is + // calculated from the original buffers that contain the contents of the + // record. + virtual bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords, + uint64_t* record_checksum = nullptr); + + // Return the recorded user-defined timestamp size that have been read so + // far. This only applies to WAL logs. + const std::unordered_map& GetRecordedTimestampSize() const { + return recorded_cf_to_ts_sz_; + } + + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + + // Returns the first physical offset after the last record returned by + // ReadRecord, or zero before first call to ReadRecord. This can also be + // thought of as the "current" position in processing the file bytes. + uint64_t LastRecordEnd(); + + // returns true if the reader has encountered an eof condition. + bool IsEOF() { return eof_; } + + // returns true if the reader has encountered read error. + bool hasReadError() const { return read_error_; } + + // when we know more data has been written to the file. we can use this + // function to force the reader to look again in the file. + // Also aligns the file position indicator to the start of the next block + // by reading the rest of the data from the EOF position to the end of the + // block that was partially read. + virtual void UnmarkEOF(); + + SequentialFileReader* file() { return file_.get(); } + + Reporter* GetReporter() const { return reporter_; } + + uint64_t GetLogNumber() const { return log_number_; } + + size_t GetReadOffset() const { + return static_cast(end_of_buffer_offset_); + } + + bool IsCompressedAndEmptyFile() { + return !first_record_read_ && compression_type_record_read_; + } + + protected: + std::shared_ptr info_log_; + const std::unique_ptr file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + + // Internal state variables used for reading records + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + bool read_error_; // Error occurred while reading from file + + // Offset of the file position indicator within the last block when an + // EOF was detected. + size_t eof_offset_; + + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // which log number this is + uint64_t const log_number_; + + // Whether this is a recycled log file + bool recycled_; + + // Whether the first record has been read or not. + bool first_record_read_; + // Type of compression used + CompressionType compression_type_; + // Track whether the compression type record has been read or not. + bool compression_type_record_read_; + StreamingUncompress* uncompress_; + // Reusable uncompressed output buffer + std::unique_ptr uncompressed_buffer_; + // Reusable uncompressed record + std::string uncompressed_record_; + // Used for stream hashing fragment content in ReadRecord() + XXH3_state_t* hash_state_; + // Used for stream hashing uncompressed buffer in ReadPhysicalRecord() + XXH3_state_t* uncompress_hash_state_; + + // The recorded user-defined timestamp sizes that have been read so far. This + // is only for WAL logs. + std::unordered_map recorded_cf_to_ts_sz_; + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + kBadRecord = kMaxRecordType + 2, + // Returned when we fail to read a valid header. + kBadHeader = kMaxRecordType + 3, + // Returned when we read an old record from a previous user of the log. + kOldRecord = kMaxRecordType + 4, + // Returned when we get a bad record length + kBadRecordLen = kMaxRecordType + 5, + // Returned when we get a bad record checksum + kBadRecordChecksum = kMaxRecordType + 6, + }; + + // Return type, or one of the preceding special values + // If WAL compressioned is enabled, fragment_checksum is the checksum of the + // fragment computed from the orginal buffer containinng uncompressed + // fragment. + unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size, + uint64_t* fragment_checksum = nullptr); + + // Read some more + bool ReadMore(size_t* drop_size, int* error); + + void UnmarkEOFInternal(); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); + + void InitCompression(const CompressionTypeRecord& compression_record); + + Status UpdateRecordedTimestampSize( + const std::vector>& cf_to_ts_sz); +}; + +class FragmentBufferedReader : public Reader { + public: + FragmentBufferedReader(std::shared_ptr info_log, + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : Reader(info_log, std::move(_file), reporter, checksum, log_num), + fragments_(), + in_fragmented_record_(false) {} + ~FragmentBufferedReader() override {} + bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords, + uint64_t* record_checksum = nullptr) override; + void UnmarkEOF() override; + + private: + std::string fragments_; + bool in_fragmented_record_; + + bool TryReadFragment(Slice* result, size_t* drop_size, + unsigned int* fragment_type_or_err); + + bool TryReadMore(size_t* drop_size, int* error); + + // No copy allowed + FragmentBufferedReader(const FragmentBufferedReader&); + void operator=(const FragmentBufferedReader&); +}; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.cc new file mode 100644 index 0000000000..1ddf0a26c8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.cc @@ -0,0 +1,279 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include + +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/udt_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, + bool recycle_log_files, bool manual_flush, + CompressionType compression_type) + : dest_(std::move(dest)), + block_offset_(0), + log_number_(log_number), + recycle_log_files_(recycle_log_files), + manual_flush_(manual_flush), + compression_type_(compression_type), + compress_(nullptr) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { + if (dest_) { + WriteBuffer().PermitUncheckedError(); + } + if (compress_) { + delete compress_; + } +} + +IOStatus Writer::WriteBuffer() { + if (dest_->seen_error()) { + return IOStatus::IOError("Seen error. Skip writing buffer."); + } + return dest_->Flush(); +} + +IOStatus Writer::Close() { + IOStatus s; + if (dest_) { + s = dest_->Close(); + dest_.reset(); + } + return s; +} + +IOStatus Writer::AddRecord(const Slice& slice, + Env::IOPriority rate_limiter_priority) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Header size varies depending on whether we are recycling or not. + const int header_size = + recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize; + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + bool begin = true; + int compress_remaining = 0; + bool compress_start = false; + if (compress_) { + compress_->Reset(); + compress_start = true; + } + + IOStatus s; + do { + const int64_t leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < header_size) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + static_cast(leftover)), + 0 /* crc32c_checksum */, rate_limiter_priority); + if (!s.ok()) { + break; + } + } + block_offset_ = 0; + } + + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize - block_offset_) >= header_size); + + const size_t avail = kBlockSize - block_offset_ - header_size; + + // Compress the record if compression is enabled. + // Compress() is called at least once (compress_start=true) and after the + // previous generated compressed chunk is written out as one or more + // physical records (left=0). + if (compress_ && (compress_start || left == 0)) { + compress_remaining = compress_->Compress(slice.data(), slice.size(), + compressed_buffer_.get(), &left); + + if (compress_remaining < 0) { + // Set failure status + s = IOStatus::IOError("Unexpected WAL compression error"); + s.SetDataLoss(true); + break; + } else if (left == 0) { + // Nothing left to compress + if (!compress_start) { + break; + } + } + compress_start = false; + ptr = compressed_buffer_.get(); + } + + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length && compress_remaining == 0); + if (begin && end) { + type = recycle_log_files_ ? kRecyclableFullType : kFullType; + } else if (begin) { + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; + } else if (end) { + type = recycle_log_files_ ? kRecyclableLastType : kLastType; + } else { + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && (left > 0 || compress_remaining > 0)); + + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(rate_limiter_priority); + } + } + + return s; +} + +IOStatus Writer::AddCompressionTypeRecord() { + // Should be the first record + assert(block_offset_ == 0); + + if (compression_type_ == kNoCompression) { + // No need to add a record + return IOStatus::OK(); + } + + CompressionTypeRecord record(compression_type_); + std::string encode; + record.EncodeTo(&encode); + IOStatus s = + EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size()); + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(); + } + // Initialize fields required for compression + const size_t max_output_buffer_len = + kBlockSize - (recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize); + CompressionOptions opts; + constexpr uint32_t compression_format_version = 2; + compress_ = StreamingCompress::Create(compression_type_, opts, + compression_format_version, + max_output_buffer_len); + assert(compress_ != nullptr); + compressed_buffer_ = + std::unique_ptr(new char[max_output_buffer_len]); + assert(compressed_buffer_); + } else { + // Disable compression if the record could not be added. + compression_type_ = kNoCompression; + } + return s; +} + +IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord( + const std::unordered_map& cf_to_ts_sz, + Env::IOPriority rate_limiter_priority) { + std::vector> ts_sz_to_record; + for (const auto& [cf_id, ts_sz] : cf_to_ts_sz) { + if (recorded_cf_to_ts_sz_.count(cf_id) != 0) { + // A column family's user-defined timestamp size should not be + // updated while DB is running. + assert(recorded_cf_to_ts_sz_[cf_id] == ts_sz); + } else if (ts_sz != 0) { + ts_sz_to_record.emplace_back(cf_id, ts_sz); + recorded_cf_to_ts_sz_.insert(std::make_pair(cf_id, ts_sz)); + } + } + if (ts_sz_to_record.empty()) { + return IOStatus::OK(); + } + + UserDefinedTimestampSizeRecord record(std::move(ts_sz_to_record)); + std::string encoded; + record.EncodeTo(&encoded); + RecordType type = recycle_log_files_ ? kRecyclableUserDefinedTimestampSizeType + : kUserDefinedTimestampSizeType; + return EmitPhysicalRecord(type, encoded.data(), encoded.size(), + rate_limiter_priority); +} + +bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); } + +IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, + Env::IOPriority rate_limiter_priority) { + assert(n <= 0xffff); // Must fit in two bytes + + size_t header_size; + char buf[kRecyclableHeaderSize]; + + // Format the header + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + uint32_t crc = type_crc_[t]; + if (t < kRecyclableFullType || t == kSetCompressionType || + t == kUserDefinedTimestampSizeType) { + // Legacy record format + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + header_size = kHeaderSize; + } else { + // Recyclable record format + assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize); + header_size = kRecyclableHeaderSize; + + // Only encode low 32-bits of the 64-bit log number. This means + // we will fail to detect an old record if we recycled a log from + // ~4 billion logs ago, but that is effectively impossible, and + // even if it were we'dbe far more likely to see a false positive + // on the 32-bit CRC. + EncodeFixed32(buf + 7, static_cast(log_number_)); + crc = crc32c::Extend(crc, buf + 7, 4); + } + + // Compute the crc of the record type and the payload. + uint32_t payload_crc = crc32c::Value(ptr, n); + crc = crc32c::Crc32cCombine(crc, payload_crc, n); + crc = crc32c::Mask(crc); // Adjust for storage + TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", + &crc); + EncodeFixed32(buf, crc); + + // Write the header and the payload + IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */, + rate_limiter_priority); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority); + } + block_offset_ += header_size + n; + return s; +} + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.h new file mode 100644 index 0000000000..75e44d1a4e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/log_writer.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include + +#include "db/log_format.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; + +namespace log { + +/** + * Writer is a general purpose log stream writer. It provides an append-only + * abstraction for writing data. The details of the how the data is written is + * handled by the WritableFile sub-class implementation. + * + * File format: + * + * File is broken down into variable sized records. The format of each record + * is described below. + * +-----+-------------+--+----+----------+------+-- ... ----+ + * File | r0 | r1 |P | r2 | r3 | r4 | | + * +-----+-------------+--+----+----------+------+-- ... ----+ + * <--- kBlockSize ------>|<-- kBlockSize ------>| + * rn = variable size records + * P = Padding + * + * Data is written out in kBlockSize chunks. If next record does not fit + * into the space left, the leftover space will be padded with \0. + * + * Legacy record format: + * + * +---------+-----------+-----------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Payload | + * +---------+-----------+-----------+--- ... ---+ + * + * CRC = 32bit hash computed over the record type and payload using CRC + * Size = Length of the payload data + * Type = Type of record + * (kZeroType, kFullType, kFirstType, kLastType, kMiddleType ) + * The type is used to group a bunch of records together to represent + * blocks that are larger than kBlockSize + * Payload = Byte stream as long as specified by the payload size + * + * Recyclable record format: + * + * +---------+-----------+-----------+----------------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | + * +---------+-----------+-----------+----------------+--- ... ---+ + * + * Same as above, with the addition of + * Log number = 32bit log file number, so that we can distinguish between + * records written by the most recent log writer vs a previous one. + */ +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(std::unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files, + bool manual_flush = false, + CompressionType compressionType = kNoCompression); + // No copying allowed + Writer(const Writer&) = delete; + void operator=(const Writer&) = delete; + + ~Writer(); + + IOStatus AddRecord(const Slice& slice, + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + IOStatus AddCompressionTypeRecord(); + + // If there are column families in `cf_to_ts_sz` not included in + // `recorded_cf_to_ts_sz_` and its user-defined timestamp size is non-zero, + // adds a record of type kUserDefinedTimestampSizeType or + // kRecyclableUserDefinedTimestampSizeType for these column families. + // This timestamp size record applies to all subsequent records. + IOStatus MaybeAddUserDefinedTimestampSizeRecord( + const std::unordered_map& cf_to_ts_sz, + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + + WritableFileWriter* file() { return dest_.get(); } + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + IOStatus WriteBuffer(); + + IOStatus Close(); + + bool BufferIsEmpty(); + + private: + std::unique_ptr dest_; + size_t block_offset_; // Current offset in block + uint64_t log_number_; + bool recycle_log_files_; + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + IOStatus EmitPhysicalRecord( + RecordType type, const char* ptr, size_t length, + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + + // If true, it does not flush after each write. Instead it relies on the upper + // layer to manually does the flush by calling ::WriteBuffer() + bool manual_flush_; + + // Compression Type + CompressionType compression_type_; + StreamingCompress* compress_; + // Reusable compressed output buffer + std::unique_ptr compressed_buffer_; + + // The recorded user-defined timestamp size that have been written so far. + // Since the user-defined timestamp size cannot be changed while the DB is + // running, existing entry in this map cannot be updated. + std::unordered_map recorded_cf_to_ts_sz_; +}; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.cc new file mode 100644 index 0000000000..ff98155c45 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/logs_with_prep_tracker.h" + +#include "port/likely.h" + +namespace ROCKSDB_NAMESPACE { +void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prepared_section_completed_mutex_); + auto it = prepared_section_completed_.find(log); + if (UNLIKELY(it == prepared_section_completed_.end())) { + prepared_section_completed_[log] = 1; + } else { + it->second += 1; + } +} + +void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) { + assert(log != 0); + std::lock_guard lock(logs_with_prep_mutex_); + + auto rit = logs_with_prep_.rbegin(); + bool updated = false; + // Most probably the last log is the one that is being marked for + // having a prepare section; so search from the end. + for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) { + if (rit->log == log) { + rit->cnt++; + updated = true; + break; + } + } + if (!updated) { + // We are either at the start, or at a position with rit->log < log + logs_with_prep_.insert(rit.base(), {log, 1}); + } +} + +uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() { + std::lock_guard lock(logs_with_prep_mutex_); + auto it = logs_with_prep_.begin(); + // start with the smallest log + for (; it != logs_with_prep_.end();) { + auto min_log = it->log; + { + std::lock_guard lock2(prepared_section_completed_mutex_); + auto completed_it = prepared_section_completed_.find(min_log); + if (completed_it == prepared_section_completed_.end() || + completed_it->second < it->cnt) { + return min_log; + } + assert(completed_it != prepared_section_completed_.end() && + completed_it->second == it->cnt); + prepared_section_completed_.erase(completed_it); + } + // erase from beginning in vector is not efficient but this function is not + // on the fast path. + it = logs_with_prep_.erase(it); + } + // no such log found + return 0; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.h new file mode 100644 index 0000000000..f72f0ca078 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/logs_with_prep_tracker.h @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This class is used to track the log files with outstanding prepare entries. +class LogsWithPrepTracker { + public: + // Called when a transaction prepared in `log` has been committed or aborted. + void MarkLogAsHavingPrepSectionFlushed(uint64_t log); + // Called when a transaction is prepared in `log`. + void MarkLogAsContainingPrepSection(uint64_t log); + // Return the earliest log file with outstanding prepare entries. + uint64_t FindMinLogContainingOutstandingPrep(); + size_t TEST_PreparedSectionCompletedSize() { + return prepared_section_completed_.size(); + } + size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); } + + private: + // REQUIRES: logs_with_prep_mutex_ held + // + // sorted list of log numbers still containing prepared data. + // this is used by FindObsoleteFiles to determine which + // flushed logs we must keep around because they still + // contain prepared data which has not been committed or rolled back + struct LogCnt { + uint64_t log; // the log number + uint64_t cnt; // number of prepared sections in the log + }; + std::vector logs_with_prep_; + std::mutex logs_with_prep_mutex_; + + // REQUIRES: prepared_section_completed_mutex_ held + // + // to be used in conjunction with logs_with_prep_. + // once a transaction with data in log L is committed or rolled back + // rather than updating logs_with_prep_ directly we keep track of that + // in prepared_section_completed_ which maps LOG -> instance_count. This helps + // avoiding contention between a commit thread and the prepare threads. + // + // when trying to determine the minimum log still active we first + // consult logs_with_prep_. while that root value maps to + // an equal value in prepared_section_completed_ we erase the log from + // both logs_with_prep_ and prepared_section_completed_. + std::unordered_map prepared_section_completed_; + std::mutex prepared_section_completed_mutex_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/lookup_key.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/lookup_key.h new file mode 100644 index 0000000000..68851bddd1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/lookup_key.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& _user_key, SequenceNumber sequence, + const Slice* ts = nullptr); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { + return Slice(start_, static_cast(end_ - start_)); + } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { + return Slice(kstart_, static_cast(end_ - kstart_)); + } + + // Return the user key. + // If user-defined timestamp is enabled, then timestamp is included in the + // result. + Slice user_key() const { + return Slice(kstart_, static_cast(end_ - kstart_ - 8)); + } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.cc new file mode 100644 index 0000000000..641e01f9a3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/malloc_stats.h" + +#include + +#include + +#include "port/jemalloc_helper.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_JEMALLOC + +struct MallocStatus { + char* cur; + char* end; +}; + +static void GetJemallocStatus(void* mstat_arg, const char* status) { + MallocStatus* mstat = reinterpret_cast(mstat_arg); + size_t status_len = status ? strlen(status) : 0; + size_t buf_size = (size_t)(mstat->end - mstat->cur); + if (!status_len || status_len > buf_size) { + return; + } + + snprintf(mstat->cur, buf_size, "%s", status); + mstat->cur += status_len; +} +void DumpMallocStats(std::string* stats) { + if (!HasJemalloc()) { + return; + } + MallocStatus mstat; + const unsigned int kMallocStatusLen = 1000000; + std::unique_ptr buf{new char[kMallocStatusLen + 1]}; + mstat.cur = buf.get(); + mstat.end = buf.get() + kMallocStatusLen; + malloc_stats_print(GetJemallocStatus, &mstat, ""); + stats->append(buf.get()); +} +#else +void DumpMallocStats(std::string*) {} +#endif // ROCKSDB_JEMALLOC +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.h new file mode 100644 index 0000000000..1cca8a9522 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/malloc_stats.h @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +void DumpMallocStats(std::string*); + +} + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.cc new file mode 100644 index 0000000000..3801b67a9b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.cc @@ -0,0 +1,1675 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/kv_checksum.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/read_callback.h" +#include "db/wide/wide_column_serialization.h" +#include "logging/logging.h" +#include "memory/arena.h" +#include "memory/memory_usage.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics_impl.h" +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "table/merging_iterator.h" +#include "util/autovector.h" +#include "util/coding.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +ImmutableMemTableOptions::ImmutableMemTableOptions( + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options) + : arena_block_size(mutable_cf_options.arena_block_size), + memtable_prefix_bloom_bits( + static_cast( + static_cast(mutable_cf_options.write_buffer_size) * + mutable_cf_options.memtable_prefix_bloom_size_ratio) * + 8u), + memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size), + memtable_whole_key_filtering( + mutable_cf_options.memtable_whole_key_filtering), + inplace_update_support(ioptions.inplace_update_support), + inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), + inplace_callback(ioptions.inplace_callback), + max_successive_merges(mutable_cf_options.max_successive_merges), + statistics(ioptions.stats), + merge_operator(ioptions.merge_operator.get()), + info_log(ioptions.logger), + allow_data_in_errors(ioptions.allow_data_in_errors), + protection_bytes_per_key( + mutable_cf_options.memtable_protection_bytes_per_key) {} + +MemTable::MemTable(const InternalKeyComparator& cmp, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBufferManager* write_buffer_manager, + SequenceNumber latest_seq, uint32_t column_family_id) + : comparator_(cmp), + moptions_(ioptions, mutable_cf_options), + refs_(0), + kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)), + mem_tracker_(write_buffer_manager), + arena_(moptions_.arena_block_size, + (write_buffer_manager != nullptr && + (write_buffer_manager->enabled() || + write_buffer_manager->cost_to_cache())) + ? &mem_tracker_ + : nullptr, + mutable_cf_options.memtable_huge_page_size), + table_(ioptions.memtable_factory->CreateMemTableRep( + comparator_, &arena_, mutable_cf_options.prefix_extractor.get(), + ioptions.logger, column_family_id)), + range_del_table_(SkipListFactory().CreateMemTableRep( + comparator_, &arena_, nullptr /* transform */, ioptions.logger, + column_family_id)), + is_range_del_table_empty_(true), + data_size_(0), + num_entries_(0), + num_deletes_(0), + write_buffer_size_(mutable_cf_options.write_buffer_size), + flush_in_progress_(false), + flush_completed_(false), + file_number_(0), + first_seqno_(0), + earliest_seqno_(latest_seq), + creation_seq_(latest_seq), + mem_next_logfile_number_(0), + min_prep_log_referenced_(0), + locks_(moptions_.inplace_update_support + ? moptions_.inplace_update_num_locks + : 0), + prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + flush_state_(FLUSH_NOT_REQUESTED), + clock_(ioptions.clock), + insert_with_hint_prefix_extractor_( + ioptions.memtable_insert_with_hint_prefix_extractor.get()), + oldest_key_time_(std::numeric_limits::max()), + atomic_flush_seqno_(kMaxSequenceNumber), + approximate_memory_usage_(0) { + UpdateFlushState(); + // something went wrong if we need to flush before inserting anything + assert(!ShouldScheduleFlush()); + + // use bloom_filter_ for both whole key and prefix bloom filter + if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) && + moptions_.memtable_prefix_bloom_bits > 0) { + bloom_filter_.reset( + new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, + 6 /* hard coded 6 probes */, + moptions_.memtable_huge_page_size, ioptions.logger)); + } + // Initialize cached_range_tombstone_ here since it could + // be read before it is constructed in MemTable::Add(), which could also lead + // to a data race on the global mutex table backing atomic shared_ptr. + auto new_cache = std::make_shared(); + size_t size = cached_range_tombstone_.Size(); + for (size_t i = 0; i < size; ++i) { + std::shared_ptr* local_cache_ref_ptr = + cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + std::atomic_store_explicit( + local_cache_ref_ptr, + std::shared_ptr(new_local_cache_ref, + new_cache.get()), + std::memory_order_relaxed); + } +} + +MemTable::~MemTable() { + mem_tracker_.FreeMem(); + assert(refs_ == 0); +} + +size_t MemTable::ApproximateMemoryUsage() { + autovector usages = { + arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(), + range_del_table_->ApproximateMemoryUsage(), + ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)}; + size_t total_usage = 0; + for (size_t usage : usages) { + // If usage + total_usage >= kMaxSizet, return kMaxSizet. + // the following variation is to avoid numeric overflow. + if (usage >= std::numeric_limits::max() - total_usage) { + return std::numeric_limits::max(); + } + total_usage += usage; + } + approximate_memory_usage_.store(total_usage, std::memory_order_relaxed); + // otherwise, return the actual usage + return total_usage; +} + +bool MemTable::ShouldFlushNow() { + size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); + // In a lot of times, we cannot allocate arena blocks that exactly matches the + // buffer size. Thus we have to decide if we should over-allocate or + // under-allocate. + // This constant variable can be interpreted as: if we still have more than + // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over + // allocate one more block. + const double kAllowOverAllocationRatio = 0.6; + + // If arena still have room for new block allocation, we can safely say it + // shouldn't flush. + auto allocated_memory = table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); + + approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed); + + // if we can still allocate one more block without exceeding the + // over-allocation ratio, then we should not flush. + if (allocated_memory + kArenaBlockSize < + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { + return false; + } + + // if user keeps adding entries that exceeds write_buffer_size, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { + return true; + } + + // In this code path, Arena has already allocated its "last block", which + // means the total allocatedmemory size is either: + // (1) "moderately" over allocated the memory (no more than `0.6 * arena + // block size`. Or, + // (2) the allocated memory is less than write buffer size, but we'll stop + // here since if we allocate a new arena block, we'll over allocate too much + // more (half of the arena block size) memory. + // + // In either case, to avoid over-allocate, the last block will stop allocation + // when its usage reaches a certain ratio, which we carefully choose "0.75 + // full" as the stop condition because it addresses the following issue with + // great simplicity: What if the next inserted entry's size is + // bigger than AllocatedAndUnused()? + // + // The answer is: if the entry size is also bigger than 0.25 * + // kArenaBlockSize, a dedicated block will be allocated for it; otherwise + // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty + // and regular block. In either case, we *overly* over-allocated. + // + // Therefore, setting the last block to be at most "0.75 full" avoids both + // cases. + // + // NOTE: the average percentage of waste space of this approach can be counted + // as: "arena block size * 0.25 / write buffer size". User who specify a small + // write buffer size and/or big arena block size may suffer. + return arena_.AllocatedAndUnused() < kArenaBlockSize / 4; +} + +void MemTable::UpdateFlushState() { + auto state = flush_state_.load(std::memory_order_relaxed); + if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) { + // ignore CAS failure, because that means somebody else requested + // a flush + flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } +} + +void MemTable::UpdateOldestKeyTime() { + uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed); + if (oldest_key_time == std::numeric_limits::max()) { + int64_t current_time = 0; + auto s = clock_->GetCurrentTime(¤t_time); + if (s.ok()) { + assert(current_time >= 0); + // If fail, the timestamp is already set. + oldest_key_time_.compare_exchange_strong( + oldest_key_time, static_cast(current_time), + std::memory_order_relaxed, std::memory_order_relaxed); + } + } +} + +Status MemTable::VerifyEntryChecksum(const char* entry, + uint32_t protection_bytes_per_key, + bool allow_data_in_errors) { + if (protection_bytes_per_key == 0) { + return Status::OK(); + } + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (key_ptr == nullptr) { + return Status::Corruption("Unable to parse internal key length"); + } + if (key_length < 8) { + return Status::Corruption("Memtable entry internal key length too short."); + } + Slice user_key = Slice(key_ptr, key_length - 8); + + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber seq; + UnPackSequenceAndType(tag, &seq, &type); + + uint32_t value_length = 0; + const char* value_ptr = GetVarint32Ptr( + key_ptr + key_length, key_ptr + key_length + 5, &value_length); + if (value_ptr == nullptr) { + return Status::Corruption("Unable to parse internal key value"); + } + Slice value = Slice(value_ptr, value_length); + + const char* checksum_ptr = value_ptr + value_length; + bool match = + ProtectionInfo64() + .ProtectKVO(user_key, value, type) + .ProtectS(seq) + .Verify(static_cast(protection_bytes_per_key), checksum_ptr); + if (!match) { + std::string msg( + "Corrupted memtable entry, per key-value checksum verification " + "failed."); + if (allow_data_in_errors) { + msg.append("Unrecognized value type: " + + std::to_string(static_cast(type)) + ". "); + msg.append("User key: " + user_key.ToString(/*hex=*/true) + ". "); + msg.append("seq: " + std::to_string(seq) + "."); + } + return Status::Corruption(msg.c_str()); + } + return Status::OK(); +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const { + // Internal keys are encoded as length-prefixed strings. + Slice k1 = GetLengthPrefixedSlice(prefix_len_key1); + Slice k2 = GetLengthPrefixedSlice(prefix_len_key2); + return comparator.CompareKeySeq(k1, k2); +} + +int MemTable::KeyComparator::operator()( + const char* prefix_len_key, const KeyComparator::DecodedType& key) const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.CompareKeySeq(a, key); +} + +void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) { + throw std::runtime_error("concurrent insert not supported"); +} + +Slice MemTableRep::UserKey(const char* key) const { + Slice slice = GetLengthPrefixedSlice(key); + return Slice(slice.data(), slice.size() - 8); +} + +KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { + *buf = allocator_->Allocate(len); + return static_cast(*buf); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, static_cast(target.size())); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator : public InternalIterator { + public: + MemTableIterator(const MemTable& mem, const ReadOptions& read_options, + Arena* arena, bool use_range_del_table = false) + : bloom_(nullptr), + prefix_extractor_(mem.prefix_extractor_), + comparator_(mem.comparator_), + valid_(false), + arena_mode_(arena != nullptr), + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support), + protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), + status_(Status::OK()), + logger_(mem.moptions_.info_log) { + if (use_range_del_table) { + iter_ = mem.range_del_table_->GetIterator(arena); + } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && + !read_options.auto_prefix_mode) { + // Auto prefix mode is not implemented in memtable yet. + bloom_ = mem.bloom_filter_.get(); + iter_ = mem.table_->GetDynamicPrefixIterator(arena); + } else { + iter_ = mem.table_->GetIterator(arena); + } + status_.PermitUncheckedError(); + } + // No copying allowed + MemTableIterator(const MemTableIterator&) = delete; + void operator=(const MemTableIterator&) = delete; + + ~MemTableIterator() override { +#ifndef NDEBUG + // Assert that the MemTableIterator is never deleted while + // Pinning is Enabled. + assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); +#endif + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } + +#ifndef NDEBUG + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + + bool Valid() const override { return valid_ && status_.ok(); } + void Seek(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_) { + // iterator should only use prefix bloom filter + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts)) { + if (!bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + void SeekForPrev(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_) { + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts)) { + if (!bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + if (!Valid() && status().ok()) { + SeekToLast(); + } + while (Valid() && comparator_.comparator.Compare(k, key()) < 0) { + Prev(); + } + } + void SeekToFirst() override { + iter_->SeekToFirst(); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + void SeekToLast() override { + iter_->SeekToLast(); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + void Next() override { + PERF_COUNTER_ADD(next_on_memtable_count, 1); + assert(Valid()); + iter_->Next(); + TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = true; + } + return is_valid; + } + void Prev() override { + PERF_COUNTER_ADD(prev_on_memtable_count, 1); + assert(Valid()); + iter_->Prev(); + valid_ = iter_->Valid(); + VerifyEntryChecksum(); + } + Slice key() const override { + assert(Valid()); + return GetLengthPrefixedSlice(iter_->key()); + } + Slice value() const override { + assert(Valid()); + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + Status status() const override { return status_; } + + bool IsKeyPinned() const override { + // memtable data is always pinned + return true; + } + + bool IsValuePinned() const override { + // memtable value is always pinned, except if we allow inplace update. + return value_pinned_; + } + + private: + DynamicBloom* bloom_; + const SliceTransform* const prefix_extractor_; + const MemTable::KeyComparator comparator_; + MemTableRep::Iterator* iter_; + bool valid_; + bool arena_mode_; + bool value_pinned_; + uint32_t protection_bytes_per_key_; + Status status_; + Logger* logger_; + + void VerifyEntryChecksum() { + if (protection_bytes_per_key_ > 0 && Valid()) { + status_ = MemTable::VerifyEntryChecksum(iter_->key(), + protection_bytes_per_key_); + if (!status_.ok()) { + ROCKS_LOG_ERROR(logger_, "In MemtableIterator: %s", status_.getState()); + } + } + } +}; + +InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, + Arena* arena) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, read_options, arena); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable) { + if (read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed)) { + return nullptr; + } + return NewRangeTombstoneIteratorInternal(read_options, read_seq, + immutable_memtable); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable) { + if (immutable_memtable) { + // Note that caller should already have verified that + // !is_range_del_table_empty_ + assert(IsFragmentedRangeTombstonesConstructed()); + return new FragmentedRangeTombstoneIterator( + fragmented_range_tombstone_list_.get(), comparator_.comparator, + read_seq, read_options.timestamp); + } + + // takes current cache + std::shared_ptr cache = + std::atomic_load_explicit(cached_range_tombstone_.Access(), + std::memory_order_relaxed); + // construct fragmented tombstone list if necessary + if (!cache->initialized.load(std::memory_order_acquire)) { + cache->reader_mutex.lock(); + if (!cache->tombstones) { + auto* unfragmented_iter = + new MemTableIterator(*this, read_options, nullptr /* arena */, + true /* use_range_del_table */); + cache->tombstones.reset(new FragmentedRangeTombstoneList( + std::unique_ptr(unfragmented_iter), + comparator_.comparator)); + cache->initialized.store(true, std::memory_order_release); + } + cache->reader_mutex.unlock(); + } + + auto* fragmented_iter = new FragmentedRangeTombstoneIterator( + cache, comparator_.comparator, read_seq, read_options.timestamp); + return fragmented_iter; +} + +void MemTable::ConstructFragmentedRangeTombstones() { + assert(!IsFragmentedRangeTombstonesConstructed(false)); + // There should be no concurrent Construction + if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { + // TODO: plumb Env::IOActivity + auto* unfragmented_iter = + new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, + true /* use_range_del_table */); + + fragmented_range_tombstone_list_ = + std::make_unique( + std::unique_ptr(unfragmented_iter), + comparator_.comparator); + } +} + +port::RWMutex* MemTable::GetLock(const Slice& key) { + return &locks_[GetSliceRangedNPHash(key, locks_.size())]; +} + +MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey) { + uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey); + entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey); + if (entry_count == 0) { + return {0, 0}; + } + uint64_t n = num_entries_.load(std::memory_order_relaxed); + if (n == 0) { + return {0, 0}; + } + if (entry_count > n) { + // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can + // be larger than actual entries we have. Cap it to entries we have to limit + // the inaccuracy. + entry_count = n; + } + uint64_t data_size = data_size_.load(std::memory_order_relaxed); + return {entry_count * (data_size / n), entry_count}; +} + +Status MemTable::VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOS64& kv_prot_info) { + uint32_t ikey_len = 0; + if (!GetVarint32(&encoded, &ikey_len)) { + return Status::Corruption("Unable to parse internal key length"); + } + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (ikey_len < 8 + ts_sz) { + return Status::Corruption("Internal key length too short"); + } + if (ikey_len > encoded.size()) { + return Status::Corruption("Internal key length too long"); + } + uint32_t value_len = 0; + const size_t user_key_len = ikey_len - 8; + Slice key(encoded.data(), user_key_len); + encoded.remove_prefix(user_key_len); + + uint64_t packed = DecodeFixed64(encoded.data()); + ValueType value_type = kMaxValue; + SequenceNumber sequence_number = kMaxSequenceNumber; + UnPackSequenceAndType(packed, &sequence_number, &value_type); + encoded.remove_prefix(8); + + if (!GetVarint32(&encoded, &value_len)) { + return Status::Corruption("Unable to parse value length"); + } + if (value_len < encoded.size()) { + return Status::Corruption("Value length too short"); + } + if (value_len > encoded.size()) { + return Status::Corruption("Value length too long"); + } + Slice value(encoded.data(), value_len); + + return kv_prot_info.StripS(sequence_number) + .StripKVO(key, value, value_type) + .GetStatus(); +} + +void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, + const Slice& key, const Slice& value, + ValueType type, SequenceNumber s, + char* checksum_ptr) { + if (moptions_.protection_bytes_per_key == 0) { + return; + } + + if (kv_prot_info == nullptr) { + ProtectionInfo64() + .ProtectKVO(key, value, type) + .ProtectS(s) + .Encode(static_cast(moptions_.protection_bytes_per_key), + checksum_ptr); + } else { + kv_prot_info->Encode( + static_cast(moptions_.protection_bytes_per_key), checksum_ptr); + } +} + +Status MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info, + bool allow_concurrent, + MemTablePostProcessInfo* post_process_info, void** hint) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + // checksum : char[moptions_.protection_bytes_per_key] + uint32_t key_size = static_cast(key.size()); + uint32_t val_size = static_cast(value.size()); + uint32_t internal_key_size = key_size + 8; + const uint32_t encoded_len = VarintLength(internal_key_size) + + internal_key_size + VarintLength(val_size) + + val_size + moptions_.protection_bytes_per_key; + char* buf = nullptr; + std::unique_ptr& table = + type == kTypeRangeDeletion ? range_del_table_ : table_; + KeyHandle handle = table->Allocate(encoded_len, &buf); + + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + Slice key_slice(p, key_size); + p += key_size; + uint64_t packed = PackSequenceAndType(s, type); + EncodeFixed64(p, packed); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((unsigned)(p + val_size - buf + moptions_.protection_bytes_per_key) == + (unsigned)encoded_len); + + UpdateEntryChecksum(kv_prot_info, key, value, type, s, + buf + encoded_len - moptions_.protection_bytes_per_key); + Slice encoded(buf, encoded_len - moptions_.protection_bytes_per_key); + if (kv_prot_info != nullptr) { + TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded); + Status status = VerifyEncodedEntry(encoded, *kv_prot_info); + if (!status.ok()) { + return status; + } + } + + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz); + + if (!allow_concurrent) { + // Extract prefix for insert with hint. + if (insert_with_hint_prefix_extractor_ != nullptr && + insert_with_hint_prefix_extractor_->InDomain(key_slice)) { + Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); + bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]); + if (UNLIKELY(!res)) { + return Status::TryAgain("key+seq exists"); + } + } else { + bool res = table->InsertKey(handle); + if (UNLIKELY(!res)) { + return Status::TryAgain("key+seq exists"); + } + } + + // this is a bit ugly, but is the way to avoid locked instructions + // when incrementing an atomic + num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, + std::memory_order_relaxed); + if (type == kTypeDeletion || type == kTypeSingleDeletion || + type == kTypeDeletionWithTimestamp) { + num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + } + + if (bloom_filter_ && prefix_extractor_ && + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->Add(key_without_ts); + } + + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s >= first_seqno_); + if (first_seqno_ == 0) { + first_seqno_.store(s, std::memory_order_relaxed); + + if (earliest_seqno_ == kMaxSequenceNumber) { + earliest_seqno_.store(GetFirstSequenceNumber(), + std::memory_order_relaxed); + } + assert(first_seqno_.load() >= earliest_seqno_.load()); + } + assert(post_process_info == nullptr); + UpdateFlushState(); + } else { + bool res = (hint == nullptr) + ? table->InsertKeyConcurrently(handle) + : table->InsertKeyWithHintConcurrently(handle, hint); + if (UNLIKELY(!res)) { + return Status::TryAgain("key+seq exists"); + } + + assert(post_process_info != nullptr); + post_process_info->num_entries++; + post_process_info->data_size += encoded_len; + if (type == kTypeDeletion) { + post_process_info->num_deletes++; + } + + if (bloom_filter_ && prefix_extractor_ && + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->AddConcurrently( + prefix_extractor_->Transform(key_without_ts)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->AddConcurrently(key_without_ts); + } + + // atomically update first_seqno_ and earliest_seqno_. + uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed); + while ((cur_seq_num == 0 || s < cur_seq_num) && + !first_seqno_.compare_exchange_weak(cur_seq_num, s)) { + } + uint64_t cur_earliest_seqno = + earliest_seqno_.load(std::memory_order_relaxed); + while ( + (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && + !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { + } + } + if (type == kTypeRangeDeletion) { + auto new_cache = std::make_shared(); + size_t size = cached_range_tombstone_.Size(); + if (allow_concurrent) { + range_del_mutex_.lock(); + } + for (size_t i = 0; i < size; ++i) { + std::shared_ptr* local_cache_ref_ptr = + cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + // It is okay for some reader to load old cache during invalidation as + // the new sequence number is not published yet. + // Each core will have a shared_ptr to a shared_ptr to the cached + // fragmented range tombstones, so that ref count is maintianed locally + // per-core using the per-core shared_ptr. + std::atomic_store_explicit( + local_cache_ref_ptr, + std::shared_ptr( + new_local_cache_ref, new_cache.get()), + std::memory_order_relaxed); + } + if (allow_concurrent) { + range_del_mutex_.unlock(); + } + is_range_del_table_empty_.store(false, std::memory_order_relaxed); + } + UpdateOldestKeyTime(); + + TEST_SYNC_POINT_CALLBACK("MemTable::Add:BeforeReturn:Encoded", &encoded); + return Status::OK(); +} + +// Callback from MemTable::Get() +namespace { + +struct Saver { + Status* status; + const LookupKey* key; + bool* found_final_value; // Is value set correctly? Used by KeyMayExist + bool* merge_in_progress; + std::string* value; + PinnableWideColumns* columns; + SequenceNumber seq; + std::string* timestamp; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + SequenceNumber max_covering_tombstone_seq; + MemTable* mem; + Logger* logger; + Statistics* statistics; + bool inplace_update_support; + bool do_merge; + SystemClock* clock; + + ReadCallback* callback_; + bool* is_blob_index; + bool allow_data_in_errors; + uint32_t protection_bytes_per_key; + bool CheckCallback(SequenceNumber _seq) { + if (callback_) { + return callback_->IsVisible(_seq); + } + return true; + } +}; +} // anonymous namespace + +static bool SaveValue(void* arg, const char* entry) { + TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry); + Saver* s = reinterpret_cast(arg); + assert(s != nullptr); + assert(!s->value || !s->columns); + + if (s->protection_bytes_per_key > 0) { + *(s->status) = MemTable::VerifyEntryChecksum( + entry, s->protection_bytes_per_key, s->allow_data_in_errors); + if (!s->status->ok()) { + ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState()); + // Memtable entry corrupted + return false; + } + } + + MergeContext* merge_context = s->merge_context; + SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq; + const MergeOperator* merge_operator = s->merge_operator; + + assert(merge_context != nullptr); + + // Refer to comments under MemTable::Add() for entry format. + // Check that it belongs to same user key. + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + assert(key_length >= 8); + Slice user_key_slice = Slice(key_ptr, key_length - 8); + const Comparator* user_comparator = + s->mem->GetInternalKeyComparator().user_comparator(); + size_t ts_sz = user_comparator->timestamp_size(); + if (ts_sz && s->timestamp && max_covering_tombstone_seq > 0) { + // timestamp should already be set to range tombstone timestamp + assert(s->timestamp->size() == ts_sz); + } + if (user_comparator->EqualWithoutTimestamp(user_key_slice, + s->key->user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber seq; + UnPackSequenceAndType(tag, &seq, &type); + // If the value is not in the snapshot, skip it + if (!s->CheckCallback(seq)) { + return true; // to continue to the next seq + } + + if (s->seq == kMaxSequenceNumber) { + s->seq = seq; + if (s->seq > max_covering_tombstone_seq) { + if (ts_sz && s->timestamp != nullptr) { + // `timestamp` was set to range tombstone's timestamp before + // `SaveValue` is ever called. This key has a higher sequence number + // than range tombstone, and is the key with the highest seqno across + // all keys with this user_key, so we update timestamp here. + Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz); + s->timestamp->assign(ts.data(), ts_sz); + } + } else { + s->seq = max_covering_tombstone_seq; + } + } + + if (ts_sz > 0 && s->timestamp != nullptr) { + if (!s->timestamp->empty()) { + assert(ts_sz == s->timestamp->size()); + } + // TODO optimize for smaller size ts + const std::string kMaxTs(ts_sz, '\xff'); + if (s->timestamp->empty() || + user_comparator->CompareTimestamp(*(s->timestamp), kMaxTs) == 0) { + Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz); + s->timestamp->assign(ts.data(), ts_sz); + } + } + + if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex || + type == kTypeWideColumnEntity || type == kTypeDeletion || + type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) && + max_covering_tombstone_seq > seq) { + type = kTypeRangeDeletion; + } + switch (type) { + case kTypeBlobIndex: { + if (!s->do_merge) { + *(s->status) = Status::NotSupported( + "GetMergeOperands not supported by stacked BlobDB"); + *(s->found_final_value) = true; + return false; + } + + if (*(s->merge_in_progress)) { + *(s->status) = Status::NotSupported( + "Merge operator not supported by stacked BlobDB"); + *(s->found_final_value) = true; + return false; + } + + if (s->is_blob_index == nullptr) { + ROCKS_LOG_ERROR(s->logger, "Encountered unexpected blob index."); + *(s->status) = Status::NotSupported( + "Encountered unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB."); + *(s->found_final_value) = true; + return false; + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + + *(s->status) = Status::OK(); + + if (s->value) { + s->value->assign(v.data(), v.size()); + } else if (s->columns) { + s->columns->SetPlainValue(v); + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + + *(s->found_final_value) = true; + *(s->is_blob_index) = true; + + return false; + } + case kTypeValue: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + + *(s->status) = Status::OK(); + + if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + // TODO(yanqin) update MergeContext so that timestamps information + // can also be retained. + + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + } else if (*(s->merge_in_progress)) { + assert(s->do_merge); + + if (s->value || s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &v, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(std::move(result)); + } + } + } + } else if (s->value) { + s->value->assign(v.data(), v.size()); + } else if (s->columns) { + s->columns->SetPlainValue(v); + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + + *(s->found_final_value) = true; + + if (s->is_blob_index != nullptr) { + *(s->is_blob_index) = false; + } + + return false; + } + case kTypeWideColumnEntity: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + + *(s->status) = Status::OK(); + + if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + + if (s->status->ok()) { + merge_context->PushOperand( + value_of_default, + s->inplace_update_support == false /* operand_pinned */); + } + } else if (*(s->merge_in_progress)) { + assert(s->do_merge); + + if (s->value) { + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + if (s->status->ok()) { + // `op_failure_scope` (an output parameter) is not provided (set + // to nullptr) since a failure must be propagated regardless of + // its value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &value_of_default, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + } + } else if (s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + *(s->status) = MergeHelper::TimedFullMergeWithEntity( + merge_operator, s->key->user_key(), v, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + *(s->status) = s->columns->SetWideColumnValue(std::move(result)); + } + } + } else if (s->value) { + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + if (s->status->ok()) { + s->value->assign(value_of_default.data(), value_of_default.size()); + } + } else if (s->columns) { + *(s->status) = s->columns->SetWideColumnValue(v); + } + + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + + *(s->found_final_value) = true; + + if (s->is_blob_index != nullptr) { + *(s->is_blob_index) = false; + } + + return false; + } + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + case kTypeRangeDeletion: { + if (*(s->merge_in_progress)) { + if (s->value || s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(std::move(result)); + } + } + } else { + // We have found a final value (a base deletion) and have newer + // merge operands that we do not intend to merge. Nothing remains + // to be done so assign status to OK. + *(s->status) = Status::OK(); + } + } else { + *(s->status) = Status::NotFound(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeMerge: { + if (!merge_operator) { + *(s->status) = Status::InvalidArgument( + "merge_operator is not properly initialized."); + // Normally we continue the loop (return true) when we see a merge + // operand. But in case of an error, we should stop the loop + // immediately and pretend we have found the value to stop further + // seek. Otherwise, the later call will override this error status. + *(s->found_final_value) = true; + return false; + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->merge_in_progress) = true; + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1); + + if (s->do_merge && merge_operator->ShouldMerge( + merge_context->GetOperandsDirectionBackward())) { + if (s->value || s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(std::move(result)); + } + } + } + + *(s->found_final_value) = true; + return false; + } + return true; + } + default: { + std::string msg("Corrupted value not expected."); + if (s->allow_data_in_errors) { + msg.append("Unrecognized value type: " + + std::to_string(static_cast(type)) + ". "); + msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) + + ". "); + msg.append("seq: " + std::to_string(seq) + "."); + } + *(s->status) = Status::Corruption(msg.c_str()); + return false; + } + } + } + + // s->state could be Corrupt, merge or notfound + return false; +} + +bool MemTable::Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + bool immutable_memtable, ReadCallback* callback, + bool* is_blob_index, bool do_merge) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return false; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + std::unique_ptr range_del_iter( + NewRangeTombstoneIterator(read_opts, + GetInternalKeySeqno(key.internal_key()), + immutable_memtable)); + if (range_del_iter != nullptr) { + SequenceNumber covering_seq = + range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()); + if (covering_seq > *max_covering_tombstone_seq) { + *max_covering_tombstone_seq = covering_seq; + if (timestamp) { + // Will be overwritten in SaveValue() if there is a point key with + // a higher seqno. + timestamp->assign(range_del_iter->timestamp().data(), + range_del_iter->timestamp().size()); + } + } + } + + bool found_final_value = false; + bool merge_in_progress = s->IsMergeInProgress(); + bool may_contain = true; + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz); + bool bloom_checked = false; + if (bloom_filter_) { + // when both memtable_whole_key_filtering and prefix_extractor_ are set, + // only do whole key filtering for Get() to save CPU + if (moptions_.memtable_whole_key_filtering) { + may_contain = bloom_filter_->MayContain(user_key_without_ts); + bloom_checked = true; + } else { + assert(prefix_extractor_); + if (prefix_extractor_->InDomain(user_key_without_ts)) { + may_contain = bloom_filter_->MayContain( + prefix_extractor_->Transform(user_key_without_ts)); + bloom_checked = true; + } + } + } + + if (bloom_filter_ && !may_contain) { + // iter is null if prefix bloom says the key does not exist + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + *seq = kMaxSequenceNumber; + } else { + if (bloom_checked) { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback, + is_blob_index, value, columns, timestamp, s, merge_context, + seq, &found_final_value, &merge_in_progress); + } + + // No change to value, since we have not yet found a Put/Delete + // Propagate corruption error + if (!found_final_value && merge_in_progress && !s->IsCorruption()) { + *s = Status::MergeInProgress(); + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); + return found_final_value; +} + +void MemTable::GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, + bool do_merge, ReadCallback* callback, + bool* is_blob_index, std::string* value, + PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress) { + Saver saver; + saver.status = s; + saver.found_final_value = found_final_value; + saver.merge_in_progress = merge_in_progress; + saver.key = &key; + saver.value = value; + saver.columns = columns; + saver.timestamp = timestamp; + saver.seq = kMaxSequenceNumber; + saver.mem = this; + saver.merge_context = merge_context; + saver.max_covering_tombstone_seq = max_covering_tombstone_seq; + saver.merge_operator = moptions_.merge_operator; + saver.logger = moptions_.info_log; + saver.inplace_update_support = moptions_.inplace_update_support; + saver.statistics = moptions_.statistics; + saver.clock = clock_; + saver.callback_ = callback; + saver.is_blob_index = is_blob_index; + saver.do_merge = do_merge; + saver.allow_data_in_errors = moptions_.allow_data_in_errors; + saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; + table_->Get(key, &saver, SaveValue); + *seq = saver.seq; +} + +void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool immutable_memtable) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + // For now, memtable Bloom filter is effectively disabled if there are any + // range tombstones. This is the simplest way to ensure range tombstones are + // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0 + bool no_range_del = read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed); + MultiGetRange temp_range(*range, range->begin(), range->end()); + if (bloom_filter_ && no_range_del) { + bool whole_key = + !prefix_extractor_ || moptions_.memtable_whole_key_filtering; + std::array bloom_keys; + std::array may_match; + std::array range_indexes; + int num_keys = 0; + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + if (whole_key) { + bloom_keys[num_keys] = iter->ukey_without_ts; + range_indexes[num_keys++] = iter.index(); + } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) { + bloom_keys[num_keys] = + prefix_extractor_->Transform(iter->ukey_without_ts); + range_indexes[num_keys++] = iter.index(); + } + } + bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]); + for (int i = 0; i < num_keys; ++i) { + if (!may_match[i]) { + temp_range.SkipIndex(range_indexes[i]); + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + } + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + bool found_final_value{false}; + bool merge_in_progress = iter->s->IsMergeInProgress(); + if (!no_range_del) { + std::unique_ptr range_del_iter( + NewRangeTombstoneIteratorInternal( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()), + immutable_memtable)); + SequenceNumber covering_seq = + range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()); + if (covering_seq > iter->max_covering_tombstone_seq) { + iter->max_covering_tombstone_seq = covering_seq; + if (iter->timestamp) { + // Will be overwritten in SaveValue() if there is a point key with + // a higher seqno. + iter->timestamp->assign(range_del_iter->timestamp().data(), + range_del_iter->timestamp().size()); + } + } + } + SequenceNumber dummy_seq; + GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, + callback, &iter->is_blob_index, + iter->value ? iter->value->GetSelf() : nullptr, iter->columns, + iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq, + &found_final_value, &merge_in_progress); + + if (!found_final_value && merge_in_progress) { + *(iter->s) = Status::MergeInProgress(); + } + + if (found_final_value) { + if (iter->value) { + iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); + } else { + assert(iter->columns); + range->AddValueSize(iter->columns->serialized_size()); + } + + range->MarkKeyDone(iter); + RecordTick(moptions_.statistics, MEMTABLE_HIT); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + // Set all remaining keys in range to Abort + for (auto range_iter = range->begin(); range_iter != range->end(); + ++range_iter) { + range->MarkKeyDone(range_iter); + *(range_iter->s) = Status::Aborted(); + } + break; + } + } + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); +} + +Status MemTable::Update(SequenceNumber seq, ValueType value_type, + const Slice& key, const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info) { + LookupKey lkey(key, seq); + Slice mem_key = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(lkey.internal_key(), mem_key.data()); + + if (iter->Valid()) { + // Refer to comments under MemTable::Add() for entry format. + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Equal( + Slice(key_ptr, key_length - 8), lkey.user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); + assert(existing_seq != seq); + if (type == value_type) { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = static_cast(prev_value.size()); + uint32_t new_size = static_cast(value.size()); + + // Update value, if new value size <= previous value size + if (new_size <= prev_size) { + char* p = + EncodeVarint32(const_cast(key_ptr) + key_length, new_size); + WriteLock wl(GetLock(lkey.user_key())); + memcpy(p, value.data(), value.size()); + assert((unsigned)((p + value.size()) - entry) == + (unsigned)(VarintLength(key_length) + key_length + + VarintLength(value.size()) + value.size())); + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + UpdateEntryChecksum(&updated_kv_prot_info, key, value, type, + existing_seq, p + value.size()); + Slice encoded(entry, p + value.size() - entry); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } else { + UpdateEntryChecksum(nullptr, key, value, type, existing_seq, + p + value.size()); + } + return Status::OK(); + } + } + } + } + + // The latest value is not value_type or key doesn't exist + return Add(seq, value_type, key, value, kv_prot_info); +} + +Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOS64* kv_prot_info) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(lkey.internal_key(), memkey.data()); + + if (iter->Valid()) { + // Refer to comments under MemTable::Add() for entry format. + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Equal( + Slice(key_ptr, key_length - 8), lkey.user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + uint64_t existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); + if (type == kTypeValue) { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = static_cast(prev_value.size()); + + char* prev_buffer = const_cast(prev_value.data()); + uint32_t new_prev_size = prev_size; + + std::string str_value; + WriteLock wl(GetLock(lkey.user_key())); + auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // Value already updated by callback. + assert(new_prev_size <= prev_size); + if (new_prev_size < prev_size) { + // overwrite the new prev_size + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_prev_size); + if (VarintLength(new_prev_size) < VarintLength(prev_size)) { + // shift the value buffer as well. + memcpy(p, prev_buffer, new_prev_size); + prev_buffer = p; + } + } + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); + UpdateFlushState(); + Slice new_value(prev_buffer, new_prev_size); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + updated_kv_prot_info.UpdateV(delta, new_value); + Slice encoded(entry, prev_buffer + new_prev_size - entry); + UpdateEntryChecksum(&updated_kv_prot_info, key, new_value, type, + existing_seq, prev_buffer + new_prev_size); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } else { + UpdateEntryChecksum(nullptr, key, new_value, type, existing_seq, + prev_buffer + new_prev_size); + } + return Status::OK(); + } else if (status == UpdateStatus::UPDATED) { + Status s; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(delta, str_value); + s = Add(seq, kTypeValue, key, Slice(str_value), + &updated_kv_prot_info); + } else { + s = Add(seq, kTypeValue, key, Slice(str_value), + nullptr /* kv_prot_info */); + } + RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); + UpdateFlushState(); + return s; + } else if (status == UpdateStatus::UPDATE_FAILED) { + // `UPDATE_FAILED` is named incorrectly. It indicates no update + // happened. It does not indicate a failure happened. + UpdateFlushState(); + return Status::OK(); + } + } + } + } + // The latest value is not `kTypeValue` or key doesn't exist + return Status::NotFound(); +} + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(key.internal_key(), memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (!comparator_.comparator.user_comparator()->Equal( + Slice(iter_key_ptr, key_length - 8), key.user_key())) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + ValueType type; + uint64_t unused; + UnPackSequenceAndType(tag, &unused, &type); + if (type != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + +void MemTableRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto iter = GetDynamicPrefixIterator(); + for (iter->Seek(k.internal_key(), k.memtable_key().data()); + iter->Valid() && callback_func(callback_args, iter->key()); + iter->Next()) { + } +} + +void MemTable::RefLogContainingPrepSection(uint64_t log) { + assert(log > 0); + auto cur = min_prep_log_referenced_.load(); + while ((log < cur || cur == 0) && + !min_prep_log_referenced_.compare_exchange_strong(cur, log)) { + cur = min_prep_log_referenced_.load(); + } +} + +uint64_t MemTable::GetMinLogContainingPrepSection() { + return min_prep_log_referenced_.load(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.h new file mode 100644 index 0000000000..eefabcf88d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable.h @@ -0,0 +1,660 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/kv_checksum.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/read_callback.h" +#include "db/version_edit.h" +#include "memory/allocator.h" +#include "memory/concurrent_arena.h" +#include "monitoring/instrumented_mutex.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "table/multiget_context.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +struct FlushJobInfo; +class Mutex; +class MemTableIterator; +class MergeContext; +class SystemClock; + +struct ImmutableMemTableOptions { + explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + size_t memtable_huge_page_size; + bool memtable_whole_key_filtering; + bool inplace_update_support; + size_t inplace_update_num_locks; + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + size_t max_successive_merges; + Statistics* statistics; + MergeOperator* merge_operator; + Logger* info_log; + bool allow_data_in_errors; + uint32_t protection_bytes_per_key; +}; + +// Batched counters to updated when inserting keys in one write batch. +// In post process of the write batch, these can be updated together. +// Only used in concurrent memtable insert case. +struct MemTablePostProcessInfo { + uint64_t data_size = 0; + uint64_t num_entries = 0; + uint64_t num_deletes = 0; +}; + +using MultiGetRange = MultiGetContext::Range; +// Note: Many of the methods in this class have comments indicating that +// external synchronization is required as these methods are not thread-safe. +// It is up to higher layers of code to decide how to prevent concurrent +// invocation of these methods. This is usually done by acquiring either +// the db mutex or the single writer thread. +// +// Some of these methods are documented to only require external +// synchronization if this memtable is immutable. Calling MarkImmutable() is +// not sufficient to guarantee immutability. It is up to higher layers of +// code to determine if this MemTable can still be modified by other threads. +// Eg: The Superversion stores a pointer to the current MemTable (that can +// be modified) and a separate list of the MemTables that can no longer be +// written to (aka the 'immutable memtables'). +class MemTable { + public: + struct KeyComparator : public MemTableRep::KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {} + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const override; + virtual int operator()(const char* prefix_len_key, + const DecodedType& key) const override; + }; + + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. + // + // earliest_seq should be the current SequenceNumber in the db such that any + // key inserted into this memtable will have an equal or larger seq number. + // (When a db is first created, the earliest sequence number will be 0). + // If the earliest sequence number is not known, kMaxSequenceNumber may be + // used, but this may prevent some transactions from succeeding until the + // first key is inserted into the memtable. + explicit MemTable(const InternalKeyComparator& comparator, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBufferManager* write_buffer_manager, + SequenceNumber earliest_seq, uint32_t column_family_id); + // No copying allowed + MemTable(const MemTable&) = delete; + MemTable& operator=(const MemTable&) = delete; + + // Do not delete this MemTable unless Unref() indicates it not in use. + ~MemTable(); + + // Increase reference count. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void Ref() { ++refs_; } + + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + MemTable* Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + return this; + } + return nullptr; + } + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + size_t ApproximateMemoryUsage(); + + // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't + // require external synchronization. The value may be less accurate though + size_t ApproximateMemoryUsageFast() const { + return approximate_memory_usage_.load(std::memory_order_relaxed); + } + + // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast + size_t MemoryAllocatedBytes() const { + return table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); + } + + // Returns a vector of unique random memtable entries of size 'sample_size'. + // + // Note: the entries are stored in the unordered_set as length-prefixed keys, + // hence their representation in the set as "const char*". + // Note2: the size of the output set 'entries' is not enforced to be strictly + // equal to 'target_sample_size'. Its final size might be slightly + // greater or slightly less than 'target_sample_size' + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + // REQUIRES: SkipList memtable representation. This function is not + // implemented for any other type of memtable representation (vectorrep, + // hashskiplist,...). + void UniqueRandomSample(const uint64_t& target_sample_size, + std::unordered_set* entries) { + // TODO(bjlemaire): at the moment, only supported by skiplistrep. + // Extend it to all other memtable representations. + table_->UniqueRandomSample(num_entries(), target_sample_size, entries); + } + + // This method heuristically determines if the memtable should continue to + // host more data. + bool ShouldScheduleFlush() const { + return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED; + } + + // Returns true if a flush should be scheduled and the caller should + // be the one to schedule it + bool MarkFlushScheduled() { + auto before = FLUSH_REQUESTED; + return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/dbformat.{h,cc} module. + // + // By default, it returns an iterator for prefix seek if prefix_extractor + // is configured in Options. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // Calling ~Iterator of the iterator will destroy all the states but + // those allocated in arena. + InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); + + // Returns an iterator that yields the range tombstones of the memtable. + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. + // @param immutable_memtable Whether this memtable is an immutable memtable. + // This information is not stored in memtable itself, so it needs to be + // specified by the caller. This flag is used internally to decide whether a + // cached fragmented range tombstone list can be returned. This cached version + // is constructed when a memtable becomes immutable. Setting the flag to false + // will always yield correct result, but may incur performance penalty as it + // always creates a new fragmented range tombstone list. + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable); + + Status VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOS64& kv_prot_info); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + // + // REQUIRES: if allow_concurrent = false, external synchronization to prevent + // simultaneous operations on the same MemTable. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + Status Add(SequenceNumber seq, ValueType type, const Slice& key, + const Slice& value, const ProtectionInfoKVOS64* kv_prot_info, + bool allow_concurrent = false, + MemTablePostProcessInfo* post_process_info = nullptr, + void** hint = nullptr); + + // Used to Get value associated with key or Get Merge Operands associated + // with key. + // If do_merge = true the default behavior which is Get value for key is + // executed. Expected behavior is described right below. + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // If memtable contains Merge operation as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operand to *operands. + // store MergeInProgress in s, and return false. + // Else, return false. + // If any operation was found, its most recent sequence number + // will be stored in *seq on success (regardless of whether true/false is + // returned). Otherwise, *seq will be set to kMaxSequenceNumber. + // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other + // status returned indicates a corruption or other unexpected error. + // If do_merge = false then any Merge Operands encountered for key are simply + // stored in merge_context.operands_list and never actually merged to get a + // final value. The raw Merge Operands are eventually returned to the user. + // @param immutable_memtable Whether this memtable is immutable. Used + // internally by NewRangeTombstoneIterator(). See comment above + // NewRangeTombstoneIterator() for more detail. + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, bool immutable_memtable, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + bool do_merge = true); + + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, bool immutable_memtable, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + bool do_merge = true) { + SequenceNumber seq; + return Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, immutable_memtable, + callback, is_blob_index, do_merge); + } + + // @param immutable_memtable Whether this memtable is immutable. Used + // internally by NewRangeTombstoneIterator(). See comment above + // NewRangeTombstoneIterator() for more detail. + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool immutable_memtable); + + // If `key` exists in current memtable with type value_type and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // adds the new value to the memtable out-of-place. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + Status Update(SequenceNumber seq, ValueType value_type, const Slice& key, + const Slice& value, const ProtectionInfoKVOS64* kv_prot_info); + + // If `key` exists in current memtable with type `kTypeValue` and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // if `key` exists in current memtable with type `kTypeValue`, adds the new + // value to the memtable out-of-place. + // + // Returns `Status::NotFound` if `key` does not exist in current memtable or + // the latest version of `key` does not have `kTypeValue`. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + Status UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOS64* kv_prot_info); + + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + + // Update counters and flush status after inserting a whole write batch + // Used in concurrent memtable inserts. + void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { + num_entries_.fetch_add(update_counters.num_entries, + std::memory_order_relaxed); + data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); + if (update_counters.num_deletes != 0) { + num_deletes_.fetch_add(update_counters.num_deletes, + std::memory_order_relaxed); + } + UpdateFlushState(); + } + + // Get total number of entries in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_entries() const { + return num_entries_.load(std::memory_order_relaxed); + } + + // Get total number of deletes in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_deletes() const { + return num_deletes_.load(std::memory_order_relaxed); + } + + uint64_t get_data_size() const { + return data_size_.load(std::memory_order_relaxed); + } + + // Dynamically change the memtable's capacity. If set below the current usage, + // the next key added will trigger a flush. Can only increase size when + // memtable prefix bloom is disabled, since we can't easily allocate more + // space. + void UpdateWriteBufferSize(size_t new_write_buffer_size) { + if (bloom_filter_ == nullptr || + new_write_buffer_size < write_buffer_size_) { + write_buffer_size_.store(new_write_buffer_size, + std::memory_order_relaxed); + } + } + + // Returns the edits area that is needed for flushing the memtable + VersionEdit* GetEdits() { return &edit_; } + + // Returns if there is no entry inserted to the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + bool IsEmpty() const { return first_seqno_ == 0; } + + // Returns the sequence number of the first element that was inserted + // into the memtable. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + SequenceNumber GetFirstSequenceNumber() { + return first_seqno_.load(std::memory_order_relaxed); + } + + // Returns the sequence number of the first element that was inserted + // into the memtable. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + void SetFirstSequenceNumber(SequenceNumber first_seqno) { + return first_seqno_.store(first_seqno, std::memory_order_relaxed); + } + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + SequenceNumber GetEarliestSequenceNumber() { + return earliest_seqno_.load(std::memory_order_relaxed); + } + + // Sets the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // Used only for MemPurge operation + void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) { + return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed); + } + + // DB's latest sequence ID when the memtable is created. This number + // may be updated to a more recent one before any key is inserted. + SequenceNumber GetCreationSeq() const { return creation_seq_; } + + void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; } + + // Returns the next active logfile number when this memtable is about to + // be flushed to storage + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } + + // Sets the next active logfile number when this memtable is about to + // be flushed to storage + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + + // if this memtable contains data from a committed + // two phase transaction we must take note of the + // log which contains that data so we can know + // when to relese that log + void RefLogContainingPrepSection(uint64_t log); + uint64_t GetMinLogContainingPrepSection(); + + // Notify the underlying storage that no more items will be added. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + // After MarkImmutable() is called, you should not attempt to + // write anything to this MemTable(). (Ie. do not call Add() or Update()). + void MarkImmutable() { + table_->MarkReadOnly(); + mem_tracker_.DoneAllocating(); + } + + // Notify the underlying storage that all data it contained has been + // persisted. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void MarkFlushed() { table_->MarkFlushed(); } + + // return true if the current MemTableRep supports merge operator. + bool IsMergeOperatorSupported() const { + return table_->IsMergeOperatorSupported(); + } + + // return true if the current MemTableRep supports snapshots. + // inplace update prevents snapshots, + bool IsSnapshotSupported() const { + return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; + } + + struct MemTableStats { + uint64_t size; + uint64_t count; + }; + + MemTableStats ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey); + + // Get the lock associated for the key + port::RWMutex* GetLock(const Slice& key); + + const InternalKeyComparator& GetInternalKeyComparator() const { + return comparator_.comparator; + } + + const ImmutableMemTableOptions* GetImmutableMemTableOptions() const { + return &moptions_; + } + + uint64_t ApproximateOldestKeyTime() const { + return oldest_key_time_.load(std::memory_order_relaxed); + } + + // REQUIRES: db_mutex held. + void SetID(uint64_t id) { id_ = id; } + + uint64_t GetID() const { return id_; } + + void SetFlushCompleted(bool completed) { flush_completed_ = completed; } + + uint64_t GetFileNumber() const { return file_number_; } + + void SetFileNumber(uint64_t file_num) { file_number_ = file_num; } + + void SetFlushInProgress(bool in_progress) { + flush_in_progress_ = in_progress; + } + + void SetFlushJobInfo(std::unique_ptr&& info) { + flush_job_info_ = std::move(info); + } + + std::unique_ptr ReleaseFlushJobInfo() { + return std::move(flush_job_info_); + } + + // Returns a heuristic flush decision + bool ShouldFlushNow(); + + void ConstructFragmentedRangeTombstones(); + + // Returns whether a fragmented range tombstone list is already constructed + // for this memtable. It should be constructed right before a memtable is + // added to an immutable memtable list. Note that if a memtable does not have + // any range tombstone, then no range tombstone list will ever be constructed. + // @param allow_empty Specifies whether a memtable with no range tombstone is + // considered to have its fragmented range tombstone list constructed. + bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const { + if (allow_empty) { + return fragmented_range_tombstone_list_.get() != nullptr || + is_range_del_table_empty_; + } else { + return fragmented_range_tombstone_list_.get() != nullptr; + } + } + + // Returns Corruption status if verification fails. + static Status VerifyEntryChecksum(const char* entry, + uint32_t protection_bytes_per_key, + bool allow_data_in_errors = false); + + private: + enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; + + friend class MemTableIterator; + friend class MemTableBackwardIterator; + friend class MemTableList; + + KeyComparator comparator_; + const ImmutableMemTableOptions moptions_; + int refs_; + const size_t kArenaBlockSize; + AllocTracker mem_tracker_; + ConcurrentArena arena_; + std::unique_ptr table_; + std::unique_ptr range_del_table_; + std::atomic_bool is_range_del_table_empty_; + + // Total data size of all data inserted + std::atomic data_size_; + std::atomic num_entries_; + std::atomic num_deletes_; + + // Dynamically changeable memtable option + std::atomic write_buffer_size_; + + // These are used to manage memtable flushes to storage + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush + uint64_t file_number_; // filled up after flush is complete + + // The updates to be applied to the transaction log when this + // memtable is flushed to storage. + VersionEdit edit_; + + // The sequence number of the kv that was inserted first + std::atomic first_seqno_; + + // The db sequence number at the time of creation or kMaxSequenceNumber + // if not set. + std::atomic earliest_seqno_; + + SequenceNumber creation_seq_; + + // The log files earlier than this number can be deleted. + uint64_t mem_next_logfile_number_; + + // the earliest log containing a prepared section + // which has been inserted into this memtable. + std::atomic min_prep_log_referenced_; + + // rw locks for inplace updates + std::vector locks_; + + const SliceTransform* const prefix_extractor_; + std::unique_ptr bloom_filter_; + + std::atomic flush_state_; + + SystemClock* clock_; + + // Extract sequential insert prefixes. + const SliceTransform* insert_with_hint_prefix_extractor_; + + // Insert hints for each prefix. + UnorderedMapH insert_hints_; + + // Timestamp of oldest key + std::atomic oldest_key_time_; + + // Memtable id to track flush. + uint64_t id_ = 0; + + // Sequence number of the atomic flush that is responsible for this memtable. + // The sequence number of atomic flush is a seq, such that no writes with + // sequence numbers greater than or equal to seq are flushed, while all + // writes with sequence number smaller than seq are flushed. + SequenceNumber atomic_flush_seqno_; + + // keep track of memory usage in table_, arena_, and range_del_table_. + // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` + std::atomic approximate_memory_usage_; + + // Flush job info of the current memtable. + std::unique_ptr flush_job_info_; + + // Updates flush_state_ using ShouldFlushNow() + void UpdateFlushState(); + + void UpdateOldestKeyTime(); + + void GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, bool do_merge, + ReadCallback* callback, bool* is_blob_index, + std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress); + + // Always returns non-null and assumes certain pre-checks (e.g., + // is_range_del_table_empty_) are done. This is only valid during the lifetime + // of the underlying memtable. + // read_seq and read_options.timestamp will be used as the upper bound + // for range tombstones. + FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq, + bool immutable_memtable); + + // The fragmented range tombstones of this memtable. + // This is constructed when this memtable becomes immutable + // if !is_range_del_table_empty_. + std::unique_ptr + fragmented_range_tombstone_list_; + + // makes sure there is a single range tombstone writer to invalidate cache + std::mutex range_del_mutex_; + CoreLocalArray> + cached_range_tombstone_; + + void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, + const Slice& key, const Slice& value, ValueType type, + SequenceNumber s, char* checksum_ptr); +}; + +extern const char* EncodeKey(std::string* scratch, const Slice& target); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.cc new file mode 100644 index 0000000000..ee1563f016 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.cc @@ -0,0 +1,987 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/memtable_list.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/memtable.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_set.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class InternalKeyComparator; +class Mutex; +class VersionSet; + +void MemTableListVersion::AddMemTable(MemTable* m) { + memlist_.push_front(m); + *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage(); +} + +void MemTableListVersion::UnrefMemTable(autovector* to_delete, + MemTable* m) { + if (m->Unref()) { + to_delete->push_back(m); + assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage()); + *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old) + : max_write_buffer_number_to_maintain_( + old.max_write_buffer_number_to_maintain_), + max_write_buffer_size_to_maintain_( + old.max_write_buffer_size_to_maintain_), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { + memlist_ = old.memlist_; + for (auto& m : memlist_) { + m->Ref(); + } + + memlist_history_ = old.memlist_history_; + for (auto& m : memlist_history_) { + m->Ref(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {} + +void MemTableListVersion::Ref() { ++refs_; } + +// called by superversion::clean() +void MemTableListVersion::Unref(autovector* to_delete) { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + // if to_delete is equal to nullptr it means we're confident + // that refs_ will not be zero + assert(to_delete != nullptr); + for (const auto& m : memlist_) { + UnrefMemTable(to_delete, m); + } + for (const auto& m : memlist_history_) { + UnrefMemTable(to_delete, m); + } + delete this; + } +} + +int MemTableList::NumNotFlushed() const { + int size = static_cast(current_->memlist_.size()); + assert(num_flush_not_started_ <= size); + return size; +} + +int MemTableList::NumFlushed() const { + return static_cast(current_->memlist_history_.size()); +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableListVersion::Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback, bool* is_blob_index) { + return GetFromList(&memlist_, key, value, columns, timestamp, s, + merge_context, max_covering_tombstone_seq, seq, read_opts, + callback, is_blob_index); +} + +void MemTableListVersion::MultiGet(const ReadOptions& read_options, + MultiGetRange* range, + ReadCallback* callback) { + for (auto memtable : memlist_) { + memtable->MultiGet(read_options, range, callback, + true /* immutable_memtable */); + if (range->empty()) { + return; + } + } +} + +bool MemTableListVersion::GetMergeOperands( + const LookupKey& key, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { + for (MemTable* memtable : memlist_) { + bool done = memtable->Get( + key, /*value=*/nullptr, /*columns=*/nullptr, /*timestamp=*/nullptr, s, + merge_context, max_covering_tombstone_seq, read_opts, + true /* immutable_memtable */, nullptr, nullptr, false); + if (done) { + return true; + } + } + return false; +} + +bool MemTableListVersion::GetFromHistory( + const LookupKey& key, std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, bool* is_blob_index) { + return GetFromList(&memlist_history_, key, value, columns, timestamp, s, + merge_context, max_covering_tombstone_seq, seq, read_opts, + nullptr /*read_callback*/, is_blob_index); +} + +bool MemTableListVersion::GetFromList( + std::list* list, const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback, + bool* is_blob_index) { + *seq = kMaxSequenceNumber; + + for (auto& memtable : *list) { + assert(memtable->IsFragmentedRangeTombstonesConstructed()); + SequenceNumber current_seq = kMaxSequenceNumber; + + bool done = + memtable->Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, ¤t_seq, read_opts, + true /* immutable_memtable */, callback, is_blob_index); + if (*seq == kMaxSequenceNumber) { + // Store the most recent sequence number of any operation on this key. + // Since we only care about the most recent change, we only need to + // return the first operation found when searching memtables in + // reverse-chronological order. + // current_seq would be equal to kMaxSequenceNumber if the value was to be + // skipped. This allows seq to be assigned again when the next value is + // read. + *seq = current_seq; + } + + if (done) { + assert(*seq != kMaxSequenceNumber || s->IsNotFound()); + return true; + } + if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { + return false; + } + } + return false; +} + +Status MemTableListVersion::AddRangeTombstoneIterators( + const ReadOptions& read_opts, Arena* /*arena*/, + RangeDelAggregator* range_del_agg) { + assert(range_del_agg != nullptr); + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = read_opts.snapshot != nullptr + ? read_opts.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + for (auto& m : memlist_) { + assert(m->IsFragmentedRangeTombstonesConstructed()); + std::unique_ptr range_del_iter( + m->NewRangeTombstoneIterator(read_opts, read_seq, + true /* immutable_memtable */)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + return Status::OK(); +} + +void MemTableListVersion::AddIterators( + const ReadOptions& options, std::vector* iterator_list, + Arena* arena) { + for (auto& m : memlist_) { + iterator_list->push_back(m->NewIterator(options, arena)); + } +} + +void MemTableListVersion::AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder, + bool add_range_tombstone_iter) { + for (auto& m : memlist_) { + auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena()); + if (!add_range_tombstone_iter || options.ignore_range_deletions) { + merge_iter_builder->AddIterator(mem_iter); + } else { + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = options.snapshot != nullptr + ? options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; + auto range_del_iter = m->NewRangeTombstoneIterator( + options, read_seq, true /* immutale_memtable */); + if (range_del_iter == nullptr || range_del_iter->empty()) { + delete range_del_iter; + } else { + mem_tombstone_iter = new TruncatedRangeDelIterator( + std::unique_ptr(range_del_iter), + &m->GetInternalKeyComparator(), nullptr /* smallest */, + nullptr /* largest */); + } + merge_iter_builder->AddPointAndTombstoneIterator(mem_iter, + mem_tombstone_iter); + } + } +} + +uint64_t MemTableListVersion::GetTotalNumEntries() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_entries(); + } + return total_num; +} + +MemTable::MemTableStats MemTableListVersion::ApproximateStats( + const Slice& start_ikey, const Slice& end_ikey) { + MemTable::MemTableStats total_stats = {0, 0}; + for (auto& m : memlist_) { + auto mStats = m->ApproximateStats(start_ikey, end_ikey); + total_stats.size += mStats.size; + total_stats.count += mStats.count; + } + return total_stats; +} + +uint64_t MemTableListVersion::GetTotalNumDeletes() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_deletes(); + } + return total_num; +} + +SequenceNumber MemTableListVersion::GetEarliestSequenceNumber( + bool include_history) const { + if (include_history && !memlist_history_.empty()) { + return memlist_history_.back()->GetEarliestSequenceNumber(); + } else if (!memlist_.empty()) { + return memlist_.back()->GetEarliestSequenceNumber(); + } else { + return kMaxSequenceNumber; + } +} + +SequenceNumber MemTableListVersion::GetFirstSequenceNumber() const { + SequenceNumber min_first_seqno = kMaxSequenceNumber; + // The first memtable in the list might not be the oldest one with mempurge + for (const auto& m : memlist_) { + min_first_seqno = std::min(m->GetFirstSequenceNumber(), min_first_seqno); + } + return min_first_seqno; +} + +// caller is responsible for referencing m +void MemTableListVersion::Add(MemTable* m, autovector* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + AddMemTable(m); + // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast + TrimHistory(to_delete, 0); +} + +// Removes m from list of memtables not flushed. Caller should NOT Unref m. +void MemTableListVersion::Remove(MemTable* m, + autovector* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.remove(m); + + m->MarkFlushed(); + if (max_write_buffer_size_to_maintain_ > 0 || + max_write_buffer_number_to_maintain_ > 0) { + memlist_history_.push_front(m); + // Unable to get size of mutable memtable at this point, pass 0 to + // TrimHistory as a best effort. + TrimHistory(to_delete, 0); + } else { + UnrefMemTable(to_delete, m); + } +} + +// return the total memory usage assuming the oldest flushed memtable is dropped +size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const { + size_t total_memtable_size = 0; + for (auto& memtable : memlist_) { + total_memtable_size += memtable->MemoryAllocatedBytes(); + } + for (auto& memtable : memlist_history_) { + total_memtable_size += memtable->MemoryAllocatedBytes(); + } + if (!memlist_history_.empty()) { + total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes(); + } + return total_memtable_size; +} + +bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { + if (max_write_buffer_size_to_maintain_ > 0) { + // calculate the total memory usage after dropping the oldest flushed + // memtable, compare with max_write_buffer_size_to_maintain_ to decide + // whether to trim history + return MemoryAllocatedBytesExcludingLast() + usage >= + static_cast(max_write_buffer_size_to_maintain_); + } else if (max_write_buffer_number_to_maintain_ > 0) { + return memlist_.size() + memlist_history_.size() > + static_cast(max_write_buffer_number_to_maintain_); + } else { + return false; + } +} + +// Make sure we don't use up too much space in history +bool MemTableListVersion::TrimHistory(autovector* to_delete, + size_t usage) { + bool ret = false; + while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { + MemTable* x = memlist_history_.back(); + memlist_history_.pop_back(); + + UnrefMemTable(to_delete, x); + ret = true; + } + return ret; +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending() const { + if ((flush_requested_ && num_flush_not_started_ > 0) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { + assert(imm_flush_needed.load(std::memory_order_relaxed)); + return true; + } + return false; +} + +bool MemTableList::IsFlushPendingOrRunning() const { + if (current_->memlist_.size() - num_flush_not_started_ > 0) { + // Flush is already running on at least one memtable + return true; + } + return IsFlushPending(); +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, + autovector* ret, + uint64_t* max_next_log_number) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); + const auto& memlist = current_->memlist_; + bool atomic_flush = false; + + // Note: every time MemTableList::Add(mem) is called, it adds the new mem + // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by + // iterating through the memlist starting at the end, the vector + // ret is filled with memtables already sorted in increasing MemTable ID. + // However, when the mempurge feature is activated, new memtables with older + // IDs will be added to the memlist. + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { + atomic_flush = true; + } + if (m->GetID() > max_memtable_id) { + break; + } + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.store(false, std::memory_order_release); + } + m->flush_in_progress_ = true; // flushing will start very soon + if (max_next_log_number) { + *max_next_log_number = + std::max(m->GetNextLogNumber(), *max_next_log_number); + } + ret->push_back(m); + } else if (!ret->empty()) { + // This `break` is necessary to prevent picking non-consecutive memtables + // in case `memlist` has one or more entries with + // `flush_in_progress_ == true` sandwiched between entries with + // `flush_in_progress_ == false`. This could happen after parallel flushes + // are picked and the one flushing older memtables is rolled back. + break; + } + } + if (!atomic_flush || num_flush_not_started_ == 0) { + flush_requested_ = false; // start-flush request is complete + } +} + +void MemTableList::RollbackMemtableFlush(const autovector& mems, + uint64_t /*file_number*/) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_ROLLBACK); + assert(!mems.empty()); + + // If the flush was not successful, then just reset state. + // Maybe a succeeding attempt to flush will be successful. + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + imm_flush_needed.store(true, std::memory_order_release); +} + +// Try record a successful flush in the manifest file. It might just return +// Status::OK letting a concurrent flush to do actual the recording.. +Status MemTableList::TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& mems, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info, + bool write_edits) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + const ReadOptions read_options(Env::IOActivity::kFlush); + + // Flush was successful + // Record the status on the memtable object. Either this call or a call by a + // concurrent flush thread will read the status and write it to manifest. + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already committing, then return + Status s; + if (commit_in_progress_) { + TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress"); + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // Retry until all completed flushes are committed. New flushes can finish + // while the current thread is writing manifest where mutex is released. + while (s.ok()) { + auto& memlist = current_->memlist_; + // The back is the oldest; if flush_completed_ is not set to it, it means + // that we were assigned a more recent memtable. The memtables' flushes must + // be recorded in manifest in order. A concurrent flush thread, who is + // assigned to flush the oldest memtable, will later wake up and does all + // the pending writes to manifest, in order. + if (memlist.empty() || !memlist.back()->flush_completed_) { + break; + } + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memtables + // are always committed in the order that they were created. + uint64_t batch_file_number = 0; + size_t batch_count = 0; + autovector edit_list; + autovector memtables_to_flush; + // enumerate from the last (earliest) element to see how many batch finished + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_completed_) { + break; + } + if (it == memlist.rbegin() || batch_file_number != m->file_number_) { + batch_file_number = m->file_number_; + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files) started", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size()); + } + + edit_list.push_back(&m->edit_); + memtables_to_flush.push_back(m); + std::unique_ptr info = m->ReleaseFlushJobInfo(); + if (info != nullptr) { + committed_flush_jobs_info->push_back(std::move(info)); + } + } + batch_count++; + } + + // TODO(myabandeh): Not sure how batch_count could be 0 here. + if (batch_count > 0) { + uint64_t min_wal_number_to_keep = 0; + assert(edit_list.size() > 0); + if (vset->db_options()->allow_2pc) { + // Note that if mempurge is successful, the edit_list will + // not be applicable (contains info of new min_log number to keep, + // and level 0 file path of SST file created during normal flush, + // so both pieces of information are irrelevant after a successful + // mempurge operation). + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, *cfd, edit_list, memtables_to_flush, prep_tracker); + + // We piggyback the information of earliest log file to keep in the + // manifest entry for the last file flushed. + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list); + } + + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (min_wal_number_to_keep > + vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + } + TEST_SYNC_POINT_CALLBACK( + "MemTableList::TryInstallMemtableFlushResults:" + "AfterComputeMinWalToKeep", + nullptr); + } + edit_list.push_back(&wal_deletion); + + const auto manifest_write_cb = [this, cfd, batch_count, log_buffer, + to_delete, mu](const Status& status) { + RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer, + to_delete, mu); + }; + if (write_edits) { + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, read_options, edit_list, + mu, db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + manifest_write_cb); + } else { + // If write_edit is false (e.g: successful mempurge), + // then remove old memtables, wake up manifest write queue threads, + // and don't commit anything to the manifest file. + RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer, + to_delete, mu); + // Note: cfd->SetLogNumber is only called when a VersionEdit + // is written to MANIFEST. When mempurge is succesful, we skip + // this step, therefore cfd->GetLogNumber is always is + // earliest log with data unflushed. + // Notify new head of manifest write queue. + // wake up all the waiting writers + // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters + // needed or investigate more. + vset->WakeUpWaitingManifestWriters(); + } + } + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m, autovector* to_delete) { + assert(static_cast(current_->memlist_.size()) >= num_flush_not_started_); + InstallNewVersion(); + // this method is used to move mutable memtable into an immutable list. + // since mutable memtable is already refcounted by the DBImpl, + // and when moving to the immutable list we don't unref it, + // we don't have to ref the memtable here. we just take over the + // reference from the DBImpl. + current_->Add(m, to_delete); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.store(true, std::memory_order_release); + } + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +bool MemTableList::TrimHistory(autovector* to_delete, size_t usage) { + InstallNewVersion(); + bool ret = current_->TrimHistory(to_delete, usage); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + return ret; +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() { + size_t total_size = 0; + for (auto& memtable : current_->memlist_) { + total_size += memtable->ApproximateMemoryUsage(); + } + return total_size; +} + +size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; } + +size_t MemTableList::MemoryAllocatedBytesExcludingLast() const { + const size_t usage = current_memory_allocted_bytes_excluding_last_.load( + std::memory_order_relaxed); + return usage; +} + +bool MemTableList::HasHistory() const { + const bool has_history = current_has_history_.load(std::memory_order_relaxed); + return has_history; +} + +void MemTableList::UpdateCachedValuesFromMemTableListVersion() { + const size_t total_memtable_size = + current_->MemoryAllocatedBytesExcludingLast(); + current_memory_allocted_bytes_excluding_last_.store( + total_memtable_size, std::memory_order_relaxed); + + const bool has_history = current_->HasHistory(); + current_has_history_.store(has_history, std::memory_order_relaxed); +} + +uint64_t MemTableList::ApproximateOldestKeyTime() const { + if (!current_->memlist_.empty()) { + return current_->memlist_.back()->ApproximateOldestKeyTime(); + } + return std::numeric_limits::max(); +} + +void MemTableList::InstallNewVersion() { + if (current_->refs_ == 1) { + // we're the only one using the version, just keep using it + } else { + // somebody else holds the current version, we need to create new one + MemTableListVersion* version = current_; + current_ = new MemTableListVersion(¤t_memory_usage_, *version); + current_->Ref(); + version->Unref(); + } +} + +void MemTableList::RemoveMemTablesOrRestoreFlags( + const Status& s, ColumnFamilyData* cfd, size_t batch_count, + LogBuffer* log_buffer, autovector* to_delete, + InstrumentedMutex* mu) { + assert(mu); + mu->AssertHeld(); + assert(to_delete); + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + + // commit new state only if the column family is NOT dropped. + // The reason is as follows (refer to + // ColumnFamilyTest.FlushAndDropRaceCondition). + // If the column family is dropped, then according to LogAndApply, its + // corresponding flush operation is NOT written to the MANIFEST. This + // means the DB is not aware of the L0 files generated from the flush. + // By committing the new state, we remove the memtable from the memtable + // list. Creating an iterator on this column family will not be able to + // read full data since the memtable is removed, and the DB is not aware + // of the L0 files, causing MergingIterator unable to build child + // iterators. RocksDB contract requires that the iterator can be created + // on a dropped column family, and we must be able to + // read full data as long as column family handle is not deleted, even if + // the column family is dropped. + if (s.ok() && !cfd->IsDropped()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + ++mem_id; + } + } else { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64 + " failed", + m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } + } +} + +uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( + const std::unordered_set* memtables_to_flush) { + uint64_t min_log = 0; + + for (auto& m : current_->memlist_) { + if (memtables_to_flush && memtables_to_flush->count(m)) { + continue; + } + + auto log = m->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +// Commit a successful atomic flush in the manifest file. +Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_metas, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + const ReadOptions read_options(Env::IOActivity::kFlush); + + size_t num = mems_list.size(); + assert(cfds.size() == num); + if (imm_lists != nullptr) { + assert(imm_lists->size() == num); + } + if (num == 0) { + return Status::OK(); + } + + for (size_t k = 0; k != num; ++k) { +#ifndef NDEBUG + const auto* imm = + (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + if (!mems_list[k]->empty()) { + assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID()); + } +#endif + assert(nullptr != file_metas[k]); + for (size_t i = 0; i != mems_list[k]->size(); ++i) { + assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); + (*mems_list[k])[i]->SetFlushCompleted(true); + (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); + } + if (committed_flush_jobs_info[k]) { + assert(!mems_list[k]->empty()); + assert((*mems_list[k])[0]); + std::unique_ptr flush_job_info = + (*mems_list[k])[0]->ReleaseFlushJobInfo(); + committed_flush_jobs_info[k]->push_back(std::move(flush_job_info)); + } + } + + Status s; + + autovector> edit_lists; + uint32_t num_entries = 0; + for (const auto mems : mems_list) { + assert(mems != nullptr); + autovector edits; + assert(!mems->empty()); + edits.emplace_back((*mems)[0]->GetEdits()); + ++num_entries; + edit_lists.emplace_back(edits); + } + + WalNumber min_wal_number_to_keep = 0; + if (vset->db_options()->allow_2pc) { + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, cfds, edit_lists, mems_list, prep_tracker); + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists); + } + + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest && + min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + } + edit_lists.back().push_back(&wal_deletion); + ++num_entries; + + // Mark the version edits as an atomic group if the number of version edits + // exceeds 1. + if (cfds.size() > 1) { + for (size_t i = 0; i < edit_lists.size(); i++) { + assert((edit_lists[i].size() == 1) || + ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1))); + for (auto& e : edit_lists[i]) { + e->MarkAtomicGroup(--num_entries); + } + } + assert(0 == num_entries); + } + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, db_directory); + + for (size_t k = 0; k != cfds.size(); ++k) { + auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + imm->InstallNewVersion(); + } + + if (s.ok() || s.IsColumnFamilyDropped()) { + for (size_t i = 0; i != cfds.size(); ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + assert(m->GetFileNumber() > 0); + uint64_t mem_id = m->GetID(); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + + imm->current_->Remove(m, to_delete); + imm->UpdateCachedValuesFromMemTableListVersion(); + imm->ResetTrimHistoryNeeded(); + } + } + } else { + for (size_t i = 0; i != cfds.size(); ++i) { + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + uint64_t mem_id = m->GetID(); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + + m->SetFlushCompleted(false); + m->SetFlushInProgress(false); + m->GetEdits()->Clear(); + m->SetFileNumber(0); + imm->num_flush_not_started_++; + } + imm->imm_flush_needed.store(true, std::memory_order_release); + } + } + + return s; +} + +void MemTableList::RemoveOldMemTables(uint64_t log_number, + autovector* to_delete) { + assert(to_delete != nullptr); + InstallNewVersion(); + auto& memlist = current_->memlist_; + autovector old_memtables; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* mem = *it; + if (mem->GetNextLogNumber() > log_number) { + break; + } + old_memtables.push_back(mem); + } + + for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) { + MemTable* mem = *it; + current_->Remove(mem, to_delete); + --num_flush_not_started_; + if (0 == num_flush_not_started_) { + imm_flush_needed.store(false, std::memory_order_release); + } + } + + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.h new file mode 100644 index 0000000000..1ad28a59e2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/memtable_list.h @@ -0,0 +1,471 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/logs_with_prep_tracker.h" +#include "db/memtable.h" +#include "db/range_del_aggregator.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; +class InternalKeyComparator; +class InstrumentedMutex; +class MergeIteratorBuilder; +class MemTableList; + +struct FlushJobInfo; + +// keeps a list of immutable memtables in a vector. the list is immutable +// if refcount is bigger than one. It is used as a state for Get() and +// Iterator code paths +// +// This class is not thread-safe. External synchronization is required +// (such as holding the db mutex or being on the write thread). +class MemTableListVersion { + public: + explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, + const MemTableListVersion& old); + explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain); + + void Ref(); + void Unref(autovector* to_delete = nullptr); + + // Search all the memtables starting from the most recent one. + // Return the most recent value found, if any. + // + // If any operation was found for this key, its most recent sequence number + // will be stored in *seq on success (regardless of whether true/false is + // returned). Otherwise, *seq will be set to kMaxSequenceNumber. + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + bool Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr) { + SequenceNumber seq; + return Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, callback, + is_blob_index); + } + + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback); + + // Returns all the merge operands corresponding to the key by searching all + // memtables starting from the most recent one. + bool GetMergeOperands(const LookupKey& key, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts); + + // Similar to Get(), but searches the Memtable history of memtables that + // have already been flushed. Should only be used from in-memory only + // queries (such as Transaction validation) as the history may contain + // writes that are also present in the SST files. + bool GetFromHistory(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + bool* is_blob_index = nullptr); + bool GetFromHistory(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, + bool* is_blob_index = nullptr) { + SequenceNumber seq; + return GetFromHistory(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, + is_blob_index); + } + + Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena, + RangeDelAggregator* range_del_agg); + + void AddIterators(const ReadOptions& options, + std::vector* iterator_list, + Arena* arena); + + void AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder, + bool add_range_tombstone_iter); + + uint64_t GetTotalNumEntries() const; + + uint64_t GetTotalNumDeletes() const; + + MemTable::MemTableStats ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey); + + // Returns the value of MemTable::GetEarliestSequenceNumber() on the most + // recent MemTable in this list or kMaxSequenceNumber if the list is empty. + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const; + + // Return the first sequence number from the memtable list, which is the + // smallest sequence number of all FirstSequenceNumber. + // Return kMaxSequenceNumber if the list is empty. + SequenceNumber GetFirstSequenceNumber() const; + + private: + friend class MemTableList; + + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer); + + // REQUIRE: m is an immutable memtable + void Add(MemTable* m, autovector* to_delete); + // REQUIRE: m is an immutable memtable + void Remove(MemTable* m, autovector* to_delete); + + // Return true if memtable is trimmed + bool TrimHistory(autovector* to_delete, size_t usage); + + bool GetFromList(std::list* list, const LookupKey& key, + std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + void AddMemTable(MemTable* m); + + void UnrefMemTable(autovector* to_delete, MemTable* m); + + // Calculate the total amount of memory used by memlist_ and memlist_history_ + // excluding the last MemTable in memlist_history_. The reason for excluding + // the last MemTable is to see if dropping the last MemTable will keep total + // memory usage above or equal to max_write_buffer_size_to_maintain_ + size_t MemoryAllocatedBytesExcludingLast() const; + + // Whether this version contains flushed memtables that are only kept around + // for transaction conflict checking. + bool HasHistory() const { return !memlist_history_.empty(); } + + bool MemtableLimitExceeded(size_t usage); + + // Immutable MemTables that have not yet been flushed. + std::list memlist_; + + // MemTables that have already been flushed + // (used during Transaction validation) + std::list memlist_history_; + + // Maximum number of MemTables to keep in memory (including both flushed + const int max_write_buffer_number_to_maintain_; + // Maximum size of MemTables to keep in memory (including both flushed + // and not-yet-flushed tables). + const int64_t max_write_buffer_size_to_maintain_; + + int refs_ = 0; + + size_t* parent_memtable_list_memory_usage_; +}; + +// This class stores references to all the immutable memtables. +// The memtables are flushed to L0 as soon as possible and in +// any order. If there are more than one immutable memtable, their +// flushes can occur concurrently. However, they are 'committed' +// to the manifest in FIFO order to maintain correctness and +// recoverability from a crash. +// +// +// Other than imm_flush_needed and imm_trim_needed, this class is not +// thread-safe and requires external synchronization (such as holding the db +// mutex or being on the write thread.) +class MemTableList { + public: + // A list of memtables. + explicit MemTableList(int min_write_buffer_number_to_merge, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : imm_flush_needed(false), + imm_trim_needed(false), + min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), + current_(new MemTableListVersion(¤t_memory_usage_, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)), + num_flush_not_started_(0), + commit_in_progress_(false), + flush_requested_(false), + current_memory_usage_(0), + current_memory_allocted_bytes_excluding_last_(0), + current_has_history_(false) { + current_->Ref(); + } + + // Should not delete MemTableList without making sure MemTableList::current() + // is Unref()'d. + ~MemTableList() {} + + MemTableListVersion* current() const { return current_; } + + // so that background threads can detect non-nullptr pointer to + // determine whether there is anything more to start flushing. + std::atomic imm_flush_needed; + + std::atomic imm_trim_needed; + + // Returns the total number of memtables in the list that haven't yet + // been flushed and logged. + int NumNotFlushed() const; + + // Returns total number of memtables in the list that have been + // completely flushed and logged. + int NumFlushed() const; + + // Returns true if there is at least one memtable on which flush has + // not yet started. + bool IsFlushPending() const; + + // Returns true if there is at least one memtable that is pending flush or + // flushing. + bool IsFlushPendingOrRunning() const; + + // Returns the earliest memtables that needs to be flushed. The returned + // memtables are guaranteed to be in the ascending order of created time. + void PickMemtablesToFlush(uint64_t max_memtable_id, + autovector* mems, + uint64_t* max_next_log_number = nullptr); + + // Reset status of the given memtable list back to pending state so that + // they can get picked up again on the next round of flush. + void RollbackMemtableFlush(const autovector& mems, + uint64_t file_number); + + // Try commit a successful flush in the manifest file. It might just return + // Status::OK letting a concurrent flush to do the actual the recording. + Status TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& m, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info, + bool write_edits = true); + + // New memtables are inserted at the front of the list. + // Takes ownership of the referenced held on *m by the caller of Add(). + // By default, adding memtables will flag that the memtable list needs to be + // flushed, but in certain situations, like after a mempurge, we may want to + // avoid flushing the memtable list upon addition of a memtable. + void Add(MemTable* m, autovector* to_delete); + + // Returns an estimate of the number of bytes of data in use. + size_t ApproximateMemoryUsage(); + + // Returns the cached current_memory_allocted_bytes_excluding_last_ value. + size_t MemoryAllocatedBytesExcludingLast() const; + + // Returns the cached current_has_history_ value. + bool HasHistory() const; + + // Updates current_memory_allocted_bytes_excluding_last_ and + // current_has_history_ from MemTableListVersion. Must be called whenever + // InstallNewVersion is called. + void UpdateCachedValuesFromMemTableListVersion(); + + // `usage` is the current size of the mutable Memtable. When + // max_write_buffer_size_to_maintain is used, total size of mutable and + // immutable memtables is checked against it to decide whether to trim + // memtable list. + // + // Return true if memtable is trimmed + bool TrimHistory(autovector* to_delete, size_t usage); + + // Returns an estimate of the number of bytes of data used by + // the unflushed mem-tables. + size_t ApproximateUnflushedMemTablesMemoryUsage(); + + // Returns an estimate of the timestamp of the earliest key. + uint64_t ApproximateOldestKeyTime() const; + + // Request a flush of all existing memtables to storage. This will + // cause future calls to IsFlushPending() to return true if this list is + // non-empty (regardless of the min_write_buffer_number_to_merge + // parameter). This flush request will persist until the next time + // PickMemtablesToFlush() is called. + void FlushRequested() { + flush_requested_ = true; + // If there are some memtables stored in imm() that don't trigger + // flush (eg: mempurge output memtable), then update imm_flush_needed. + // Note: if race condition and imm_flush_needed is set to true + // when there is num_flush_not_started_==0, then there is no + // impact whatsoever. Imm_flush_needed is only used in an assert + // in IsFlushPending(). + if (num_flush_not_started_ > 0) { + imm_flush_needed.store(true, std::memory_order_release); + } + } + + bool HasFlushRequested() { return flush_requested_; } + + // Returns true if a trim history should be scheduled and the caller should + // be the one to schedule it + bool MarkTrimHistoryNeeded() { + auto expected = false; + return imm_trim_needed.compare_exchange_strong( + expected, true, std::memory_order_relaxed, std::memory_order_relaxed); + } + + void ResetTrimHistoryNeeded() { + auto expected = true; + imm_trim_needed.compare_exchange_strong( + expected, false, std::memory_order_relaxed, std::memory_order_relaxed); + } + + // Copying allowed + // MemTableList(const MemTableList&); + // void operator=(const MemTableList&); + + size_t* current_memory_usage() { return ¤t_memory_usage_; } + + // Returns the min log containing the prep section after memtables listsed in + // `memtables_to_flush` are flushed and their status is persisted in manifest. + uint64_t PrecomputeMinLogContainingPrepSection( + const std::unordered_set* memtables_to_flush = nullptr); + + uint64_t GetEarliestMemTableID() const { + auto& memlist = current_->memlist_; + if (memlist.empty()) { + return std::numeric_limits::max(); + } + return memlist.back()->GetID(); + } + + uint64_t GetLatestMemTableID() const { + auto& memlist = current_->memlist_; + if (memlist.empty()) { + return 0; + } + return memlist.front()->GetID(); + } + + void AssignAtomicFlushSeq(const SequenceNumber& seq) { + const auto& memlist = current_->memlist_; + // Scan the memtable list from new to old + for (auto it = memlist.begin(); it != memlist.end(); ++it) { + MemTable* mem = *it; + if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) { + mem->atomic_flush_seqno_ = seq; + } else { + // Earlier memtables must have been assigned a atomic flush seq, no + // need to continue scan. + break; + } + } + } + + // Used only by DBImplSecondary during log replay. + // Remove memtables whose data were written before the WAL with log_number + // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are + // not freed, but put into a vector for future deref and reclamation. + void RemoveOldMemTables(uint64_t log_number, + autovector* to_delete); + + private: + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer); + + // DB mutex held + void InstallNewVersion(); + + // DB mutex held + // Called after writing to MANIFEST + void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd, + size_t batch_count, LogBuffer* log_buffer, + autovector* to_delete, + InstrumentedMutex* mu); + + const int min_write_buffer_number_to_merge_; + + MemTableListVersion* current_; + + // the number of elements that still need flushing + int num_flush_not_started_; + + // committing in progress + bool commit_in_progress_; + + // Requested a flush of memtables to storage. It's possible to request that + // a subset of memtables be flushed. + bool flush_requested_; + + // The current memory usage. + size_t current_memory_usage_; + + // Cached value of current_->MemoryAllocatedBytesExcludingLast(). + std::atomic current_memory_allocted_bytes_excluding_last_; + + // Cached value of current_->HasHistory(). + std::atomic current_has_history_; +}; + +// Installs memtable atomic flush results. +// In most cases, imm_lists is nullptr, and the function simply uses the +// immutable memtable lists associated with the cfds. There are unit tests that +// installs flush results for external immutable memtable lists other than the +// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case, +// imm_lists parameter is not nullptr. +extern Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_context.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_context.h new file mode 100644 index 0000000000..8a7b072902 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_context.h @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +const std::vector empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { + public: + // Clear all the operands + void Clear() { + if (operand_list_) { + operand_list_->clear(); + copied_operands_->clear(); + } + } + + // Push a merge operand + void PushOperand(const Slice& operand_slice, bool operand_pinned = false) { + Initialize(); + SetDirectionBackward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // Push back a merge operand + void PushOperandBack(const Slice& operand_slice, + bool operand_pinned = false) { + Initialize(); + SetDirectionForward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list_) { + return 0; + } + return operand_list_->size(); + } + + // Get the operand at the index. + Slice GetOperand(int index) const { + assert(operand_list_); + + SetDirectionForward(); + return (*operand_list_)[index]; + } + + // Same as GetOperandsDirectionForward + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperands() const { + return GetOperandsDirectionForward(); + } + + // Return all the operands in the order as they were merged (passed to + // FullMerge or FullMergeV2) + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionForward() const { + if (!operand_list_) { + return empty_operand_list; + } + + SetDirectionForward(); + return *operand_list_; + } + + // Return all the operands in the reversed order relative to how they were + // merged (passed to FullMerge or FullMergeV2) + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionBackward() const { + if (!operand_list_) { + return empty_operand_list; + } + + SetDirectionBackward(); + return *operand_list_; + } + + private: + void Initialize() { + if (!operand_list_) { + operand_list_.reset(new std::vector()); + copied_operands_.reset(new std::vector>()); + } + } + + void SetDirectionForward() const { + if (operands_reversed_ == true) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = false; + } + } + + void SetDirectionBackward() const { + if (operands_reversed_ == false) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = true; + } + } + + // List of operands + mutable std::unique_ptr> operand_list_; + // Copy of operands that are not pinned. + std::unique_ptr>> copied_operands_; + mutable bool operands_reversed_ = true; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.cc new file mode 100644 index 0000000000..110ac9622e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.cc @@ -0,0 +1,606 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/merge_helper.h" + +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/wide/wide_column_serialization.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics_impl.h" +#include "port/likely.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" +#include "table/format.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + const CompactionFilter* compaction_filter, + Logger* logger, bool assert_valid_internal_key, + SequenceNumber latest_snapshot, + const SnapshotChecker* snapshot_checker, int level, + Statistics* stats, + const std::atomic* shutting_down) + : env_(env), + clock_(env->GetSystemClock().get()), + user_comparator_(user_comparator), + user_merge_operator_(user_merge_operator), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + logger_(logger), + assert_valid_internal_key_(assert_valid_internal_key), + allow_single_operand_(false), + latest_snapshot_(latest_snapshot), + snapshot_checker_(snapshot_checker), + level_(level), + keys_(), + filter_timer_(clock_), + total_filter_time_(0U), + stats_(stats) { + assert(user_comparator_ != nullptr); + if (user_merge_operator_) { + allow_single_operand_ = user_merge_operator_->AllowSingleOperand(); + } +} + +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, const Slice* value, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, Slice* result_operand, + bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope) { + assert(merge_operator != nullptr); + + if (operands.empty()) { + assert(value != nullptr && result != nullptr); + result->assign(value->data(), value->size()); + return Status::OK(); + } + + if (update_num_ops_stats) { + RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS, + static_cast(operands.size())); + } + + bool success = false; + Slice tmp_result_operand(nullptr, 0); + const MergeOperator::MergeOperationInput merge_in(key, value, operands, + logger); + MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); + { + // Setup to time the merge + StopWatchNano timer(clock, statistics != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); + + // Do the merge + success = merge_operator->FullMergeV2(merge_in, &merge_out); + + if (tmp_result_operand.data()) { + // FullMergeV2 result is an existing operand + if (result_operand != nullptr) { + *result_operand = tmp_result_operand; + } else { + result->assign(tmp_result_operand.data(), tmp_result_operand.size()); + } + } else if (result_operand) { + *result_operand = Slice(nullptr, 0); + } + + RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, + statistics ? timer.ElapsedNanos() : 0); + } + + if (op_failure_scope != nullptr) { + *op_failure_scope = merge_out.op_failure_scope; + // Apply default per merge_operator.h + if (*op_failure_scope == MergeOperator::OpFailureScope::kDefault) { + *op_failure_scope = MergeOperator::OpFailureScope::kTryMerge; + } + } + + if (!success) { + RecordTick(statistics, NUMBER_MERGE_FAILURES); + return Status::Corruption(Status::SubCode::kMergeOperatorFailed); + } + + return Status::OK(); +} + +Status MergeHelper::TimedFullMergeWithEntity( + const MergeOperator* merge_operator, const Slice& key, Slice base_entity, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope) { + WideColumns base_columns; + + { + const Status s = + WideColumnSerialization::Deserialize(base_entity, base_columns); + if (!s.ok()) { + return s; + } + } + + const bool has_default_column = + !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName; + + Slice value_of_default; + if (has_default_column) { + value_of_default = base_columns[0].value(); + } + + std::string merge_result; + + { + const Status s = TimedFullMerge(merge_operator, key, &value_of_default, + operands, &merge_result, logger, statistics, + clock, nullptr /* result_operand */, + update_num_ops_stats, op_failure_scope); + if (!s.ok()) { + return s; + } + } + + if (has_default_column) { + base_columns[0].value() = merge_result; + + const Status s = WideColumnSerialization::Serialize(base_columns, *result); + if (!s.ok()) { + return s; + } + } else { + const Status s = + WideColumnSerialization::Serialize(merge_result, base_columns, *result); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +// PRE: iter points to the first merge type entry +// POST: iter points to the first entry beyond the merge process (or the end) +// keys_, operands_ are updated to reflect the merge result. +// keys_ stores the list of keys encountered while merging. +// operands_ stores the list of merge operands encountered while merging. +// keys_[i] corresponds to operands_[i] for each i. +// +// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator +// and just pass the StripeRep corresponding to the stripe being merged. +Status MergeHelper::MergeUntil(InternalIterator* iter, + CompactionRangeDelAggregator* range_del_agg, + const SequenceNumber stop_before, + const bool at_bottom, + const bool allow_data_in_errors, + const BlobFetcher* blob_fetcher, + const std::string* const full_history_ts_low, + PrefetchBufferCollection* prefetch_buffers, + CompactionIterationStats* c_iter_stats) { + // Get a copy of the internal key, before it's invalidated by iter->Next() + // Also maintain the list of merge operands seen. + assert(HasOperator()); + keys_.clear(); + merge_context_.Clear(); + has_compaction_filter_skip_until_ = false; + assert(user_merge_operator_); + assert(user_comparator_); + const size_t ts_sz = user_comparator_->timestamp_size(); + if (full_history_ts_low) { + assert(ts_sz > 0); + assert(ts_sz == full_history_ts_low->size()); + } + bool first_key = true; + + // We need to parse the internal key again as the parsed key is + // backed by the internal key! + // Assume no internal key corruption as it has been successfully parsed + // by the caller. + // original_key_is_iter variable is just caching the information: + // original_key_is_iter == (iter->key().ToString() == original_key) + bool original_key_is_iter = true; + std::string original_key = iter->key().ToString(); + // Important: + // orig_ikey is backed by original_key if keys_.empty() + // orig_ikey is backed by keys_.back() if !keys_.empty() + ParsedInternalKey orig_ikey; + + Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors); + assert(s.ok()); + if (!s.ok()) return s; + + assert(kTypeMerge == orig_ikey.type); + + bool hit_the_next_user_key = false; + int cmp_with_full_history_ts_low = 0; + for (; iter->Valid(); iter->Next(), original_key_is_iter = false) { + if (IsShuttingDown()) { + s = Status::ShutdownInProgress(); + return s; + } + // Skip range tombstones emitted by the compaction iterator. + if (iter->IsDeleteRangeSentinelKey()) { + continue; + } + + ParsedInternalKey ikey; + assert(keys_.size() == merge_context_.GetNumOperands()); + + Status pik_status = + ParseInternalKey(iter->key(), &ikey, allow_data_in_errors); + Slice ts; + if (pik_status.ok()) { + ts = ExtractTimestampFromUserKey(ikey.user_key, ts_sz); + if (full_history_ts_low) { + cmp_with_full_history_ts_low = + user_comparator_->CompareTimestamp(ts, *full_history_ts_low); + } + } + if (!pik_status.ok()) { + // stop at corrupted key + if (assert_valid_internal_key_) { + return pik_status; + } + break; + } else if (first_key) { + // If user-defined timestamp is enabled, we expect both user key and + // timestamps are equal, as a sanity check. + assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)); + first_key = false; + } else if (!user_comparator_->EqualWithoutTimestamp(ikey.user_key, + orig_ikey.user_key) || + (ts_sz > 0 && + !user_comparator_->Equal(ikey.user_key, orig_ikey.user_key) && + cmp_with_full_history_ts_low >= 0)) { + // 1) hit a different user key, or + // 2) user-defined timestamp is enabled, and hit a version of user key NOT + // eligible for GC, then stop right here. + hit_the_next_user_key = true; + break; + } else if (stop_before > 0 && ikey.sequence <= stop_before && + LIKELY(snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(ikey.sequence, + stop_before) != + SnapshotCheckerResult::kNotInSnapshot)) { + // hit an entry that's possibly visible by the previous snapshot, can't + // touch that + break; + } + + // At this point we are guaranteed that we need to process this key. + + assert(IsValueType(ikey.type)); + if (ikey.type != kTypeMerge) { + // hit a put/delete/single delete + // => merge the put value or a nullptr with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Success! + + // If there are no operands, just return the Status::OK(). That will cause + // the compaction iterator to write out the key we're currently at, which + // is the put/delete we just encountered. + if (keys_.empty()) { + return s; + } + + // TODO: if we're in compaction and it's a put, it would be nice to run + // compaction filter on it. + std::string merge_result; + MergeOperator::OpFailureScope op_failure_scope; + + if (range_del_agg && + range_del_agg->ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal)) { + s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } else if (ikey.type == kTypeValue) { + const Slice val = iter->value(); + + s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } else if (ikey.type == kTypeBlobIndex) { + BlobIndex blob_index; + + s = blob_index.DecodeFrom(iter->value()); + if (!s.ok()) { + return s; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher); + + PinnableSlice blob_value; + s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, prefetch_buffer, + &blob_value, &bytes_read); + if (!s.ok()) { + return s; + } + + if (c_iter_stats) { + ++c_iter_stats->num_blobs_read; + c_iter_stats->total_blob_bytes_read += bytes_read; + } + + s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } else if (ikey.type == kTypeWideColumnEntity) { + s = TimedFullMergeWithEntity( + user_merge_operator_, ikey.user_key, iter->value(), + merge_context_.GetOperands(), &merge_result, logger_, stats_, + clock_, /* update_num_ops_stats */ false, &op_failure_scope); + } else { + s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (s.ok()) { + // The original key encountered + original_key = std::move(keys_.back()); + orig_ikey.type = ikey.type == kTypeWideColumnEntity + ? kTypeWideColumnEntity + : kTypeValue; + UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); + merge_context_.Clear(); + keys_.emplace_front(std::move(original_key)); + merge_context_.PushOperand(merge_result); + + // move iter to the next entry + iter->Next(); + } else if (op_failure_scope == + MergeOperator::OpFailureScope::kMustMerge) { + // Change to `Status::MergeInProgress()` to denote output consists of + // merge operands only. Leave `iter` at the non-merge entry so it will + // be output after. + s = Status::MergeInProgress(); + } + return s; + } else { + // hit a merge + // => if there is a compaction filter, apply it. + // => check for range tombstones covering the operand + // => merge the operand into the front of the operands_ list + // if not filtered + // => then continue because we haven't yet seen a Put/Delete. + // + // Keep queuing keys and operands until we either meet a put / delete + // request or later did a partial merge. + + Slice value_slice = iter->value(); + // add an operand to the list if: + // 1) it's included in one of the snapshots. in that case we *must* write + // it out, no matter what compaction filter says + // 2) it's not filtered by a compaction filter + CompactionFilter::Decision filter = + ikey.sequence <= latest_snapshot_ + ? CompactionFilter::Decision::kKeep + : FilterMerge(orig_ikey.user_key, value_slice); + if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil && + range_del_agg != nullptr && + range_del_agg->ShouldDelete( + iter->key(), RangeDelPositioningMode::kForwardTraversal)) { + filter = CompactionFilter::Decision::kRemove; + } + if (filter == CompactionFilter::Decision::kKeep || + filter == CompactionFilter::Decision::kChangeValue) { + if (original_key_is_iter) { + // this is just an optimization that saves us one memcpy + keys_.emplace_front(original_key); + } else { + keys_.emplace_front(iter->key().ToString()); + } + if (keys_.size() == 1) { + // we need to re-anchor the orig_ikey because it was anchored by + // original_key before + pik_status = + ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + } + if (filter == CompactionFilter::Decision::kKeep) { + merge_context_.PushOperand( + value_slice, iter->IsValuePinned() /* operand_pinned */); + } else { + assert(filter == CompactionFilter::Decision::kChangeValue); + // Compaction filter asked us to change the operand from value_slice + // to compaction_filter_value_. + merge_context_.PushOperand(compaction_filter_value_, false); + } + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + // Compaction filter asked us to remove this key altogether + // (not just this operand), along with some keys following it. + keys_.clear(); + merge_context_.Clear(); + has_compaction_filter_skip_until_ = true; + return s; + } + } + } + + if (cmp_with_full_history_ts_low >= 0) { + size_t num_merge_operands = merge_context_.GetNumOperands(); + if (ts_sz && num_merge_operands > 1) { + // We do not merge merge operands with different timestamps if they are + // not eligible for GC. + ROCKS_LOG_ERROR(logger_, "ts_sz=%d, %d merge oprands", + static_cast(ts_sz), + static_cast(num_merge_operands)); + assert(false); + } + } + + if (merge_context_.GetNumOperands() == 0) { + // we filtered out all the merge operands + return s; + } + + // We are sure we have seen this key's entire history if: + // at_bottom == true (this does not necessarily mean it is the bottommost + // layer, but rather that we are confident the key does not appear on any of + // the lower layers, at_bottom == false doesn't mean it does appear, just + // that we can't be sure, see Compaction::IsBottommostLevel for details) + // AND + // we have either encountered another key or end of key history on this + // layer. + // Note that if user-defined timestamp is enabled, we need some extra caution + // here: if full_history_ts_low is nullptr, or it's not null but the key's + // timestamp is greater than or equal to full_history_ts_low, it means this + // key cannot be dropped. We may not have seen the beginning of the key. + // + // When these conditions are true we are able to merge all the keys + // using full merge. + // + // For these cases we are not sure about, we simply miss the opportunity + // to combine the keys. Since VersionSet::SetupOtherInputs() always makes + // sure that all merge-operands on the same level get compacted together, + // this will simply lead to these merge operands moving to the next level. + bool surely_seen_the_beginning = + (hit_the_next_user_key || !iter->Valid()) && at_bottom && + (ts_sz == 0 || cmp_with_full_history_ts_low < 0); + if (surely_seen_the_beginning) { + // do a final merge with nullptr as the existing value and say + // bye to the merge type (it's now converted to a Put) + assert(kTypeMerge == orig_ikey.type); + assert(merge_context_.GetNumOperands() >= 1); + assert(merge_context_.GetNumOperands() == keys_.size()); + std::string merge_result; + MergeOperator::OpFailureScope op_failure_scope; + s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + if (s.ok()) { + // The original key encountered + // We are certain that keys_ is not empty here (see assertions couple of + // lines before). + original_key = std::move(keys_.back()); + orig_ikey.type = kTypeValue; + UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); + merge_context_.Clear(); + keys_.emplace_front(std::move(original_key)); + merge_context_.PushOperand(merge_result); + } else if (op_failure_scope == MergeOperator::OpFailureScope::kMustMerge) { + // Change to `Status::MergeInProgress()` to denote output consists of + // merge operands only. + s = Status::MergeInProgress(); + } + } else { + // We haven't seen the beginning of the key nor a Put/Delete. + // Attempt to use the user's associative merge function to + // merge the stacked merge operands into a single operand. + s = Status::MergeInProgress(); + if (merge_context_.GetNumOperands() >= 2 || + (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) { + bool merge_success = false; + std::string merge_result; + { + StopWatchNano timer(clock_, stats_ != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); + merge_success = user_merge_operator_->PartialMergeMulti( + orig_ikey.user_key, + std::deque(merge_context_.GetOperands().begin(), + merge_context_.GetOperands().end()), + &merge_result, logger_); + RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME, + stats_ ? timer.ElapsedNanosSafe() : 0); + } + if (merge_success) { + // Merging of operands (associative merge) was successful. + // Replace operands with the merge result + merge_context_.Clear(); + merge_context_.PushOperand(merge_result); + keys_.erase(keys_.begin(), keys_.end() - 1); + } + } + } + + return s; +} + +MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper) + : merge_helper_(merge_helper) { + it_keys_ = merge_helper_->keys().rend(); + it_values_ = merge_helper_->values().rend(); +} + +void MergeOutputIterator::SeekToFirst() { + const auto& keys = merge_helper_->keys(); + const auto& values = merge_helper_->values(); + assert(keys.size() == values.size()); + it_keys_ = keys.rbegin(); + it_values_ = values.rbegin(); +} + +void MergeOutputIterator::Next() { + ++it_keys_; + ++it_values_; +} + +CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key, + const Slice& value_slice) { + if (compaction_filter_ == nullptr) { + return CompactionFilter::Decision::kKeep; + } + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + filter_timer_.Start(); + } + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + auto ret = compaction_filter_->FilterV3( + level_, user_key, CompactionFilter::ValueType::kMergeOperand, + &value_slice, /* existing_columns */ nullptr, &compaction_filter_value_, + /* new_columns */ nullptr, compaction_filter_skip_until_.rep()); + if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) { + if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(), + user_key) <= 0) { + // Invalid skip_until returned from compaction filter. + // Keep the key as per FilterV2/FilterV3 documentation. + ret = CompactionFilter::Decision::kKeep; + } else { + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + } + } + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + } + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.h new file mode 100644 index 0000000000..7f624b7432 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_helper.h @@ -0,0 +1,224 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include + +#include "db/merge_context.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/wide_columns.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +class Iterator; +class Logger; +class MergeOperator; +class Statistics; +class SystemClock; +class BlobFetcher; +class PrefetchBufferCollection; +struct CompactionIterationStats; + +class MergeHelper { + public: + MergeHelper(Env* env, const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + const CompactionFilter* compaction_filter, Logger* logger, + bool assert_valid_internal_key, SequenceNumber latest_snapshot, + const SnapshotChecker* snapshot_checker = nullptr, int level = 0, + Statistics* stats = nullptr, + const std::atomic* shutting_down = nullptr); + + // Wrapper around MergeOperator::FullMergeV2() that records perf statistics. + // Result of merge will be written to result if status returned is OK. + // If operands is empty, the value will simply be copied to result. + // Set `update_num_ops_stats` to true if it is from a user read, so that + // the latency is sensitive. + // Returns one of the following statuses: + // - OK: Entries were successfully merged. + // - Corruption: Merge operator reported unsuccessful merge. The scope of the + // damage will be stored in `*op_failure_scope` when `op_failure_scope` is + // not nullptr + static Status TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, + Slice* result_operand, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope); + + static Status TimedFullMergeWithEntity( + const MergeOperator* merge_operator, const Slice& key, Slice base_entity, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope); + + // During compaction, merge entries until we hit + // - a corrupted key + // - a Put/Delete, + // - a different user key, + // - a specific sequence number (snapshot boundary), + // - REMOVE_AND_SKIP_UNTIL returned from compaction filter, + // or - the end of iteration + // + // The result(s) of the merge can be accessed in `MergeHelper::keys()` and + // `MergeHelper::values()`, which are invalidated the next time `MergeUntil()` + // is called. `MergeOutputIterator` is specially designed to iterate the + // results of a `MergeHelper`'s most recent `MergeUntil()`. + // + // iter: (IN) points to the first merge type entry + // (OUT) points to the first entry not included in the merge process + // range_del_agg: (IN) filters merge operands covered by range tombstones. + // stop_before: (IN) a sequence number that merge should not cross. + // 0 means no restriction + // at_bottom: (IN) true if the iterator covers the bottem level, which means + // we could reach the start of the history of this user key. + // allow_data_in_errors: (IN) if true, data details will be displayed in + // error/log messages. + // blob_fetcher: (IN) blob fetcher object for the compaction's input version. + // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers + // used for compaction readahead. + // c_iter_stats: (OUT) compaction iteration statistics. + // + // Returns one of the following statuses: + // - OK: Entries were successfully merged. + // - MergeInProgress: Output consists of merge operands only. + // - Corruption: Merge operator reported unsuccessful merge or a corrupted + // key has been encountered and not expected (applies only when compiling + // with asserts removed). + // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true). + // + // REQUIRED: The first key in the input is not corrupted. + Status MergeUntil(InternalIterator* iter, + CompactionRangeDelAggregator* range_del_agg, + const SequenceNumber stop_before, const bool at_bottom, + const bool allow_data_in_errors, + const BlobFetcher* blob_fetcher, + const std::string* const full_history_ts_low, + PrefetchBufferCollection* prefetch_buffers, + CompactionIterationStats* c_iter_stats); + + // Filters a merge operand using the compaction filter specified + // in the constructor. Returns the decision that the filter made. + // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the + // optional outputs of compaction filter. + // user_key includes timestamp if user-defined timestamp is enabled. + CompactionFilter::Decision FilterMerge(const Slice& user_key, + const Slice& value_slice); + + // Query the merge result + // These are valid until the next MergeUntil call + // If the merging was successful: + // - keys() contains a single element with the latest sequence number of + // the merges. The type will be Put or Merge. See IMPORTANT 1 note, below. + // - values() contains a single element with the result of merging all the + // operands together + // + // IMPORTANT 1: the key type could change after the MergeUntil call. + // Put/Delete + Merge + ... + Merge => Put + // Merge + ... + Merge => Merge + // + // If the merge operator is not associative, and if a Put/Delete is not found + // then the merging will be unsuccessful. In this case: + // - keys() contains the list of internal keys seen in order of iteration. + // - values() contains the list of values (merges) seen in the same order. + // values() is parallel to keys() so that the first entry in + // keys() is the key associated with the first entry in values() + // and so on. These lists will be the same length. + // All of these pairs will be merges over the same user key. + // See IMPORTANT 2 note below. + // + // IMPORTANT 2: The entries were traversed in order from BACK to FRONT. + // So keys().back() was the first key seen by iterator. + // TODO: Re-style this comment to be like the first one + const std::deque& keys() const { return keys_; } + const std::vector& values() const { + return merge_context_.GetOperands(); + } + uint64_t TotalFilterTime() const { return total_filter_time_; } + bool HasOperator() const { return user_merge_operator_ != nullptr; } + + // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will + // return true and fill *until with the key to which we should skip. + // If true, keys() and values() are empty. + bool FilteredUntil(Slice* skip_until) const { + if (!has_compaction_filter_skip_until_) { + return false; + } + assert(compaction_filter_ != nullptr); + assert(skip_until != nullptr); + assert(compaction_filter_skip_until_.Valid()); + *skip_until = compaction_filter_skip_until_.Encode(); + return true; + } + + private: + Env* env_; + SystemClock* clock_; + const Comparator* user_comparator_; + const MergeOperator* user_merge_operator_; + const CompactionFilter* compaction_filter_; + const std::atomic* shutting_down_; + Logger* logger_; + bool assert_valid_internal_key_; // enforce no internal key corruption? + bool allow_single_operand_; + SequenceNumber latest_snapshot_; + const SnapshotChecker* const snapshot_checker_; + int level_; + + // the scratch area that holds the result of MergeUntil + // valid up to the next MergeUntil call + + // Keeps track of the sequence of keys seen + std::deque keys_; + // Parallel with keys_; stores the operands + mutable MergeContext merge_context_; + + StopWatchNano filter_timer_; + uint64_t total_filter_time_; + Statistics* stats_; + + bool has_compaction_filter_skip_until_ = false; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } +}; + +// MergeOutputIterator can be used to iterate over the result of a merge. +class MergeOutputIterator { + public: + // The MergeOutputIterator is bound to a MergeHelper instance. + explicit MergeOutputIterator(const MergeHelper* merge_helper); + + // Seeks to the first record in the output. + void SeekToFirst(); + // Advances to the next record in the output. + void Next(); + + Slice key() { return Slice(*it_keys_); } + Slice value() { return Slice(*it_values_); } + bool Valid() { return it_keys_ != merge_helper_->keys().rend(); } + + private: + const MergeHelper* merge_helper_; + std::deque::const_reverse_iterator it_keys_; + std::vector::const_reverse_iterator it_values_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_operator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_operator.cc new file mode 100644 index 0000000000..d325856406 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/merge_operator.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/** + * Back-end implementation details specific to the Merge Operator. + */ + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { + +bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // If FullMergeV2 is not implemented, we convert the operand_list to + // std::deque and pass it to FullMerge + std::deque operand_list_str; + for (auto& op : merge_in.operand_list) { + operand_list_str.emplace_back(op.data(), op.size()); + } + return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str, + &merge_out->new_value, merge_in.logger); +} + +// The default implementation of PartialMergeMulti, which invokes +// PartialMerge multiple times internally and merges two operands at +// a time. +bool MergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + assert(operand_list.size() >= 2); + // Simply loop through the operands + Slice temp_slice(operand_list[0]); + + for (size_t i = 1; i < operand_list.size(); ++i) { + auto& operand = operand_list[i]; + std::string temp_value; + if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_slice = Slice(*new_value); + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Given a "real" merge from the library, call the user's +// associative merge function one-by-one on each of the operands. +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Simply loop through the operands + Slice temp_existing; + const Slice* existing_value = merge_in.existing_value; + for (const auto& operand : merge_in.operand_list) { + std::string temp_value; + if (!Merge(merge_in.key, existing_value, operand, &temp_value, + merge_in.logger)) { + return false; + } + swap(temp_value, merge_out->new_value); + temp_existing = Slice(merge_out->new_value); + existing_value = &temp_existing; + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Call the user defined simple merge on the operands; +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + return Merge(key, &left_operand, right_operand, new_value, logger); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.cc new file mode 100644 index 0000000000..e93e2d68c4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/output_validator.h" + +#include "test_util/sync_point.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { +Status OutputValidator::Add(const Slice& key, const Slice& value) { + if (enable_hash_) { + // Generate a rolling 64-bit hash of the key and values + paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_); + paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_); + } + if (enable_order_check_) { + TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check", + /*arg=*/nullptr); + if (key.size() < kNumInternalBytes) { + return Status::Corruption( + "Compaction tries to write a key without internal bytes."); + } + // prev_key_ starts with empty. + if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) { + return Status::Corruption("Compaction sees out-of-order keys."); + } + prev_key_.assign(key.data(), key.size()); + } + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.h new file mode 100644 index 0000000000..40635f9c44 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/output_validator.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// A class that validates key/value that is inserted to an SST file. +// Pass every key/value of the file using OutputValidator::Add() +// and the class validates key order and optionally calculate a hash +// of all the key and value. +class OutputValidator { + public: + explicit OutputValidator(const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + uint64_t precalculated_hash = 0) + : icmp_(icmp), + paranoid_hash_(precalculated_hash), + enable_order_check_(enable_order_check), + enable_hash_(enable_hash) {} + + // Add a key to the KV sequence, and return whether the key follows + // criteria, e.g. key is ordered. + Status Add(const Slice& key, const Slice& value); + + // Compare result of two key orders are the same. It can be used + // to compare the keys inserted into a file, and what is read back. + // Return true if the validation passes. + bool CompareValidator(const OutputValidator& other_validator) { + return GetHash() == other_validator.GetHash(); + } + + // Not (yet) intended to be persisted, so subject to change + // without notice between releases. + uint64_t GetHash() const { return paranoid_hash_; } + + private: + const InternalKeyComparator& icmp_; + std::string prev_key_; + uint64_t paranoid_hash_ = 0; + bool enable_order_check_; + bool enable_hash_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.cc new file mode 100644 index 0000000000..1306f45da6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.cc @@ -0,0 +1,111 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/periodic_task_scheduler.h" + +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +// `timer_mutex` is a global mutex serves 3 purposes currently: +// (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as +// they are currently not implemented in a thread-safe way; and +// (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and +// the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically. +// (3) protect tasks_map_ in PeriodicTaskScheduler +// Note: It's not efficient to have a static global mutex, for +// PeriodicTaskScheduler it should be okay, as the operations are called +// infrequently. +static port::Mutex timer_mutex; + +static const std::map kDefaultPeriodSeconds = { + {PeriodicTaskType::kDumpStats, kInvalidPeriodSec}, + {PeriodicTaskType::kPersistStats, kInvalidPeriodSec}, + {PeriodicTaskType::kFlushInfoLog, 10}, + {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec}, +}; + +static const std::map kPeriodicTaskTypeNames = { + {PeriodicTaskType::kDumpStats, "dump_st"}, + {PeriodicTaskType::kPersistStats, "pst_st"}, + {PeriodicTaskType::kFlushInfoLog, "flush_info_log"}, + {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"}, +}; + +Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, + const PeriodicTaskFunc& fn) { + return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type)); +} + +Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, + const PeriodicTaskFunc& fn, + uint64_t repeat_period_seconds) { + MutexLock l(&timer_mutex); + static std::atomic initial_delay(0); + + if (repeat_period_seconds == kInvalidPeriodSec) { + return Status::InvalidArgument("Invalid task repeat period"); + } + auto it = tasks_map_.find(task_type); + if (it != tasks_map_.end()) { + // the task already exists and it's the same, no update needed + if (it->second.repeat_every_sec == repeat_period_seconds) { + return Status::OK(); + } + // cancel the existing one before register new one + timer_->Cancel(it->second.name); + tasks_map_.erase(it); + } + + timer_->Start(); + // put task type name as prefix, for easy debug + std::string unique_id = + kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++); + + bool succeeded = timer_->Add( + fn, unique_id, + (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond, + repeat_period_seconds * kMicrosInSecond); + if (!succeeded) { + return Status::Aborted("Failed to register periodic task"); + } + auto result = tasks_map_.try_emplace( + task_type, TaskInfo{unique_id, repeat_period_seconds}); + if (!result.second) { + return Status::Aborted("Failed to add periodic task"); + }; + return Status::OK(); +} + +Status PeriodicTaskScheduler::Unregister(PeriodicTaskType task_type) { + MutexLock l(&timer_mutex); + auto it = tasks_map_.find(task_type); + if (it != tasks_map_.end()) { + timer_->Cancel(it->second.name); + tasks_map_.erase(it); + } + if (!timer_->HasPendingTask()) { + timer_->Shutdown(); + } + return Status::OK(); +} + +Timer* PeriodicTaskScheduler::Default() { + static Timer timer(SystemClock::Default().get()); + return &timer; +} + +#ifndef NDEBUG +void PeriodicTaskScheduler::TEST_OverrideTimer(SystemClock* clock) { + static Timer test_timer(clock); + test_timer.TEST_OverrideTimer(clock); + MutexLock l(&timer_mutex); + timer_ = &test_timer; +} +#endif // NDEBUG + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.h new file mode 100644 index 0000000000..4d129a6797 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/periodic_task_scheduler.h @@ -0,0 +1,108 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include "util/timer.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +using PeriodicTaskFunc = std::function; + +constexpr uint64_t kInvalidPeriodSec = 0; + +// List of task types +enum class PeriodicTaskType : uint8_t { + kDumpStats = 0, + kPersistStats, + kFlushInfoLog, + kRecordSeqnoTime, + kMax, +}; + +// PeriodicTaskScheduler contains the periodic task scheduled from the DB +// instance. It's used to schedule/unschedule DumpStats(), PersistStats(), +// FlushInfoLog(), etc. Each type of the task can only have one instance, +// re-register the same task type would only update the repeat period. +// +// Internally, it uses a global single threaded timer object to run the periodic +// task functions. Timer thread will always be started since the info log +// flushing cannot be disabled. +class PeriodicTaskScheduler { + public: + explicit PeriodicTaskScheduler() = default; + + PeriodicTaskScheduler(const PeriodicTaskScheduler&) = delete; + PeriodicTaskScheduler(PeriodicTaskScheduler&&) = delete; + PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete; + PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete; + + // Register a task with its default repeat period + Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn); + + // Register a task with specified repeat period. 0 is an invalid argument + // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically + Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, + uint64_t repeat_period_seconds); + + // Unregister the task + Status Unregister(PeriodicTaskType task_type); + +#ifndef NDEBUG + // Override the timer for the unittest + void TEST_OverrideTimer(SystemClock* clock); + + // Call Timer TEST_WaitForRun() which wait until Timer starting waiting. + void TEST_WaitForRun(const std::function& callback) const { + if (timer_ != nullptr) { + timer_->TEST_WaitForRun(callback); + } + } + + // Get global valid task number in the Timer + size_t TEST_GetValidTaskNum() const { + if (timer_ != nullptr) { + return timer_->TEST_GetPendingTaskNum(); + } + return 0; + } + + // If it has the specified task type registered + bool TEST_HasTask(PeriodicTaskType task_type) const { + auto it = tasks_map_.find(task_type); + return it != tasks_map_.end(); + } +#endif // NDEBUG + + private: + // default global Timer instance + static Timer* Default(); + + // Internal structure to store task information + struct TaskInfo { + TaskInfo(std::string _name, uint64_t _repeat_every_sec) + : name(std::move(_name)), repeat_every_sec(_repeat_every_sec) {} + std::string name; + uint64_t repeat_every_sec; + }; + + // Internal tasks map + std::map tasks_map_; + + // Global timer pointer, which doesn't support synchronous add/cancel tasks + // so having a global `timer_mutex` for add/cancel task. + Timer* timer_ = Default(); + + // Global task id, protected by the global `timer_mutex` + inline static uint64_t id_; + + static constexpr uint64_t kMicrosInSecond = 1000U * 1000U; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pinned_iterators_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pinned_iterators_manager.h new file mode 100644 index 0000000000..0fcf231dad --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pinned_iterators_manager.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include + +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// PinnedIteratorsManager will be notified whenever we need to pin an Iterator +// and it will be responsible for deleting pinned Iterators when they are +// not needed anymore. +class PinnedIteratorsManager : public Cleanable { + public: + PinnedIteratorsManager() : pinning_enabled(false) {} + ~PinnedIteratorsManager() { + if (pinning_enabled) { + ReleasePinnedData(); + } + } + + // Move constructor and move assignment is allowed. + PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default; + PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept = + default; + + // Enable Iterators pinning + void StartPinning() { + assert(pinning_enabled == false); + pinning_enabled = true; + } + + // Is pinning enabled ? + bool PinningEnabled() { return pinning_enabled; } + + // Take ownership of iter and delete it when ReleasePinnedData() is called + void PinIterator(InternalIterator* iter, bool arena = false) { + if (arena) { + PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator); + } else { + PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator); + } + } + + using ReleaseFunction = void (*)(void* arg1); + void PinPtr(void* ptr, ReleaseFunction release_func) { + assert(pinning_enabled); + if (ptr == nullptr) { + return; + } + pinned_ptrs_.emplace_back(ptr, release_func); + } + + // Release pinned Iterators + inline void ReleasePinnedData() { + assert(pinning_enabled == true); + pinning_enabled = false; + + // Remove duplicate pointers + std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end()); + auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end()); + + for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) { + void* ptr = i->first; + ReleaseFunction release_func = i->second; + release_func(ptr); + } + pinned_ptrs_.clear(); + // Also do cleanups from the base Cleanable + Cleanable::Reset(); + } + + private: + static void ReleaseInternalIterator(void* ptr) { + delete reinterpret_cast(ptr); + } + + static void ReleaseArenaInternalIterator(void* ptr) { + reinterpret_cast(ptr)->~InternalIterator(); + } + + bool pinning_enabled; + std::vector> pinned_ptrs_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/post_memtable_callback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/post_memtable_callback.h new file mode 100644 index 0000000000..fbf2fbe869 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/post_memtable_callback.h @@ -0,0 +1,25 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Callback invoked after finishing writing to the memtable but before +// publishing the sequence number to readers. +// Note that with write-prepared/write-unprepared transactions with +// two-write-queues, PreReleaseCallback is called before publishing the +// sequence numbers to readers. +class PostMemTableCallback { + public: + virtual ~PostMemTableCallback() {} + + virtual Status operator()(SequenceNumber seq, bool disable_memtable) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pre_release_callback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pre_release_callback.h new file mode 100644 index 0000000000..6b90394877 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/pre_release_callback.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class PreReleaseCallback { + public: + virtual ~PreReleaseCallback() {} + + // Will be called while on the write thread after the write to the WAL and + // before the write to memtable. This is useful if any operation needs to be + // done before the write gets visible to the readers, or if we want to reduce + // the overhead of locking by updating something sequentially while we are on + // the write thread. If the callback fails, this function returns a non-OK + // status, the sequence number will not be released, and same status will be + // propagated to all the writers in the write group. + // seq is the sequence number that is used for this write and will be + // released. + // is_mem_disabled is currently used for debugging purposes to assert that + // the callback is done from the right write queue. + // If non-zero, log_number indicates the WAL log to which we wrote. + // index >= 0 specifies the order of callback in the same write thread. + // total > index specifies the total number of callbacks in the same write + // thread. Together with index, could be used to reduce the redundant + // operations among the callbacks. + virtual Status Callback(SequenceNumber seq, bool is_mem_disabled, + uint64_t log_number, size_t index, size_t total) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.cc new file mode 100644 index 0000000000..6e76f9c725 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.cc @@ -0,0 +1,555 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_del_aggregator.h" + +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +TruncatedRangeDelIterator::TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest) + : iter_(std::move(iter)), + icmp_(icmp), + smallest_ikey_(smallest), + largest_ikey_(largest) { + // Set up bounds such that range tombstones from this iterator are + // truncated to range [smallest_, largest_). + if (smallest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_smallest = pinned_bounds_.back(); + Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + parsed_smallest.type = kTypeMaxValid; + assert(pik_status.ok()); + smallest_ = &parsed_smallest; + } + if (largest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_largest = pinned_bounds_.back(); + + Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + + if (parsed_largest.type == kTypeRangeDeletion && + parsed_largest.sequence == kMaxSequenceNumber) { + // The file boundary has been artificially extended by a range tombstone. + // We do not need to adjust largest to properly truncate range + // tombstones that extend past the boundary. + } else if (parsed_largest.sequence == 0) { + // The largest key in the sstable has a sequence number of 0. Since we + // guarantee that no internal keys with the same user key and sequence + // number can exist in a DB, we know that the largest key in this sstable + // cannot exist as the smallest key in the next sstable. This further + // implies that no range tombstone in this sstable covers largest; + // otherwise, the file boundary would have been artificially extended. + // + // Therefore, we will never truncate a range tombstone at largest, so we + // can leave it unchanged. + // TODO: maybe use kMaxValid here to ensure range tombstone having + // distinct key from point keys. + } else { + // The same user key may straddle two sstable boundaries. To ensure that + // the truncated end key can cover the largest key in this sstable, reduce + // its sequence number by 1. + parsed_largest.sequence -= 1; + // This line is not needed for correctness, but it ensures that the + // truncated end key is not covering keys from the next SST file. + parsed_largest.type = kTypeMaxValid; + } + largest_ = &parsed_largest; + } +} + +bool TruncatedRangeDelIterator::Valid() const { + assert(iter_ != nullptr); + return iter_->Valid() && + (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) && + (largest_ == nullptr || + icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0); +} + +// NOTE: target is a user key, with timestamp if enabled. +void TruncatedRangeDelIterator::Seek(const Slice& target) { + if (largest_ != nullptr && + icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber, + kTypeRangeDeletion)) <= 0) { + iter_->Invalidate(); + return; + } + if (smallest_ != nullptr && + icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->Seek(target); +} + +void TruncatedRangeDelIterator::SeekInternalKey(const Slice& target) { + if (largest_ && icmp_->Compare(*largest_, target) <= 0) { + iter_->Invalidate(); + return; + } + if (smallest_ && icmp_->Compare(target, *smallest_) < 0) { + // Since target < smallest, target < largest_. + // This seek must land on a range tombstone where end_key() > target, + // so there is no need to check again. + iter_->Seek(smallest_->user_key); + } else { + iter_->Seek(ExtractUserKey(target)); + while (Valid() && icmp_->Compare(end_key(), target) <= 0) { + Next(); + } + } +} + +// NOTE: target is a user key, with timestamp if enabled. +void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { + if (smallest_ != nullptr && + icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion), + *smallest_) < 0) { + iter_->Invalidate(); + return; + } + if (largest_ != nullptr && + icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekForPrev(target); +} + +void TruncatedRangeDelIterator::SeekToFirst() { + if (smallest_ != nullptr) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->SeekToTopFirst(); +} + +void TruncatedRangeDelIterator::SeekToLast() { + if (largest_ != nullptr) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekToTopLast(); +} + +std::map> +TruncatedRangeDelIterator::SplitBySnapshot( + const std::vector& snapshots) { + using FragmentedIterPair = + std::pair>; + + auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots); + std::map> + split_truncated_iters; + std::for_each( + split_untruncated_iters.begin(), split_untruncated_iters.end(), + [&](FragmentedIterPair& iter_pair) { + auto truncated_iter = std::make_unique( + std::move(iter_pair.second), icmp_, smallest_ikey_, largest_ikey_); + split_truncated_iters.emplace(iter_pair.first, + std::move(truncated_iter)); + }); + return split_truncated_iters; +} + +ForwardRangeDelIterator::ForwardRangeDelIterator( + const InternalKeyComparator* icmp) + : icmp_(icmp), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(EndKeyMinComparator(icmp)), + inactive_iters_(StartKeyMinComparator(icmp)) {} + +bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + // Move active iterators that end before parsed. + while (!active_iters_.empty() && + icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Next(); + } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that start before parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) { + iter->Next(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; +} + +void ForwardRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); +} + +ReverseRangeDelIterator::ReverseRangeDelIterator( + const InternalKeyComparator* icmp) + : icmp_(icmp), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(StartKeyMaxComparator(icmp)), + inactive_iters_(EndKeyMaxComparator(icmp)) {} + +bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + // Move active iterators that start after parsed. + while (!active_iters_.empty() && + icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Prev(); + } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that end after parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) { + iter->Prev(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; +} + +void ReverseRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); +} + +bool RangeDelAggregator::StripeRep::ShouldDelete( + const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + if (!InStripe(parsed.sequence) || IsEmpty()) { + return false; + } + switch (mode) { + case RangeDelPositioningMode::kForwardTraversal: + InvalidateReverseIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx()); + it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) { + auto& iter = *it; + forward_iter_.AddNewIter(iter.get(), parsed); + } + + return forward_iter_.ShouldDelete(parsed); + case RangeDelPositioningMode::kBackwardTraversal: + InvalidateForwardIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx()); + it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) { + auto& iter = *it; + reverse_iter_.AddNewIter(iter.get(), parsed); + } + + return reverse_iter_.ShouldDelete(parsed); + default: + assert(false); + return false; + } +} + +bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start, + const Slice& end) { + Invalidate(); + + // Set the internal start/end keys so that: + // - if start_ikey has the same user key and sequence number as the + // current end key, start_ikey will be considered greater; and + // - if end_ikey has the same user key and sequence number as the current + // start key, end_ikey will be considered greater. + ParsedInternalKey start_ikey(start, kMaxSequenceNumber, + static_cast(0)); + ParsedInternalKey end_ikey(end, 0, static_cast(0)); + for (auto& iter : iters_) { + bool checked_candidate_tombstones = false; + for (iter->SeekForPrev(start); + iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0; + iter->Next()) { + checked_candidate_tombstones = true; + if (icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; + } + } + + if (!checked_candidate_tombstones) { + // Do an additional check for when the end of the range is the begin + // key of a tombstone, which we missed earlier since SeekForPrev'ing + // to the start was invalid. + iter->SeekForPrev(end); + if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; + } + } + } + return false; +} + +void ReadRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + rep_.AddTombstones(std::make_unique( + std::move(input_iter), icmp_, smallest, largest)); +} + +bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + return rep_.ShouldDelete(parsed, mode); +} + +bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start, + const Slice& end) { + InvalidateRangeDelMapPositions(); + return rep_.IsRangeOverlapped(start, end); +} + +void CompactionRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + // This bounds output of CompactionRangeDelAggregator::NewIterator. + if (!trim_ts_.empty()) { + assert(icmp_->user_comparator()->timestamp_size() > 0); + input_iter->SetTimestampUpperBound(&trim_ts_); + } + + assert(input_iter->lower_bound() == 0); + assert(input_iter->upper_bound() == kMaxSequenceNumber); + parent_iters_.emplace_back(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest)); + + Slice* ts_upper_bound = nullptr; + if (!ts_upper_bound_.empty()) { + assert(icmp_->user_comparator()->timestamp_size() > 0); + ts_upper_bound = &ts_upper_bound_; + } + auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_); + for (auto& split_iter : split_iters) { + auto it = reps_.find(split_iter.first); + if (it == reps_.end()) { + bool inserted; + SequenceNumber upper_bound = split_iter.second->upper_bound(); + SequenceNumber lower_bound = split_iter.second->lower_bound(); + std::tie(it, inserted) = reps_.emplace( + split_iter.first, StripeRep(icmp_, upper_bound, lower_bound)); + assert(inserted); + } + assert(it != reps_.end()); + // ts_upper_bound is used to bound ShouldDelete() to only consider + // range tombstones under full_history_ts_low_ and trim_ts_. Keys covered by + // range tombstones that are above full_history_ts_low_ should not be + // dropped prematurely: user may read with a timestamp between the range + // tombstone and the covered key. Note that we cannot set timestamp + // upperbound on the original `input_iter` since `input_iter`s are later + // used in CompactionRangeDelAggregator::NewIterator to output range + // tombstones for persistence. We do not want to only persist range + // tombstones with timestamp lower than ts_upper_bound. + split_iter.second->SetTimestampUpperBound(ts_upper_bound); + it->second.AddTombstones(std::move(split_iter.second)); + } +} + +bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + auto it = reps_.lower_bound(parsed.sequence); + if (it == reps_.end()) { + return false; + } + return it->second.ShouldDelete(parsed, mode); +} + +namespace { + +// Produce a sorted (by start internal key) stream of range tombstones from +// `children`. lower_bound and upper_bound on internal key can be +// optionally specified. Range tombstones that ends before lower_bound or starts +// after upper_bound are excluded. +// If user-defined timestamp is enabled, lower_bound and upper_bound should +// contain timestamp. +class TruncatedRangeDelMergingIter : public InternalIterator { + public: + TruncatedRangeDelMergingIter( + const InternalKeyComparator* icmp, const Slice* lower_bound, + const Slice* upper_bound, + const std::vector>& children) + : icmp_(icmp), + lower_bound_(lower_bound), + upper_bound_(upper_bound), + heap_(StartKeyMinComparator(icmp)), + ts_sz_(icmp_->user_comparator()->timestamp_size()) { + for (auto& child : children) { + if (child != nullptr) { + assert(child->lower_bound() == 0); + assert(child->upper_bound() == kMaxSequenceNumber); + children_.push_back(child.get()); + } + } + } + + bool Valid() const override { + return !heap_.empty() && !AfterEndKey(heap_.top()); + } + Status status() const override { return Status::OK(); } + + void SeekToFirst() override { + heap_.clear(); + for (auto& child : children_) { + if (lower_bound_ != nullptr) { + child->Seek(ExtractUserKey(*lower_bound_)); + // Since the above `Seek()` operates on a user key while `lower_bound_` + // is an internal key, we may need to advance `child` farther for it to + // be in bounds. + while (child->Valid() && BeforeStartKey(child)) { + child->InternalNext(); + } + } else { + child->SeekToFirst(); + } + if (child->Valid()) { + heap_.push(child); + } + } + } + + void Next() override { + auto* top = heap_.top(); + top->InternalNext(); + if (top->Valid()) { + heap_.replace_top(top); + } else { + heap_.pop(); + } + } + + Slice key() const override { + auto* top = heap_.top(); + if (ts_sz_) { + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion, top->timestamp()); + } else { + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion); + } + assert(top->start_key().user_key.size() >= ts_sz_); + return cur_start_key_.Encode(); + } + + Slice value() const override { + auto* top = heap_.top(); + if (!ts_sz_) { + return top->end_key().user_key; + } + assert(top->timestamp().size() == ts_sz_); + cur_end_key_.clear(); + cur_end_key_.append(top->end_key().user_key.data(), + top->end_key().user_key.size() - ts_sz_); + cur_end_key_.append(top->timestamp().data(), ts_sz_); + return cur_end_key_; + } + + // Unused InternalIterator methods + void Prev() override { assert(false); } + void Seek(const Slice& /* target */) override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + private: + bool BeforeStartKey(const TruncatedRangeDelIterator* iter) const { + if (lower_bound_ == nullptr) { + return false; + } + return icmp_->Compare(iter->end_key(), *lower_bound_) <= 0; + } + + bool AfterEndKey(const TruncatedRangeDelIterator* iter) const { + if (upper_bound_ == nullptr) { + return false; + } + return icmp_->Compare(iter->start_key(), *upper_bound_) > 0; + } + + const InternalKeyComparator* icmp_; + const Slice* lower_bound_; + const Slice* upper_bound_; + BinaryHeap heap_; + std::vector children_; + + mutable InternalKey cur_start_key_; + mutable std::string cur_end_key_; + size_t ts_sz_; +}; + +} // anonymous namespace + +std::unique_ptr +CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound, + const Slice* upper_bound) { + InvalidateRangeDelMapPositions(); + auto merging_iter = std::make_unique( + icmp_, lower_bound, upper_bound, parent_iters_); + + auto fragmented_tombstone_list = + std::make_shared( + std::move(merging_iter), *icmp_, true /* for_compaction */, + *snapshots_); + + return std::make_unique( + fragmented_tombstone_list, *icmp_, kMaxSequenceNumber /* upper_bound */); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.h new file mode 100644 index 0000000000..dc1e730389 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_del_aggregator.h @@ -0,0 +1,478 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" + +namespace ROCKSDB_NAMESPACE { + +class TruncatedRangeDelIterator { + public: + TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest); + + bool Valid() const; + + void Next() { iter_->TopNext(); } + void Prev() { iter_->TopPrev(); } + + void InternalNext() { iter_->Next(); } + + // Seeks to the tombstone with the highest visible sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the earliest tombstone that ends after target. + // REQUIRES: target is a user key. + void Seek(const Slice& target); + + // Seeks to the first range tombstone with end_key() > target. + void SeekInternalKey(const Slice& target); + + // Seeks to the tombstone with the highest visible sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the latest tombstone that starts before target. + void SeekForPrev(const Slice& target); + + void SeekToFirst(); + void SeekToLast(); + + ParsedInternalKey start_key() const { + return (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0) + ? iter_->parsed_start_key() + : *smallest_; + } + + ParsedInternalKey end_key() const { + return (largest_ == nullptr || + icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0) + ? iter_->parsed_end_key() + : *largest_; + } + + SequenceNumber seq() const { return iter_->seq(); } + Slice timestamp() const { + assert(icmp_->user_comparator()->timestamp_size()); + return iter_->timestamp(); + } + void SetTimestampUpperBound(const Slice* ts_upper_bound) { + iter_->SetTimestampUpperBound(ts_upper_bound); + } + + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return iter_->upper_bound(); } + + SequenceNumber lower_bound() const { return iter_->lower_bound(); } + + private: + std::unique_ptr iter_; + const InternalKeyComparator* icmp_; + const ParsedInternalKey* smallest_ = nullptr; + const ParsedInternalKey* largest_ = nullptr; + std::list pinned_bounds_; + + const InternalKey* smallest_ikey_; + const InternalKey* largest_ikey_; +}; + +struct SeqMaxComparator { + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return a->seq() > b->seq(); + } +}; + +struct StartKeyMinComparator { + explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->start_key(), b->start_key()) > 0; + } + + const InternalKeyComparator* icmp; +}; + +class ForwardRangeDelIterator { + public: + explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->Seek(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMinComparator { + explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + return; + } + int cmp = icmp_->Compare(parsed, iter->start_key()); + if (cmp < 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; +}; + +class ReverseRangeDelIterator { + public: + explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->SeekForPrev(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMaxComparator { + explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->end_key(), b->end_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + struct StartKeyMaxComparator { + explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; +}; + +enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal }; +class RangeDelAggregator { + public: + explicit RangeDelAggregator(const InternalKeyComparator* icmp) + : icmp_(icmp) {} + virtual ~RangeDelAggregator() {} + + virtual void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) = 0; + + bool ShouldDelete(const Slice& ikey, RangeDelPositioningMode mode) { + ParsedInternalKey parsed; + + Status pik_status = + ParseInternalKey(ikey, &parsed, false /* log_err_key */); // TODO + assert(pik_status.ok()); + if (!pik_status.ok()) { + return false; + } + + return ShouldDelete(parsed, mode); + } + virtual bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) = 0; + + virtual void InvalidateRangeDelMapPositions() = 0; + + virtual bool IsEmpty() const = 0; + + bool AddFile(uint64_t file_number) { + return files_seen_.insert(file_number).second; + } + + protected: + class StripeRep { + public: + StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound) + : icmp_(icmp), + forward_iter_(icmp), + reverse_iter_(icmp), + upper_bound_(upper_bound), + lower_bound_(lower_bound) {} + + void AddTombstones(std::unique_ptr input_iter) { + iters_.push_back(std::move(input_iter)); + } + + bool IsEmpty() const { return iters_.empty(); } + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); + + void Invalidate() { + if (!IsEmpty()) { + InvalidateForwardIter(); + InvalidateReverseIter(); + } + } + + // If user-defined timestamp is enabled, `start` and `end` are user keys + // with timestamp. + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + private: + bool InStripe(SequenceNumber seq) const { + return lower_bound_ <= seq && seq <= upper_bound_; + } + + void InvalidateForwardIter() { forward_iter_.Invalidate(); } + + void InvalidateReverseIter() { reverse_iter_.Invalidate(); } + + const InternalKeyComparator* icmp_; + std::vector> iters_; + ForwardRangeDelIterator forward_iter_; + ReverseRangeDelIterator reverse_iter_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + }; + + const InternalKeyComparator* icmp_; + + private: + std::set files_seen_; +}; + +class ReadRangeDelAggregator final : public RangeDelAggregator { + public: + ReadRangeDelAggregator(const InternalKeyComparator* icmp, + SequenceNumber upper_bound) + : RangeDelAggregator(icmp), + rep_(icmp, upper_bound, 0 /* lower_bound */) {} + ~ReadRangeDelAggregator() override {} + + using RangeDelAggregator::ShouldDelete; + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) final override { + if (rep_.IsEmpty()) { + return false; + } + return ShouldDeleteImpl(parsed, mode); + } + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); } + + bool IsEmpty() const override { return rep_.IsEmpty(); } + + private: + StripeRep rep_; + + bool ShouldDeleteImpl(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); +}; + +class CompactionRangeDelAggregator : public RangeDelAggregator { + public: + CompactionRangeDelAggregator(const InternalKeyComparator* icmp, + const std::vector& snapshots, + const std::string* full_history_ts_low = nullptr, + const std::string* trim_ts = nullptr) + : RangeDelAggregator(icmp), snapshots_(&snapshots) { + if (full_history_ts_low) { + ts_upper_bound_ = *full_history_ts_low; + } + if (trim_ts) { + trim_ts_ = *trim_ts; + // Range tombstone newer than `trim_ts` or `full_history_ts_low` should + // not be considered in ShouldDelete(). + if (ts_upper_bound_.empty()) { + ts_upper_bound_ = trim_ts_; + } else if (!trim_ts_.empty() && icmp->user_comparator()->CompareTimestamp( + trim_ts_, ts_upper_bound_) < 0) { + ts_upper_bound_ = trim_ts_; + } + } + } + ~CompactionRangeDelAggregator() override {} + + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + using RangeDelAggregator::ShouldDelete; + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { + for (auto& rep : reps_) { + rep.second.Invalidate(); + } + } + + bool IsEmpty() const override { + for (const auto& rep : reps_) { + if (!rep.second.IsEmpty()) { + return false; + } + } + return true; + } + + // Creates an iterator over all the range tombstones in the aggregator, for + // use in compaction. + // + // NOTE: the internal key boundaries are used for optimization purposes to + // reduce the number of tombstones that are passed to the fragmenter; they do + // not guarantee that the resulting iterator only contains range tombstones + // that cover keys in the provided range. If required, these bounds must be + // enforced during iteration. + std::unique_ptr NewIterator( + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr); + + private: + std::vector> parent_iters_; + std::map reps_; + + const std::vector* snapshots_; + // min over full_history_ts_low and trim_ts_ + Slice ts_upper_bound_{}; + Slice trim_ts_{}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.cc new file mode 100644 index 0000000000..7e7cedeca4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_tombstone_fragmenter.h" + +#include +#include +#include +#include +#include + +#include "util/autovector.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { + if (unfragmented_tombstones == nullptr) { + return; + } + bool is_sorted = true; + InternalKey pinned_last_start_key; + Slice last_start_key; + num_unfragmented_tombstones_ = 0; + total_tombstone_payload_bytes_ = 0; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); + if (num_unfragmented_tombstones_ > 0 && + icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { + is_sorted = false; + break; + } + if (unfragmented_tombstones->IsKeyPinned()) { + last_start_key = unfragmented_tombstones->key(); + } else { + pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key()); + last_start_key = pinned_last_start_key.Encode(); + } + } + if (is_sorted) { + FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction, + snapshots); + return; + } + + // Sort the tombstones before fragmenting them. + std::vector keys, values; + keys.reserve(num_unfragmented_tombstones_); + values.reserve(num_unfragmented_tombstones_); + // Reset the counter to zero for the next iteration over keys. + total_tombstone_payload_bytes_ = 0; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next()) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); + keys.emplace_back(unfragmented_tombstones->key().data(), + unfragmented_tombstones->key().size()); + values.emplace_back(unfragmented_tombstones->value().data(), + unfragmented_tombstones->value().size()); + } + // VectorIterator implicitly sorts by key during construction. + auto iter = std::make_unique(std::move(keys), + std::move(values), &icmp); + FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots); +} + +void FragmentedRangeTombstoneList::FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { + Slice cur_start_key(nullptr, 0); + auto cmp = ParsedInternalKeyComparator(&icmp); + + // Stores the end keys and sequence numbers of range tombstones with a start + // key less than or equal to cur_start_key. Provides an ordering by end key + // for use in flush_current_tombstones. + std::set cur_end_keys(cmp); + + size_t ts_sz = icmp.user_comparator()->timestamp_size(); + // Given the next start key in unfragmented_tombstones, + // flush_current_tombstones writes every tombstone fragment that starts + // and ends with a key before next_start_key, and starts with a key greater + // than or equal to cur_start_key. + auto flush_current_tombstones = [&](const Slice& next_start_key) { + auto it = cur_end_keys.begin(); + bool reached_next_start_key = false; + for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) { + Slice cur_end_key = it->user_key; + if (icmp.user_comparator()->CompareWithoutTimestamp(cur_start_key, + cur_end_key) == 0) { + // Empty tombstone. + continue; + } + if (icmp.user_comparator()->CompareWithoutTimestamp(next_start_key, + cur_end_key) <= 0) { + // All the end keys in [it, cur_end_keys.end()) are after + // next_start_key, so the tombstones they represent can be used in + // fragments that start with keys greater than or equal to + // next_start_key. However, the end keys we already passed will not be + // used in any more tombstone fragments. + // + // Remove the fully fragmented tombstones and stop iteration after a + // final round of flushing to preserve the tombstones we can create more + // fragments from. + reached_next_start_key = true; + cur_end_keys.erase(cur_end_keys.begin(), it); + cur_end_key = next_start_key; + } + + // Flush a range tombstone fragment [cur_start_key, cur_end_key), which + // should not overlap with the last-flushed tombstone fragment. + assert(tombstones_.empty() || + icmp.user_comparator()->CompareWithoutTimestamp( + tombstones_.back().end_key, cur_start_key) <= 0); + + // Sort the sequence numbers of the tombstones being fragmented in + // descending order, and then flush them in that order. + autovector seqnums_to_flush; + autovector timestamps_to_flush; + for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { + seqnums_to_flush.push_back(flush_it->sequence); + if (ts_sz) { + timestamps_to_flush.push_back( + ExtractTimestampFromUserKey(flush_it->user_key, ts_sz)); + } + } + // TODO: bind the two sorting together to be more efficient + std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(), + std::greater()); + if (ts_sz) { + std::sort(timestamps_to_flush.begin(), timestamps_to_flush.end(), + [icmp](const Slice& ts1, const Slice& ts2) { + return icmp.user_comparator()->CompareTimestamp(ts1, ts2) > + 0; + }); + } + + size_t start_idx = tombstone_seqs_.size(); + size_t end_idx = start_idx + seqnums_to_flush.size(); + + // If user-defined timestamp is enabled, we should not drop tombstones + // from any snapshot stripe. Garbage collection of range tombstones + // happens in CompactionOutputs::AddRangeDels(). + if (for_compaction && ts_sz == 0) { + // Drop all tombstone seqnums that are not preserved by a snapshot. + SequenceNumber next_snapshot = kMaxSequenceNumber; + for (auto seq : seqnums_to_flush) { + if (seq <= next_snapshot) { + // This seqnum is visible by a lower snapshot. + tombstone_seqs_.push_back(seq); + auto upper_bound_it = + std::lower_bound(snapshots.begin(), snapshots.end(), seq); + if (upper_bound_it == snapshots.begin()) { + // This seqnum is the topmost one visible by the earliest + // snapshot. None of the seqnums below it will be visible, so we + // can skip them. + break; + } + next_snapshot = *std::prev(upper_bound_it); + } + } + end_idx = tombstone_seqs_.size(); + } else { + // The fragmentation is being done for reads, so preserve all seqnums. + tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(), + seqnums_to_flush.end()); + if (ts_sz) { + tombstone_timestamps_.insert(tombstone_timestamps_.end(), + timestamps_to_flush.begin(), + timestamps_to_flush.end()); + } + } + + assert(start_idx < end_idx); + if (ts_sz) { + std::string start_key_with_max_ts; + AppendUserKeyWithMaxTimestamp(&start_key_with_max_ts, cur_start_key, + ts_sz); + pinned_slices_.emplace_back(std::move(start_key_with_max_ts)); + Slice start_key = pinned_slices_.back(); + + std::string end_key_with_max_ts; + AppendUserKeyWithMaxTimestamp(&end_key_with_max_ts, cur_end_key, ts_sz); + pinned_slices_.emplace_back(std::move(end_key_with_max_ts)); + Slice end_key = pinned_slices_.back(); + + // RangeTombstoneStack expects start_key and end_key to have max + // timestamp. + tombstones_.emplace_back(start_key, end_key, start_idx, end_idx); + } else { + tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, + end_idx); + } + + cur_start_key = cur_end_key; + } + if (!reached_next_start_key) { + // There is a gap between the last flushed tombstone fragment and + // the next tombstone's start key. Remove all the end keys in + // the working set, since we have fully fragmented their corresponding + // tombstones. + cur_end_keys.clear(); + } + cur_start_key = next_start_key; + }; + + pinned_iters_mgr_.StartPinning(); + + bool no_tombstones = true; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next()) { + const Slice& ikey = unfragmented_tombstones->key(); + Slice tombstone_start_key = ExtractUserKey(ikey); + SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey); + if (!unfragmented_tombstones->IsKeyPinned()) { + pinned_slices_.emplace_back(tombstone_start_key.data(), + tombstone_start_key.size()); + tombstone_start_key = pinned_slices_.back(); + } + no_tombstones = false; + + Slice tombstone_end_key = unfragmented_tombstones->value(); + if (!unfragmented_tombstones->IsValuePinned()) { + pinned_slices_.emplace_back(tombstone_end_key.data(), + tombstone_end_key.size()); + tombstone_end_key = pinned_slices_.back(); + } + if (!cur_end_keys.empty() && + icmp.user_comparator()->CompareWithoutTimestamp( + cur_start_key, tombstone_start_key) != 0) { + // The start key has changed. Flush all tombstones that start before + // this new start key. + flush_current_tombstones(tombstone_start_key); + } + cur_start_key = tombstone_start_key; + + cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion); + } + if (!cur_end_keys.empty()) { + ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end()); + flush_current_tombstones(last_end_key.user_key); + } + + if (!no_tombstones) { + pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(), + false /* arena */); + } +} + +bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower, + SequenceNumber upper) { + std::call_once(seq_set_init_once_flag_, [this]() { + for (auto s : tombstone_seqs_) { + seq_set_.insert(s); + } + }); + auto seq_it = seq_set_.lower_bound(lower); + return seq_it != seq_set_.end() && *seq_it <= upper; +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp, + SequenceNumber _upper_bound, const Slice* ts_upper_bound, + SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_(tombstones), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound), + ts_upper_bound_(ts_upper_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + const Slice* ts_upper_bound, SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_ref_(tombstones), + tombstones_(tombstones_ref_.get()), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound), + ts_upper_bound_(ts_upper_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones_cache, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + const Slice* ts_upper_bound, SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_cache_ref_(tombstones_cache), + tombstones_(tombstones_cache_ref_->tombstones.get()), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { + assert(tombstones_ != nullptr); + if (!ts_upper_bound || ts_upper_bound->empty()) { + ts_upper_bound_ = nullptr; + } else { + ts_upper_bound_ = ts_upper_bound; + } + Invalidate(); +} + +void FragmentedRangeTombstoneIterator::SeekToFirst() { + pos_ = tombstones_->begin(); + seq_pos_ = tombstones_->seq_begin(); +} + +void FragmentedRangeTombstoneIterator::SeekToTopFirst() { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = tombstones_->begin(); + SetMaxVisibleSeqAndTimestamp(); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekToLast() { + pos_ = std::prev(tombstones_->end()); + seq_pos_ = std::prev(tombstones_->seq_end()); +} + +void FragmentedRangeTombstoneIterator::SeekToTopLast() { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = std::prev(tombstones_->end()); + SetMaxVisibleSeqAndTimestamp(); + ScanBackwardToVisibleTombstone(); +} + +// @param `target` is a user key, with timestamp if user-defined timestamp is +// enabled. +void FragmentedRangeTombstoneIterator::Seek(const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + SeekToCoveringTombstone(target); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + SeekForPrevToCoveringTombstone(target); + ScanBackwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone( + const Slice& target) { + pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target, + tombstone_end_cmp_); + if (pos_ == tombstones_->end()) { + // All tombstones end before target. + seq_pos_ = tombstones_->seq_end(); + return; + } + SetMaxVisibleSeqAndTimestamp(); +} + +void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone( + const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target, + tombstone_start_cmp_); + if (pos_ == tombstones_->begin()) { + // All tombstones start after target. + Invalidate(); + return; + } + --pos_; + SetMaxVisibleSeqAndTimestamp(); +} + +void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() { + while (pos_ != tombstones_->end() && + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { + ++pos_; + if (pos_ == tombstones_->end()) { + Invalidate(); + return; + } + SetMaxVisibleSeqAndTimestamp(); + } +} + +void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() { + while (pos_ != tombstones_->end() && + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { + if (pos_ == tombstones_->begin()) { + Invalidate(); + return; + } + --pos_; + SetMaxVisibleSeqAndTimestamp(); + } +} + +void FragmentedRangeTombstoneIterator::Next() { + ++seq_pos_; + if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) { + ++pos_; + } +} + +void FragmentedRangeTombstoneIterator::TopNext() { + ++pos_; + if (pos_ == tombstones_->end()) { + return; + } + SetMaxVisibleSeqAndTimestamp(); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::Prev() { + if (seq_pos_ == tombstones_->seq_begin()) { + Invalidate(); + return; + } + --seq_pos_; + if (pos_ == tombstones_->end() || + seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) { + --pos_; + } +} + +void FragmentedRangeTombstoneIterator::TopPrev() { + if (pos_ == tombstones_->begin()) { + Invalidate(); + return; + } + --pos_; + SetMaxVisibleSeqAndTimestamp(); + ScanBackwardToVisibleTombstone(); +} + +bool FragmentedRangeTombstoneIterator::Valid() const { + return tombstones_ != nullptr && pos_ != tombstones_->end(); +} + +SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum( + const Slice& target_user_key) { + SeekToCoveringTombstone(target_user_key); + return ValidPos() && ucmp_->CompareWithoutTimestamp(start_key(), + target_user_key) <= 0 + ? seq() + : 0; +} + +std::map> +FragmentedRangeTombstoneIterator::SplitBySnapshot( + const std::vector& snapshots) { + std::map> + splits; + SequenceNumber lower = 0; + SequenceNumber upper; + for (size_t i = 0; i <= snapshots.size(); i++) { + if (i >= snapshots.size()) { + upper = kMaxSequenceNumber; + } else { + upper = snapshots[i]; + } + if (tombstones_->ContainsRange(lower, upper)) { + splits.emplace(upper, + std::make_unique( + tombstones_, *icmp_, upper, ts_upper_bound_, lower)); + } + lower = upper + 1; + } + return splits; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.h new file mode 100644 index 0000000000..8c7d982972 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/range_tombstone_fragmenter.h @@ -0,0 +1,356 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { +struct FragmentedRangeTombstoneList; + +struct FragmentedRangeTombstoneListCache { + // ensure only the first reader needs to initialize l + std::mutex reader_mutex; + std::unique_ptr tombstones = nullptr; + // readers will first check this bool to avoid + std::atomic initialized = false; +}; + +struct FragmentedRangeTombstoneList { + public: + // A compact representation of a "stack" of range tombstone fragments, which + // start and end at the same user keys but have different sequence numbers. + // The members seq_start_idx and seq_end_idx are intended to be parameters to + // seq_iter(). + // If user-defined timestamp is enabled, `start` and `end` should be user keys + // with timestamp, and the timestamps are set to max timestamp to be returned + // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will + // also be used as parameters to ts_iter(). + struct RangeTombstoneStack { + RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx, + size_t end_idx) + : start_key(start), + end_key(end), + seq_start_idx(start_idx), + seq_end_idx(end_idx) {} + Slice start_key; + Slice end_key; + size_t seq_start_idx; + size_t seq_end_idx; + }; + // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value() + // both contain timestamp if enabled. + FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction = false, + const std::vector& snapshots = {}); + + std::vector::const_iterator begin() const { + return tombstones_.begin(); + } + + std::vector::const_iterator end() const { + return tombstones_.end(); + } + + std::vector::const_iterator seq_iter(size_t idx) const { + return std::next(tombstone_seqs_.begin(), idx); + } + + std::vector::const_iterator ts_iter(size_t idx) const { + return std::next(tombstone_timestamps_.begin(), idx); + } + + std::vector::const_iterator seq_begin() const { + return tombstone_seqs_.begin(); + } + + std::vector::const_iterator seq_end() const { + return tombstone_seqs_.end(); + } + + bool empty() const { return tombstones_.empty(); } + + // Returns true if the stored tombstones contain with one with a sequence + // number in [lower, upper]. + // This method is not const as it internally lazy initialize a set of + // sequence numbers (`seq_set_`). + bool ContainsRange(SequenceNumber lower, SequenceNumber upper); + + uint64_t num_unfragmented_tombstones() const { + return num_unfragmented_tombstones_; + } + + uint64_t total_tombstone_payload_bytes() const { + return total_tombstone_payload_bytes_; + } + + private: + // Given an ordered range tombstone iterator unfragmented_tombstones, + // "fragment" the tombstones into non-overlapping pieces. Each + // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which + // contains start_key, end_key, and indices that points to sequence numbers + // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If + // for_compaction is true, then `snapshots` should be provided. Range + // tombstone fragments are dropped if they are not visible in any snapshot and + // user-defined timestamp is not enabled. That is, for each snapshot stripe + // [lower, upper], the range tombstone fragment with largest seqno in [lower, + // upper] is preserved, and all the other range tombstones are dropped. + void FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots); + + std::vector tombstones_; + std::vector tombstone_seqs_; + std::vector tombstone_timestamps_; + std::once_flag seq_set_init_once_flag_; + std::set seq_set_; + std::list pinned_slices_; + PinnedIteratorsManager pinned_iters_mgr_; + uint64_t num_unfragmented_tombstones_; + uint64_t total_tombstone_payload_bytes_; +}; + +// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del +// meta block into an iterator over non-overlapping tombstone fragments. The +// tombstone fragmentation process should be more efficient than the range +// tombstone collapsing algorithm in RangeDelAggregator because this leverages +// the internal key ordering already provided by the input iterator, if +// applicable (when the iterator is unsorted, a new sorted iterator is created +// before proceeding). If there are few overlaps, creating a +// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator +// tombstone collapsing is always O(n log n). +class FragmentedRangeTombstoneIterator : public InternalIterator { + public: + FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, + SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, + SequenceNumber lower_bound = 0); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); + + void SeekToFirst() override; + void SeekToLast() override; + + void SeekToTopFirst(); + void SeekToTopLast(); + + // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator + // seeking should behave. This is OK because they are not currently used, but + // eventually FragmentedRangeTombstoneIterator should no longer implement + // InternalIterator. + // + // Seeks to the range tombstone that covers target at a seqnum in the + // snapshot. If no such tombstone exists, seek to the earliest tombstone in + // the snapshot that ends after target. + void Seek(const Slice& target) override; + // Seeks to the range tombstone that covers target at a seqnum in the + // snapshot. If no such tombstone exists, seek to the latest tombstone in the + // snapshot that starts before target. + void SeekForPrev(const Slice& target) override; + + void Next() override; + void Prev() override; + + void TopNext(); + void TopPrev(); + + bool Valid() const override; + // Note that key() and value() do not return correct timestamp. + // Caller should call timestamp() to get the current timestamp. + Slice key() const override { + MaybePinKey(); + return current_start_key_.Encode(); + } + Slice value() const override { return pos_->end_key; } + bool IsKeyPinned() const override { return false; } + bool IsValuePinned() const override { return true; } + Status status() const override { return Status::OK(); } + + bool empty() const { return tombstones_->empty(); } + void Invalidate() { + pos_ = tombstones_->end(); + seq_pos_ = tombstones_->seq_end(); + pinned_pos_ = tombstones_->end(); + pinned_seq_pos_ = tombstones_->seq_end(); + } + + RangeTombstone Tombstone() const { + assert(Valid()); + if (icmp_->user_comparator()->timestamp_size()) { + return RangeTombstone(start_key(), end_key(), seq(), timestamp()); + } + return RangeTombstone(start_key(), end_key(), seq()); + } + // Note that start_key() and end_key() are not guaranteed to have the + // correct timestamp. User can call timestamp() to get the correct + // timestamp(). + Slice start_key() const { return pos_->start_key; } + Slice end_key() const { return pos_->end_key; } + SequenceNumber seq() const { return *seq_pos_; } + Slice timestamp() const { + // seqno and timestamp are stored in the same order. + return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin()); + } + // Current use case is by CompactionRangeDelAggregator to set + // full_history_ts_low_. + void SetTimestampUpperBound(const Slice* ts_upper_bound) { + ts_upper_bound_ = ts_upper_bound; + } + + ParsedInternalKey parsed_start_key() const { + return ParsedInternalKey(pos_->start_key, seq(), kTypeRangeDeletion); + } + ParsedInternalKey parsed_end_key() const { + return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber, + kTypeRangeDeletion); + } + + // Return the max sequence number of a range tombstone that covers + // the given user key. + // If there is no covering tombstone, then 0 is returned. + SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key); + + // Splits the iterator into n+1 iterators (where n is the number of + // snapshots), each providing a view over a "stripe" of sequence numbers. The + // iterators are keyed by the upper bound of their ranges (the provided + // snapshots + kMaxSequenceNumber). + // + // NOTE: the iterators in the returned map are no longer valid if their + // parent iterator is deleted, since they do not modify the refcount of the + // underlying tombstone list. Therefore, this map should be deleted before + // the parent iterator. + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return upper_bound_; } + SequenceNumber lower_bound() const { return lower_bound_; } + + uint64_t num_unfragmented_tombstones() const { + return tombstones_->num_unfragmented_tombstones(); + } + uint64_t total_tombstone_payload_bytes() const { + return tombstones_->total_tombstone_payload_bytes(); + } + + private: + using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; + + struct RangeTombstoneStackStartComparator { + explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstoneStack& a, + const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0; + } + + bool operator()(const RangeTombstoneStack& a, const Slice& b) const { + return cmp->CompareWithoutTimestamp(a.start_key, b) < 0; + } + + bool operator()(const Slice& a, const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a, b.start_key) < 0; + } + + const Comparator* cmp; + }; + + struct RangeTombstoneStackEndComparator { + explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstoneStack& a, + const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0; + } + + bool operator()(const RangeTombstoneStack& a, const Slice& b) const { + return cmp->CompareWithoutTimestamp(a.end_key, b) < 0; + } + + bool operator()(const Slice& a, const RangeTombstoneStack& b) const { + return cmp->CompareWithoutTimestamp(a, b.end_key) < 0; + } + + const Comparator* cmp; + }; + + void MaybePinKey() const { + if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() && + (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) { + current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion); + pinned_pos_ = pos_; + pinned_seq_pos_ = seq_pos_; + } + } + + void SeekToCoveringTombstone(const Slice& key); + void SeekForPrevToCoveringTombstone(const Slice& key); + void ScanForwardToVisibleTombstone(); + void ScanBackwardToVisibleTombstone(); + bool ValidPos() const { + return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx); + } + + const RangeTombstoneStackStartComparator tombstone_start_cmp_; + const RangeTombstoneStackEndComparator tombstone_end_cmp_; + const InternalKeyComparator* icmp_; + const Comparator* ucmp_; + std::shared_ptr tombstones_ref_; + std::shared_ptr tombstones_cache_ref_; + FragmentedRangeTombstoneList* tombstones_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + // Only consider timestamps <= ts_upper_bound_. + const Slice* ts_upper_bound_; + std::vector::const_iterator pos_; + std::vector::const_iterator seq_pos_; + mutable std::vector::const_iterator pinned_pos_; + mutable std::vector::const_iterator pinned_seq_pos_; + mutable InternalKey current_start_key_; + + // Check the current RangeTombstoneStack `pos_` against timestamp + // upper bound `ts_upper_bound_` and sequence number upper bound + // `upper_bound_`. Update the sequence number (and timestamp) pointer + // `seq_pos_` to the first valid position satisfying both bounds. + void SetMaxVisibleSeqAndTimestamp() { + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + if (ts_upper_bound_ && !ts_upper_bound_->empty()) { + auto ts_pos = std::lower_bound( + tombstones_->ts_iter(pos_->seq_start_idx), + tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_, + [this](const Slice& s1, const Slice& s2) { + return ucmp_->CompareTimestamp(s1, s2) > 0; + }); + auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx); + auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx); + if (seq_idx < ts_idx) { + // seq and ts are ordered in non-increasing order. Only updates seq_pos_ + // to a larger index for smaller sequence number and timestamp. + seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx); + } + } + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/read_callback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/read_callback.h new file mode 100644 index 0000000000..c042352db0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/read_callback.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/dbformat.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class ReadCallback { + public: + explicit ReadCallback(SequenceNumber last_visible_seq) + : max_visible_seq_(last_visible_seq) {} + ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted) + : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {} + + virtual ~ReadCallback() {} + + // Will be called to see if the seq number visible; if not it moves on to + // the next seq number. + virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0; + + inline bool IsVisible(SequenceNumber seq) { + assert(min_uncommitted_ > 0); + assert(min_uncommitted_ >= kMinUnCommittedSeq); + if (seq < min_uncommitted_) { // handles seq == 0 as well + assert(seq <= max_visible_seq_); + return true; + } else if (max_visible_seq_ < seq) { + assert(seq != 0); + return false; + } else { + assert(seq != 0); // already handled in the first if-then clause + return IsVisibleFullCheck(seq); + } + } + + inline SequenceNumber max_visible_seq() { return max_visible_seq_; } + + // Refresh to a more recent visible seq + virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; } + + protected: + // The max visible seq, it is usually the snapshot but could be larger if + // transaction has its own writes written to db. + SequenceNumber max_visible_seq_ = kMaxSequenceNumber; + // Any seq less than min_uncommitted_ is committed. + const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/repair.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/repair.cc new file mode 100644 index 0000000000..67513cacc4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/repair.cc @@ -0,0 +1,839 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Repairer does best effort recovery to recover as much data as possible after +// a disaster without compromising consistency. It does not guarantee bringing +// the database to a time consistent state. +// +// Repair process is broken into 4 phases: +// (a) Find files +// (b) Convert logs to tables +// (c) Extract metadata +// (d) Write Descriptor +// +// (a) Find files +// +// The repairer goes through all the files in the directory, and classifies them +// based on their file name. Any file that cannot be identified by name will be +// ignored. +// +// (b) Convert logs to table +// +// Every log file that is active is replayed. All sections of the file where the +// checksum does not match is skipped over. We intentionally give preference to +// data consistency. +// +// (c) Extract metadata +// +// We scan every table to compute +// (1) smallest/largest for the table +// (2) largest sequence number in the table +// (3) oldest blob file referred to by the table (if applicable) +// +// If we are unable to scan the file, then we ignore the table. +// +// (d) Write Descriptor +// +// We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// (d) We can provide options for time consistent recovery and unsafe recovery +// (ignore checksum failure when applicable) +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#include "db/version_builder.h" + +#include + +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "options/cf_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "table/unique_id_impl.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& default_cf_opts, + const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs) + : dbname_(dbname), + db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)), + env_(db_options.env), + file_options_(), + db_options_(SanitizeOptions(dbname_, db_options)), + immutable_db_options_(ImmutableDBOptions(db_options_)), + icmp_(default_cf_opts.comparator), + default_cf_opts_( + SanitizeOptions(immutable_db_options_, default_cf_opts)), + default_iopts_( + ImmutableOptions(immutable_db_options_, default_cf_opts_)), + unknown_cf_opts_( + SanitizeOptions(immutable_db_options_, unknown_cf_opts)), + create_unknown_cfs_(create_unknown_cfs), + raw_table_cache_( + // TableCache can be small since we expect each table to be opened + // once. + NewLRUCache(10, db_options_.table_cache_numshardbits)), + table_cache_(new TableCache(default_iopts_, &file_options_, + raw_table_cache_.get(), + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, db_session_id_)), + wb_(db_options_.db_write_buffer_size), + wc_(db_options_.delayed_write_rate), + vset_(dbname_, &immutable_db_options_, file_options_, + raw_table_cache_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", db_session_id_), + next_file_number_(1), + db_lock_(nullptr), + closed_(false) { + for (const auto& cfd : column_families) { + cf_name_to_opts_[cfd.name] = cfd.options; + } + } + + const ColumnFamilyOptions* GetColumnFamilyOptions( + const std::string& cf_name) { + if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) { + if (create_unknown_cfs_) { + return &unknown_cf_opts_; + } + return nullptr; + } + return &cf_name_to_opts_[cf_name]; + } + + // Adds a column family to the VersionSet with cf_options_ and updates + // manifest. + Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; + const auto* cf_opts = GetColumnFamilyOptions(cf_name); + if (cf_opts == nullptr) { + return Status::Corruption("Encountered unknown column family with name=" + + cf_name + ", id=" + std::to_string(cf_id)); + } + Options opts(db_options_, *cf_opts); + MutableCFOptions mut_cf_opts(opts); + + VersionEdit edit; + edit.SetComparatorName(opts.comparator->Name()); + edit.SetLogNumber(0); + edit.SetColumnFamily(cf_id); + ColumnFamilyData* cfd; + cfd = nullptr; + edit.AddColumnFamily(cf_name); + + mutex_.Lock(); + std::unique_ptr db_dir; + Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), + &db_dir, nullptr); + if (status.ok()) { + status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_, + db_dir.get(), false /* new_descriptor_log */, + cf_opts); + } + mutex_.Unlock(); + return status; + } + + Status Close() { + Status s = Status::OK(); + if (!closed_) { + if (db_lock_ != nullptr) { + s = env_->UnlockFile(db_lock_); + db_lock_ = nullptr; + } + closed_ = true; + } + return s; + } + + ~Repairer() { Close().PermitUncheckedError(); } + + Status Run() { + Status status = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!status.ok()) { + return status; + } + status = FindFiles(); + DBImpl* db_impl = nullptr; + if (status.ok()) { + // Discard older manifests and start a fresh one + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + // Just create a DBImpl temporarily so we can reuse NewDB() + db_impl = new DBImpl(db_options_, dbname_); + status = db_impl->NewDB(/*new_filenames=*/nullptr); + } + delete db_impl; + + if (status.ok()) { + // Recover using the fresh manifest created by NewDB() + status = + vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false); + } + if (status.ok()) { + // Need to scan existing SST files first so the column families are + // created before we process WAL files + ExtractMetaData(); + + // ExtractMetaData() uses table_fds_ to know which SST files' metadata to + // extract -- we need to clear it here since metadata for existing SST + // files has been extracted already + table_fds_.clear(); + ConvertLogFilesToTables(); + ExtractMetaData(); + status = AddTables(); + } + if (status.ok()) { + uint64_t bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.fd.GetFileSize(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "**** Repaired rocksdb %s; " + "recovered %" ROCKSDB_PRIszt " files; %" PRIu64 + " bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), tables_.size(), bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + uint32_t column_family_id; + std::string column_family_name; + }; + + std::string const dbname_; + std::string db_session_id_; + Env* const env_; + const FileOptions file_options_; + const DBOptions db_options_; + const ImmutableDBOptions immutable_db_options_; + const InternalKeyComparator icmp_; + const ColumnFamilyOptions default_cf_opts_; + const ImmutableOptions default_iopts_; // table_cache_ holds reference + const ColumnFamilyOptions unknown_cf_opts_; + const bool create_unknown_cfs_; + std::shared_ptr raw_table_cache_; + std::unique_ptr table_cache_; + WriteBufferManager wb_; + WriteController wc_; + VersionSet vset_; + std::unordered_map cf_name_to_opts_; + InstrumentedMutex mutex_; + + std::vector manifests_; + std::vector table_fds_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + // Lock over the persistent DB state. Non-nullptr iff successfully + // acquired. + FileLock* db_lock_; + bool closed_; + + Status FindFiles() { + std::vector filenames; + bool found_file = false; + std::vector to_search_paths; + + for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { + to_search_paths.push_back(db_options_.db_paths[path_id].path); + } + + // search wal_dir if user uses a customize wal_dir + bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_); + if (!same) { + to_search_paths.push_back(immutable_db_options_.wal_dir); + } + + for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) { + ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n", + to_search_paths[path_id].c_str()); + Status status = env_->GetChildren(to_search_paths[path_id], &filenames); + if (!status.ok()) { + return status; + } + if (!filenames.empty()) { + found_file = true; + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kWalFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_fds_.emplace_back(number, static_cast(path_id), + 0); + } else { + // Ignore other files + } + } + } + } + } + if (!found_file) { + return Status::Corruption(dbname_, "repair found no files"); + } + return Status::OK(); + } + + void ConvertLogFilesToTables() { + const auto& wal_dir = immutable_db_options_.GetWalDir(); + for (size_t i = 0; i < logs_.size(); i++) { + // we should use LogFileName(wal_dir, logs_[i]) here. user might uses + // wal_dir option. + std::string logname = LogFileName(wal_dir, logs_[i]); + Status status = ConvertLogToTable(wal_dir, logs_[i]); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Log #%" PRIu64 ": ignoring conversion error: %s", + logs_[i], status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + std::shared_ptr info_log; + uint64_t lognum; + void Corruption(size_t bytes, const Status& s) override { + // We print error messages for corruption, but continue repairing. + ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", + lognum, static_cast(bytes), s.ToString().c_str()); + } + }; + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + + // Open the log file + std::string logname = LogFileName(wal_dir, log); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr lfile_reader; + Status status = SequentialFileReader::Create( + fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader, + nullptr /* dbg */, nullptr /* rate limiter */); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log; + reporter.lognum = log; + // We intentionally make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter, + true /*enable checksum*/, log); + + // Initialize per-column family memtables + for (auto* cfd : *vset_.GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet()); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + Status record_status = WriteBatchInternal::SetContents(&batch, record); + if (record_status.ok()) { + record_status = + WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); + } + if (record_status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s", + log, record_status.ToString().c_str()); + } + } + + // Dump a table for each column family with entries in this log file. + for (auto* cfd : *vset_.GetColumnFamilySet()) { + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + MemTable* mem = cfd->mem(); + if (mem->IsEmpty()) { + continue; + } + + FileMetaData meta; + meta.fd = FileDescriptor(next_file_number_++, 0, 0); + // TODO: plumb Env::IOActivity + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + int64_t _current_time = 0; + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error + const uint64_t current_time = static_cast(_current_time); + meta.file_creation_time = current_time; + SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); + + auto write_hint = cfd->CalculateSSTWriteHint(0); + std::vector> + range_del_iters; + auto range_del_iter = mem->NewRangeTombstoneIterator( + ro, kMaxSequenceNumber, false /* immutable_memtable */); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + + IOStatus io_s; + CompressionOptions default_compression; + TableBuilderOptions tboptions( + *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + kNoCompression, default_compression, cfd->GetID(), cfd->GetName(), + -1 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, + 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_, + 0 /*target_file_size*/, meta.fd.GetNumber()); + + SeqnoToTimeMapping empty_seqno_time_mapping; + status = BuildTable( + dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, + file_options_, read_options, table_cache_.get(), iter.get(), + std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, + {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, + false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, + nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery, + empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */, + Env::IO_HIGH, nullptr /* table_properties */, write_hint); + ROCKS_LOG_INFO(db_options_.info_log, + "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", + log, counter, meta.fd.GetNumber(), + status.ToString().c_str()); + if (status.ok()) { + if (meta.fd.GetFileSize() > 0) { + table_fds_.push_back(meta.fd); + } + } else { + break; + } + } + delete cf_mems; + return status; + } + + void ExtractMetaData() { + for (size_t i = 0; i < table_fds_.size(); i++) { + TableInfo t; + t.meta.fd = table_fds_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName( + db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId()); + char file_num_buf[kFormatFileNumberBufSize]; + FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(), + file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s", + file_num_buf, status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName( + db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId()); + int counter = 0; + uint64_t file_size; + Status status = env_->GetFileSize(fname, &file_size); + t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(), + file_size); + std::shared_ptr props; + if (status.ok()) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = table_cache_->GetTableProperties( + file_options_, read_options, icmp_, t->meta, &props, + 0 /* block_protection_bytes_per_key */); + } + if (status.ok()) { + auto s = + GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, &t->meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Table #%" PRIu64 + ": unable to get unique id, default to Unknown.", + t->meta.fd.GetNumber()); + } + t->column_family_id = static_cast(props->column_family_id); + if (t->column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Table #%" PRIu64 + ": column family unknown (probably due to legacy format); " + "adding to default column family id 0.", + t->meta.fd.GetNumber()); + t->column_family_id = 0; + } + + if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) == + nullptr) { + status = + AddColumnFamily(props->column_family_name, t->column_family_id); + } + t->meta.oldest_ancester_time = props->creation_time; + } + if (status.ok()) { + uint64_t tail_size = 0; + bool contain_no_data_blocks = + props->num_entries > 0 && + (props->num_entries == props->num_range_deletions); + if (props->tail_start_offset > 0 || contain_no_data_blocks) { + assert(props->tail_start_offset <= file_size); + tail_size = file_size - props->tail_start_offset; + } + t->meta.tail_size = tail_size; + } + ColumnFamilyData* cfd = nullptr; + if (status.ok()) { + cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); + if (cfd->GetName() != props->column_family_name) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Table #%" PRIu64 + ": inconsistent column family name '%s'; expected '%s' for column " + "family id %" PRIu32 ".", + t->meta.fd.GetNumber(), props->column_family_name.c_str(), + cfd->GetName().c_str(), t->column_family_id); + status = Status::Corruption(dbname_, "inconsistent column family name"); + } + } + if (status.ok()) { + // TODO: plumb Env::IOActivity + ReadOptions ropts; + ropts.total_order_seek = true; + InternalIterator* iter = table_cache_->NewIterator( + ropts, file_options_, cfd->internal_comparator(), t->meta, + nullptr /* range_del_agg */, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, + /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key); + ParsedInternalKey parsed; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Status pik_status = + ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Table #%" PRIu64 ": unparsable key - %s", + t->meta.fd.GetNumber(), pik_status.getState()); + continue; + } + + counter++; + + status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence, + parsed.type); + if (!status.ok()) { + break; + } + } + if (status.ok() && !iter->status().ok()) { + status = iter->status(); + } + delete iter; + + ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s", + t->meta.fd.GetNumber(), counter, + status.ToString().c_str()); + } + if (status.ok()) { + // XXX/FIXME: This is just basic, naive handling of range tombstones, + // like call to UpdateBoundariesForRange in builder.cc where we assume + // an SST file is a full sorted run. This probably needs the extra logic + // from compaction_job.cc around call to UpdateBoundariesForRange (to + // handle range tombstones extendingg beyond range of other entries). + // TODO: plumb Env::IOActivity + ReadOptions ropts; + std::unique_ptr r_iter; + status = table_cache_->GetRangeTombstoneIterator( + ropts, cfd->internal_comparator(), t->meta, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key, + &r_iter); + + if (r_iter) { + r_iter->SeekToFirst(); + + while (r_iter->Valid()) { + auto tombstone = r_iter->Tombstone(); + auto kv = tombstone.Serialize(); + t->meta.UpdateBoundariesForRange( + kv.first, tombstone.SerializeEndKey(), tombstone.seq_, + cfd->internal_comparator()); + r_iter->Next(); + } + } + } + return status; + } + + Status AddTables() { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; + std::unordered_map> cf_id_to_tables; + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]); + if (max_sequence < tables_[i].meta.fd.largest_seqno) { + max_sequence = tables_[i].meta.fd.largest_seqno; + } + } + vset_.SetLastAllocatedSequence(max_sequence); + vset_.SetLastPublishedSequence(max_sequence); + vset_.SetLastSequence(max_sequence); + + for (const auto& cf_id_and_tables : cf_id_to_tables) { + auto* cfd = + vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first); + + // Recover files' epoch number using dummy VersionStorageInfo + VersionBuilder dummy_version_builder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), cfd->current()->storage_info(), + cfd->current()->version_set(), + cfd->GetFileMetadataCacheReservationManager()); + VersionStorageInfo dummy_vstorage( + &cfd->internal_comparator(), cfd->user_comparator(), + cfd->NumberLevels(), cfd->ioptions()->compaction_style, + nullptr /* src_vstorage */, cfd->ioptions()->force_consistency_checks, + EpochNumberRequirement::kMightMissing); + Status s; + VersionEdit dummy_edit; + for (const auto* table : cf_id_and_tables.second) { + // TODO(opt): separate out into multiple levels + dummy_edit.AddFile( + 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), + table->meta.fd.GetFileSize(), table->meta.smallest, + table->meta.largest, table->meta.fd.smallest_seqno, + table->meta.fd.largest_seqno, table->meta.marked_for_compaction, + table->meta.temperature, table->meta.oldest_blob_file_number, + table->meta.oldest_ancester_time, table->meta.file_creation_time, + table->meta.epoch_number, table->meta.file_checksum, + table->meta.file_checksum_func_name, table->meta.unique_id, + table->meta.compensated_range_deletion_size, table->meta.tail_size); + } + s = dummy_version_builder.Apply(&dummy_edit); + if (s.ok()) { + s = dummy_version_builder.SaveTo(&dummy_vstorage); + } + if (s.ok()) { + dummy_vstorage.RecoverEpochNumbers(cfd); + } + if (s.ok()) { + // Record changes from this repair in VersionEdit, including files with + // recovered epoch numbers + VersionEdit edit; + edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetLogNumber(0); + edit.SetNextFile(next_file_number_); + edit.SetColumnFamily(cfd->GetID()); + for (int level = 0; level < dummy_vstorage.num_levels(); ++level) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + edit.AddFile(level, *file_meta); + } + } + + // Release resources occupied by the dummy VersionStorageInfo + for (int level = 0; level < dummy_vstorage.num_levels(); ++level) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + file_meta->refs--; + if (file_meta->refs <= 0) { + delete file_meta; + } + } + } + + // Persist record of changes + assert(next_file_number_ > 0); + vset_.MarkFileNumberUsed(next_file_number_ - 1); + mutex_.Lock(); + std::unique_ptr db_dir; + s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, + nullptr); + if (s.ok()) { + s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, db_dir.get(), + false /* new_descriptor_log */); + } + mutex_.Unlock(); + } + if (!s.ok()) { + return s; + } + } + return Status::OK(); + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != nullptr) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir).PermitUncheckedError(); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(), + s.ToString().c_str()); + } +}; + +Status GetDefaultCFOptions( + const std::vector& column_families, + ColumnFamilyOptions* res) { + assert(res != nullptr); + auto iter = std::find_if(column_families.begin(), column_families.end(), + [](const ColumnFamilyDescriptor& cfd) { + return cfd.name == kDefaultColumnFamilyName; + }); + if (iter == column_families.end()) { + return Status::InvalidArgument( + "column_families", "Must contain entry for default column family"); + } + *res = iter->options; + return Status::OK(); +} +} // anonymous namespace + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (!status.ok()) { + return status; + } + + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + ColumnFamilyOptions() /* unknown_cf_opts */, + false /* create_unknown_cfs */); + status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (!status.ok()) { + return status; + } + + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + unknown_cf_opts, true /* create_unknown_cfs */); + status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Options opts(options); + DBOptions db_options(opts); + ColumnFamilyOptions cf_options(opts); + + Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */, + cf_options /* unknown_cf_opts */, + true /* create_unknown_cfs */); + Status status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.cc new file mode 100644 index 0000000000..c692099294 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.cc @@ -0,0 +1,341 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/seqno_to_time_mapping.h" + +#include "db/version_edit.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t SeqnoToTimeMapping::GetOldestApproximateTime( + const SequenceNumber seqno) const { + assert(is_sorted_); + auto it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), seqno); + if (it == seqno_time_mapping_.begin()) { + return 0; + } + it--; + return it->time; +} + +void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) { + if (seqno == 0) { + return; + } + is_sorted_ = false; + seqno_time_mapping_.emplace_back(seqno, time); +} + +void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) { + assert(is_sorted_); + + if (max_time_duration_ == 0) { + return; + } + + const uint64_t cut_off_time = + now > max_time_duration_ ? now - max_time_duration_ : 0; + assert(cut_off_time <= now); // no overflow + + auto it = std::upper_bound( + seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time, + [](uint64_t target, const SeqnoTimePair& other) -> bool { + return target < other.time; + }); + if (it == seqno_time_mapping_.begin()) { + return; + } + it--; + seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it); +} + +SequenceNumber SeqnoToTimeMapping::GetOldestSequenceNum(uint64_t time) { + assert(is_sorted_); + + auto it = std::upper_bound( + seqno_time_mapping_.begin(), seqno_time_mapping_.end(), time, + [](uint64_t target, const SeqnoTimePair& other) -> bool { + return target < other.time; + }); + if (it == seqno_time_mapping_.begin()) { + return 0; + } + it--; + return it->seqno; +} + +// The encoded format is: +// [num_of_entries][[seqno][time],[seqno][time],...] +// ^ ^ +// var_int delta_encoded (var_int) +void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, + const SequenceNumber end, const uint64_t now, + const uint64_t output_size) const { + assert(is_sorted_); + if (start > end) { + // It could happen when the SST file is empty, the initial value of min + // sequence number is kMaxSequenceNumber and max is 0. + // The empty output file will be removed in the final step of compaction. + return; + } + + auto start_it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), start); + if (start_it != seqno_time_mapping_.begin()) { + start_it--; + } + + auto end_it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), end); + if (end_it == seqno_time_mapping_.begin()) { + return; + } + if (start_it >= end_it) { + return; + } + + // truncate old entries that are not needed + if (max_time_duration_ > 0) { + const uint64_t cut_off_time = + now > max_time_duration_ ? now - max_time_duration_ : 0; + while (start_it < end_it && start_it->time < cut_off_time) { + start_it++; + } + } + // to include the first element + if (start_it != seqno_time_mapping_.begin()) { + start_it--; + } + + // If there are more data than needed, pick the entries for encoding. + // It's not the most optimized algorithm for selecting the best representative + // entries over the time. + // It starts from the beginning and makes sure the distance is larger than + // `(end - start) / size` before selecting the number. For example, for the + // following list, pick 3 entries (it will pick seqno #1, #6, #8): + // 1 -> 10 + // 5 -> 17 + // 6 -> 25 + // 8 -> 30 + // first, it always picks the first one, then there are 2 num_entries_to_fill + // and the time difference between current one vs. the last one is + // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips + // #5 and pick #6. + // But the most optimized solution is picking #1 #5 #8, as it will be more + // evenly distributed for time. Anyway the following algorithm is simple and + // may over-select new data, which is good. We do want more accurate time + // information for recent data. + std::deque output_copy; + if (std::distance(start_it, end_it) > static_cast(output_size)) { + int64_t num_entries_to_fill = static_cast(output_size); + auto last_it = end_it; + last_it--; + uint64_t end_time = last_it->time; + uint64_t skip_until_time = 0; + for (auto it = start_it; it < end_it; it++) { + // skip if it's not reach the skip_until_time yet + if (std::distance(it, end_it) > num_entries_to_fill && + it->time < skip_until_time) { + continue; + } + output_copy.push_back(*it); + num_entries_to_fill--; + if (std::distance(it, end_it) > num_entries_to_fill && + num_entries_to_fill > 0) { + // If there are more entries than we need, re-calculate the + // skip_until_time, which means skip until that time + skip_until_time = + it->time + ((end_time - it->time) / num_entries_to_fill); + } + } + + // Make sure all entries are filled + assert(num_entries_to_fill == 0); + start_it = output_copy.begin(); + end_it = output_copy.end(); + } + + // Delta encode the data + uint64_t size = std::distance(start_it, end_it); + PutVarint64(&dest, size); + SeqnoTimePair base; + for (auto it = start_it; it < end_it; it++) { + assert(base < *it); + SeqnoTimePair val = *it - base; + base = *it; + val.Encode(dest); + } +} + +Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) { + Slice input(seqno_time_mapping_str); + if (input.empty()) { + return Status::OK(); + } + uint64_t size; + if (!GetVarint64(&input, &size)) { + return Status::Corruption("Invalid sequence number time size"); + } + is_sorted_ = false; + SeqnoTimePair base; + for (uint64_t i = 0; i < size; i++) { + SeqnoTimePair val; + Status s = val.Decode(input); + if (!s.ok()) { + return s; + } + val.Add(base); + seqno_time_mapping_.emplace_back(val); + base = val; + } + return Status::OK(); +} + +void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const { + PutVarint64Varint64(&dest, seqno, time); +} + +Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) { + if (!GetVarint64(&input, &seqno)) { + return Status::Corruption("Invalid sequence number"); + } + if (!GetVarint64(&input, &time)) { + return Status::Corruption("Invalid time"); + } + return Status::OK(); +} + +bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { + assert(is_sorted_); + + // skip seq number 0, which may have special meaning, like zeroed out data + if (seqno == 0) { + return false; + } + if (!Empty()) { + if (seqno < Last().seqno || time < Last().time) { + return false; + } + if (seqno == Last().seqno) { + Last().time = time; + return true; + } + if (time == Last().time) { + // new sequence has the same time as old one, no need to add new mapping + return false; + } + } + + seqno_time_mapping_.emplace_back(seqno, time); + + if (seqno_time_mapping_.size() > max_capacity_) { + seqno_time_mapping_.pop_front(); + } + return true; +} + +bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration, + uint64_t max_time_duration) { + uint64_t new_max_capacity = + CalculateMaxCapacity(min_time_duration, max_time_duration); + if (new_max_capacity == max_capacity_) { + return false; + } else if (new_max_capacity < seqno_time_mapping_.size()) { + uint64_t delta = seqno_time_mapping_.size() - new_max_capacity; + seqno_time_mapping_.erase(seqno_time_mapping_.begin(), + seqno_time_mapping_.begin() + delta); + } + max_capacity_ = new_max_capacity; + return true; +} + +Status SeqnoToTimeMapping::Sort() { + if (is_sorted_) { + return Status::OK(); + } + if (seqno_time_mapping_.empty()) { + is_sorted_ = true; + return Status::OK(); + } + + std::deque copy = std::move(seqno_time_mapping_); + + std::sort(copy.begin(), copy.end()); + + seqno_time_mapping_.clear(); + + // remove seqno = 0, which may have special meaning, like zeroed out data + while (copy.front().seqno == 0) { + copy.pop_front(); + } + + SeqnoTimePair prev = copy.front(); + for (const auto& it : copy) { + // If sequence number is the same, pick the one with larger time, which is + // more accurate than the older time. + if (it.seqno == prev.seqno) { + assert(it.time >= prev.time); + prev.time = it.time; + } else { + assert(it.seqno > prev.seqno); + // If a larger sequence number has an older time which is not useful, skip + if (it.time > prev.time) { + seqno_time_mapping_.push_back(prev); + prev = it; + } + } + } + seqno_time_mapping_.emplace_back(prev); + + is_sorted_ = true; + return Status::OK(); +} + +std::string SeqnoToTimeMapping::ToHumanString() const { + std::string ret; + for (const auto& seq_time : seqno_time_mapping_) { + AppendNumberTo(&ret, seq_time.seqno); + ret.append("->"); + AppendNumberTo(&ret, seq_time.time); + ret.append(","); + } + return ret; +} + +SeqnoToTimeMapping SeqnoToTimeMapping::Copy( + SequenceNumber smallest_seqno) const { + SeqnoToTimeMapping ret; + auto it = std::upper_bound(seqno_time_mapping_.begin(), + seqno_time_mapping_.end(), smallest_seqno); + if (it != seqno_time_mapping_.begin()) { + it--; + } + std::copy(it, seqno_time_mapping_.end(), + std::back_inserter(ret.seqno_time_mapping_)); + return ret; +} + +uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration, + uint64_t max_time_duration) { + if (min_time_duration == 0) { + return 0; + } + return std::min( + kMaxSeqnoToTimeEntries, + max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration); +} + +SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-( + const SeqnoTimePair& other) const { + SeqnoTimePair res; + res.seqno = seqno - other.seqno; + res.time = time - other.time; + return res; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.h new file mode 100644 index 0000000000..4ffc9c1992 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/seqno_to_time_mapping.h @@ -0,0 +1,189 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint64_t kUnknownSeqnoTime = 0; + +// SeqnoToTimeMapping stores the sequence number to time mapping, so given a +// sequence number it can estimate the oldest possible time for that sequence +// number. For example: +// 10 -> 100 +// 50 -> 300 +// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it +// would be 300. +// As it's a sorted list, the new entry is inserted from the back. The old data +// will be popped from the front if they're no longer used. +// +// Note: the data struct is not thread safe, both read and write need to be +// synchronized by caller. +class SeqnoToTimeMapping { + public: + // Maximum number of entries can be encoded into SST. The data is delta encode + // so the maximum data usage for each SST is < 0.3K + static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100; + + // Maximum number of entries per CF. If there's only CF with this feature on, + // the max duration divided by this number, so for example, if + // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the + // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it + // needs is 100. + // When there are multiple CFs having this feature on, the sampling cadence is + // determined by the smallest setting, the capacity is determined the largest + // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10. + static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100; + + // A simple struct for sequence number to time pair + struct SeqnoTimePair { + SequenceNumber seqno = 0; + uint64_t time = 0; + + SeqnoTimePair() = default; + SeqnoTimePair(SequenceNumber _seqno, uint64_t _time) + : seqno(_seqno), time(_time) {} + + // Encode to dest string + void Encode(std::string& dest) const; + + // Decode the value from input Slice and remove it from the input + Status Decode(Slice& input); + + // subtraction of 2 SeqnoTimePair + SeqnoTimePair operator-(const SeqnoTimePair& other) const; + + // Add 2 values together + void Add(const SeqnoTimePair& obj) { + seqno += obj.seqno; + time += obj.time; + } + + // Compare SeqnoTimePair with a sequence number, used for binary search a + // sequence number in a list of SeqnoTimePair + bool operator<(const SequenceNumber& other) const { return seqno < other; } + + // Compare 2 SeqnoTimePair + bool operator<(const SeqnoTimePair& other) const { + return std::tie(seqno, time) < std::tie(other.seqno, other.time); + } + + // Check if 2 SeqnoTimePair is the same + bool operator==(const SeqnoTimePair& other) const { + return std::tie(seqno, time) == std::tie(other.seqno, other.time); + } + }; + + // constractor of SeqnoToTimeMapping + // max_time_duration is the maximum time it should track. For example, if + // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1 + // day, then it can be removed. + // max_capacity is the maximum number of entry it can hold. For single CF, + // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise + // kMaxSeqnoTimePairsPerCF * 10. + // If it's set to 0, means it won't truncate any old data. + explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0, + uint64_t max_capacity = 0) + : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {} + + // Append a new entry to the list. The new entry should be newer than the + // existing ones. It maintains the internal sorted status. + bool Append(SequenceNumber seqno, uint64_t time); + + // Given a sequence number, estimate it's oldest time + uint64_t GetOldestApproximateTime(SequenceNumber seqno) const; + + // Truncate the old entries based on the current time and max_time_duration_ + void TruncateOldEntries(uint64_t now); + + // Given a time, return it's oldest possible sequence number + SequenceNumber GetOldestSequenceNum(uint64_t time); + + // Encode to a binary string + void Encode(std::string& des, SequenceNumber start, SequenceNumber end, + uint64_t now, + uint64_t output_size = kMaxSeqnoTimePairsPerSST) const; + + // Add a new random entry, unlike Append(), it can be any data, but also makes + // the list un-sorted. + void Add(SequenceNumber seqno, uint64_t time); + + // Decode and add the entries to the current obj. The list will be unsorted + Status Add(const std::string& seqno_time_mapping_str); + + // Return the number of entries + size_t Size() const { return seqno_time_mapping_.size(); } + + // Reduce the size of internal list + bool Resize(uint64_t min_time_duration, uint64_t max_time_duration); + + // Override the max_time_duration_ + void SetMaxTimeDuration(uint64_t max_time_duration) { + max_time_duration_ = max_time_duration; + } + + uint64_t GetCapacity() const { return max_capacity_; } + + // Sort the list, which also remove the redundant entries, useless entries, + // which makes sure the seqno is sorted, but also the time + Status Sort(); + + // copy the current obj from the given smallest_seqno. + SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const; + + // If the internal list is empty + bool Empty() const { return seqno_time_mapping_.empty(); } + + // clear all entries + void Clear() { seqno_time_mapping_.clear(); } + + // return the string for user message + // Note: Not efficient, okay for print + std::string ToHumanString() const; + +#ifndef NDEBUG + const std::deque& TEST_GetInternalMapping() const { + return seqno_time_mapping_; + } +#endif + + private: + static constexpr uint64_t kMaxSeqnoToTimeEntries = + kMaxSeqnoTimePairsPerCF * 10; + + uint64_t max_time_duration_; + uint64_t max_capacity_; + + std::deque seqno_time_mapping_; + + bool is_sorted_ = true; + + static uint64_t CalculateMaxCapacity(uint64_t min_time_duration, + uint64_t max_time_duration); + + SeqnoTimePair& Last() { + assert(!Empty()); + return seqno_time_mapping_.back(); + } +}; + +// for searching the sequence number from SeqnoToTimeMapping +inline bool operator<(const SequenceNumber& seqno, + const SeqnoToTimeMapping::SeqnoTimePair& other) { + return seqno < other.seqno; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_checker.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_checker.h new file mode 100644 index 0000000000..b7ff1df8c0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_checker.h @@ -0,0 +1,58 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +enum class SnapshotCheckerResult : int { + kInSnapshot = 0, + kNotInSnapshot = 1, + // In case snapshot is released and the checker has no clue whether + // the given sequence is visible to the snapshot. + kSnapshotReleased = 2, +}; + +// Callback class that control GC of duplicate keys in flush/compaction. +class SnapshotChecker { + public: + virtual ~SnapshotChecker() {} + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0; +}; + +class DisableGCSnapshotChecker : public SnapshotChecker { + public: + virtual ~DisableGCSnapshotChecker() {} + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber /*sequence*/, + SequenceNumber /*snapshot_sequence*/) const override { + // By returning kNotInSnapshot, we prevent all the values from being GCed + return SnapshotCheckerResult::kNotInSnapshot; + } + static DisableGCSnapshotChecker* Instance(); + + protected: + explicit DisableGCSnapshotChecker() {} +}; + +class WritePreparedTxnDB; + +// Callback class created by WritePreparedTxnDB to check if a key +// is visible by a snapshot. +class WritePreparedSnapshotChecker : public SnapshotChecker { + public: + explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db); + virtual ~WritePreparedSnapshotChecker() {} + + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const override; + + private: + const WritePreparedTxnDB* const txn_db_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.cc new file mode 100644 index 0000000000..98b4754634 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/db.h" +#include "rocksdb/snapshot.h" + +namespace ROCKSDB_NAMESPACE { + +ManagedSnapshot::ManagedSnapshot(DB* db) + : db_(db), snapshot_(db->GetSnapshot()) {} + +ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot) + : db_(db), snapshot_(_snapshot) {} + +ManagedSnapshot::~ManagedSnapshot() { + if (snapshot_) { + db_->ReleaseSnapshot(snapshot_); + } +} + +const Snapshot* ManagedSnapshot::snapshot() { return snapshot_; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.h new file mode 100644 index 0000000000..23e5e98cd2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/snapshot_impl.h @@ -0,0 +1,239 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "db/dbformat.h" +#include "rocksdb/db.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { + public: + SequenceNumber number_; // const after creation + // It indicates the smallest uncommitted data at the time the snapshot was + // taken. This is currently used by WritePrepared transactions to limit the + // scope of queries to IsInSnapshot. + SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; + + SequenceNumber GetSequenceNumber() const override { return number_; } + + int64_t GetUnixTime() const override { return unix_time_; } + + uint64_t GetTimestamp() const override { return timestamp_; } + + private: + friend class SnapshotList; + + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; + + SnapshotList* list_; // just for sanity checks + + int64_t unix_time_; + + uint64_t timestamp_; + + // Will this snapshot be used by a Transaction to do write-conflict checking? + bool is_write_conflict_boundary_; +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + // Set all the variables to make UBSAN happy. + list_.list_ = nullptr; + list_.unix_time_ = 0; + list_.timestamp_ = 0; + list_.is_write_conflict_boundary_ = false; + count_ = 0; + } + + // No copy-construct. + SnapshotList(const SnapshotList&) = delete; + + bool empty() const { + assert(list_.next_ != &list_ || 0 == count_); + return list_.next_ == &list_; + } + SnapshotImpl* oldest() const { + assert(!empty()); + return list_.next_; + } + SnapshotImpl* newest() const { + assert(!empty()); + return list_.prev_; + } + + SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time, + bool is_write_conflict_boundary, + uint64_t ts = std::numeric_limits::max()) { + s->number_ = seq; + s->unix_time_ = unix_time; + s->timestamp_ = ts; + s->is_write_conflict_boundary_ = is_write_conflict_boundary; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + count_++; + return s; + } + + // Do not responsible to free the object. + void Delete(const SnapshotImpl* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + count_--; + } + + // retrieve all snapshot numbers up until max_seq. They are sorted in + // ascending order (with no duplicates). + std::vector GetAll( + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector ret; + GetAll(&ret, oldest_write_conflict_snapshot, max_seq); + return ret; + } + + void GetAll(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector& ret = *snap_vector; + // So far we have no use case that would pass a non-empty vector + assert(ret.size() == 0); + + if (oldest_write_conflict_snapshot != nullptr) { + *oldest_write_conflict_snapshot = kMaxSequenceNumber; + } + + if (empty()) { + return; + } + const SnapshotImpl* s = &list_; + while (s->next_ != &list_) { + if (s->next_->number_ > max_seq) { + break; + } + // Avoid duplicates + if (ret.empty() || ret.back() != s->next_->number_) { + ret.push_back(s->next_->number_); + } + + if (oldest_write_conflict_snapshot != nullptr && + *oldest_write_conflict_snapshot == kMaxSequenceNumber && + s->next_->is_write_conflict_boundary_) { + // If this is the first write-conflict boundary snapshot in the list, + // it is the oldest + *oldest_write_conflict_snapshot = s->next_->number_; + } + + s = s->next_; + } + return; + } + + // get the sequence number of the most recent snapshot + SequenceNumber GetNewest() { + if (empty()) { + return 0; + } + return newest()->number_; + } + + int64_t GetOldestSnapshotTime() const { + if (empty()) { + return 0; + } else { + return oldest()->unix_time_; + } + } + + int64_t GetOldestSnapshotSequence() const { + if (empty()) { + return 0; + } else { + return oldest()->GetSequenceNumber(); + } + } + + uint64_t count() const { return count_; } + + private: + // Dummy head of doubly-linked list of snapshots + SnapshotImpl list_; + uint64_t count_; +}; + +// All operations on TimestampedSnapshotList must be protected by db mutex. +class TimestampedSnapshotList { + public: + explicit TimestampedSnapshotList() = default; + + std::shared_ptr GetSnapshot(uint64_t ts) const { + if (ts == std::numeric_limits::max() && !snapshots_.empty()) { + auto it = snapshots_.rbegin(); + assert(it != snapshots_.rend()); + return it->second; + } + auto it = snapshots_.find(ts); + if (it == snapshots_.end()) { + return std::shared_ptr(); + } + return it->second; + } + + void GetSnapshots( + uint64_t ts_lb, uint64_t ts_ub, + std::vector>& snapshots) const { + assert(ts_lb < ts_ub); + auto it_low = snapshots_.lower_bound(ts_lb); + auto it_high = snapshots_.lower_bound(ts_ub); + for (auto it = it_low; it != it_high; ++it) { + snapshots.emplace_back(it->second); + } + } + + void AddSnapshot(const std::shared_ptr& snapshot) { + assert(snapshot); + snapshots_.try_emplace(snapshot->GetTimestamp(), snapshot); + } + + // snapshots_to_release: the container to where the timestamped snapshots will + // be moved so that it retains the last reference to the snapshots and the + // snapshots won't be actually released which requires db mutex. The + // snapshots will be released by caller of ReleaseSnapshotsOlderThan(). + void ReleaseSnapshotsOlderThan( + uint64_t ts, + autovector>& snapshots_to_release) { + auto ub = snapshots_.lower_bound(ts); + for (auto it = snapshots_.begin(); it != ub; ++it) { + snapshots_to_release.emplace_back(it->second); + } + snapshots_.erase(snapshots_.begin(), ub); + } + + private: + std::map> snapshots_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.cc new file mode 100644 index 0000000000..73fa07c6d5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.cc @@ -0,0 +1,710 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/statistics.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "table/multiget_context.h" +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +// Generate the regular and coroutine versions of some methods by +// including table_cache_sync_and_async.h twice +// Macros in the header will expand differently based on whether +// WITH_COROUTINES or WITHOUT_COROUTINES is defined +// clang-format off +#define WITHOUT_COROUTINES +#include "db/table_cache_sync_and_async.h" +#undef WITHOUT_COROUTINES +#define WITH_COROUTINES +#include "db/table_cache_sync_and_async.h" +#undef WITH_COROUTINES +// clang-format on + +namespace ROCKSDB_NAMESPACE { + +namespace { + +static Slice GetSliceForFileNumber(const uint64_t* file_number) { + return Slice(reinterpret_cast(file_number), + sizeof(*file_number)); +} + + +void AppendVarint64(IterKey* key, uint64_t v) { + char buf[10]; + auto ptr = EncodeVarint64(buf, v); + key->TrimAppend(key->Size(), buf, ptr - buf); +} + + +} // anonymous namespace + +const int kLoadConcurency = 128; + +TableCache::TableCache(const ImmutableOptions& ioptions, + const FileOptions* file_options, Cache* const cache, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id) + : ioptions_(ioptions), + file_options_(*file_options), + cache_(cache), + immortal_tables_(false), + block_cache_tracer_(block_cache_tracer), + loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr), + io_tracer_(io_tracer), + db_session_id_(db_session_id) { + if (ioptions_.row_cache) { + // If the same cache is shared by multiple instances, we need to + // disambiguate its entries. + PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId()); + } +} + +TableCache::~TableCache() {} + +Status TableCache::GetTableReader( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats, + uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, + const std::shared_ptr& prefix_extractor, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { + std::string fname = TableFileName( + ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId()); + std::unique_ptr file; + FileOptions fopts = file_options; + fopts.temperature = file_temperature; + Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", + const_cast(&s)); + if (s.ok()) { + s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); + } + if (s.ok()) { + RecordTick(ioptions_.stats, NO_FILE_OPENS); + } else if (s.IsPathNotFound()) { + fname = Rocks2LevelTableFileName(fname); + s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + if (s.ok()) { + s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, + nullptr); + } + if (s.ok()) { + RecordTick(ioptions_.stats, NO_FILE_OPENS); + } + } + + if (s.ok()) { + if (!sequential_mode && ioptions_.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), fname, ioptions_.clock, io_tracer_, + record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, + file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, + file_temperature, level == ioptions_.num_levels - 1)); + UniqueId64x2 expected_unique_id; + if (ioptions_.verify_sst_unique_id_in_manifest) { + expected_unique_id = file_meta.unique_id; + } else { + expected_unique_id = kNullUniqueId64x2; // null ID == no verification + } + s = ioptions_.table_factory->NewTableReader( + ro, + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + block_protection_bytes_per_key, skip_filters, immortal_tables_, + false /* force_direct_prefetch */, level, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, + file_meta.fd.GetNumber(), expected_unique_id, + file_meta.fd.largest_seqno, file_meta.tail_size), + std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, + prefetch_index_and_filter_in_cache); + TEST_SYNC_POINT("TableCache::GetTableReader:0"); + } + return s; +} + +Status TableCache::FindTable( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, TypedHandle** handle, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor, + const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { + PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); + uint64_t number = file_meta.fd.GetNumber(); + Slice key = GetSliceForFileNumber(&number); + *handle = cache_.Lookup(key); + TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0", + const_cast(&no_io)); + + if (*handle == nullptr) { + if (no_io) { + return Status::Incomplete("Table not found in table_cache, no_io is set"); + } + MutexLock load_lock(loader_mutex_.get(key)); + // We check the cache again under loading mutex + *handle = cache_.Lookup(key); + if (*handle != nullptr) { + return Status::OK(); + } + + std::unique_ptr table_reader; + Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, + false /* sequential mode */, record_read_stats, + block_protection_bytes_per_key, file_read_hist, + &table_reader, prefix_extractor, skip_filters, + level, prefetch_index_and_filter_in_cache, + max_file_size_for_l0_meta_pin, file_temperature); + if (!s.ok()) { + assert(table_reader == nullptr); + RecordTick(ioptions_.stats, NO_FILE_ERRORS); + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + } else { + s = cache_.Insert(key, table_reader.get(), 1, handle); + if (s.ok()) { + // Release ownership of table reader. + table_reader.release(); + } + } + return s; + } + return Status::OK(); +} + +InternalIterator* TableCache::NewIterator( + const ReadOptions& options, const FileOptions& file_options, + const InternalKeyComparator& icomparator, const FileMetaData& file_meta, + RangeDelAggregator* range_del_agg, + const std::shared_ptr& prefix_extractor, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t block_protection_bytes_per_key, + TruncatedRangeDelIterator** range_del_iter) { + PERF_TIMER_GUARD(new_table_iterator_nanos); + + Status s; + TableReader* table_reader = nullptr; + TypedHandle* handle = nullptr; + if (table_reader_ptr != nullptr) { + *table_reader_ptr = nullptr; + } + bool for_compaction = caller == TableReaderCaller::kCompaction; + auto& fd = file_meta.fd; + table_reader = fd.table_reader; + if (table_reader == nullptr) { + s = FindTable(options, file_options, icomparator, file_meta, &handle, + block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, + skip_filters, level, + true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); + if (s.ok()) { + table_reader = cache_.Value(handle); + } + } + InternalIterator* result = nullptr; + if (s.ok()) { + if (options.table_filter && + !options.table_filter(*table_reader->GetTableProperties())) { + result = NewEmptyInternalIterator(arena); + } else { + result = table_reader->NewIterator( + options, prefix_extractor.get(), arena, skip_filters, caller, + file_options.compaction_readahead_size, allow_unprepared_value); + } + if (handle != nullptr) { + cache_.RegisterReleaseAsCleanup(handle, *result); + handle = nullptr; // prevent from releasing below + } + + if (for_compaction) { + table_reader->SetupForCompaction(); + } + if (table_reader_ptr != nullptr) { + *table_reader_ptr = table_reader; + } + } + if (s.ok() && !options.ignore_range_deletions) { + if (range_del_iter != nullptr) { + auto new_range_del_iter = + table_reader->NewRangeTombstoneIterator(options); + if (new_range_del_iter == nullptr || new_range_del_iter->empty()) { + delete new_range_del_iter; + *range_del_iter = nullptr; + } else { + *range_del_iter = new TruncatedRangeDelIterator( + std::unique_ptr( + new_range_del_iter), + &icomparator, &file_meta.smallest, &file_meta.largest); + } + } + if (range_del_agg != nullptr) { + if (range_del_agg->AddFile(fd.GetNumber())) { + std::unique_ptr new_range_del_iter( + static_cast( + table_reader->NewRangeTombstoneIterator(options))); + if (new_range_del_iter != nullptr) { + s = new_range_del_iter->status(); + } + if (s.ok()) { + const InternalKey* smallest = &file_meta.smallest; + const InternalKey* largest = &file_meta.largest; + if (smallest_compaction_key != nullptr) { + smallest = smallest_compaction_key; + } + if (largest_compaction_key != nullptr) { + largest = largest_compaction_key; + } + range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest, + largest); + } + } + } + } + + if (handle != nullptr) { + cache_.Release(handle); + } + if (!s.ok()) { + assert(result == nullptr); + result = NewErrorInternalIterator(s, arena); + } + return result; +} + +Status TableCache::GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + std::unique_ptr* out_iter) { + assert(out_iter); + const FileDescriptor& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + TypedHandle* handle = nullptr; + if (t == nullptr) { + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key); + if (s.ok()) { + t = cache_.Value(handle); + } + } + if (s.ok()) { + // Note: NewRangeTombstoneIterator could return nullptr + out_iter->reset(t->NewRangeTombstoneIterator(options)); + } + if (handle) { + if (*out_iter) { + cache_.RegisterReleaseAsCleanup(handle, **out_iter); + } else { + cache_.Release(handle); + } + } + return s; +} + +void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, + IterKey& row_cache_key) { + uint64_t fd_number = fd.GetNumber(); + // We use the user key as cache key instead of the internal key, + // otherwise the whole cache would be invalidated every time the + // sequence key increases. However, to support caching snapshot + // reads, we append the sequence number (incremented by 1 to + // distinguish from 0) only in this case. + // If the snapshot is larger than the largest seqno in the file, + // all data should be exposed to the snapshot, so we treat it + // the same as there is no snapshot. The exception is that if + // a seq-checking callback is registered, some internal keys + // may still be filtered out. + uint64_t seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. + if (options.snapshot != nullptr && + (get_context->has_callback() || + static_cast_with_check(options.snapshot) + ->GetSequenceNumber() <= fd.largest_seqno)) { + // We should consider to use options.snapshot->GetSequenceNumber() + // instead of GetInternalKeySeqno(k), which will make the code + // easier to understand. + seq_no = 1 + GetInternalKeySeqno(internal_key); + } + + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), + row_cache_id_.size()); + AppendVarint64(&row_cache_key, fd_number); + AppendVarint64(&row_cache_key, seq_no); +} + +bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context) { + bool found = false; + + row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); + RowCacheInterface row_cache{ioptions_.row_cache.get()}; + if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) { + // Cleanable routine to release the cache entry + Cleanable value_pinner; + // If it comes here value is located on the cache. + // found_row_cache_entry points to the value on cache, + // and value_pinner has cleanup procedure for the cached entry. + // After replayGetContextLog() returns, get_context.pinnable_slice_ + // will point to cache entry buffer (or a copy based on that) and + // cleanup routine under value_pinner will be delegated to + // get_context.pinnable_slice_. Cache entry is released when + // get_context.pinnable_slice_ is reset. + row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner); + replayGetContextLog(*row_cache.Value(row_handle), user_key, get_context, + &value_pinner); + RecordTick(ioptions_.stats, ROW_CACHE_HIT); + found = true; + } else { + RecordTick(ioptions_.stats, ROW_CACHE_MISS); + } + return found; +} + +Status TableCache::Get( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin) { + auto& fd = file_meta.fd; + std::string* row_cache_entry = nullptr; + bool done = false; + IterKey row_cache_key; + std::string row_cache_entry_buffer; + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { + auto user_key = ExtractUserKey(k); + CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); + done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), + get_context); + if (!done) { + row_cache_entry = &row_cache_entry_buffer; + } + } + Status s; + TableReader* t = fd.table_reader; + TypedHandle* handle = nullptr; + if (!done) { + assert(s.ok()); + if (t == nullptr) { + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); + if (s.ok()) { + t = cache_.Value(handle); + } + } + SequenceNumber* max_covering_tombstone_seq = + get_context->max_covering_tombstone_seq(); + if (s.ok() && max_covering_tombstone_seq != nullptr && + !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + SequenceNumber seq = + range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)); + if (seq > *max_covering_tombstone_seq) { + *max_covering_tombstone_seq = seq; + if (get_context->NeedTimestamp()) { + get_context->SetTimestampFromRangeTombstone( + range_del_iter->timestamp()); + } + } + } + } + if (s.ok()) { + get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. + s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); + get_context->SetReplayLog(nullptr); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + get_context->MarkKeyMayExist(); + s = Status::OK(); + done = true; + } + } + + // Put the replay log in row cache only if something was found. + if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { + RowCacheInterface row_cache{ioptions_.row_cache.get()}; + size_t charge = row_cache_entry->capacity() + sizeof(std::string); + auto row_ptr = new std::string(std::move(*row_cache_entry)); + // If row cache is full, it's OK to continue. + row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge) + .PermitUncheckedError(); + } + + if (handle != nullptr) { + cache_.Release(handle); + } + return s; +} + +void TableCache::UpdateRangeTombstoneSeqnums( + const ReadOptions& options, TableReader* t, + MultiGetContext::Range& table_range) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { + SequenceNumber* max_covering_tombstone_seq = + iter->get_context->max_covering_tombstone_seq(); + SequenceNumber seq = + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts); + if (seq > *max_covering_tombstone_seq) { + *max_covering_tombstone_seq = seq; + if (iter->get_context->NeedTimestamp()) { + iter->get_context->SetTimestampFromRangeTombstone( + range_del_iter->timestamp()); + } + } + } + } +} + +Status TableCache::MultiGetFilter( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, int level, + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key) { + auto& fd = file_meta.fd; + IterKey row_cache_key; + std::string row_cache_entry_buffer; + + // Check if we need to use the row cache. If yes, then we cannot do the + // filtering here, since the filtering needs to happen after the row cache + // lookup. + KeyContext& first_key = *mget_range->begin(); + if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) { + return Status::NotSupported(); + } + Status s; + TableReader* t = fd.table_reader; + TypedHandle* handle = nullptr; + MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), + mget_range->end()); + if (t == nullptr) { + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, + /*skip_filters=*/false, level, + true /* prefetch_index_and_filter_in_cache */, + /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); + if (s.ok()) { + t = cache_.Value(handle); + } + *table_handle = handle; + } + if (s.ok()) { + s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range); + } + if (s.ok() && !options.ignore_range_deletions) { + // Update the range tombstone sequence numbers for the keys here + // as TableCache::MultiGet may or may not be called, and even if it + // is, it may be called with fewer keys in the rangedue to filtering. + UpdateRangeTombstoneSeqnums(options, t, tombstone_range); + } + if (mget_range->empty() && handle) { + cache_.Release(handle); + *table_handle = nullptr; + } + + return s; +} + +Status TableCache::GetTableProperties( + const FileOptions& file_options, const ReadOptions& read_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor, bool no_io) { + auto table_reader = file_meta.fd.table_reader; + // table already been pre-loaded? + if (table_reader) { + *properties = table_reader->GetTableProperties(); + + return Status::OK(); + } + + TypedHandle* table_handle = nullptr; + Status s = FindTable(read_options, file_options, internal_comparator, + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, no_io); + if (!s.ok()) { + return s; + } + assert(table_handle); + auto table = cache_.Value(table_handle); + *properties = table->GetTableProperties(); + cache_.Release(table_handle); + return s; +} + +Status TableCache::ApproximateKeyAnchors( + const ReadOptions& ro, const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + std::vector& anchors) { + Status s; + TableReader* t = file_meta.fd.table_reader; + TypedHandle* handle = nullptr; + if (t == nullptr) { + s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, + block_protection_bytes_per_key); + if (s.ok()) { + t = cache_.Value(handle); + } + } + if (s.ok() && t != nullptr) { + s = t->ApproximateKeyAnchors(ro, anchors); + } + if (handle != nullptr) { + cache_.Release(handle); + } + return s; +} + +size_t TableCache::GetMemoryUsageByTableReader( + const FileOptions& file_options, const ReadOptions& read_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor) { + auto table_reader = file_meta.fd.table_reader; + // table already been pre-loaded? + if (table_reader) { + return table_reader->ApproximateMemoryUsage(); + } + + TypedHandle* table_handle = nullptr; + Status s = FindTable(read_options, file_options, internal_comparator, + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, true /* no_io */); + if (!s.ok()) { + return 0; + } + assert(table_handle); + auto table = cache_.Value(table_handle); + auto ret = table->ApproximateMemoryUsage(); + cache_.Release(table_handle); + return ret; +} + +void TableCache::Evict(Cache* cache, uint64_t file_number) { + cache->Erase(GetSliceForFileNumber(&file_number)); +} + +uint64_t TableCache::ApproximateOffsetOf( + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = file_meta.fd.table_reader; + TypedHandle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable( + read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, prefix_extractor, + false /* no_io */, !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = cache_.Value(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateOffsetOf(read_options, key, caller); + } + if (table_handle != nullptr) { + cache_.Release(table_handle); + } + + return result; +} + +uint64_t TableCache::ApproximateSize( + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = file_meta.fd.table_reader; + TypedHandle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable( + read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, prefix_extractor, + false /* no_io */, !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = cache_.Value(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateSize(read_options, start, end, caller); + } + if (table_handle != nullptr) { + cache_.Release(table_handle); + } + + return result; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.h new file mode 100644 index 0000000000..41201eea8a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache.h @@ -0,0 +1,283 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#pragma once +#include +#include +#include + +#include "cache/typed_cache.h" +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "options/cf_options.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/coro_utils.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Arena; +struct FileDescriptor; +class GetContext; +class HistogramImpl; + +// Manages caching for TableReader objects for a column family. The actual +// cache is allocated separately and passed to the constructor. TableCache +// wraps around the underlying SST file readers by providing Get(), +// MultiGet() and NewIterator() methods that hide the instantiation, +// caching and access to the TableReader. The main purpose of this is +// performance - by caching the TableReader, it avoids unnecessary file opens +// and object allocation and instantiation. One exception is compaction, where +// a new TableReader may be instantiated - see NewIterator() comments +// +// Another service provided by TableCache is managing the row cache - if the +// DB is configured with a row cache, and the lookup key is present in the row +// cache, lookup is very fast. The row cache is obtained from +// ioptions.row_cache +class TableCache { + public: + TableCache(const ImmutableOptions& ioptions, + const FileOptions* storage_options, Cache* cache, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id); + ~TableCache(); + + // Cache interface for table cache + using CacheInterface = + BasicTypedCacheInterface; + using TypedHandle = CacheInterface::TypedHandle; + + // Cache interface for row cache + using RowCacheInterface = + BasicTypedCacheInterface; + using RowHandle = RowCacheInterface::TypedHandle; + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "table_reader_ptr" + // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object + // underlying the returned iterator, or nullptr if no Table object underlies + // the returned iterator. The returned "*table_reader_ptr" object is owned + // by the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + // If !options.ignore_range_deletions, and range_del_iter is non-nullptr, + // then range_del_iter is set to a TruncatedRangeDelIterator for range + // tombstones in the SST file corresponding to the specified file number. The + // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST + // file's boundary. + // @param options Must outlive the returned iterator. + // @param range_del_agg If non-nullptr, adds range deletions to the + // aggregator. If an error occurs, returns it in a NewErrorInternalIterator + // @param for_compaction If true, a new TableReader may be allocated (but + // not cached), depending on the CF options + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + InternalIterator* NewIterator( + const ReadOptions& options, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, + const std::shared_ptr& prefix_extractor, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t protection_bytes_per_key, + TruncatedRangeDelIterator** range_del_iter = nullptr); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param get_context Context for get operation. The result of the lookup + // can be retrieved by calling get_context->State() + // @param file_read_hist If non-nullptr, the file reader statistics are + // recorded + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + Status Get( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, size_t max_file_size_for_l0_meta_pin = 0); + + // Return the range delete tombstone iterator of the file specified by + // `file_meta`. + Status GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + std::unique_ptr* out_iter); + + // Call table reader's MultiGetFilter to use the bloom filter to filter out + // keys. Returns Status::NotSupported() if row cache needs to be checked. + // If the table cache is looked up to get the table reader, the cache handle + // is returned in table_handle. This handle should be passed back to + // MultiGet() so it can be released. + Status MultiGetFilter( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, int level, + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param mget_range Pointer to the structure describing a batch of keys to + // be looked up in this table file. The result is stored + // in the embedded GetContext + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + DECLARE_SYNC_AND_ASYNC( + Status, MultiGet, const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + bool skip_range_deletions = false, int level = -1, + TypedHandle* table_handle = nullptr); + + // Evict any entry for the specified file number + static void Evict(Cache* cache, uint64_t file_number); + + // Find table reader + // @param skip_filters Disables loading/accessing the filter block + // @param level == -1 means not specified + Status FindTable( + const ReadOptions& ro, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, TypedHandle**, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr, + const bool no_io = false, bool record_read_stats = true, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); + + // Get the table properties of a given table. + // @no_io: indicates if we should load table to the cache if it is not present + // in table cache yet. + // @returns: `properties` will be reset on success. Please note that we will + // return Status::Incomplete() if table is not present in cache and + // we set `no_io` to be true. + Status GetTableProperties( + const FileOptions& toptions, const ReadOptions& read_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr, + bool no_io = false); + + Status ApproximateKeyAnchors(const ReadOptions& ro, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + uint8_t block_protection_bytes_per_key, + std::vector& anchors); + + // Return total memory usage of the table reader of the file. + // 0 if table reader of the file is not loaded. + size_t GetMemoryUsageByTableReader( + const FileOptions& toptions, const ReadOptions& read_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr); + + // Returns approximated offset of a key in a file represented by fd. + uint64_t ApproximateOffsetOf( + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr); + + // Returns approximated data size between start and end keys in a file + // represented by fd (the start key must not be greater than the end key). + uint64_t ApproximateSize( + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor = nullptr); + + CacheInterface& get_cache() { return cache_; } + + // Capacity of the backing Cache that indicates infinite TableCache capacity. + // For example when max_open_files is -1 we set the backing Cache to this. + static const int kInfiniteCapacity = 0x400000; + + // The tables opened with this TableCache will be immortal, i.e., their + // lifetime is as long as that of the DB. + void SetTablesAreImmortal() { + if (cache_.get()->GetCapacity() >= kInfiniteCapacity) { + immortal_tables_ = true; + } + } + + private: + // Build a table reader + Status GetTableReader( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, bool sequential_mode, + bool record_read_stats, uint8_t block_protection_bytes_per_key, + HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + const std::shared_ptr& prefix_extractor = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); + + // Update the max_covering_tombstone_seq in the GetContext for each key based + // on the range deletions in the table + void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t, + MultiGetContext::Range& table_range); + + // Create a key prefix for looking up the row cache. The prefix is of the + // format row_cache_id + fd_number + seq_no. Later, the user key can be + // appended to form the full key + void CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, IterKey& row_cache_key); + + // Helper function to lookup the row cache for a key. It appends the + // user key to row_cache_key at offset prefix_size + bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context); + + const ImmutableOptions& ioptions_; + const FileOptions& file_options_; + CacheInterface cache_; + std::string row_cache_id_; + bool immortal_tables_; + BlockCacheTracer* const block_cache_tracer_; + Striped loader_mutex_; + std::shared_ptr io_tracer_; + std::string db_session_id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache_sync_and_async.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache_sync_and_async.h new file mode 100644 index 0000000000..df8e9337f6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_cache_sync_and_async.h @@ -0,0 +1,130 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/coro_utils.h" + +#if defined(WITHOUT_COROUTINES) || \ + (defined(USE_COROUTINES) && defined(WITH_COROUTINES)) +namespace ROCKSDB_NAMESPACE { + +#if defined(WITHOUT_COROUTINES) +#endif + +// Batched version of TableCache::MultiGet. +DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) +(const ReadOptions& options, const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, + int level, TypedHandle* handle) { + auto& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + MultiGetRange table_range(*mget_range, mget_range->begin(), + mget_range->end()); + if (handle != nullptr && t == nullptr) { + t = cache_.Value(handle); + } + autovector row_cache_entries; + IterKey row_cache_key; + size_t row_cache_key_prefix_size = 0; + KeyContext& first_key = *table_range.begin(); + bool lookup_row_cache = + ioptions_.row_cache && !first_key.get_context->NeedToReadSequence(); + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (lookup_row_cache) { + GetContext* first_context = first_key.get_context; + CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context, + row_cache_key); + row_cache_key_prefix_size = row_cache_key.Size(); + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + const Slice& user_key = miter->ukey_with_ts; + + GetContext* get_context = miter->get_context; + + if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, + get_context)) { + table_range.SkipKey(miter); + } else { + row_cache_entries.emplace_back(); + get_context->SetReplayLog(&(row_cache_entries.back())); + } + } + } + + // Check that table_range is not empty. Its possible all keys may have been + // found in the row cache and thus the range may now be empty + if (s.ok() && !table_range.empty()) { + if (t == nullptr) { + assert(handle == nullptr); + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); + TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); + if (s.ok()) { + t = cache_.Value(handle); + assert(t); + } + } + if (s.ok() && !options.ignore_range_deletions && !skip_range_deletions) { + UpdateRangeTombstoneSeqnums(options, t, table_range); + } + if (s.ok()) { + CO_AWAIT(t->MultiGet) + (options, &table_range, prefix_extractor.get(), skip_filters); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { + Status* status = iter->s; + if (status->IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + iter->get_context->MarkKeyMayExist(); + s = Status::OK(); + } + } + } + } + + if (lookup_row_cache) { + size_t row_idx = 0; + RowCacheInterface row_cache{ioptions_.row_cache.get()}; + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + std::string& row_cache_entry = row_cache_entries[row_idx++]; + const Slice& user_key = miter->ukey_with_ts; + ; + GetContext* get_context = miter->get_context; + + get_context->SetReplayLog(nullptr); + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(), + user_key.size()); + // Put the replay log in row cache only if something was found. + if (s.ok() && !row_cache_entry.empty()) { + size_t charge = row_cache_entry.capacity() + sizeof(std::string); + auto row_ptr = new std::string(std::move(row_cache_entry)); + // If row cache is full, it's OK. + row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge) + .PermitUncheckedError(); + } + } + } + + if (handle != nullptr) { + cache_.Release(handle); + } + CO_RETURN s; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.cc new file mode 100644 index 0000000000..edb9a1b63a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/table_properties_collector.h" + +#include "db/dbformat.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint64_t GetUint64Property(const UserCollectedProperties& props, + const std::string& property_name, + bool* property_present) { + auto pos = props.find(property_name); + if (pos == props.end()) { + *property_present = false; + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + *property_present = true; + return GetVarint64(&raw, &val) ? val : 0; +} + +} // anonymous namespace + +Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, + const Slice& value, + uint64_t file_size) { + ParsedInternalKey ikey; + Status s = ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!s.ok()) { + return s; + } + + return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type), + ikey.sequence, file_size); +} + +void UserKeyTablePropertiesCollector::BlockAdd( + uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) { + return collector_->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); +} + +Status UserKeyTablePropertiesCollector::Finish( + UserCollectedProperties* properties) { + return collector_->Finish(properties); +} + +UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties() + const { + return collector_->GetReadableProperties(); +} + +uint64_t GetDeletedKeys(const UserCollectedProperties& props) { + bool property_present_ignored; + return GetUint64Property(props, TablePropertiesNames::kDeletedKeys, + &property_present_ignored); +} + +uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present) { + return GetUint64Property(props, TablePropertiesNames::kMergeOperands, + property_present); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.h new file mode 100644 index 0000000000..968115c3d7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/table_properties_collector.h @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines a collection of statistics collectors. +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// Base class for internal table properties collector. +class IntTblPropCollector { + public: + virtual ~IntTblPropCollector() {} + virtual Status Finish(UserCollectedProperties* properties) = 0; + + virtual const char* Name() const = 0; + + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) = 0; + + virtual void BlockAdd(uint64_t block_uncomp_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) = 0; + + virtual UserCollectedProperties GetReadableProperties() const = 0; + + virtual bool NeedCompact() const { return false; } +}; + +// Factory for internal table properties collector. +class IntTblPropCollectorFactory { + public: + virtual ~IntTblPropCollectorFactory() {} + // has to be thread-safe + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id, int level_at_creation) = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; +}; + +using IntTblPropCollectorFactories = + std::vector>; + +// When rocksdb creates a new table, it will encode all "user keys" into +// "internal keys", which contains meta information of a given entry. +// +// This class extracts user key from the encoded internal key when Add() is +// invoked. +class UserKeyTablePropertiesCollector : public IntTblPropCollector { + public: + // transfer of ownership + explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector) + : collector_(collector) {} + + virtual ~UserKeyTablePropertiesCollector() {} + + virtual Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) override; + + virtual void BlockAdd(uint64_t block_uncomp_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { return collector_->Name(); } + + UserCollectedProperties GetReadableProperties() const override; + + virtual bool NeedCompact() const override { + return collector_->NeedCompact(); + } + + protected: + std::unique_ptr collector_; +}; + +class UserKeyTablePropertiesCollectorFactory + : public IntTblPropCollectorFactory { + public: + explicit UserKeyTablePropertiesCollectorFactory( + std::shared_ptr user_collector_factory) + : user_collector_factory_(user_collector_factory) {} + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id, int level_at_creation) override { + TablePropertiesCollectorFactory::Context context; + context.column_family_id = column_family_id; + context.level_at_creation = level_at_creation; + return new UserKeyTablePropertiesCollector( + user_collector_factory_->CreateTablePropertiesCollector(context)); + } + + virtual const char* Name() const override { + return user_collector_factory_->Name(); + } + + private: + std::shared_ptr user_collector_factory_; +}; + +// When rocksdb creates a newtable, it will encode all "user keys" into +// "internal keys". This class collects min/max timestamp from the encoded +// internal key when Add() is invoked. +// +// @param cmp the user comparator to compare the timestamps in internal key. +class TimestampTablePropertiesCollector : public IntTblPropCollector { + public: + explicit TimestampTablePropertiesCollector(const Comparator* cmp) + : cmp_(cmp), + timestamp_min_(kDisableUserTimestamp), + timestamp_max_(kDisableUserTimestamp) {} + + Status InternalAdd(const Slice& key, const Slice& /* value */, + uint64_t /* file_size */) override { + auto user_key = ExtractUserKey(key); + assert(cmp_ && cmp_->timestamp_size() > 0); + if (user_key.size() < cmp_->timestamp_size()) { + return Status::Corruption( + "User key size mismatch when comparing to timestamp size."); + } + auto timestamp_in_key = + ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size()); + if (timestamp_max_ == kDisableUserTimestamp || + cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) { + timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size()); + } + if (timestamp_min_ == kDisableUserTimestamp || + cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) { + timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size()); + } + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + return; + } + + Status Finish(UserCollectedProperties* properties) override { + // timestamp is empty is table is empty + assert(timestamp_min_.size() == timestamp_max_.size() && + (timestamp_min_.empty() || + timestamp_max_.size() == cmp_->timestamp_size())); + properties->insert({"rocksdb.timestamp_min", timestamp_min_}); + properties->insert({"rocksdb.timestamp_max", timestamp_max_}); + return Status::OK(); + } + + const char* Name() const override { + return "TimestampTablePropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override { + return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)}, + {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}}; + } + + protected: + const Comparator* const cmp_; + std::string timestamp_min_; + std::string timestamp_max_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.cc new file mode 100644 index 0000000000..8841b8cf3b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "db/transaction_log_impl.h" + +#include + +#include "db/write_batch_internal.h" +#include "file/sequence_file_reader.h" +#include "util/defer.h" + +namespace ROCKSDB_NAMESPACE { + +TransactionLogIteratorImpl::TransactionLogIteratorImpl( + const std::string& dir, const ImmutableDBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seq, + std::unique_ptr files, VersionSet const* const versions, + const bool seq_per_batch, const std::shared_ptr& io_tracer) + : dir_(dir), + options_(options), + read_options_(read_options), + soptions_(soptions), + starting_sequence_number_(seq), + files_(std::move(files)), + versions_(versions), + seq_per_batch_(seq_per_batch), + io_tracer_(io_tracer), + started_(false), + is_valid_(false), + current_file_index_(0), + current_batch_seq_(0), + current_last_seq_(0) { + assert(files_ != nullptr); + assert(versions_ != nullptr); + assert(!seq_per_batch_); + current_status_.PermitUncheckedError(); // Clear on start + reporter_.env = options_->env; + reporter_.info_log = options_->info_log.get(); + SeekToStartSequence(); // Seek till starting sequence +} + +Status TransactionLogIteratorImpl::OpenLogFile( + const LogFile* log_file, + std::unique_ptr* file_reader) { + FileSystemPtr fs(options_->fs, io_tracer_); + std::unique_ptr file; + std::string fname; + Status s; + EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_); + if (log_file->Type() == kArchivedLogFile) { + fname = ArchivedLogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + } else { + fname = LogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + if (!s.ok()) { + // If cannot open file in DB directory. + // Try the archive dir, as it could have moved in the meanwhile. + fname = ArchivedLogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + } + } + if (s.ok()) { + file_reader->reset(new SequentialFileReader(std::move(file), fname, + io_tracer_, options_->listeners, + options_->rate_limiter.get())); + } + return s; +} + +BatchResult TransactionLogIteratorImpl::GetBatch() { + assert(is_valid_); // cannot call in a non valid state. + BatchResult result; + result.sequence = current_batch_seq_; + result.writeBatchPtr = std::move(current_batch_); + return result; +} + +Status TransactionLogIteratorImpl::status() { return current_status_; } + +bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; } + +bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) { + // Don't read if no more complete entries to read from logs + if (current_last_seq_ >= versions_->LastSequence()) { + return false; + } + return current_log_reader_->ReadRecord(record, &scratch_); +} + +void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, + bool strict) { + Slice record; + started_ = false; + is_valid_ = false; + // Check invariant of TransactionLogIterator when SeekToStartSequence() + // succeeds. + const Defer defer([this]() { + if (is_valid_) { + assert(current_status_.ok()); + if (starting_sequence_number_ > current_batch_seq_) { + assert(current_batch_seq_ < current_last_seq_); + assert(current_last_seq_ >= starting_sequence_number_); + } + } + }); + if (files_->size() <= start_file_index) { + return; + } else if (!current_status_.ok()) { + return; + } + Status s = + OpenLogReader(files_->at(static_cast(start_file_index)).get()); + if (!s.ok()) { + current_status_ = s; + reporter_.Info(current_status_.ToString().c_str()); + return; + } + while (RestrictedRead(&record)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter_.Corruption(record.size(), + Status::Corruption("very small log record")); + continue; + } + UpdateCurrentWriteBatch(record); + if (current_last_seq_ >= starting_sequence_number_) { + if (strict && current_batch_seq_ != starting_sequence_number_) { + current_status_ = Status::Corruption( + "Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(current_status_.ToString().c_str()); + return; + } else if (strict) { + reporter_.Info( + "Could seek required sequence number. Iterator will " + "continue."); + } + is_valid_ = true; + started_ = true; // set started_ as we could seek till starting sequence + return; + } else { + is_valid_ = false; + } + } + + // Could not find start sequence in first file. Normally this must be the + // only file. Otherwise log the error and let the iterator return next entry + // If strict is set, we want to seek exactly till the start sequence and it + // should have been present in the file we scanned above + if (strict) { + current_status_ = Status::Corruption( + "Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(current_status_.ToString().c_str()); + } else if (files_->size() != 1) { + current_status_ = Status::Corruption( + "Start sequence was not found, " + "skipping to the next available"); + reporter_.Info(current_status_.ToString().c_str()); + // Let NextImpl find the next available entry. started_ remains false + // because we don't want to check for gaps while moving to start sequence + NextImpl(true); + } +} + +void TransactionLogIteratorImpl::Next() { + if (!current_status_.ok()) { + return; + } + return NextImpl(false); +} + +void TransactionLogIteratorImpl::NextImpl(bool internal) { + Slice record; + is_valid_ = false; + if (!internal && !started_) { + // Runs every time until we can seek to the start sequence + SeekToStartSequence(); + } + while (true) { + assert(current_log_reader_); + if (current_log_reader_->IsEOF()) { + current_log_reader_->UnmarkEOF(); + } + while (RestrictedRead(&record)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter_.Corruption(record.size(), + Status::Corruption("very small log record")); + continue; + } else { + // started_ should be true if called by application + assert(internal || started_); + // started_ should be false if called internally + assert(!internal || !started_); + UpdateCurrentWriteBatch(record); + if (internal && !started_) { + started_ = true; + } + return; + } + } + + // Open the next file + if (current_file_index_ < files_->size() - 1) { + ++current_file_index_; + Status s = OpenLogReader(files_->at(current_file_index_).get()); + if (!s.ok()) { + is_valid_ = false; + current_status_ = s; + return; + } + } else { + is_valid_ = false; + if (current_last_seq_ == versions_->LastSequence()) { + current_status_ = Status::OK(); + } else { + const char* msg = "Create a new iterator to fetch the new tail."; + current_status_ = Status::TryAgain(msg); + } + return; + } + } +} + +bool TransactionLogIteratorImpl::IsBatchExpected( + const WriteBatch* batch, const SequenceNumber expected_seq) { + assert(batch); + SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch); + if (batchSeq != expected_seq) { + char buf[200]; + snprintf(buf, sizeof(buf), + "Discontinuity in log records. Got seq=%" PRIu64 + ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64 + ".Log iterator will reseek the correct batch.", + batchSeq, expected_seq, versions_->LastSequence()); + reporter_.Info(buf); + return false; + } + return true; +} + +void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { + std::unique_ptr batch(new WriteBatch()); + Status s = WriteBatchInternal::SetContents(batch.get(), record); + s.PermitUncheckedError(); // TODO: What should we do with this error? + + SequenceNumber expected_seq = current_last_seq_ + 1; + // If the iterator has started, then confirm that we get continuous batches + if (started_ && !IsBatchExpected(batch.get(), expected_seq)) { + // Seek to the batch having expected sequence number + if (expected_seq < files_->at(current_file_index_)->StartSequence()) { + // Expected batch must lie in the previous log file + // Avoid underflow. + if (current_file_index_ != 0) { + current_file_index_--; + } + } + starting_sequence_number_ = expected_seq; + // currentStatus_ will be set to Ok if reseek succeeds + // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode + // that allows gaps in the WAL since it will still skip over the gap. + current_status_ = Status::NotFound("Gap in sequence numbers"); + // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode + // should be disabled + return SeekToStartSequence(current_file_index_, !seq_per_batch_); + } + + current_batch_seq_ = WriteBatchInternal::Sequence(batch.get()); + assert(!seq_per_batch_); + current_last_seq_ = + current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1; + // currentBatchSeq_ can only change here + assert(current_last_seq_ <= versions_->LastSequence()); + + current_batch_ = std::move(batch); + is_valid_ = true; + current_status_ = Status::OK(); +} + +Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) { + std::unique_ptr file; + Status s = OpenLogFile(log_file, &file); + if (!s.ok()) { + return s; + } + assert(file); + current_log_reader_.reset( + new log::Reader(options_->info_log, std::move(file), &reporter_, + read_options_.verify_checksums_, log_file->LogNumber())); + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.h new file mode 100644 index 0000000000..6568de23f6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/transaction_log_impl.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "db/log_reader.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class LogFileImpl : public LogFile { + public: + LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, + uint64_t sizeBytes) + : logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) {} + + std::string PathName() const override { + if (type_ == kArchivedLogFile) { + return ArchivedLogFileName("", logNumber_); + } + return LogFileName("", logNumber_); + } + + uint64_t LogNumber() const override { return logNumber_; } + + WalFileType Type() const override { return type_; } + + SequenceNumber StartSequence() const override { return startSequence_; } + + uint64_t SizeFileBytes() const override { return sizeFileBytes_; } + + bool operator<(const LogFile& that) const { + return LogNumber() < that.LogNumber(); + } + + private: + uint64_t logNumber_; + WalFileType type_; + SequenceNumber startSequence_; + uint64_t sizeFileBytes_; +}; + +class TransactionLogIteratorImpl : public TransactionLogIterator { + public: + TransactionLogIteratorImpl( + const std::string& dir, const ImmutableDBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seqNum, + std::unique_ptr files, VersionSet const* const versions, + const bool seq_per_batch, const std::shared_ptr& io_tracer); + + virtual bool Valid() override; + + virtual void Next() override; + + virtual Status status() override; + + virtual BatchResult GetBatch() override; + + private: + const std::string& dir_; + const ImmutableDBOptions* options_; + const TransactionLogIterator::ReadOptions read_options_; + const EnvOptions& soptions_; + SequenceNumber starting_sequence_number_; + std::unique_ptr files_; + // Used only to get latest seq. num + // TODO(icanadi) can this be just a callback? + VersionSet const* const versions_; + const bool seq_per_batch_; + std::shared_ptr io_tracer_; + + // State variables + bool started_; + bool is_valid_; // not valid when it starts of. + Status current_status_; + size_t current_file_index_; + std::unique_ptr current_batch_; + std::unique_ptr current_log_reader_; + std::string scratch_; + Status OpenLogFile(const LogFile* log_file, + std::unique_ptr* file); + + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes, + s.ToString().c_str()); + } + virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); } + } reporter_; + + SequenceNumber + current_batch_seq_; // sequence number at start of current batch + SequenceNumber current_last_seq_; // last sequence in the current batch + // Reads from transaction log only if the writebatch record has been written + bool RestrictedRead(Slice* record); + // Seeks to starting_sequence_number_ reading from start_file_index in files_. + // If strict is set, then must get a batch starting with + // starting_sequence_number_. + void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false); + // Implementation of Next. SeekToStartSequence calls it internally with + // internal=true to let it find next entry even if it has to jump gaps because + // the iterator may start off from the first available entry but promises to + // be continuous after that + void NextImpl(bool internal = false); + // Check if batch is expected, else return false + bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq); + // Update current batch if a continuous batch is found. + void UpdateCurrentWriteBatch(const Slice& record); + Status OpenLogReader(const LogFile* file); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.cc new file mode 100644 index 0000000000..d7ca0899f1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/trim_history_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) { + std::lock_guard lock(checking_mutex_); + cfd->Ref(); + cfds_.push_back(cfd); + is_empty_.store(false, std::memory_order_relaxed); +} + +ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() { + std::lock_guard lock(checking_mutex_); + while (true) { + if (cfds_.empty()) { + return nullptr; + } + ColumnFamilyData* cfd = cfds_.back(); + cfds_.pop_back(); + if (cfds_.empty()) { + is_empty_.store(true, std::memory_order_relaxed); + } + + if (!cfd->IsDropped()) { + // success + return cfd; + } + cfd->UnrefAndTryDelete(); + } +} + +bool TrimHistoryScheduler::Empty() { + bool is_empty = is_empty_.load(std::memory_order_relaxed); + return is_empty; +} + +void TrimHistoryScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + } + assert(Empty()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.h new file mode 100644 index 0000000000..252802a7ae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/trim_history_scheduler.h @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps +// track of column families whose flushed immutable memtables may need to be +// removed (aka trimmed). The actual trimming may be slightly delayed. Due to +// the use of the mutex and atomic variable, ScheduleWork, +// TakeNextColumnFamily, and, Empty can be called concurrently. +class TrimHistoryScheduler { + public: + TrimHistoryScheduler() : is_empty_(true) {} + + // When a column family needs history trimming, add cfd to the FIFO queue + void ScheduleWork(ColumnFamilyData* cfd); + + // Remove the column family from the queue, the caller is responsible for + // calling `MemtableList::TrimHistory` + ColumnFamilyData* TakeNextColumnFamily(); + + bool Empty(); + + void Clear(); + + // Not on critical path, use mutex to ensure thread safety + private: + std::atomic is_empty_; + autovector cfds_; + std::mutex checking_mutex_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.cc new file mode 100644 index 0000000000..d87ef94494 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.cc @@ -0,0 +1,1426 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_builder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "db/blob/blob_file_meta.h" +#include "db/dbformat.h" +#include "db/internal_stats.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/version_set.h" +#include "port/port.h" +#include "table/table_reader.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionBuilder::Rep { + class NewestFirstByEpochNumber { + private: + inline static const NewestFirstBySeqNo seqno_cmp; + + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + + if (lhs->epoch_number != rhs->epoch_number) { + return lhs->epoch_number > rhs->epoch_number; + } else { + return seqno_cmp(lhs, rhs); + } + } + }; + class BySmallestKey { + public: + explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {} + + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + assert(cmp_); + + const int r = cmp_->Compare(lhs->smallest, rhs->smallest); + if (r != 0) { + return (r < 0); + } + + // Break ties by file number + return (lhs->fd.GetNumber() < rhs->fd.GetNumber()); + } + + private: + const InternalKeyComparator* cmp_; + }; + + struct LevelState { + std::unordered_set deleted_files; + // Map from file number to file meta data. + std::unordered_map added_files; + }; + + // A class that represents the accumulated changes (like additional garbage or + // newly linked/unlinked SST files) for a given blob file after applying a + // series of VersionEdits. + class BlobFileMetaDataDelta { + public: + bool IsEmpty() const { + return !additional_garbage_count_ && !additional_garbage_bytes_ && + newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty(); + } + + uint64_t GetAdditionalGarbageCount() const { + return additional_garbage_count_; + } + + uint64_t GetAdditionalGarbageBytes() const { + return additional_garbage_bytes_; + } + + const std::unordered_set& GetNewlyLinkedSsts() const { + return newly_linked_ssts_; + } + + const std::unordered_set& GetNewlyUnlinkedSsts() const { + return newly_unlinked_ssts_; + } + + void AddGarbage(uint64_t count, uint64_t bytes) { + additional_garbage_count_ += count; + additional_garbage_bytes_ += bytes; + } + + void LinkSst(uint64_t sst_file_number) { + assert(newly_linked_ssts_.find(sst_file_number) == + newly_linked_ssts_.end()); + + // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be + // linked to and unlinked from the same blob file in the case of a trivial + // move.) + auto it = newly_unlinked_ssts_.find(sst_file_number); + + if (it != newly_unlinked_ssts_.end()) { + newly_unlinked_ssts_.erase(it); + } else { + newly_linked_ssts_.emplace(sst_file_number); + } + } + + void UnlinkSst(uint64_t sst_file_number) { + assert(newly_unlinked_ssts_.find(sst_file_number) == + newly_unlinked_ssts_.end()); + + // Reconcile with newly linked SSTs on the fly. (Note: an SST can be + // linked to and unlinked from the same blob file in the case of a trivial + // move.) + auto it = newly_linked_ssts_.find(sst_file_number); + + if (it != newly_linked_ssts_.end()) { + newly_linked_ssts_.erase(it); + } else { + newly_unlinked_ssts_.emplace(sst_file_number); + } + } + + private: + uint64_t additional_garbage_count_ = 0; + uint64_t additional_garbage_bytes_ = 0; + std::unordered_set newly_linked_ssts_; + std::unordered_set newly_unlinked_ssts_; + }; + + // A class that represents the state of a blob file after applying a series of + // VersionEdits. In addition to the resulting state, it also contains the + // delta (see BlobFileMetaDataDelta above). The resulting state can be used to + // identify obsolete blob files, while the delta makes it possible to + // efficiently detect trivial moves. + class MutableBlobFileMetaData { + public: + // To be used for brand new blob files + explicit MutableBlobFileMetaData( + std::shared_ptr&& shared_meta) + : shared_meta_(std::move(shared_meta)) {} + + // To be used for pre-existing blob files + explicit MutableBlobFileMetaData( + const std::shared_ptr& meta) + : shared_meta_(meta->GetSharedMeta()), + linked_ssts_(meta->GetLinkedSsts()), + garbage_blob_count_(meta->GetGarbageBlobCount()), + garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {} + + const std::shared_ptr& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + + bool HasDelta() const { return !delta_.IsEmpty(); } + + const std::unordered_set& GetLinkedSsts() const { + return linked_ssts_; + } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + bool AddGarbage(uint64_t count, uint64_t bytes) { + assert(shared_meta_); + + if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() || + garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) { + return false; + } + + delta_.AddGarbage(count, bytes); + + garbage_blob_count_ += count; + garbage_blob_bytes_ += bytes; + + return true; + } + + void LinkSst(uint64_t sst_file_number) { + delta_.LinkSst(sst_file_number); + + assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end()); + linked_ssts_.emplace(sst_file_number); + } + + void UnlinkSst(uint64_t sst_file_number) { + delta_.UnlinkSst(sst_file_number); + + assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end()); + linked_ssts_.erase(sst_file_number); + } + + private: + std::shared_ptr shared_meta_; + // Accumulated changes + BlobFileMetaDataDelta delta_; + // Resulting state after applying the changes + BlobFileMetaData::LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; + }; + + const FileOptions& file_options_; + const ImmutableCFOptions* const ioptions_; + TableCache* table_cache_; + VersionStorageInfo* base_vstorage_; + VersionSet* version_set_; + int num_levels_; + LevelState* levels_; + // Store sizes of levels larger than num_levels_. We do this instead of + // storing them in levels_ to avoid regression in case there are no files + // on invalid levels. The version is not consistent if in the end the files + // on invalid levels don't cancel out. + std::unordered_map invalid_level_sizes_; + // Whether there are invalid new files or invalid deletion on levels larger + // than num_levels_. + bool has_invalid_levels_; + // Current levels of table files affected by additions/deletions. + std::unordered_map table_file_levels_; + // Current compact cursors that should be changed after the last compaction + std::unordered_map updated_compact_cursors_; + NewestFirstByEpochNumber level_zero_cmp_by_epochno_; + NewestFirstBySeqNo level_zero_cmp_by_seqno_; + BySmallestKey level_nonzero_cmp_; + + // Mutable metadata objects for all blob files affected by the series of + // version edits. + std::map mutable_blob_file_metas_; + + std::shared_ptr file_metadata_cache_res_mgr_; + + public: + Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, + TableCache* table_cache, VersionStorageInfo* base_vstorage, + VersionSet* version_set, + std::shared_ptr file_metadata_cache_res_mgr) + : file_options_(file_options), + ioptions_(ioptions), + table_cache_(table_cache), + base_vstorage_(base_vstorage), + version_set_(version_set), + num_levels_(base_vstorage->num_levels()), + has_invalid_levels_(false), + level_nonzero_cmp_(base_vstorage_->InternalComparator()), + file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) { + assert(ioptions_); + + levels_ = new LevelState[num_levels_]; + } + + ~Rep() { + for (int level = 0; level < num_levels_; level++) { + const auto& added = levels_[level].added_files; + for (auto& pair : added) { + UnrefFile(pair.second); + } + } + + delete[] levels_; + } + + void UnrefFile(FileMetaData* f) { + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + assert(table_cache_ != nullptr); + // NOTE: have to release in raw cache interface to avoid using a + // TypedHandle for FileMetaData::table_reader_handle + table_cache_->get_cache().get()->Release(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + f->ApproximateMemoryUsage(), false /* increase */); + s.PermitUncheckedError(); + } + delete f; + } + } + + // Mapping used for checking the consistency of links between SST files and + // blob files. It is built using the forward links (table file -> blob file), + // and is subsequently compared with the inverse mapping stored in the + // BlobFileMetaData objects. + using ExpectedLinkedSsts = + std::unordered_map; + + static void UpdateExpectedLinkedSsts( + uint64_t table_file_number, uint64_t blob_file_number, + ExpectedLinkedSsts* expected_linked_ssts) { + assert(expected_linked_ssts); + + if (blob_file_number == kInvalidBlobFileNumber) { + return; + } + + (*expected_linked_ssts)[blob_file_number].emplace(table_file_number); + } + + template + Status CheckConsistencyDetailsForLevel( + const VersionStorageInfo* vstorage, int level, Checker checker, + const std::string& sync_point, + ExpectedLinkedSsts* expected_linked_ssts) const { +#ifdef NDEBUG + (void)sync_point; +#endif + + assert(vstorage); + assert(level >= 0 && level < num_levels_); + assert(expected_linked_ssts); + + const auto& level_files = vstorage->LevelFiles(level); + + if (level_files.empty()) { + return Status::OK(); + } + + assert(level_files[0]); + UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(), + level_files[0]->oldest_blob_file_number, + expected_linked_ssts); + + for (size_t i = 1; i < level_files.size(); ++i) { + assert(level_files[i]); + UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(), + level_files[i]->oldest_blob_file_number, + expected_linked_ssts); + + auto lhs = level_files[i - 1]; + auto rhs = level_files[i]; + +#ifndef NDEBUG + auto pair = std::make_pair(&lhs, &rhs); + TEST_SYNC_POINT_CALLBACK(sync_point, &pair); +#endif + + const Status s = checker(lhs, rhs); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); + } + + // Make sure table files are sorted correctly and that the links between + // table files and blob files are consistent. + Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const { + assert(vstorage); + + ExpectedLinkedSsts expected_linked_ssts; + + if (num_levels_ > 0) { + const InternalKeyComparator* const icmp = vstorage->InternalComparator(); + EpochNumberRequirement epoch_number_requirement = + vstorage->GetEpochNumberRequirement(); + assert(icmp); + // Check L0 + { + auto l0_checker = [this, epoch_number_requirement, icmp]( + const FileMetaData* lhs, + const FileMetaData* rhs) { + assert(lhs); + assert(rhs); + + if (epoch_number_requirement == + EpochNumberRequirement::kMightMissing) { + if (!level_zero_cmp_by_seqno_(lhs, rhs)) { + std::ostringstream oss; + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << " with seqnos (largest, smallest) " + << lhs->fd.largest_seqno << " , " << lhs->fd.smallest_seqno + << ", #" << rhs->fd.GetNumber() + << " with seqnos (largest, smallest) " + << rhs->fd.largest_seqno << " , " << rhs->fd.smallest_seqno; + return Status::Corruption("VersionBuilder", oss.str()); + } + } else if (epoch_number_requirement == + EpochNumberRequirement::kMustPresent) { + if (lhs->epoch_number == rhs->epoch_number) { + bool range_overlapped = + icmp->Compare(lhs->smallest, rhs->largest) <= 0 && + icmp->Compare(lhs->largest, rhs->smallest) >= 0; + + if (range_overlapped) { + std::ostringstream oss; + oss << "L0 files of same epoch number but overlapping range #" + << lhs->fd.GetNumber() + << " , smallest key: " << lhs->smallest.DebugString(true) + << " , largest key: " << lhs->largest.DebugString(true) + << " , epoch number: " << lhs->epoch_number << " vs. file #" + << rhs->fd.GetNumber() + << " , smallest key: " << rhs->smallest.DebugString(true) + << " , largest key: " << rhs->largest.DebugString(true) + << " , epoch number: " << rhs->epoch_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + } + + if (!level_zero_cmp_by_epochno_(lhs, rhs)) { + std::ostringstream oss; + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << " with epoch number " + << lhs->epoch_number << ", #" << rhs->fd.GetNumber() + << " with epoch number " << rhs->epoch_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + } + + return Status::OK(); + }; + + const Status s = CheckConsistencyDetailsForLevel( + vstorage, /* level */ 0, l0_checker, + "VersionBuilder::CheckConsistency0", &expected_linked_ssts); + if (!s.ok()) { + return s; + } + } + + // Check L1 and up + + for (int level = 1; level < num_levels_; ++level) { + auto checker = [this, level, icmp](const FileMetaData* lhs, + const FileMetaData* rhs) { + assert(lhs); + assert(rhs); + + if (!level_nonzero_cmp_(lhs, rhs)) { + std::ostringstream oss; + oss << 'L' << level << " files are not sorted properly: files #" + << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); + + return Status::Corruption("VersionBuilder", oss.str()); + } + + // Make sure there is no overlap in level + if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) { + std::ostringstream oss; + oss << 'L' << level << " has overlapping ranges: file #" + << lhs->fd.GetNumber() + << " largest key: " << lhs->largest.DebugString(true) + << " vs. file #" << rhs->fd.GetNumber() + << " smallest key: " << rhs->smallest.DebugString(true); + + return Status::Corruption("VersionBuilder", oss.str()); + } + + return Status::OK(); + }; + + const Status s = CheckConsistencyDetailsForLevel( + vstorage, level, checker, "VersionBuilder::CheckConsistency1", + &expected_linked_ssts); + if (!s.ok()) { + return s; + } + } + } + + // Make sure that all blob files in the version have non-garbage data and + // the links between them and the table files are consistent. + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& blob_file_meta : blob_files) { + assert(blob_file_meta); + + const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber(); + + if (blob_file_meta->GetGarbageBlobCount() >= + blob_file_meta->GetTotalBlobCount()) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number + << " consists entirely of garbage"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (blob_file_meta->GetLinkedSsts() != + expected_linked_ssts[blob_file_number]) { + std::ostringstream oss; + oss << "Links are inconsistent between table files and blob file #" + << blob_file_number; + + return Status::Corruption("VersionBuilder", oss.str()); + } + } + + Status ret_s; + TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn", + &ret_s); + return ret_s; + } + + Status CheckConsistency(const VersionStorageInfo* vstorage) const { + assert(vstorage); + + // Always run consistency checks in debug build +#ifdef NDEBUG + if (!vstorage->force_consistency_checks()) { + return Status::OK(); + } +#endif + Status s = CheckConsistencyDetails(vstorage); + if (s.IsCorruption() && s.getState()) { + // Make it clear the error is due to force_consistency_checks = 1 or + // debug build +#ifdef NDEBUG + auto prefix = "force_consistency_checks"; +#else + auto prefix = "force_consistency_checks(DEBUG)"; +#endif + s = Status::Corruption(prefix, s.getState()); + } else { + // was only expecting corruption with message, or OK + assert(s.ok()); + } + return s; + } + + bool CheckConsistencyForNumLevels() const { + // Make sure there are no files on or beyond num_levels(). + if (has_invalid_levels_) { + return false; + } + + for (const auto& pair : invalid_level_sizes_) { + const size_t level_size = pair.second; + if (level_size != 0) { + return false; + } + } + + return true; + } + + bool IsBlobFileInVersion(uint64_t blob_file_number) const { + auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); + if (mutable_it != mutable_blob_file_metas_.end()) { + return true; + } + + assert(base_vstorage_); + const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number); + + return !!meta; + } + + MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData( + uint64_t blob_file_number) { + auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); + if (mutable_it != mutable_blob_file_metas_.end()) { + return &mutable_it->second; + } + + assert(base_vstorage_); + const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number); + + if (meta) { + mutable_it = mutable_blob_file_metas_ + .emplace(blob_file_number, MutableBlobFileMetaData(meta)) + .first; + return &mutable_it->second; + } + + return nullptr; + } + + Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) { + const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber(); + + if (IsBlobFileInVersion(blob_file_number)) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number << " already added"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + // Note: we use C++11 for now but in C++14, this could be done in a more + // elegant way using generalized lambda capture. + VersionSet* const vs = version_set_; + const ImmutableCFOptions* const ioptions = ioptions_; + + auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) { + if (vs) { + assert(ioptions); + assert(!ioptions->cf_paths.empty()); + assert(shared_meta); + + vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(), + ioptions->cf_paths.front().path); + } + + delete shared_meta; + }; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, blob_file_addition.GetTotalBlobCount(), + blob_file_addition.GetTotalBlobBytes(), + blob_file_addition.GetChecksumMethod(), + blob_file_addition.GetChecksumValue(), deleter); + + mutable_blob_file_metas_.emplace( + blob_file_number, MutableBlobFileMetaData(std::move(shared_meta))); + + return Status::OK(); + } + + Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) { + const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber(); + + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + + if (!mutable_meta) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number << " not found"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(), + blob_file_garbage.GetGarbageBlobBytes())) { + std::ostringstream oss; + oss << "Garbage overflow for blob file #" << blob_file_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + + return Status::OK(); + } + + int GetCurrentLevelForTableFile(uint64_t file_number) const { + auto it = table_file_levels_.find(file_number); + if (it != table_file_levels_.end()) { + return it->second; + } + + assert(base_vstorage_); + return base_vstorage_->GetFileLocation(file_number).GetLevel(); + } + + uint64_t GetOldestBlobFileNumberForTableFile(int level, + uint64_t file_number) const { + assert(level < num_levels_); + + const auto& added_files = levels_[level].added_files; + + auto it = added_files.find(file_number); + if (it != added_files.end()) { + const FileMetaData* const meta = it->second; + assert(meta); + + return meta->oldest_blob_file_number; + } + + assert(base_vstorage_); + const FileMetaData* const meta = + base_vstorage_->GetFileMetaDataByNumber(file_number); + assert(meta); + + return meta->oldest_blob_file_number; + } + + Status ApplyFileDeletion(int level, uint64_t file_number) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (level != current_level) { + if (level >= num_levels_) { + has_invalid_levels_ = true; + } + + std::ostringstream oss; + oss << "Cannot delete table file #" << file_number << " from level " + << level << " since it is "; + if (current_level == + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + oss << "not in the LSM tree"; + } else { + oss << "on level " << current_level; + } + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (level >= num_levels_) { + assert(invalid_level_sizes_[level] > 0); + --invalid_level_sizes_[level]; + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + + return Status::OK(); + } + + const uint64_t blob_file_number = + GetOldestBlobFileNumberForTableFile(level, file_number); + + if (blob_file_number != kInvalidBlobFileNumber) { + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + if (mutable_meta) { + mutable_meta->UnlinkSst(file_number); + } + } + + auto& level_state = levels_[level]; + + auto& add_files = level_state.added_files; + auto add_it = add_files.find(file_number); + if (add_it != add_files.end()) { + UnrefFile(add_it->second); + add_files.erase(add_it); + } + + auto& del_files = level_state.deleted_files; + assert(del_files.find(file_number) == del_files.end()); + del_files.emplace(file_number); + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + + return Status::OK(); + } + + Status ApplyFileAddition(int level, const FileMetaData& meta) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const uint64_t file_number = meta.fd.GetNumber(); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (current_level != + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + if (level >= num_levels_) { + has_invalid_levels_ = true; + } + + std::ostringstream oss; + oss << "Cannot add table file #" << file_number << " to level " << level + << " since it is already in the LSM tree on level " << current_level; + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (level >= num_levels_) { + ++invalid_level_sizes_[level]; + table_file_levels_[file_number] = level; + + return Status::OK(); + } + + auto& level_state = levels_[level]; + + auto& del_files = level_state.deleted_files; + auto del_it = del_files.find(file_number); + if (del_it != del_files.end()) { + del_files.erase(del_it); + } + + FileMetaData* const f = new FileMetaData(meta); + f->refs = 1; + + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + f->ApproximateMemoryUsage(), true /* increase */); + if (!s.ok()) { + delete f; + s = Status::MemoryLimit( + "Can't allocate " + + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kFileMetadata)] + + " due to exceeding the memory limit " + "based on " + "cache capacity"); + return s; + } + } + + auto& add_files = level_state.added_files; + assert(add_files.find(file_number) == add_files.end()); + add_files.emplace(file_number, f); + + const uint64_t blob_file_number = f->oldest_blob_file_number; + + if (blob_file_number != kInvalidBlobFileNumber) { + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + if (mutable_meta) { + mutable_meta->LinkSst(file_number); + } + } + + table_file_levels_[file_number] = level; + + return Status::OK(); + } + + Status ApplyCompactCursors(int level, + const InternalKey& smallest_uncompacted_key) { + if (level < 0) { + std::ostringstream oss; + oss << "Cannot add compact cursor (" << level << "," + << smallest_uncompacted_key.Encode().ToString() + << " due to invalid level (level = " << level << ")"; + return Status::Corruption("VersionBuilder", oss.str()); + } + if (level < num_levels_) { + // Omit levels (>= num_levels_) when re-open with shrinking num_levels_ + updated_compact_cursors_[level] = smallest_uncompacted_key; + } + return Status::OK(); + } + + // Apply all of the edits in *edit to the current state. + Status Apply(const VersionEdit* edit) { + { + const Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } + } + + // Note: we process the blob file related changes first because the + // table file addition/deletion logic depends on the blob files + // already being there. + + // Add new blob files + for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) { + const Status s = ApplyBlobFileAddition(blob_file_addition); + if (!s.ok()) { + return s; + } + } + + // Increase the amount of garbage for blob files affected by GC + for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) { + const Status s = ApplyBlobFileGarbage(blob_file_garbage); + if (!s.ok()) { + return s; + } + } + + // Delete table files + for (const auto& deleted_file : edit->GetDeletedFiles()) { + const int level = deleted_file.first; + const uint64_t file_number = deleted_file.second; + + const Status s = ApplyFileDeletion(level, file_number); + if (!s.ok()) { + return s; + } + } + + // Add new table files + for (const auto& new_file : edit->GetNewFiles()) { + const int level = new_file.first; + const FileMetaData& meta = new_file.second; + + const Status s = ApplyFileAddition(level, meta); + if (!s.ok()) { + return s; + } + } + + // Populate compact cursors for round-robin compaction, leave + // the cursor to be empty to indicate it is invalid + for (const auto& cursor : edit->GetCompactCursors()) { + const int level = cursor.first; + const InternalKey smallest_uncompacted_key = cursor.second; + const Status s = ApplyCompactCursors(level, smallest_uncompacted_key); + if (!s.ok()) { + return s; + } + } + return Status::OK(); + } + + // Helper function template for merging the blob file metadata from the base + // version with the mutable metadata representing the state after applying the + // edits. The function objects process_base and process_mutable are + // respectively called to handle a base version object when there is no + // matching mutable object, and a mutable object when there is no matching + // base version object. process_both is called to perform the merge when a + // given blob file appears both in the base version and the mutable list. The + // helper stops processing objects if a function object returns false. Blob + // files with a file number below first_blob_file are not processed. + template + void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base, + ProcessMutable process_mutable, + ProcessBoth process_both) const { + assert(base_vstorage_); + + auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file); + const auto base_it_end = base_vstorage_->GetBlobFiles().end(); + + auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file); + const auto mutable_it_end = mutable_blob_file_metas_.end(); + + while (base_it != base_it_end && mutable_it != mutable_it_end) { + const auto& base_meta = *base_it; + assert(base_meta); + + const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber(); + const uint64_t mutable_blob_file_number = mutable_it->first; + + if (base_blob_file_number < mutable_blob_file_number) { + if (!process_base(base_meta)) { + return; + } + + ++base_it; + } else if (mutable_blob_file_number < base_blob_file_number) { + const auto& mutable_meta = mutable_it->second; + + if (!process_mutable(mutable_meta)) { + return; + } + + ++mutable_it; + } else { + assert(base_blob_file_number == mutable_blob_file_number); + + const auto& mutable_meta = mutable_it->second; + + if (!process_both(base_meta, mutable_meta)) { + return; + } + + ++base_it; + ++mutable_it; + } + } + + while (base_it != base_it_end) { + const auto& base_meta = *base_it; + + if (!process_base(base_meta)) { + return; + } + + ++base_it; + } + + while (mutable_it != mutable_it_end) { + const auto& mutable_meta = mutable_it->second; + + if (!process_mutable(mutable_meta)) { + return; + } + + ++mutable_it; + } + } + + // Helper function template for finding the first blob file that has linked + // SSTs. + template + static bool CheckLinkedSsts(const Meta& meta, + uint64_t* min_oldest_blob_file_num) { + assert(min_oldest_blob_file_num); + + if (!meta.GetLinkedSsts().empty()) { + assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber); + + *min_oldest_blob_file_num = meta.GetBlobFileNumber(); + + return false; + } + + return true; + } + + // Find the oldest blob file that has linked SSTs. + uint64_t GetMinOldestBlobFileNumber() const { + uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber; + + auto process_base = + [&min_oldest_blob_file_num]( + const std::shared_ptr& base_meta) { + assert(base_meta); + + return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num); + }; + + auto process_mutable = [&min_oldest_blob_file_num]( + const MutableBlobFileMetaData& mutable_meta) { + return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); + }; + + auto process_both = [&min_oldest_blob_file_num]( + const std::shared_ptr& base_meta, + const MutableBlobFileMetaData& mutable_meta) { +#ifndef NDEBUG + assert(base_meta); + assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); +#else + (void)base_meta; +#endif + + // Look at mutable_meta since it supersedes *base_meta + return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); + }; + + MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable, + process_both); + + return min_oldest_blob_file_num; + } + + static std::shared_ptr CreateBlobFileMetaData( + const MutableBlobFileMetaData& mutable_meta) { + return BlobFileMetaData::Create( + mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(), + mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes()); + } + + // Add the blob file specified by meta to *vstorage if it is determined to + // contain valid data (blobs). + template + static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) { + assert(vstorage); + assert(meta); + + if (meta->GetLinkedSsts().empty() && + meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) { + return; + } + + vstorage->AddBlobFile(std::forward(meta)); + } + + // Merge the blob file metadata from the base version with the changes (edits) + // applied, and save the result into *vstorage. + void SaveBlobFilesTo(VersionStorageInfo* vstorage) const { + assert(vstorage); + + assert(base_vstorage_); + vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() + + mutable_blob_file_metas_.size()); + + const uint64_t oldest_blob_file_with_linked_ssts = + GetMinOldestBlobFileNumber(); + + auto process_base = + [vstorage](const std::shared_ptr& base_meta) { + assert(base_meta); + + AddBlobFileIfNeeded(vstorage, base_meta); + + return true; + }; + + auto process_mutable = + [vstorage](const MutableBlobFileMetaData& mutable_meta) { + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + + return true; + }; + + auto process_both = [vstorage]( + const std::shared_ptr& base_meta, + const MutableBlobFileMetaData& mutable_meta) { + assert(base_meta); + assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); + + if (!mutable_meta.HasDelta()) { + assert(base_meta->GetGarbageBlobCount() == + mutable_meta.GetGarbageBlobCount()); + assert(base_meta->GetGarbageBlobBytes() == + mutable_meta.GetGarbageBlobBytes()); + assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts()); + + AddBlobFileIfNeeded(vstorage, base_meta); + + return true; + } + + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + + return true; + }; + + MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base, + process_mutable, process_both); + } + + void MaybeAddFile(VersionStorageInfo* vstorage, int level, + FileMetaData* f) const { + const uint64_t file_number = f->fd.GetNumber(); + + const auto& level_state = levels_[level]; + + const auto& del_files = level_state.deleted_files; + const auto del_it = del_files.find(file_number); + + if (del_it != del_files.end()) { + // f is to-be-deleted table file + vstorage->RemoveCurrentStats(f); + } else { + const auto& add_files = level_state.added_files; + const auto add_it = add_files.find(file_number); + + // Note: if the file appears both in the base version and in the added + // list, the added FileMetaData supersedes the one in the base version. + if (add_it != add_files.end() && add_it->second != f) { + vstorage->RemoveCurrentStats(f); + } else { + vstorage->AddFile(level, f); + } + } + } + + template + void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const { + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *vstorage. + const auto& base_files = base_vstorage_->LevelFiles(level); + const auto& unordered_added_files = levels_[level].added_files; + vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); + + // Sort added files for the level. + std::vector added_files; + added_files.reserve(unordered_added_files.size()); + for (const auto& pair : unordered_added_files) { + added_files.push_back(pair.second); + } + std::sort(added_files.begin(), added_files.end(), cmp); + + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + auto added_iter = added_files.begin(); + auto added_end = added_files.end(); + while (added_iter != added_end || base_iter != base_end) { + if (base_iter == base_end || + (added_iter != added_end && cmp(*added_iter, *base_iter))) { + MaybeAddFile(vstorage, level, *added_iter++); + } else { + MaybeAddFile(vstorage, level, *base_iter++); + } + } + } + + bool PromoteEpochNumberRequirementIfNeeded( + VersionStorageInfo* vstorage) const { + if (vstorage->HasMissingEpochNumber()) { + return false; + } + + for (int level = 0; level < num_levels_; ++level) { + for (const auto& pair : levels_[level].added_files) { + const FileMetaData* f = pair.second; + if (f->epoch_number == kUnknownEpochNumber) { + return false; + } + } + } + + vstorage->SetEpochNumberRequirement(EpochNumberRequirement::kMustPresent); + return true; + } + + void SaveSSTFilesTo(VersionStorageInfo* vstorage) const { + assert(vstorage); + + if (!num_levels_) { + return; + } + + EpochNumberRequirement epoch_number_requirement = + vstorage->GetEpochNumberRequirement(); + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + bool promoted = PromoteEpochNumberRequirementIfNeeded(vstorage); + if (promoted) { + epoch_number_requirement = vstorage->GetEpochNumberRequirement(); + } + } + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_); + } else { + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_); + } + + for (int level = 1; level < num_levels_; ++level) { + SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); + } + } + + void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const { + for (auto iter = updated_compact_cursors_.begin(); + iter != updated_compact_cursors_.end(); iter++) { + vstorage->AddCursorForOneLevel(iter->first, iter->second); + } + } + + // Save the current state in *vstorage. + Status SaveTo(VersionStorageInfo* vstorage) const { + Status s; + +#ifndef NDEBUG + // The same check is done within Apply() so we skip it in release mode. + s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } +#endif // NDEBUG + + s = CheckConsistency(vstorage); + if (!s.ok()) { + return s; + } + + SaveSSTFilesTo(vstorage); + + SaveBlobFilesTo(vstorage); + + SaveCompactCursorsTo(vstorage); + + s = CheckConsistency(vstorage); + return s; + } + + Status LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { + assert(table_cache_ != nullptr); + + size_t table_cache_capacity = + table_cache_->get_cache().get()->GetCapacity(); + bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity); + size_t max_load = std::numeric_limits::max(); + + if (!always_load) { + // If it is initial loading and not set to always loading all the + // files, we only load up to kInitialLoadLimit files, to limit the + // time reopening the DB. + const size_t kInitialLoadLimit = 16; + size_t load_limit; + // If the table cache is not 1/4 full, we pin the table handle to + // file metadata to avoid the cache read costs when reading the file. + // The downside of pinning those files is that LRU won't be followed + // for those files. This doesn't matter much because if number of files + // of the DB excceeds table cache capacity, eventually no table reader + // will be pinned and LRU will be followed. + if (is_initial_load) { + load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4); + } else { + load_limit = table_cache_capacity / 4; + } + + size_t table_cache_usage = table_cache_->get_cache().get()->GetUsage(); + if (table_cache_usage >= load_limit) { + // TODO (yanqin) find a suitable status code. + return Status::OK(); + } else { + max_load = load_limit - table_cache_usage; + } + } + + // + std::vector> files_meta; + std::vector statuses; + for (int level = 0; level < num_levels_; level++) { + for (auto& file_meta_pair : levels_[level].added_files) { + auto* file_meta = file_meta_pair.second; + // If the file has been opened before, just skip it. + if (!file_meta->table_reader_handle) { + files_meta.emplace_back(file_meta, level); + statuses.emplace_back(Status::OK()); + } + if (files_meta.size() >= max_load) { + break; + } + } + if (files_meta.size() >= max_load) { + break; + } + } + + std::atomic next_file_meta_idx(0); + std::function load_handlers_func([&]() { + while (true) { + size_t file_idx = next_file_meta_idx.fetch_add(1); + if (file_idx >= files_meta.size()) { + break; + } + + auto* file_meta = files_meta[file_idx].first; + int level = files_meta[file_idx].second; + TableCache::TypedHandle* handle = nullptr; + statuses[file_idx] = table_cache_->FindTable( + read_options, file_options_, + *(base_vstorage_->InternalComparator()), *file_meta, &handle, + block_protection_bytes_per_key, prefix_extractor, false /*no_io */, + true /* record_read_stats */, + internal_stats->GetFileReadHist(level), false, level, + prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, + file_meta->temperature); + if (handle != nullptr) { + file_meta->table_reader_handle = handle; + // Load table_reader + file_meta->fd.table_reader = table_cache_->get_cache().Value(handle); + } + } + }); + + std::vector threads; + for (int i = 1; i < max_threads; i++) { + threads.emplace_back(load_handlers_func); + } + load_handlers_func(); + for (auto& t : threads) { + t.join(); + } + Status ret; + for (const auto& s : statuses) { + if (!s.ok()) { + if (ret.ok()) { + ret = s; + } + } + } + return ret; + } +}; + +VersionBuilder::VersionBuilder( + const FileOptions& file_options, const ImmutableCFOptions* ioptions, + TableCache* table_cache, VersionStorageInfo* base_vstorage, + VersionSet* version_set, + std::shared_ptr file_metadata_cache_res_mgr) + : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, + version_set, file_metadata_cache_res_mgr)) {} + +VersionBuilder::~VersionBuilder() = default; + +bool VersionBuilder::CheckConsistencyForNumLevels() { + return rep_->CheckConsistencyForNumLevels(); +} + +Status VersionBuilder::Apply(const VersionEdit* edit) { + return rep_->Apply(edit); +} + +Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const { + return rep_->SaveTo(vstorage); +} + +Status VersionBuilder::LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { + return rep_->LoadTableHandlers( + internal_stats, max_threads, prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, + read_options, block_protection_bytes_per_key); +} + +uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { + return rep_->GetMinOldestBlobFileNumber(); +} + +BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( + ColumnFamilyData* cfd) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), cfd->current()->storage_info(), + cfd->current()->version_set(), + cfd->GetFileMetadataCacheReservationManager())), + version_(cfd->current()) { + version_->Ref(); +} + +BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( + ColumnFamilyData* cfd, Version* v) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), v->storage_info(), v->version_set(), + cfd->GetFileMetadataCacheReservationManager())), + version_(v) { + assert(version_ != cfd->current()); +} + +BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() { + version_->Unref(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.h new file mode 100644 index 0000000000..fb2a304a84 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_builder.h @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once + +#include + +#include "db/version_edit.h" +#include "rocksdb/file_system.h" +#include "rocksdb/metadata.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +struct ImmutableCFOptions; +class TableCache; +class VersionStorageInfo; +class VersionEdit; +struct FileMetaData; +class InternalStats; +class Version; +class VersionSet; +class ColumnFamilyData; +class CacheReservationManager; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionBuilder { + public: + VersionBuilder(const FileOptions& file_options, + const ImmutableCFOptions* ioptions, TableCache* table_cache, + VersionStorageInfo* base_vstorage, VersionSet* version_set, + std::shared_ptr + file_metadata_cache_res_mgr = nullptr); + ~VersionBuilder(); + + bool CheckConsistencyForNumLevels(); + Status Apply(const VersionEdit* edit); + Status SaveTo(VersionStorageInfo* vstorage) const; + Status LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key); + uint64_t GetMinOldestBlobFileNumber() const; + + private: + class Rep; + std::unique_ptr rep_; +}; + +// A wrapper of version builder which references the current version in +// constructor and unref it in the destructor. +// Both of the constructor and destructor need to be called inside DB Mutex. +class BaseReferencedVersionBuilder { + public: + explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd); + BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v); + ~BaseReferencedVersionBuilder(); + VersionBuilder* version_builder() const { return version_builder_.get(); } + + private: + std::unique_ptr version_builder_; + Version* version_; +}; + +class NewestFirstBySeqNo { + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + + if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { + return lhs->fd.largest_seqno > rhs->fd.largest_seqno; + } + + if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { + return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + } + + // Break ties by file number + return lhs->fd.GetNumber() > rhs->fd.GetNumber(); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.cc new file mode 100644 index 0000000000..4f1ae80d21 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.cc @@ -0,0 +1,1059 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/blob/blob_index.h" +#include "db/version_set.h" +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace {} // anonymous namespace + +uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { + assert(number <= kFileNumberMask); + return number | (path_id * (kFileNumberMask + 1)); +} + +Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, + ValueType value_type) { + if (value_type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return s; + } + + if (!blob_index.IsInlined() && !blob_index.HasTTL()) { + if (blob_index.file_number() == kInvalidBlobFileNumber) { + return Status::Corruption("Invalid blob file number"); + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } + } + + if (smallest.size() == 0) { + smallest.DecodeFrom(key); + } + largest.DecodeFrom(key); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + + return Status::OK(); +} + +void VersionEdit::Clear() { + max_level_ = 0; + db_id_.clear(); + comparator_.clear(); + log_number_ = 0; + prev_log_number_ = 0; + next_file_number_ = 0; + max_column_family_ = 0; + min_log_number_to_keep_ = 0; + last_sequence_ = 0; + has_db_id_ = false; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_max_column_family_ = false; + has_min_log_number_to_keep_ = false; + has_last_sequence_ = false; + compact_cursors_.clear(); + deleted_files_.clear(); + new_files_.clear(); + blob_file_additions_.clear(); + blob_file_garbages_.clear(); + wal_additions_.clear(); + wal_deletion_.Reset(); + column_family_ = 0; + is_column_family_add_ = false; + is_column_family_drop_ = false; + column_family_name_.clear(); + is_in_atomic_group_ = false; + remaining_entries_ = 0; + full_history_ts_low_.clear(); +} + +bool VersionEdit::EncodeTo(std::string* dst) const { + if (has_db_id_) { + PutVarint32(dst, kDbId); + PutLengthPrefixedSlice(dst, db_id_); + } + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32Varint64(dst, kLogNumber, log_number_); + } + if (has_prev_log_number_) { + PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); + } + if (has_max_column_family_) { + PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_); + } + if (has_min_log_number_to_keep_) { + PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_); + } + if (has_last_sequence_) { + PutVarint32Varint64(dst, kLastSequence, last_sequence_); + } + for (size_t i = 0; i < compact_cursors_.size(); i++) { + if (compact_cursors_[i].second.Valid()) { + PutVarint32(dst, kCompactCursor); + PutVarint32(dst, compact_cursors_[i].first); // level + PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode()); + } + } + for (const auto& deleted : deleted_files_) { + PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */, + deleted.second /* file number */); + } + + bool min_log_num_written = false; + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + if (!f.smallest.Valid() || !f.largest.Valid() || + f.epoch_number == kUnknownEpochNumber) { + return false; + } + PutVarint32(dst, kNewFile4); + PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); + PutVarint64(dst, f.fd.GetFileSize()); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); + // Customized fields' format: + // +-----------------------------+ + // | 1st field's tag (varint32) | + // +-----------------------------+ + // | 1st field's size (varint32) | + // +-----------------------------+ + // | bytes for 1st field | + // | (based on size decoded) | + // +-----------------------------+ + // | | + // | ...... | + // | | + // +-----------------------------+ + // | last field's size (varint32)| + // +-----------------------------+ + // | bytes for last field | + // | (based on size decoded) | + // +-----------------------------+ + // | terminating tag (varint32) | + // +-----------------------------+ + // + // Customized encoding for fields: + // tag kPathId: 1 byte as path_id + // tag kNeedCompaction: + // now only can take one char value 1 indicating need-compaction + // + PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime); + std::string varint_oldest_ancester_time; + PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", + &varint_oldest_ancester_time); + PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); + + PutVarint32(dst, NewFileCustomTag::kFileCreationTime); + std::string varint_file_creation_time; + PutVarint64(&varint_file_creation_time, f.file_creation_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", + &varint_file_creation_time); + PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + + PutVarint32(dst, NewFileCustomTag::kEpochNumber); + std::string varint_epoch_number; + PutVarint64(&varint_epoch_number, f.epoch_number); + PutLengthPrefixedSlice(dst, Slice(varint_epoch_number)); + + PutVarint32(dst, NewFileCustomTag::kFileChecksum); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); + + PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); + + if (f.fd.GetPathId() != 0) { + PutVarint32(dst, NewFileCustomTag::kPathId); + char p = static_cast(f.fd.GetPathId()); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.temperature != Temperature::kUnknown) { + PutVarint32(dst, NewFileCustomTag::kTemperature); + char p = static_cast(f.temperature); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.marked_for_compaction) { + PutVarint32(dst, NewFileCustomTag::kNeedCompaction); + char p = static_cast(1); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (has_min_log_number_to_keep_ && !min_log_num_written) { + PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack); + std::string varint_log_number; + PutFixed64(&varint_log_number, min_log_number_to_keep_); + PutLengthPrefixedSlice(dst, Slice(varint_log_number)); + min_log_num_written = true; + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber); + std::string oldest_blob_file_number; + PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); + PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); + } + UniqueId64x2 unique_id = f.unique_id; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id); + if (unique_id != kNullUniqueId64x2) { + PutVarint32(dst, NewFileCustomTag::kUniqueId); + std::string unique_id_str = EncodeUniqueIdBytes(&unique_id); + PutLengthPrefixedSlice(dst, Slice(unique_id_str)); + } + if (f.compensated_range_deletion_size) { + PutVarint32(dst, kCompensatedRangeDeletionSize); + std::string compensated_range_deletion_size; + PutVarint64(&compensated_range_deletion_size, + f.compensated_range_deletion_size); + PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); + } + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); + } + + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", + dst); + + PutVarint32(dst, NewFileCustomTag::kTerminate); + } + + for (const auto& blob_file_addition : blob_file_additions_) { + PutVarint32(dst, kBlobFileAddition); + blob_file_addition.EncodeTo(dst); + } + + for (const auto& blob_file_garbage : blob_file_garbages_) { + PutVarint32(dst, kBlobFileGarbage); + blob_file_garbage.EncodeTo(dst); + } + + for (const auto& wal_addition : wal_additions_) { + PutVarint32(dst, kWalAddition2); + std::string encoded; + wal_addition.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); + } + + if (!wal_deletion_.IsEmpty()) { + PutVarint32(dst, kWalDeletion2); + std::string encoded; + wal_deletion_.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); + } + + // 0 is default and does not need to be explicitly written + if (column_family_ != 0) { + PutVarint32Varint32(dst, kColumnFamily, column_family_); + } + + if (is_column_family_add_) { + PutVarint32(dst, kColumnFamilyAdd); + PutLengthPrefixedSlice(dst, Slice(column_family_name_)); + } + + if (is_column_family_drop_) { + PutVarint32(dst, kColumnFamilyDrop); + } + + if (is_in_atomic_group_) { + PutVarint32(dst, kInAtomicGroup); + PutVarint32(dst, remaining_entries_); + } + + if (HasFullHistoryTsLow()) { + PutVarint32(dst, kFullHistoryTsLow); + PutLengthPrefixedSlice(dst, full_history_ts_low_); + } + return true; +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return dst->Valid(); + } else { + return false; + } +} + +bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { + uint32_t v = 0; + if (GetVarint32(input, &v)) { + *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } + return true; + } else { + return false; + } +} + +const char* VersionEdit::DecodeNewFile4From(Slice* input) { + const char* msg = nullptr; + int level = 0; + FileMetaData f; + uint64_t number = 0; + uint32_t path_id = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && + GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && + GetInternalKey(input, &f.largest) && + GetVarint64(input, &smallest_seqno) && + GetVarint64(input, &largest_seqno)) { + // See comments in VersionEdit::EncodeTo() for format of customized fields + while (true) { + uint32_t custom_tag = 0; + Slice field; + if (!GetVarint32(input, &custom_tag)) { + return "new-file4 custom field"; + } + if (custom_tag == kTerminate) { + break; + } + if (!GetLengthPrefixedSlice(input, &field)) { + return "new-file4 custom field length prefixed slice error"; + } + switch (custom_tag) { + case kPathId: + if (field.size() != 1) { + return "path_id field wrong size"; + } + path_id = field[0]; + if (path_id > 3) { + return "path_id wrong vaue"; + } + break; + case kOldestAncesterTime: + if (!GetVarint64(&field, &f.oldest_ancester_time)) { + return "invalid oldest ancester time"; + } + break; + case kFileCreationTime: + if (!GetVarint64(&field, &f.file_creation_time)) { + return "invalid file creation time"; + } + break; + case kEpochNumber: + if (!GetVarint64(&field, &f.epoch_number)) { + return "invalid epoch number"; + } + break; + case kFileChecksum: + f.file_checksum = field.ToString(); + break; + case kFileChecksumFuncName: + f.file_checksum_func_name = field.ToString(); + break; + case kNeedCompaction: + if (field.size() != 1) { + return "need_compaction field wrong size"; + } + f.marked_for_compaction = (field[0] == 1); + break; + case kMinLogNumberToKeepHack: + // This is a hack to encode kMinLogNumberToKeep in a + // forward-compatible fashion. + if (!GetFixed64(&field, &min_log_number_to_keep_)) { + return "deleted log number malformatted"; + } + has_min_log_number_to_keep_ = true; + break; + case kOldestBlobFileNumber: + if (!GetVarint64(&field, &f.oldest_blob_file_number)) { + return "invalid oldest blob file number"; + } + break; + case kTemperature: + if (field.size() != 1) { + return "temperature field wrong size"; + } else { + Temperature casted_field = static_cast(field[0]); + if (casted_field <= Temperature::kCold) { + f.temperature = casted_field; + } + } + break; + case kUniqueId: + if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) { + f.unique_id = kNullUniqueId64x2; + return "invalid unique id"; + } + break; + case kCompensatedRangeDeletionSize: + if (!GetVarint64(&field, &f.compensated_range_deletion_size)) { + return "Invalid compensated range deletion size"; + } + break; + case kTailSize: + if (!GetVarint64(&field, &f.tail_size)) { + return "invalid tail start offset"; + } + break; + default: + if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { + // Should not proceed if cannot understand it + return "new-file4 custom field not supported"; + } + break; + } + } + } else { + return "new-file4 entry"; + } + f.fd = + FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + return nullptr; +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); +#ifndef NDEBUG + bool ignore_ignorable_tags = false; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags", + &ignore_ignorable_tags); +#endif + Slice input = src; + const char* msg = nullptr; + uint32_t tag = 0; + + // Temporary storage for parsing + int level = 0; + FileMetaData f; + Slice str; + InternalKey key; + while (msg == nullptr && GetVarint32(&input, &tag)) { +#ifndef NDEBUG + if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) { + tag = kTagSafeIgnoreMask; + } +#endif + switch (tag) { + case kDbId: + if (GetLengthPrefixedSlice(&input, &str)) { + db_id_ = str.ToString(); + has_db_id_ = true; + } else { + msg = "db id"; + } + break; + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kMaxColumnFamily: + if (GetVarint32(&input, &max_column_family_)) { + has_max_column_family_ = true; + } else { + msg = "max column family"; + } + break; + + case kMinLogNumberToKeep: + if (GetVarint64(&input, &min_log_number_to_keep_)) { + has_min_log_number_to_keep_ = true; + } else { + msg = "min log number to kee"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactCursor: + if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { + // Here we re-use the output format of compact pointer in LevelDB + // to persist compact_cursors_ + compact_cursors_.push_back(std::make_pair(level, key)); + } else { + if (!msg) { + msg = "compaction cursor"; + } + } + break; + + case kDeletedFile: { + uint64_t number = 0; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + if (!msg) { + msg = "deleted file"; + } + } + break; + } + + case kNewFile: { + uint64_t number = 0; + uint64_t file_size = 0; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + f.fd = FileDescriptor(number, 0, file_size); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file entry"; + } + } + break; + } + case kNewFile2: { + uint64_t number = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, 0, file_size, smallest_seqno, + largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file2 entry"; + } + } + break; + } + + case kNewFile3: { + uint64_t number = 0; + uint32_t path_id = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, + largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file3 entry"; + } + } + break; + } + + case kNewFile4: { + msg = DecodeNewFile4From(&input); + break; + } + + case kBlobFileAddition: + case kBlobFileAddition_DEPRECATED: { + BlobFileAddition blob_file_addition; + const Status s = blob_file_addition.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + AddBlobFile(std::move(blob_file_addition)); + break; + } + + case kBlobFileGarbage: + case kBlobFileGarbage_DEPRECATED: { + BlobFileGarbage blob_file_garbage; + const Status s = blob_file_garbage.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + AddBlobFileGarbage(std::move(blob_file_garbage)); + break; + } + + case kWalAddition: { + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + + case kWalAddition2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalAddition not prefixed by length"; + break; + } + + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + + case kWalDeletion: { + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + + case kWalDeletion2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalDeletion not prefixed by length"; + break; + } + + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + + case kColumnFamily: + if (!GetVarint32(&input, &column_family_)) { + if (!msg) { + msg = "set column family id"; + } + } + break; + + case kColumnFamilyAdd: + if (GetLengthPrefixedSlice(&input, &str)) { + is_column_family_add_ = true; + column_family_name_ = str.ToString(); + } else { + if (!msg) { + msg = "column family add"; + } + } + break; + + case kColumnFamilyDrop: + is_column_family_drop_ = true; + break; + + case kInAtomicGroup: + is_in_atomic_group_ = true; + if (!GetVarint32(&input, &remaining_entries_)) { + if (!msg) { + msg = "remaining entries"; + } + } + break; + + case kFullHistoryTsLow: + if (!GetLengthPrefixedSlice(&input, &str)) { + msg = "full_history_ts_low"; + } else if (str.empty()) { + msg = "full_history_ts_low: empty"; + } else { + full_history_ts_low_.assign(str.data(), str.size()); + } + break; + + default: + if (tag & kTagSafeIgnoreMask) { + // Tag from future which can be safely ignored. + // The next field must be the length of the entry. + uint32_t field_len; + if (!GetVarint32(&input, &field_len) || + static_cast(field_len) > input.size()) { + if (!msg) { + msg = "safely ignoreable tag length error"; + } + } else { + input.remove_prefix(static_cast(field_len)); + } + } else { + msg = "unknown tag"; + } + break; + } + } + + if (msg == nullptr && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != nullptr) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString(bool hex_key) const { + std::string r; + r.append("VersionEdit {"); + if (has_db_id_) { + r.append("\n DB ID: "); + r.append(db_id_); + } + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFileNumber: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_max_column_family_) { + r.append("\n MaxColumnFamily: "); + AppendNumberTo(&r, max_column_family_); + } + if (has_min_log_number_to_keep_) { + r.append("\n MinLogNumberToKeep: "); + AppendNumberTo(&r, min_log_number_to_keep_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (const auto& level_and_compact_cursor : compact_cursors_) { + r.append("\n CompactCursor: "); + AppendNumberTo(&r, level_and_compact_cursor.first); + r.append(" "); + r.append(level_and_compact_cursor.second.DebugString(hex_key)); + } + for (const auto& deleted_file : deleted_files_) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, deleted_file.first); + r.append(" "); + AppendNumberTo(&r, deleted_file.second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.fd.GetNumber()); + r.append(" "); + AppendNumberTo(&r, f.fd.GetFileSize()); + r.append(" "); + r.append(f.smallest.DebugString(hex_key)); + r.append(" .. "); + r.append(f.largest.DebugString(hex_key)); + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, f.oldest_blob_file_number); + } + r.append(" oldest_ancester_time:"); + AppendNumberTo(&r, f.oldest_ancester_time); + r.append(" file_creation_time:"); + AppendNumberTo(&r, f.file_creation_time); + r.append(" epoch_number:"); + AppendNumberTo(&r, f.epoch_number); + r.append(" file_checksum:"); + r.append(Slice(f.file_checksum).ToString(true)); + r.append(" file_checksum_func_name: "); + r.append(f.file_checksum_func_name); + if (f.temperature != Temperature::kUnknown) { + r.append(" temperature: "); + // Maybe change to human readable format whenthe feature becomes + // permanent + r.append(std::to_string(static_cast(f.temperature))); + } + if (f.unique_id != kNullUniqueId64x2) { + r.append(" unique_id(internal): "); + UniqueId64x2 id = f.unique_id; + r.append(InternalUniqueIdToHumanString(&id)); + r.append(" public_unique_id: "); + InternalUniqueIdToExternal(&id); + r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); + } + r.append(" tail size:"); + AppendNumberTo(&r, f.tail_size); + } + + for (const auto& blob_file_addition : blob_file_additions_) { + r.append("\n BlobFileAddition: "); + r.append(blob_file_addition.DebugString()); + } + + for (const auto& blob_file_garbage : blob_file_garbages_) { + r.append("\n BlobFileGarbage: "); + r.append(blob_file_garbage.DebugString()); + } + + for (const auto& wal_addition : wal_additions_) { + r.append("\n WalAddition: "); + r.append(wal_addition.DebugString()); + } + + if (!wal_deletion_.IsEmpty()) { + r.append("\n WalDeletion: "); + r.append(wal_deletion_.DebugString()); + } + + r.append("\n ColumnFamily: "); + AppendNumberTo(&r, column_family_); + if (is_column_family_add_) { + r.append("\n ColumnFamilyAdd: "); + r.append(column_family_name_); + } + if (is_column_family_drop_) { + r.append("\n ColumnFamilyDrop"); + } + if (is_in_atomic_group_) { + r.append("\n AtomicGroup: "); + AppendNumberTo(&r, remaining_entries_); + r.append(" entries remains"); + } + if (HasFullHistoryTsLow()) { + r.append("\n FullHistoryTsLow: "); + r.append(Slice(full_history_ts_low_).ToString(hex_key)); + } + r.append("\n}\n"); + return r; +} + +std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { + JSONWriter jw; + jw << "EditNumber" << edit_num; + + if (has_db_id_) { + jw << "DB ID" << db_id_; + } + if (has_comparator_) { + jw << "Comparator" << comparator_; + } + if (has_log_number_) { + jw << "LogNumber" << log_number_; + } + if (has_prev_log_number_) { + jw << "PrevLogNumber" << prev_log_number_; + } + if (has_next_file_number_) { + jw << "NextFileNumber" << next_file_number_; + } + if (has_max_column_family_) { + jw << "MaxColumnFamily" << max_column_family_; + } + if (has_min_log_number_to_keep_) { + jw << "MinLogNumberToKeep" << min_log_number_to_keep_; + } + if (has_last_sequence_) { + jw << "LastSeq" << last_sequence_; + } + + if (!deleted_files_.empty()) { + jw << "DeletedFiles"; + jw.StartArray(); + + for (const auto& deleted_file : deleted_files_) { + jw.StartArrayedObject(); + jw << "Level" << deleted_file.first; + jw << "FileNumber" << deleted_file.second; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!new_files_.empty()) { + jw << "AddedFiles"; + jw.StartArray(); + + for (size_t i = 0; i < new_files_.size(); i++) { + jw.StartArrayedObject(); + jw << "Level" << new_files_[i].first; + const FileMetaData& f = new_files_[i].second; + jw << "FileNumber" << f.fd.GetNumber(); + jw << "FileSize" << f.fd.GetFileSize(); + jw << "SmallestIKey" << f.smallest.DebugString(hex_key); + jw << "LargestIKey" << f.largest.DebugString(hex_key); + jw << "OldestAncesterTime" << f.oldest_ancester_time; + jw << "FileCreationTime" << f.file_creation_time; + jw << "EpochNumber" << f.epoch_number; + jw << "FileChecksum" << Slice(f.file_checksum).ToString(true); + jw << "FileChecksumFuncName" << f.file_checksum_func_name; + if (f.temperature != Temperature::kUnknown) { + jw << "temperature" << std::to_string(static_cast(f.temperature)); + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + jw << "OldestBlobFile" << f.oldest_blob_file_number; + } + if (f.temperature != Temperature::kUnknown) { + // Maybe change to human readable format whenthe feature becomes + // permanent + jw << "Temperature" << static_cast(f.temperature); + } + jw << "TailSize" << f.tail_size; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!blob_file_additions_.empty()) { + jw << "BlobFileAdditions"; + + jw.StartArray(); + + for (const auto& blob_file_addition : blob_file_additions_) { + jw.StartArrayedObject(); + jw << blob_file_addition; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!blob_file_garbages_.empty()) { + jw << "BlobFileGarbages"; + + jw.StartArray(); + + for (const auto& blob_file_garbage : blob_file_garbages_) { + jw.StartArrayedObject(); + jw << blob_file_garbage; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!wal_additions_.empty()) { + jw << "WalAdditions"; + + jw.StartArray(); + + for (const auto& wal_addition : wal_additions_) { + jw.StartArrayedObject(); + jw << wal_addition; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!wal_deletion_.IsEmpty()) { + jw << "WalDeletion"; + jw.StartObject(); + jw << wal_deletion_; + jw.EndObject(); + } + + jw << "ColumnFamily" << column_family_; + + if (is_column_family_add_) { + jw << "ColumnFamilyAdd" << column_family_name_; + } + if (is_column_family_drop_) { + jw << "ColumnFamilyDrop" << column_family_name_; + } + if (is_in_atomic_group_) { + jw << "AtomicGroup" << remaining_entries_; + } + + if (HasFullHistoryTsLow()) { + jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key); + } + + jw.EndObject(); + + return jw.Get(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.h new file mode 100644 index 0000000000..07e8f37742 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit.h @@ -0,0 +1,702 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_garbage.h" +#include "db/dbformat.h" +#include "db/wal_edit.h" +#include "memory/arena.h" +#include "port/malloc.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/advanced_options.h" +#include "table/table_reader.h" +#include "table/unique_id_impl.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. The number should be forward compatible so +// users can down-grade RocksDB safely. A future Tag is ignored by doing '&' +// between Tag and kTagSafeIgnoreMask field. +enum Tag : uint32_t { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactCursor = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + kMinLogNumberToKeep = 10, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100, + kNewFile3 = 102, + kNewFile4 = 103, // 4th (the latest) format version of adding files + kColumnFamily = 200, // specify column family for version edit + kColumnFamilyAdd = 201, + kColumnFamilyDrop = 202, + kMaxColumnFamily = 203, + + kInAtomicGroup = 300, + + kBlobFileAddition = 400, + kBlobFileGarbage, + + // Mask for an unidentified tag from the future which can be safely ignored. + kTagSafeIgnoreMask = 1 << 13, + + // Forward compatible (aka ignorable) records + kDbId, + kBlobFileAddition_DEPRECATED, + kBlobFileGarbage_DEPRECATED, + kWalAddition, + kWalDeletion, + kFullHistoryTsLow, + kWalAddition2, + kWalDeletion2, +}; + +enum NewFileCustomTag : uint32_t { + kTerminate = 1, // The end of customized fields + kNeedCompaction = 2, + // Since Manifest is not entirely forward-compatible, we currently encode + // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed + // when manifest becomes forward-compatible. + kMinLogNumberToKeepHack = 3, + kOldestBlobFileNumber = 4, + kOldestAncesterTime = 5, + kFileCreationTime = 6, + kFileChecksum = 7, + kFileChecksumFuncName = 8, + kTemperature = 9, + kMinTimestamp = 10, + kMaxTimestamp = 11, + kUniqueId = 12, + kEpochNumber = 13, + kCompensatedRangeDeletionSize = 14, + kTailSize = 15, + + // If this bit for the custom tag is set, opening DB should fail if + // we don't know this field. + kCustomTagNonSafeIgnoreMask = 1 << 6, + + // Forward incompatible (aka unignorable) fields + kPathId, +}; + +class VersionSet; + +constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; +constexpr uint64_t kUnknownOldestAncesterTime = 0; +constexpr uint64_t kUnknownFileCreationTime = 0; +constexpr uint64_t kUnknownEpochNumber = 0; +// If `Options::allow_ingest_behind` is true, this epoch number +// will be dedicated to files ingested behind. +constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1; + +extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); + +// A copyable structure contains information needed to read data from an SST +// file. It can contain a pointer to a table reader opened for the file, or +// file number and size, which can be used to create a new table reader for it. +// The behavior is undefined when a copied of the structure is used when the +// file is not in any live version any more. +struct FileDescriptor { + // Table reader in table_reader_handle + TableReader* table_reader; + uint64_t packed_number_and_path_id; + uint64_t file_size; // File size in bytes + SequenceNumber smallest_seqno; // The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file + + FileDescriptor() : FileDescriptor(0, 0, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) + : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) + : table_reader(nullptr), + packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), + file_size(_file_size), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno) {} + + FileDescriptor(const FileDescriptor& fd) { *this = fd; } + + FileDescriptor& operator=(const FileDescriptor& fd) { + table_reader = fd.table_reader; + packed_number_and_path_id = fd.packed_number_and_path_id; + file_size = fd.file_size; + smallest_seqno = fd.smallest_seqno; + largest_seqno = fd.largest_seqno; + return *this; + } + + uint64_t GetNumber() const { + return packed_number_and_path_id & kFileNumberMask; + } + uint32_t GetPathId() const { + return static_cast(packed_number_and_path_id / + (kFileNumberMask + 1)); + } + uint64_t GetFileSize() const { return file_size; } +}; + +struct FileSampledStats { + FileSampledStats() : num_reads_sampled(0) {} + FileSampledStats(const FileSampledStats& other) { *this = other; } + FileSampledStats& operator=(const FileSampledStats& other) { + num_reads_sampled = other.num_reads_sampled.load(); + return *this; + } + + // number of user reads to this file. + mutable std::atomic num_reads_sampled; +}; + +struct FileMetaData { + FileDescriptor fd; + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + // Needs to be disposed when refs becomes 0. + Cache::Handle* table_reader_handle = nullptr; + + FileSampledStats stats; + + // Stats for compensating deletion entries during compaction + + // File size compensated by deletion entry. + // This is used to compute a file's compaction priority, and is updated in + // Version::ComputeCompensatedSizes() first time when the file is created or + // loaded. After it is updated (!= 0), it is immutable. + uint64_t compensated_file_size = 0; + // These values can mutate, but they can only be read or written from + // single-threaded LogAndApply thread + uint64_t num_entries = 0; // the number of entries. + // The number of deletion entries, including range deletions. + uint64_t num_deletions = 0; + uint64_t raw_key_size = 0; // total uncompressed key size. + uint64_t raw_value_size = 0; // total uncompressed value size. + uint64_t num_range_deletions = 0; + // This is computed during Flush/Compaction, and is added to + // `compensated_file_size`. Currently, this estimates the size of keys in the + // next level covered by range tombstones in this file. + uint64_t compensated_range_deletion_size = 0; + + int refs = 0; // Reference count + + bool being_compacted = false; // Is this file undergoing compaction? + bool init_stats_from_file = false; // true if the data-entry stats of this + // file has initialized from file. + + bool marked_for_compaction = false; // True if client asked us nicely to + // compact this file. + Temperature temperature = Temperature::kUnknown; + + // Used only in BlobDB. The file number of the oldest blob file this SST file + // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + + // The file could be the compaction output from other SST files, which could + // in turn be outputs for compact older SST files. We track the memtable + // flush timestamp for the oldest SST file that eventually contribute data + // to this file. 0 means the information is not available. + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + + // Unix time when the SST file is created. + uint64_t file_creation_time = kUnknownFileCreationTime; + + // The order of a file being flushed or ingested/imported. + // Compaction output file will be assigned with the minimum `epoch_number` + // among input files'. + // For L0, larger `epoch_number` indicates newer L0 file. + uint64_t epoch_number = kUnknownEpochNumber; + + // File checksum + std::string file_checksum = kUnknownFileChecksum; + + // File checksum function name + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + + // SST unique id + UniqueId64x2 unique_id{}; + + // Size of the "tail" part of a SST file + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_size = 0; + + FileMetaData() = default; + + FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, + const InternalKey& smallest_key, const InternalKey& largest_key, + const SequenceNumber& smallest_seq, + const SequenceNumber& largest_seq, bool marked_for_compact, + Temperature _temperature, uint64_t oldest_blob_file, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _epoch_number, const std::string& _file_checksum, + const std::string& _file_checksum_func_name, + UniqueId64x2 _unique_id, + const uint64_t _compensated_range_deletion_size, + uint64_t _tail_size) + : fd(file, file_path_id, file_size, smallest_seq, largest_seq), + smallest(smallest_key), + largest(largest_key), + compensated_range_deletion_size(_compensated_range_deletion_size), + marked_for_compaction(marked_for_compact), + temperature(_temperature), + oldest_blob_file_number(oldest_blob_file), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + epoch_number(_epoch_number), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name), + unique_id(std::move(_unique_id)), + tail_size(_tail_size) { + TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); + } + + // REQUIRED: Keys must be given to the function in sorted order (it expects + // the last key to be the largest). + Status UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, ValueType value_type); + + // Unlike UpdateBoundaries, ranges do not need to be presented in any + // particular order. + void UpdateBoundariesForRange(const InternalKey& start, + const InternalKey& end, SequenceNumber seqno, + const InternalKeyComparator& icmp) { + if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) { + smallest = start; + } + if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { + largest = end; + } + assert(icmp.Compare(smallest, largest) <= 0); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + } + + // Try to get oldest ancester time from the class itself or table properties + // if table reader is already pinned. + // 0 means the information is not available. + uint64_t TryGetOldestAncesterTime() { + if (oldest_ancester_time != kUnknownOldestAncesterTime) { + return oldest_ancester_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->creation_time; + } + return kUnknownOldestAncesterTime; + } + + uint64_t TryGetFileCreationTime() { + if (file_creation_time != kUnknownFileCreationTime) { + return file_creation_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->file_creation_time; + } + return kUnknownFileCreationTime; + } + + // WARNING: manual update to this function is needed + // whenever a new string property is added to FileMetaData + // to reduce approximation error. + // + // TODO: eliminate the need of manually updating this function + // for new string properties + size_t ApproximateMemoryUsage() const { + size_t usage = 0; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + usage += smallest.size() + largest.size() + file_checksum.size() + + file_checksum_func_name.size(); + return usage; + } +}; + +// A compressed copy of file meta data that just contain minimum data needed +// to serve read operations, while still keeping the pointer to full metadata +// of the file in case it is needed. +struct FdWithKeyRange { + FileDescriptor fd; + FileMetaData* file_metadata; // Point to all metadata + Slice smallest_key; // slice that contain smallest key + Slice largest_key; // slice that contain largest key + + FdWithKeyRange() + : fd(), file_metadata(nullptr), smallest_key(), largest_key() {} + + FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, + FileMetaData* _file_metadata) + : fd(_fd), + file_metadata(_file_metadata), + smallest_key(_smallest_key), + largest_key(_largest_key) {} +}; + +// Data structure to store an array of FdWithKeyRange in one level +// Actual data is guaranteed to be stored closely +struct LevelFilesBrief { + size_t num_files; + FdWithKeyRange* files; + LevelFilesBrief() { + num_files = 0; + files = nullptr; + } +}; + +// The state of a DB at any given time is referred to as a Version. +// Any modification to the Version is considered a Version Edit. A Version is +// constructed by joining a sequence of Version Edits. Version Edits are written +// to the MANIFEST file. +class VersionEdit { + public: + void Clear(); + + void SetDBId(const std::string& db_id) { + has_db_id_ = true; + db_id_ = db_id; + } + bool HasDbId() const { return has_db_id_; } + const std::string& GetDbId() const { return db_id_; } + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + bool HasComparatorName() const { return has_comparator_; } + const std::string& GetComparatorName() const { return comparator_; } + + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + bool HasLogNumber() const { return has_log_number_; } + uint64_t GetLogNumber() const { return log_number_; } + + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + bool HasPrevLogNumber() const { return has_prev_log_number_; } + uint64_t GetPrevLogNumber() const { return prev_log_number_; } + + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + bool HasNextFile() const { return has_next_file_number_; } + uint64_t GetNextFile() const { return next_file_number_; } + + void SetMaxColumnFamily(uint32_t max_column_family) { + has_max_column_family_ = true; + max_column_family_ = max_column_family; + } + bool HasMaxColumnFamily() const { return has_max_column_family_; } + uint32_t GetMaxColumnFamily() const { return max_column_family_; } + + void SetMinLogNumberToKeep(uint64_t num) { + has_min_log_number_to_keep_ = true; + min_log_number_to_keep_ = num; + } + bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; } + uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; } + + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + bool HasLastSequence() const { return has_last_sequence_; } + SequenceNumber GetLastSequence() const { return last_sequence_; } + + // Delete the specified table file from the specified level. + void DeleteFile(int level, uint64_t file) { + deleted_files_.emplace(level, file); + } + + // Retrieve the table files deleted as well as their associated levels. + using DeletedFiles = std::set>; + const DeletedFiles& GetDeletedFiles() const { return deleted_files_; } + + // Add the specified table file at the specified level. + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file + // referred to by this file if any, kInvalidBlobFileNumber otherwise. + void AddFile(int level, uint64_t file, uint32_t file_path_id, + uint64_t file_size, const InternalKey& smallest, + const InternalKey& largest, const SequenceNumber& smallest_seqno, + const SequenceNumber& largest_seqno, bool marked_for_compaction, + Temperature temperature, uint64_t oldest_blob_file_number, + uint64_t oldest_ancester_time, uint64_t file_creation_time, + uint64_t epoch_number, const std::string& file_checksum, + const std::string& file_checksum_func_name, + const UniqueId64x2& unique_id, + const uint64_t compensated_range_deletion_size, + uint64_t tail_size) { + assert(smallest_seqno <= largest_seqno); + new_files_.emplace_back( + level, + FileMetaData(file, file_path_id, file_size, smallest, largest, + smallest_seqno, largest_seqno, marked_for_compaction, + temperature, oldest_blob_file_number, oldest_ancester_time, + file_creation_time, epoch_number, file_checksum, + file_checksum_func_name, unique_id, + compensated_range_deletion_size, tail_size)); + if (!HasLastSequence() || largest_seqno > GetLastSequence()) { + SetLastSequence(largest_seqno); + } + } + + void AddFile(int level, const FileMetaData& f) { + assert(f.fd.smallest_seqno <= f.fd.largest_seqno); + new_files_.emplace_back(level, f); + if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) { + SetLastSequence(f.fd.largest_seqno); + } + } + + // Retrieve the table files added as well as their associated levels. + using NewFiles = std::vector>; + const NewFiles& GetNewFiles() const { return new_files_; } + + // Retrieve all the compact cursors + using CompactCursors = std::vector>; + const CompactCursors& GetCompactCursors() const { return compact_cursors_; } + void AddCompactCursor(int level, const InternalKey& cursor) { + compact_cursors_.push_back(std::make_pair(level, cursor)); + } + void SetCompactCursors( + const std::vector& compact_cursors_by_level) { + compact_cursors_.clear(); + compact_cursors_.reserve(compact_cursors_by_level.size()); + for (int i = 0; i < (int)compact_cursors_by_level.size(); i++) { + if (compact_cursors_by_level[i].Valid()) { + compact_cursors_.push_back( + std::make_pair(i, compact_cursors_by_level[i])); + } + } + } + + // Add a new blob file. + void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + blob_file_additions_.emplace_back( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value)); + } + + void AddBlobFile(BlobFileAddition blob_file_addition) { + blob_file_additions_.emplace_back(std::move(blob_file_addition)); + } + + // Retrieve all the blob files added. + using BlobFileAdditions = std::vector; + const BlobFileAdditions& GetBlobFileAdditions() const { + return blob_file_additions_; + } + + void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) { + assert(blob_file_additions_.empty()); + blob_file_additions_ = std::move(blob_file_additions); + } + + // Add garbage for an existing blob file. Note: intentionally broken English + // follows. + void AddBlobFileGarbage(uint64_t blob_file_number, + uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + } + + void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) { + blob_file_garbages_.emplace_back(std::move(blob_file_garbage)); + } + + // Retrieve all the blob file garbage added. + using BlobFileGarbages = std::vector; + const BlobFileGarbages& GetBlobFileGarbages() const { + return blob_file_garbages_; + } + + void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) { + assert(blob_file_garbages_.empty()); + blob_file_garbages_ = std::move(blob_file_garbages); + } + + // Add a WAL (either just created or closed). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) { + assert(NumEntries() == wal_additions_.size()); + wal_additions_.emplace_back(number, std::move(metadata)); + } + + // Retrieve all the added WALs. + const WalAdditions& GetWalAdditions() const { return wal_additions_; } + + bool IsWalAddition() const { return !wal_additions_.empty(); } + + // Delete a WAL (either directly deleted or archived). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void DeleteWalsBefore(WalNumber number) { + assert((NumEntries() == 1) == !wal_deletion_.IsEmpty()); + wal_deletion_ = WalDeletion(number); + } + + const WalDeletion& GetWalDeletion() const { return wal_deletion_; } + + bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); } + + bool IsWalManipulation() const { + size_t entries = NumEntries(); + return (entries > 0) && ((entries == wal_additions_.size()) || + (entries == !wal_deletion_.IsEmpty())); + } + + // Number of edits + size_t NumEntries() const { + return new_files_.size() + deleted_files_.size() + + blob_file_additions_.size() + blob_file_garbages_.size() + + wal_additions_.size() + !wal_deletion_.IsEmpty(); + } + + void SetColumnFamily(uint32_t column_family_id) { + column_family_ = column_family_id; + } + uint32_t GetColumnFamily() const { return column_family_; } + + // set column family ID by calling SetColumnFamily() + void AddColumnFamily(const std::string& name) { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_add_ = true; + column_family_name_ = name; + } + + // set column family ID by calling SetColumnFamily() + void DropColumnFamily() { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_drop_ = true; + } + + bool IsColumnFamilyManipulation() const { + return is_column_family_add_ || is_column_family_drop_; + } + + bool IsColumnFamilyAdd() const { return is_column_family_add_; } + + bool IsColumnFamilyDrop() const { return is_column_family_drop_; } + + void MarkAtomicGroup(uint32_t remaining_entries) { + is_in_atomic_group_ = true; + remaining_entries_ = remaining_entries; + } + bool IsInAtomicGroup() const { return is_in_atomic_group_; } + uint32_t GetRemainingEntries() const { return remaining_entries_; } + + bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); } + const std::string& GetFullHistoryTsLow() const { + assert(HasFullHistoryTsLow()); + return full_history_ts_low_; + } + void SetFullHistoryTsLow(std::string full_history_ts_low) { + assert(!full_history_ts_low.empty()); + full_history_ts_low_ = std::move(full_history_ts_low); + } + + // return true on success. + bool EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString(bool hex_key = false) const; + std::string DebugJSON(int edit_num, bool hex_key = false) const; + + private: + friend class ReactiveVersionSet; + friend class VersionEditHandlerBase; + friend class ListColumnFamiliesHandler; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; + friend class VersionSet; + friend class Version; + friend class AtomicGroupReadBuffer; + + bool GetLevel(Slice* input, int* level, const char** msg); + + const char* DecodeNewFile4From(Slice* input); + + int max_level_ = 0; + std::string db_id_; + std::string comparator_; + uint64_t log_number_ = 0; + uint64_t prev_log_number_ = 0; + uint64_t next_file_number_ = 0; + uint32_t max_column_family_ = 0; + // The most recent WAL log number that is deleted + uint64_t min_log_number_to_keep_ = 0; + SequenceNumber last_sequence_ = 0; + bool has_db_id_ = false; + bool has_comparator_ = false; + bool has_log_number_ = false; + bool has_prev_log_number_ = false; + bool has_next_file_number_ = false; + bool has_max_column_family_ = false; + bool has_min_log_number_to_keep_ = false; + bool has_last_sequence_ = false; + + // Compaction cursors for round-robin compaction policy + CompactCursors compact_cursors_; + + DeletedFiles deleted_files_; + NewFiles new_files_; + + BlobFileAdditions blob_file_additions_; + BlobFileGarbages blob_file_garbages_; + + WalAdditions wal_additions_; + WalDeletion wal_deletion_; + + // Each version edit record should have column_family_ set + // If it's not set, it is default (0) + uint32_t column_family_ = 0; + // a version edit can be either column_family add or + // column_family drop. If it's column family add, + // it also includes column family name. + bool is_column_family_drop_ = false; + bool is_column_family_add_ = false; + std::string column_family_name_; + + bool is_in_atomic_group_ = false; + uint32_t remaining_entries_ = 0; + + std::string full_history_ts_low_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.cc new file mode 100644 index 0000000000..d507c4b0c8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.cc @@ -0,0 +1,1017 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit_handler.h" + +#include +#include + +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_source.h" +#include "db/version_edit.h" +#include "logging/logging.h" +#include "monitoring/persistent_stats_history.h" + +namespace ROCKSDB_NAMESPACE { + +void VersionEditHandlerBase::Iterate(log::Reader& reader, + Status* log_read_status) { + Slice record; + std::string scratch; + assert(log_read_status); + assert(log_read_status->ok()); + + [[maybe_unused]] size_t recovered_edits = 0; + Status s = Initialize(); + while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() && + reader.ReadRecord(&record, &scratch) && log_read_status->ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + s = read_buffer_.AddEdit(&edit); + if (!s.ok()) { + break; + } + ColumnFamilyData* cfd = nullptr; + if (edit.is_in_atomic_group_) { + if (read_buffer_.IsFull()) { + for (auto& e : read_buffer_.replay_buffer()) { + s = ApplyVersionEdit(e, &cfd); + if (!s.ok()) { + break; + } + ++recovered_edits; + } + if (!s.ok()) { + break; + } + read_buffer_.Clear(); + } + } else { + s = ApplyVersionEdit(edit, &cfd); + if (s.ok()) { + ++recovered_edits; + } + } + } + if (!log_read_status->ok()) { + s = *log_read_status; + } + + CheckIterationResult(reader, &s); + + if (!s.ok()) { + if (s.IsCorruption()) { + // when we find a Corruption error, something is + // wrong with the underlying file. in this case we + // want to report the filename, so in here we append + // the filename to the Corruption message + assert(reader.file()); + + // build a new error message + std::stringstream message; + // append previous dynamic state message + const char* state = s.getState(); + if (state != nullptr) { + message << state; + message << ' '; + } + // append the filename to the corruption message + message << "in file " << reader.file()->file_name(); + // overwrite the status with the extended status + s = Status(s.code(), s.subcode(), s.severity(), message.str()); + } + status_ = s; + } + TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish", + &recovered_edits); +} + +Status ListColumnFamiliesHandler::ApplyVersionEdit( + VersionEdit& edit, ColumnFamilyData** /*unused*/) { + Status s; + if (edit.is_column_family_add_) { + if (column_family_names_.find(edit.column_family_) != + column_family_names_.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + } else { + column_family_names_.insert( + {edit.column_family_, edit.column_family_name_}); + } + } else if (edit.is_column_family_drop_) { + if (column_family_names_.find(edit.column_family_) == + column_family_names_.end()) { + s = Status::Corruption("Manifest - dropping non-existing column family"); + } else { + column_family_names_.erase(edit.column_family_); + } + } + return s; +} + +Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) { + for (const auto& deleted_file : edit.GetDeletedFiles()) { + Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second); + if (!s.ok()) { + return s; + } + } + for (const auto& new_file : edit.GetNewFiles()) { + Status s = file_checksum_list_.InsertOneFileChecksum( + new_file.second.fd.GetNumber(), new_file.second.file_checksum, + new_file.second.file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + for (const auto& new_blob_file : edit.GetBlobFileAdditions()) { + std::string checksum_value = new_blob_file.GetChecksumValue(); + std::string checksum_method = new_blob_file.GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (checksum_method.empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + Status s = file_checksum_list_.InsertOneFileChecksum( + new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +VersionEditHandler::VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement) + : VersionEditHandlerBase(read_options), + read_only_(read_only), + column_families_(std::move(column_families)), + version_set_(version_set), + track_missing_files_(track_missing_files), + no_error_if_files_missing_(no_error_if_files_missing), + io_tracer_(io_tracer), + skip_load_table_files_(skip_load_table_files), + initialized_(false), + epoch_number_requirement_(epoch_number_requirement) { + assert(version_set_ != nullptr); +} + +Status VersionEditHandler::Initialize() { + Status s; + if (!initialized_) { + for (const auto& cf_desc : column_families_) { + name_to_options_.emplace(cf_desc.name, cf_desc.options); + } + auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName); + if (default_cf_iter == name_to_options_.end()) { + s = Status::InvalidArgument("Default column family not specified"); + } + if (s.ok()) { + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* cfd = + CreateCfAndInit(default_cf_iter->second, default_cf_edit); + assert(cfd != nullptr); +#ifdef NDEBUG + (void)cfd; +#endif + initialized_ = true; + } + } + return s; +} + +Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s; + if (edit.is_column_family_add_) { + s = OnColumnFamilyAdd(edit, cfd); + } else if (edit.is_column_family_drop_) { + s = OnColumnFamilyDrop(edit, cfd); + } else if (edit.IsWalAddition()) { + s = OnWalAddition(edit); + } else if (edit.IsWalDeletion()) { + s = OnWalDeletion(edit); + } else { + s = OnNonCfOperation(edit, cfd); + } + if (s.ok()) { + assert(cfd != nullptr); + s = ExtractInfoFromVersionEdit(*cfd, edit); + } + return s; +} + +Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + Status s; + if (cf_in_builders || cf_in_not_found) { + s = Status::Corruption("MANIFEST adding the same column family twice: " + + edit.column_family_name_); + } + if (s.ok()) { + auto cf_options = name_to_options_.find(edit.column_family_name_); + // implicitly add persistent_stats column family without requiring user + // to specify + ColumnFamilyData* tmp_cfd = nullptr; + bool is_persistent_stats_column_family = + edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + if (cf_options == name_to_options_.end() && + !is_persistent_stats_column_family) { + column_families_not_found_.emplace(edit.column_family_, + edit.column_family_name_); + } else { + if (is_persistent_stats_column_family) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + tmp_cfd = CreateCfAndInit(cfo, edit); + } else { + tmp_cfd = CreateCfAndInit(cf_options->second, edit); + } + *cfd = tmp_cfd; + } + } + return s; +} + +Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + ColumnFamilyData* tmp_cfd = nullptr; + Status s; + if (cf_in_builders) { + tmp_cfd = DestroyCfAndCleanup(edit); + } else if (cf_in_not_found) { + column_families_not_found_.erase(edit.column_family_); + } else { + s = Status::Corruption("MANIFEST - dropping non-existing column family"); + } + *cfd = tmp_cfd; + return s; +} + +Status VersionEditHandler::OnWalAddition(VersionEdit& edit) { + assert(edit.IsWalAddition()); + return version_set_->wals_.AddWals(edit.GetWalAdditions()); +} + +Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) { + assert(edit.IsWalDeletion()); + return version_set_->wals_.DeleteWalsBefore( + edit.GetWalDeletion().GetLogNumber()); +} + +Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + Status s; + if (!cf_in_not_found) { + if (!cf_in_builders) { + s = Status::Corruption( + "MANIFEST record referencing unknown column family"); + } + ColumnFamilyData* tmp_cfd = nullptr; + if (s.ok()) { + auto builder_iter = builders_.find(edit.column_family_); + assert(builder_iter != builders_.end()); + tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( + edit.column_family_); + assert(tmp_cfd != nullptr); + s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false); + if (s.ok()) { + s = builder_iter->second->version_builder()->Apply(&edit); + } + } + *cfd = tmp_cfd; + } + return s; +} + +// TODO maybe cache the computation result +bool VersionEditHandler::HasMissingFiles() const { + bool ret = false; + for (const auto& elem : cf_to_missing_files_) { + const auto& missing_files = elem.second; + if (!missing_files.empty()) { + ret = true; + break; + } + } + if (!ret) { + for (const auto& elem : cf_to_missing_blob_files_high_) { + if (elem.second != kInvalidBlobFileNumber) { + ret = true; + break; + } + } + } + return ret; +} + +void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit, + bool* cf_in_not_found, + bool* cf_in_builders) const { + assert(cf_in_not_found != nullptr); + assert(cf_in_builders != nullptr); + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool in_not_found = column_families_not_found_.find(edit.column_family_) != + column_families_not_found_.end(); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool in_builders = builders_.find(edit.column_family_) != builders_.end(); + // They cannot both be true + assert(!(in_not_found && in_builders)); + *cf_in_not_found = in_not_found; + *cf_in_builders = in_builders; +} + +void VersionEditHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + assert(s != nullptr); + if (!s->ok()) { + // Do nothing here. + } else if (!version_edit_params_.has_log_number_ || + !version_edit_params_.has_next_file_number_ || + !version_edit_params_.has_last_sequence_) { + std::string msg("no "); + if (!version_edit_params_.has_log_number_) { + msg.append("log_file_number, "); + } + if (!version_edit_params_.has_next_file_number_) { + msg.append("next_file_number, "); + } + if (!version_edit_params_.has_last_sequence_) { + msg.append("last_sequence, "); + } + msg = msg.substr(0, msg.size() - 2); + msg.append(" entry in MANIFEST"); + *s = Status::Corruption(msg); + } + // There were some column families in the MANIFEST that weren't specified + // in the argument. This is OK in read_only mode + if (s->ok() && MustOpenAllColumnFamilies() && + !column_families_not_found_.empty()) { + std::string msg; + for (const auto& cf : column_families_not_found_) { + msg.append(", "); + msg.append(cf.second); + } + msg = msg.substr(2); + *s = Status::InvalidArgument("Column families not opened: " + msg); + } + if (s->ok()) { + version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily( + version_edit_params_.max_column_family_); + version_set_->MarkMinLogNumberToKeep( + version_edit_params_.min_log_number_to_keep_); + version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_); + version_set_->MarkFileNumberUsed(version_edit_params_.log_number_); + for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + *s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + if (s->ok()) { + for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } + if (read_only_) { + cfd->table_cache()->SetTablesAreImmortal(); + } + *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false, + /*is_initial_load=*/true); + if (!s->ok()) { + // If s is IOError::PathNotFound, then we mark the db as corrupted. + if (s->IsPathNotFound()) { + *s = Status::Corruption("Corruption: " + s->ToString()); + } + break; + } + } + } + + if (s->ok()) { + for (auto* cfd : *(version_set_->column_family_set_)) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + VersionEdit edit; + *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true); + if (!s->ok()) { + break; + } + } + } + if (s->ok()) { + version_set_->manifest_file_size_ = reader.GetReadOffset(); + assert(version_set_->manifest_file_size_ > 0); + version_set_->next_file_number_.store( + version_edit_params_.next_file_number_ + 1); + SequenceNumber last_seq = version_edit_params_.last_sequence_; + assert(last_seq != kMaxSequenceNumber); + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_allocated_sequence_.load()) { + version_set_->last_allocated_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_published_sequence_.load()) { + version_set_->last_published_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_sequence_.load()) { + version_set_->last_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->descriptor_last_sequence_) { + // This is the maximum last sequence of all `VersionEdit`s iterated. It + // may be greater than the maximum `largest_seqno` of all files in case + // the newest data referred to by the MANIFEST has been dropped or had its + // sequence number zeroed through compaction. + version_set_->descriptor_last_sequence_ = last_seq; + } + version_set_->prev_log_number_ = version_edit_params_.prev_log_number_; + } +} + +ColumnFamilyData* VersionEditHandler::CreateCfAndInit( + const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { + ColumnFamilyData* cfd = + version_set_->CreateColumnFamily(cf_options, read_options_, &edit); + assert(cfd != nullptr); + cfd->set_initialized(); + assert(builders_.find(edit.column_family_) == builders_.end()); + builders_.emplace(edit.column_family_, + VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd))); + if (track_missing_files_) { + cf_to_missing_files_.emplace(edit.column_family_, + std::unordered_set()); + cf_to_missing_blob_files_high_.emplace(edit.column_family_, + kInvalidBlobFileNumber); + } + return cfd; +} + +ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( + const VersionEdit& edit) { + auto builder_iter = builders_.find(edit.column_family_); + assert(builder_iter != builders_.end()); + builders_.erase(builder_iter); + if (track_missing_files_) { + auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_); + assert(missing_files_iter != cf_to_missing_files_.end()); + cf_to_missing_files_.erase(missing_files_iter); + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(edit.column_family_); + assert(missing_blob_files_high_iter != + cf_to_missing_blob_files_high_.end()); + cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter); + } + ColumnFamilyData* ret = + version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_); + assert(ret != nullptr); + ret->SetDropped(); + ret->UnrefAndTryDelete(); + ret = nullptr; + return ret; +} + +Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, + ColumnFamilyData* cfd, + bool force_create_version) { + assert(cfd->initialized()); + Status s; + if (force_create_version) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); + auto* v = new Version(cfd, version_set_, version_set_->file_options_, + *cfd->GetLatestMutableCFOptions(), io_tracer_, + version_set_->current_version_number_++, + epoch_number_requirement_); + s = builder->SaveTo(v->storage_info()); + if (s.ok()) { + // Install new version + v->PrepareAppend( + *cfd->GetLatestMutableCFOptions(), read_options_, + !(version_set_->db_options_->skip_stats_update_on_db_open)); + version_set_->AppendVersion(cfd, v); + } else { + delete v; + } + } + return s; +} + +Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) { + bool skip_load_table_files = skip_load_table_files_; + TEST_SYNC_POINT_CALLBACK( + "VersionEditHandler::LoadTables:skip_load_table_files", + &skip_load_table_files); + if (skip_load_table_files) { + return Status::OK(); + } + assert(cfd != nullptr); + assert(!cfd->IsDropped()); + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + assert(builder_iter->second != nullptr); + VersionBuilder* builder = builder_iter->second->version_builder(); + assert(builder); + const MutableCFOptions* moptions = cfd->GetLatestMutableCFOptions(); + Status s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, + prefetch_index_and_filter_in_cache, is_initial_load, + moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions), + read_options_, moptions->block_protection_bytes_per_key); + if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { + s = Status::OK(); + } + if (!s.ok() && !version_set_->db_options_->paranoid_checks) { + s = Status::OK(); + } + return s; +} + +Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& edit) { + Status s; + if (edit.has_db_id_) { + version_set_->db_id_ = edit.GetDbId(); + version_edit_params_.SetDBId(edit.db_id_); + } + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + ROCKS_LOG_WARN( + version_set_->db_options()->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(edit.log_number_); + version_edit_params_.SetLogNumber(edit.log_number_); + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + if (!cf_to_cmp_names_) { + s = Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + } else { + cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_); + } + } + if (edit.HasFullHistoryTsLow()) { + const std::string& new_ts = edit.GetFullHistoryTsLow(); + cfd->SetFullHistoryTsLow(new_ts); + } + } + + if (s.ok()) { + if (edit.has_prev_log_number_) { + version_edit_params_.SetPrevLogNumber(edit.prev_log_number_); + } + if (edit.has_next_file_number_) { + version_edit_params_.SetNextFile(edit.next_file_number_); + } + if (edit.has_max_column_family_) { + version_edit_params_.SetMaxColumnFamily(edit.max_column_family_); + } + if (edit.has_min_log_number_to_keep_) { + version_edit_params_.min_log_number_to_keep_ = + std::max(version_edit_params_.min_log_number_to_keep_, + edit.min_log_number_to_keep_); + } + if (edit.has_last_sequence_) { + // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This + // is legacy behavior that cannot change without breaking downgrade + // compatibility. + assert(!version_edit_params_.has_last_sequence_ || + version_edit_params_.last_sequence_ <= edit.last_sequence_); + version_edit_params_.SetLastSequence(edit.last_sequence_); + } + if (!version_edit_params_.has_prev_log_number_) { + version_edit_params_.SetPrevLogNumber(0); + } + } + return s; +} + +VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( + bool read_only, std::vector column_families, + VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, + EpochNumberRequirement epoch_number_requirement) + : VersionEditHandler(read_only, column_families, version_set, + /*track_missing_files=*/true, + /*no_error_if_files_missing=*/true, io_tracer, + read_options, epoch_number_requirement) {} + +VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { + for (const auto& elem : versions_) { + delete elem.second; + } + versions_.clear(); +} + +void VersionEditHandlerPointInTime::CheckIterationResult( + const log::Reader& reader, Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + assert(s != nullptr); + if (s->ok()) { + for (auto* cfd : *(version_set_->column_family_set_)) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + assert(v_iter->second != nullptr); + + version_set_->AppendVersion(cfd, v_iter->second); + versions_.erase(v_iter); + } + } + } else { + for (const auto& elem : versions_) { + delete elem.second; + } + versions_.clear(); + } +} + +ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup( + const VersionEdit& edit) { + ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit); + auto v_iter = versions_.find(edit.column_family_); + if (v_iter != versions_.end()) { + delete v_iter->second; + versions_.erase(v_iter); + } + return cfd; +} + +Status VersionEditHandlerPointInTime::MaybeCreateVersion( + const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { + assert(cfd != nullptr); + if (!force_create_version) { + assert(edit.column_family_ == cfd->GetID()); + } + auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID()); + assert(missing_files_iter != cf_to_missing_files_.end()); + std::unordered_set& missing_files = missing_files_iter->second; + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(cfd->GetID()); + assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end()); + const uint64_t prev_missing_blob_file_high = + missing_blob_files_high_iter->second; + + VersionBuilder* builder = nullptr; + + if (prev_missing_blob_file_high != kInvalidBlobFileNumber) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + } + + // At this point, we have not yet applied the new version edits read from the + // MANIFEST. We check whether we have any missing table and blob files. + const bool prev_has_missing_files = + !missing_files.empty() || + (prev_missing_blob_file_high != kInvalidBlobFileNumber && + prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber()); + + for (const auto& file : edit.GetDeletedFiles()) { + uint64_t file_num = file.second; + auto fiter = missing_files.find(file_num); + if (fiter != missing_files.end()) { + missing_files.erase(fiter); + } + } + + assert(!cfd->ioptions()->cf_paths.empty()); + Status s; + for (const auto& elem : edit.GetNewFiles()) { + int level = elem.first; + const FileMetaData& meta = elem.second; + const FileDescriptor& fd = meta.fd; + uint64_t file_num = fd.GetNumber(); + const std::string fpath = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); + s = VerifyFile(cfd, fpath, level, meta); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_files.insert(file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + uint64_t missing_blob_file_num = prev_missing_blob_file_high; + for (const auto& elem : edit.GetBlobFileAdditions()) { + uint64_t file_num = elem.GetBlobFileNumber(); + s = VerifyBlobFile(cfd, file_num, elem); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_blob_file_num = std::max(missing_blob_file_num, file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + bool has_missing_blob_files = false; + if (missing_blob_file_num != kInvalidBlobFileNumber && + missing_blob_file_num >= prev_missing_blob_file_high) { + missing_blob_files_high_iter->second = missing_blob_file_num; + has_missing_blob_files = true; + } else if (missing_blob_file_num < prev_missing_blob_file_high) { + assert(false); + } + + // We still have not applied the new version edit, but have tried to add new + // table and blob files after verifying their presence and consistency. + // Therefore, we know whether we will see new missing table and blob files + // later after actually applying the version edit. We perform the check here + // and record the result. + const bool has_missing_files = + !missing_files.empty() || has_missing_blob_files; + + bool missing_info = !version_edit_params_.has_log_number_ || + !version_edit_params_.has_next_file_number_ || + !version_edit_params_.has_last_sequence_; + + // Create version before apply edit. The version will represent the state + // before applying the version edit. + // A new version will created if: + // 1) no error has occurred so far, and + // 2) log_number_, next_file_number_ and last_sequence_ are known, and + // 3) any of the following: + // a) no missing file before, but will have missing file(s) after applying + // this version edit. + // b) no missing file after applying the version edit, and the caller + // explicitly request that a new version be created. + if (s.ok() && !missing_info && + ((has_missing_files && !prev_has_missing_files) || + (!has_missing_files && force_create_version))) { + if (!builder) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder); + } + + const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions(); + auto* version = new Version(cfd, version_set_, version_set_->file_options_, + *cf_opts_ptr, io_tracer_, + version_set_->current_version_number_++, + epoch_number_requirement_); + s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, false, true, + cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), + read_options_, cf_opts_ptr->block_protection_bytes_per_key); + if (!s.ok()) { + delete version; + if (s.IsCorruption()) { + s = Status::OK(); + } + return s; + } + s = builder->SaveTo(version->storage_info()); + if (s.ok()) { + version->PrepareAppend( + *cfd->GetLatestMutableCFOptions(), read_options_, + !version_set_->db_options_->skip_stats_update_on_db_open); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + delete v_iter->second; + v_iter->second = version; + } else { + versions_.emplace(cfd->GetID(), version); + } + } else { + delete version; + } + } + return s; +} + +Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, + int level, + const FileMetaData& fmeta) { + return version_set_->VerifyFileMetadata(read_options_, cfd, fpath, level, + fmeta); +} + +Status VersionEditHandlerPointInTime::VerifyBlobFile( + ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition) { + BlobSource* blob_source = cfd->blob_source(); + assert(blob_source); + CacheHandleGuard blob_file_reader; + + Status s = blob_source->GetBlobFileReader(read_options_, blob_file_num, + &blob_file_reader); + if (!s.ok()) { + return s; + } + // TODO: verify checksum + (void)blob_addition; + return s; +} + +Status VersionEditHandlerPointInTime::LoadTables( + ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/, + bool /*is_initial_load*/) { + return Status::OK(); +} + +Status ManifestTailer::Initialize() { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::Initialize(); + } + assert(Mode::kCatchUp == mode_); + Status s; + if (!initialized_) { + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* default_cfd = cfd_set->GetDefault(); + assert(default_cfd); + auto builder_iter = builders_.find(default_cfd->GetID()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = default_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(default_cfd, base_version)); + builder_iter->second = std::move(new_builder); + + initialized_ = true; + } + return s; +} + +Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd); + if (s.ok()) { + assert(cfd); + if (*cfd) { + cfds_changed_.insert(*cfd); + } + } + return s; +} + +Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::OnColumnFamilyAdd(edit, cfd); + } + assert(Mode::kCatchUp == mode_); + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily()); + assert(cfd); + *cfd = tmp_cfd; + if (!tmp_cfd) { + // For now, ignore new column families created after Recover() succeeds. + return Status::OK(); + } + auto builder_iter = builders_.find(edit.GetColumnFamily()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = tmp_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(tmp_cfd, base_version)); + builder_iter->second = std::move(new_builder); + +#ifndef NDEBUG + auto version_iter = versions_.find(edit.GetColumnFamily()); + assert(version_iter == versions_.end()); +#endif // !NDEBUG + return Status::OK(); +} + +void ManifestTailer::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandlerPointInTime::CheckIterationResult(reader, s); + assert(s); + if (s->ok()) { + if (Mode::kRecovery == mode_) { + mode_ = Mode::kCatchUp; + } else { + assert(Mode::kCatchUp == mode_); + } + } +} + +Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd, + const std::string& fpath, int level, + const FileMetaData& fmeta) { + Status s = + VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta); + // TODO: Open file or create hard link to prevent the file from being + // deleted. + return s; +} + +void DumpManifestHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + if (!s->ok()) { + fprintf(stdout, "%s\n", s->ToString().c_str()); + return; + } + assert(cf_to_cmp_names_); + for (auto* cfd : *(version_set_->column_family_set_)) { + fprintf(stdout, + "--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber()); + auto it = cf_to_cmp_names_->find(cfd->GetID()); + if (it != cf_to_cmp_names_->end()) { + fprintf(stdout, + "comparator: <%s>, but the comparator object is not available.\n", + it->second.c_str()); + } else { + fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name()); + } + assert(cfd->current()); + + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char), + cfd->current()->DebugString(hex_).size(), stdout); + } + fprintf(stdout, + "next_file_number %" PRIu64 " last_sequence %" PRIu64 + " prev_log_number %" PRIu64 " max_column_family %" PRIu32 + " min_log_number_to_keep %" PRIu64 "\n", + version_set_->current_next_file_number(), + version_set_->LastSequence(), version_set_->prev_log_number(), + version_set_->column_family_set_->GetMaxColumnFamily(), + version_set_->min_log_number_to_keep()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.h new file mode 100644 index 0000000000..4b9f195420 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_edit_handler.h @@ -0,0 +1,334 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +struct FileMetaData; + +class VersionEditHandlerBase { + public: + explicit VersionEditHandlerBase(const ReadOptions& read_options) + : read_options_(read_options), + max_manifest_read_size_(std::numeric_limits::max()) {} + + virtual ~VersionEditHandlerBase() {} + + void Iterate(log::Reader& reader, Status* log_read_status); + + const Status& status() const { return status_; } + + AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } + + protected: + explicit VersionEditHandlerBase(const ReadOptions& read_options, + uint64_t max_read_size) + : read_options_(read_options), max_manifest_read_size_(max_read_size) {} + virtual Status Initialize() { return Status::OK(); } + + virtual Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) = 0; + + virtual void CheckIterationResult(const log::Reader& /*reader*/, + Status* /*s*/) {} + + void ClearReadBuffer() { read_buffer_.Clear(); } + + Status status_; + + const ReadOptions& read_options_; + + private: + AtomicGroupReadBuffer read_buffer_; + const uint64_t max_manifest_read_size_; +}; + +class ListColumnFamiliesHandler : public VersionEditHandlerBase { + public: + explicit ListColumnFamiliesHandler(const ReadOptions& read_options) + : VersionEditHandlerBase(read_options) {} + + ~ListColumnFamiliesHandler() override {} + + const std::map GetColumnFamilyNames() const { + return column_family_names_; + } + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + // default column family is always implicitly there + std::map column_family_names_{ + {0, kDefaultColumnFamilyName}}; +}; + +class FileChecksumRetriever : public VersionEditHandlerBase { + public: + FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size, + FileChecksumList& file_checksum_list) + : VersionEditHandlerBase(read_options, max_read_size), + file_checksum_list_(file_checksum_list) {} + + ~FileChecksumRetriever() override {} + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + FileChecksumList& file_checksum_list_; +}; + +using VersionBuilderUPtr = std::unique_ptr; + +// A class used for scanning MANIFEST file. +// VersionEditHandler reads a MANIFEST file, parses the version edits, and +// builds the version set's in-memory state, e.g. the version storage info for +// the versions of column families. +// To use this class and its subclasses, +// 1. Create an object of VersionEditHandler or its subclasses. +// VersionEditHandler handler(read_only, column_families, version_set, +// track_missing_files, +// no_error_if_files_missing); +// 2. Status s = handler.Iterate(reader, &db_id); +// 3. Check s and handle possible errors. +// +// Not thread-safe, external synchronization is necessary if an object of +// VersionEditHandler is shared by multiple threads. +class VersionEditHandler : public VersionEditHandlerBase { + public: + explicit VersionEditHandler( + bool read_only, + const std::vector& column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent) + : VersionEditHandler( + read_only, column_families, version_set, track_missing_files, + no_error_if_files_missing, io_tracer, read_options, + /*skip_load_table_files=*/false, epoch_number_requirement) {} + + ~VersionEditHandler() override {} + + const VersionEditParams& GetVersionEditParams() const { + return version_edit_params_; + } + + bool HasMissingFiles() const; + + void GetDbId(std::string* db_id) const { + if (db_id && version_edit_params_.has_db_id_) { + *db_id = version_edit_params_.db_id_; + } + } + + protected: + explicit VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnWalAddition(VersionEdit& edit); + + Status OnWalDeletion(VersionEdit& edit); + + Status Initialize() override; + + void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found, + bool* cf_in_builders) const; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options, + const VersionEdit& edit); + + virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit); + + virtual Status MaybeCreateVersion(const VersionEdit& edit, + ColumnFamilyData* cfd, + bool force_create_version); + + virtual Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load); + + virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } + + const bool read_only_; + std::vector column_families_; + VersionSet* version_set_; + std::unordered_map builders_; + std::unordered_map name_to_options_; + // Keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status. + std::unordered_map column_families_not_found_; + VersionEditParams version_edit_params_; + const bool track_missing_files_; + std::unordered_map> + cf_to_missing_files_; + std::unordered_map cf_to_missing_blob_files_high_; + bool no_error_if_files_missing_; + std::shared_ptr io_tracer_; + bool skip_load_table_files_; + bool initialized_; + std::unique_ptr> cf_to_cmp_names_; + EpochNumberRequirement epoch_number_requirement_; + + private: + Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& edit); +}; + +// A class similar to its base class, i.e. VersionEditHandler. +// VersionEditHandlerPointInTime restores the versions to the most recent point +// in time such that at this point, the version does not have missing files. +// +// Not thread-safe, external synchronization is necessary if an object of +// VersionEditHandlerPointInTime is shared by multiple threads. +class VersionEditHandlerPointInTime : public VersionEditHandler { + public: + VersionEditHandlerPointInTime( + bool read_only, std::vector column_families, + VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); + ~VersionEditHandlerPointInTime() override; + + protected: + void CheckIterationResult(const log::Reader& reader, Status* s) override; + ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; + Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, + bool force_create_version) override; + virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& fmeta); + virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition); + + Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) override; + + std::unordered_map versions_; +}; + +class ManifestTailer : public VersionEditHandlerPointInTime { + public: + explicit ManifestTailer(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent) + : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, + version_set, io_tracer, read_options, + epoch_number_requirement), + mode_(Mode::kRecovery) {} + + void PrepareToReadNewManifest() { + initialized_ = false; + ClearReadBuffer(); + } + + std::unordered_set& GetUpdatedColumnFamilies() { + return cfds_changed_; + } + + protected: + Status Initialize() override; + + bool MustOpenAllColumnFamilies() const override { return false; } + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, + const FileMetaData& fmeta) override; + + enum Mode : uint8_t { + kRecovery = 0, + kCatchUp = 1, + }; + + Mode mode_; + std::unordered_set cfds_changed_; +}; + +class DumpManifestHandler : public VersionEditHandler { + public: + DumpManifestHandler(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool verbose, bool hex, + bool json) + : VersionEditHandler( + /*read_only=*/true, column_families, version_set, + /*track_missing_files=*/false, + /*no_error_if_files_missing=*/false, io_tracer, read_options, + /*skip_load_table_files=*/true), + verbose_(verbose), + hex_(hex), + json_(json), + count_(0) { + cf_to_cmp_names_.reset(new std::unordered_map()); + } + + ~DumpManifestHandler() override {} + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override { + // Write out each individual edit + if (verbose_ && !json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } else if (json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } + ++count_; + return VersionEditHandler::ApplyVersionEdit(edit, cfd); + } + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + private: + const bool verbose_; + const bool hex_; + const bool json_; + int count_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.cc new file mode 100644 index 0000000000..674c0e4aa9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.cc @@ -0,0 +1,7286 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_source.h" +#include "db/compaction/compaction.h" +#include "db/compaction/file_pri.h" +#include "db/dbformat.h" +#include "db/internal_stats.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/table_cache.h" +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/version_edit_handler.h" +#include "table/compaction_merging_iterator.h" + +#if USE_COROUTINES +#include "folly/experimental/coro/BlockingWait.h" +#include "folly/experimental/coro/Collect.h" +#endif +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/file_read_sample.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/merging_iterator.h" +#include "table/meta_blocks.h" +#include "table/multiget_context.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/coro_utils.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/user_comparator_wrapper.h" + +// Generate the regular and coroutine versions of some methods by +// including version_set_sync_and_async.h twice +// Macros in the header will expand differently based on whether +// WITH_COROUTINES or WITHOUT_COROUTINES is defined +// clang-format off +#define WITHOUT_COROUTINES +#include "db/version_set_sync_and_async.h" +#undef WITHOUT_COROUTINES +#define WITH_COROUTINES +#include "db/version_set_sync_and_async.h" +#undef WITH_COROUTINES +// clang-format on + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Find File in LevelFilesBrief data structure +// Within an index range defined by left and right +int FindFileInRange(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key, + uint32_t left, uint32_t right) { + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0; + }; + const auto& b = file_level.files; + return static_cast(std::lower_bound(b + left, b + right, key, cmp) - b); +} + +Status OverlapWithIterator(const Comparator* ucmp, + const Slice& smallest_user_key, + const Slice& largest_user_key, + InternalIterator* iter, bool* overlap) { + InternalKey range_start(smallest_user_key, kMaxSequenceNumber, + kValueTypeForSeek); + iter->Seek(range_start.Encode()); + if (!iter->status().ok()) { + return iter->status(); + } + + *overlap = false; + if (iter->Valid()) { + ParsedInternalKey seek_result; + Status s = ParseInternalKey(iter->key(), &seek_result, + false /* log_err_key */); // TODO + if (!s.ok()) return s; + + if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= + 0) { + *overlap = true; + } + } + + return iter->status(); +} + +// Class to help choose the next file to search for the particular key. +// Searches and returns files level by level. +// We can search level-by-level since entries never hop across +// levels. Therefore we are guaranteed that if we find data +// in a smaller level, later levels are irrelevant (unless we +// are MergeInProgress). +class FilePicker { + public: + FilePicker(const Slice& user_key, const Slice& ikey, + autovector* file_levels, unsigned int num_levels, + FileIndexer* file_indexer, const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) + : num_levels_(num_levels), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), + search_left_bound_(0), + search_right_bound_(FileIndexer::kLevelMaxIndex), + level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), + curr_file_level_(nullptr), + user_key_(user_key), + ikey_(ikey), + file_indexer_(file_indexer), + user_comparator_(user_comparator), + internal_comparator_(internal_comparator) { + // Setup member variables to search first level. + search_ended_ = !PrepareNextLevel(); + if (!search_ended_) { + // Prefetch Level 0 table data to avoid cache miss if possible. + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; + if (r) { + r->Prepare(ikey); + } + } + } + } + + int GetCurrentLevel() const { return curr_level_; } + + FdWithKeyRange* GetNextFile() { + while (!search_ended_) { // Loops over different levels. + while (curr_index_in_curr_level_ < curr_file_level_->num_files) { + // Loops over all files in current level. + FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; + hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_index_in_curr_level_ == curr_file_level_->num_files - 1; + int cmp_largest = -1; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (num_levels_ > 1 || curr_file_level_->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + assert(curr_level_ == 0 || + curr_index_in_curr_level_ == start_index_in_curr_level_ || + user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)); + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->largest_key)); + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level_ > 0) { + file_indexer_->GetNextLevelIndex( + curr_level_, curr_index_in_curr_level_, cmp_smallest, + cmp_largest, &search_left_bound_, &search_right_bound_); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + if (curr_level_ == 0) { + ++curr_index_in_curr_level_; + continue; + } else { + // Search next level. + break; + } + } + } + + returned_file_level_ = curr_level_; + if (curr_level_ > 0 && cmp_largest < 0) { + // No more files to search in this level. + search_ended_ = !PrepareNextLevel(); + } else { + ++curr_index_in_curr_level_; + } + return f; + } + // Start searching next level. + search_ended_ = !PrepareNextLevel(); + } + // Search ended. + return nullptr; + } + + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + + private: + unsigned int num_levels_; + unsigned int curr_level_; + unsigned int returned_file_level_; + unsigned int hit_file_level_; + int32_t search_left_bound_; + int32_t search_right_bound_; + autovector* level_files_brief_; + bool search_ended_; + bool is_hit_file_last_in_level_; + LevelFilesBrief* curr_file_level_; + unsigned int curr_index_in_curr_level_; + unsigned int start_index_in_curr_level_; + Slice user_key_; + Slice ikey_; + FileIndexer* file_indexer_; + const Comparator* user_comparator_; + const InternalKeyComparator* internal_comparator_; + + // Setup local variables to search next level. + // Returns false if there are no more levels to search. + bool PrepareNextLevel() { + curr_level_++; + while (curr_level_ < num_levels_) { + curr_file_level_ = &(*level_files_brief_)[curr_level_]; + if (curr_file_level_->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + assert(search_left_bound_ == 0); + assert(search_right_bound_ == -1 || + search_right_bound_ == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index; + if (curr_level_ == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (search_left_bound_ <= search_right_bound_) { + if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { + search_right_bound_ = + static_cast(curr_file_level_->num_files) - 1; + } + // `search_right_bound_` is an inclusive upper-bound, but since it was + // determined based on user key, it is still possible the lookup key + // falls to the right of `search_right_bound_`'s corresponding file. + // So, pass a limit one higher, which allows us to detect this case. + start_index = + FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, + static_cast(search_left_bound_), + static_cast(search_right_bound_) + 1); + if (start_index == search_right_bound_ + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + } + start_index_in_curr_level_ = start_index; + curr_index_in_curr_level_ = start_index; + + return true; + } + // curr_level_ = num_levels_. So, no more levels to search. + return false; + } +}; +} // anonymous namespace + +class FilePickerMultiGet { + private: + struct FilePickerContext; + + public: + FilePickerMultiGet(MultiGetRange* range, + autovector* file_levels, + unsigned int num_levels, FileIndexer* file_indexer, + const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) + : num_levels_(num_levels), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), + range_(*range, range->begin(), range->end()), + maybe_repeat_key_(false), + current_level_range_(*range, range->begin(), range->end()), + current_file_range_(*range, range->begin(), range->end()), + batch_iter_(range->begin()), + batch_iter_prev_(range->begin()), + upper_key_(range->begin()), + level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), + curr_file_level_(nullptr), + file_indexer_(file_indexer), + user_comparator_(user_comparator), + internal_comparator_(internal_comparator), + hit_file_(nullptr) { + for (auto iter = range_.begin(); iter != range_.end(); ++iter) { + fp_ctx_array_[iter.index()] = + FilePickerContext(0, FileIndexer::kLevelMaxIndex); + } + + // Setup member variables to search first level. + search_ended_ = !PrepareNextLevel(); + if (!search_ended_) { + // REVISIT + // Prefetch Level 0 table data to avoid cache miss if possible. + // As of now, only PlainTableReader and CuckooTableReader do any + // prefetching. This may not be necessary anymore once we implement + // batching in those table readers + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; + if (r) { + for (auto iter = range_.begin(); iter != range_.end(); ++iter) { + r->Prepare(iter->ikey); + } + } + } + } + } + + FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other) + : num_levels_(other.num_levels_), + curr_level_(other.curr_level_), + returned_file_level_(other.returned_file_level_), + hit_file_level_(other.hit_file_level_), + fp_ctx_array_(other.fp_ctx_array_), + range_(*range, range->begin(), range->end()), + maybe_repeat_key_(false), + current_level_range_(*range, range->begin(), range->end()), + current_file_range_(*range, range->begin(), range->end()), + batch_iter_(range->begin()), + batch_iter_prev_(range->begin()), + upper_key_(range->begin()), + level_files_brief_(other.level_files_brief_), + is_hit_file_last_in_level_(false), + curr_file_level_(other.curr_file_level_), + file_indexer_(other.file_indexer_), + user_comparator_(other.user_comparator_), + internal_comparator_(other.internal_comparator_), + hit_file_(nullptr) { + PrepareNextLevelForSearch(); + } + + int GetCurrentLevel() const { return curr_level_; } + + void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); } + + FdWithKeyRange* GetNextFileInLevel() { + if (batch_iter_ == current_level_range_.end() || search_ended_) { + hit_file_ = nullptr; + return nullptr; + } else { + if (maybe_repeat_key_) { + maybe_repeat_key_ = false; + // Check if we found the final value for the last key in the + // previous lookup range. If we did, then there's no need to look + // any further for that key, so advance batch_iter_. Else, keep + // batch_iter_ positioned on that key so we look it up again in + // the next file + // For L0, always advance the key because we will look in the next + // file regardless for all keys not found yet + if (current_level_range_.CheckKeyDone(batch_iter_) || + curr_level_ == 0) { + batch_iter_ = upper_key_; + } + } + // batch_iter_prev_ will become the start key for the next file + // lookup + batch_iter_prev_ = batch_iter_; + } + + MultiGetRange next_file_range(current_level_range_, batch_iter_prev_, + current_level_range_.end()); + size_t curr_file_index = + (batch_iter_ != current_level_range_.end()) + ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level + : curr_file_level_->num_files; + FdWithKeyRange* f; + bool is_last_key_in_file; + if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f, + &is_last_key_in_file)) { + hit_file_ = nullptr; + return nullptr; + } else { + if (is_last_key_in_file) { + // Since cmp_largest is 0, batch_iter_ still points to the last key + // that falls in this file, instead of the next one. Increment + // the file index for all keys between batch_iter_ and upper_key_ + auto tmp_iter = batch_iter_; + while (tmp_iter != upper_key_) { + ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level); + ++tmp_iter; + } + maybe_repeat_key_ = true; + } + // Set the range for this file + current_file_range_ = + MultiGetRange(next_file_range, batch_iter_prev_, upper_key_); + returned_file_level_ = curr_level_; + hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_file_index == curr_file_level_->num_files - 1; + hit_file_ = f; + return f; + } + } + + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + + FdWithKeyRange* GetHitFile() { return hit_file_; } + + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + + bool KeyMaySpanNextFile() { return maybe_repeat_key_; } + + bool IsSearchEnded() { return search_ended_; } + + const MultiGetRange& CurrentFileRange() { return current_file_range_; } + + bool RemainingOverlapInLevel() { + return !current_level_range_.Suffix(current_file_range_).empty(); + } + + MultiGetRange& GetRange() { return range_; } + + void ReplaceRange(const MultiGetRange& other) { + assert(hit_file_ == nullptr); + range_ = other; + current_level_range_ = other; + } + + FilePickerMultiGet(FilePickerMultiGet&& other) + : num_levels_(other.num_levels_), + curr_level_(other.curr_level_), + returned_file_level_(other.returned_file_level_), + hit_file_level_(other.hit_file_level_), + fp_ctx_array_(std::move(other.fp_ctx_array_)), + range_(std::move(other.range_)), + maybe_repeat_key_(other.maybe_repeat_key_), + current_level_range_(std::move(other.current_level_range_)), + current_file_range_(std::move(other.current_file_range_)), + batch_iter_(other.batch_iter_, ¤t_level_range_), + batch_iter_prev_(other.batch_iter_prev_, ¤t_level_range_), + upper_key_(other.upper_key_, ¤t_level_range_), + level_files_brief_(other.level_files_brief_), + search_ended_(other.search_ended_), + is_hit_file_last_in_level_(other.is_hit_file_last_in_level_), + curr_file_level_(other.curr_file_level_), + file_indexer_(other.file_indexer_), + user_comparator_(other.user_comparator_), + internal_comparator_(other.internal_comparator_), + hit_file_(other.hit_file_) {} + + private: + unsigned int num_levels_; + unsigned int curr_level_; + unsigned int returned_file_level_; + unsigned int hit_file_level_; + + struct FilePickerContext { + int32_t search_left_bound; + int32_t search_right_bound; + unsigned int curr_index_in_curr_level; + unsigned int start_index_in_curr_level; + + FilePickerContext(int32_t left, int32_t right) + : search_left_bound(left), + search_right_bound(right), + curr_index_in_curr_level(0), + start_index_in_curr_level(0) {} + + FilePickerContext() = default; + }; + std::array fp_ctx_array_; + MultiGetRange range_; + bool maybe_repeat_key_; + MultiGetRange current_level_range_; + MultiGetRange current_file_range_; + // Iterator to iterate through the keys in a MultiGet batch, that gets reset + // at the beginning of each level. Each call to GetNextFile() will position + // batch_iter_ at or right after the last key that was found in the returned + // SST file + MultiGetRange::Iterator batch_iter_; + // An iterator that records the previous position of batch_iter_, i.e last + // key found in the previous SST file, in order to serve as the start of + // the batch key range for the next SST file + MultiGetRange::Iterator batch_iter_prev_; + MultiGetRange::Iterator upper_key_; + autovector* level_files_brief_; + bool search_ended_; + bool is_hit_file_last_in_level_; + LevelFilesBrief* curr_file_level_; + FileIndexer* file_indexer_; + const Comparator* user_comparator_; + const InternalKeyComparator* internal_comparator_; + FdWithKeyRange* hit_file_; + + // Iterates through files in the current level until it finds a file that + // contains at least one key from the MultiGet batch + bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range, + size_t* file_index, FdWithKeyRange** fd, + bool* is_last_key_in_file) { + size_t curr_file_index = *file_index; + FdWithKeyRange* f = nullptr; + bool file_hit = false; + int cmp_largest = -1; + if (curr_file_index >= curr_file_level_->num_files) { + // In the unlikely case the next key is a duplicate of the current key, + // and the current key is the last in the level and the internal key + // was not found, we need to skip lookup for the remaining keys and + // reset the search bounds + if (batch_iter_ != current_level_range_.end()) { + ++batch_iter_; + for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + } + return false; + } + // Loops over keys in the MultiGet batch until it finds a file with + // atleast one of the keys. Then it keeps moving forward until the + // last key in the batch that falls in that file + while (batch_iter_ != current_level_range_.end() && + (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level == + curr_file_index || + !file_hit)) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level]; + Slice& user_key = batch_iter_->ukey_without_ts; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (num_levels_ > 1 || curr_file_level_->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->smallest_key), true); + + assert(curr_level_ == 0 || + fp_ctx.curr_index_in_curr_level == + fp_ctx.start_index_in_curr_level || + cmp_smallest <= 0); + + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->largest_key), true); + } else { + cmp_largest = -1; + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level_ > 0) { + file_indexer_->GetNextLevelIndex( + curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest, + cmp_largest, &fp_ctx.search_left_bound, + &fp_ctx.search_right_bound); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + next_file_range->SkipKey(batch_iter_); + } else { + file_hit = true; + } + } else { + file_hit = true; + } + if (cmp_largest == 0) { + // cmp_largest is 0, which means the next key will not be in this + // file, so stop looking further. However, its possible there are + // duplicates in the batch, so find the upper bound for the batch + // in this file (upper_key_) by skipping past the duplicates. We + // leave batch_iter_ as is since we may have to pick up from there + // for the next file, if this file has a merge value rather than + // final value + upper_key_ = batch_iter_; + ++upper_key_; + while (upper_key_ != current_level_range_.end() && + user_comparator_->CompareWithoutTimestamp( + batch_iter_->ukey_without_ts, false, + upper_key_->ukey_without_ts, false) == 0) { + ++upper_key_; + } + break; + } else { + if (curr_level_ == 0) { + // We need to look through all files in level 0 + ++fp_ctx.curr_index_in_curr_level; + } + ++batch_iter_; + } + if (!file_hit) { + curr_file_index = + (batch_iter_ != current_level_range_.end()) + ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level + : curr_file_level_->num_files; + } + } + + *fd = f; + *file_index = curr_file_index; + *is_last_key_in_file = cmp_largest == 0; + if (!*is_last_key_in_file) { + // If the largest key in the batch overlapping the file is not the + // largest key in the file, upper_ley_ would not have been updated so + // update it here + upper_key_ = batch_iter_; + } + return file_hit; + } + + // Setup local variables to search next level. + // Returns false if there are no more levels to search. + bool PrepareNextLevel() { + if (curr_level_ == 0) { + MultiGetRange::Iterator mget_iter = current_level_range_.begin(); + if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < + curr_file_level_->num_files) { + batch_iter_prev_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); + return true; + } + } + + curr_level_++; + // Reset key range to saved value + while (curr_level_ < num_levels_) { + bool level_contains_keys = false; + curr_file_level_ = &(*level_files_brief_)[curr_level_]; + if (curr_file_level_->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + + for (auto mget_iter = current_level_range_.begin(); + mget_iter != current_level_range_.end(); ++mget_iter) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; + + assert(fp_ctx.search_left_bound == 0); + assert(fp_ctx.search_right_bound == -1 || + fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + // Skip all subsequent empty levels + do { + ++curr_level_; + } while ((curr_level_ < num_levels_) && + (*level_files_brief_)[curr_level_].num_files == 0); + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index = -1; + current_level_range_ = + MultiGetRange(range_, range_.begin(), range_.end()); + for (auto mget_iter = current_level_range_.begin(); + mget_iter != current_level_range_.end(); ++mget_iter) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; + if (curr_level_ == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + level_contains_keys = true; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) { + if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) { + fp_ctx.search_right_bound = + static_cast(curr_file_level_->num_files) - 1; + } + // `search_right_bound_` is an inclusive upper-bound, but since it + // was determined based on user key, it is still possible the lookup + // key falls to the right of `search_right_bound_`'s corresponding + // file. So, pass a limit one higher, which allows us to detect this + // case. + Slice& ikey = mget_iter->ikey; + start_index = FindFileInRange( + *internal_comparator_, *curr_file_level_, ikey, + static_cast(fp_ctx.search_left_bound), + static_cast(fp_ctx.search_right_bound) + 1); + if (start_index == fp_ctx.search_right_bound + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + current_level_range_.SkipKey(mget_iter); + continue; + } else { + level_contains_keys = true; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + current_level_range_.SkipKey(mget_iter); + continue; + } + } + fp_ctx.start_index_in_curr_level = start_index; + fp_ctx.curr_index_in_curr_level = start_index; + } + if (level_contains_keys) { + batch_iter_prev_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); + return true; + } + curr_level_++; + } + // curr_level_ = num_levels_. So, no more levels to search. + return false; + } +}; + +VersionStorageInfo::~VersionStorageInfo() { delete[] files_; } + +Version::~Version() { + assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (size_t i = 0; i < storage_info_.files_[level].size(); i++) { + FileMetaData* f = storage_info_.files_[level][i]; + assert(f->refs > 0); + f->refs--; + if (f->refs <= 0) { + assert(cfd_ != nullptr); + uint32_t path_id = f->fd.GetPathId(); + assert(path_id < cfd_->ioptions()->cf_paths.size()); + vset_->obsolete_files_.push_back( + ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path, + cfd_->GetFileMetadataCacheReservationManager())); + } + } + } +} + +int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key) { + return FindFileInRange(icmp, file_level, key, 0, + static_cast(file_level.num_files)); +} + +void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena) { + assert(file_level); + assert(arena); + + size_t num = files.size(); + file_level->num_files = num; + char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange)); + file_level->files = new (mem) FdWithKeyRange[num]; + + for (size_t i = 0; i < num; i++) { + Slice smallest_key = files[i]->smallest.Encode(); + Slice largest_key = files[i]->largest.Encode(); + + // Copy key slice to sequential memory + size_t smallest_size = smallest_key.size(); + size_t largest_size = largest_key.size(); + mem = arena->AllocateAligned(smallest_size + largest_size); + memcpy(mem, smallest_key.data(), smallest_size); + memcpy(mem + smallest_size, largest_key.data(), largest_size); + + FdWithKeyRange& f = file_level->files[i]; + f.fd = files[i]->fd; + f.file_metadata = files[i]; + f.smallest_key = Slice(mem, smallest_size); + f.largest_key = Slice(mem + smallest_size, largest_size); + } +} + +static bool AfterFile(const Comparator* ucmp, const Slice* user_key, + const FdWithKeyRange* f) { + // nullptr user_key occurs before all keys and is therefore never after *f + return (user_key != nullptr && + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->largest_key)) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, + const FdWithKeyRange* f) { + // nullptr user_key occurs after all keys and is therefore never before *f + return (user_key != nullptr && + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->smallest_key)) < 0); +} + +bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (size_t i = 0; i < file_level.num_files; i++) { + const FdWithKeyRange* f = &(file_level.files[i]); + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != nullptr) { + // Find the leftmost possible internal key for smallest_user_key + InternalKey small; + small.SetMinPossibleForUserKey(*smallest_user_key); + index = FindFile(icmp, file_level, small.Encode()); + } + + if (index >= file_level.num_files) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]); +} + +namespace { + +class LevelIterator final : public InternalIterator { + public: + // @param read_options Must outlive this iterator. + LevelIterator( + TableCache* table_cache, const ReadOptions& read_options, + const FileOptions& file_options, const InternalKeyComparator& icomparator, + const LevelFilesBrief* flevel, + const std::shared_ptr& prefix_extractor, + bool should_sample, HistogramImpl* file_read_hist, + TableReaderCaller caller, bool skip_filters, int level, + uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg, + const std::vector* compaction_boundaries = + nullptr, + bool allow_unprepared_value = false, + TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr) + : table_cache_(table_cache), + read_options_(read_options), + file_options_(file_options), + icomparator_(icomparator), + user_comparator_(icomparator.user_comparator()), + flevel_(flevel), + prefix_extractor_(prefix_extractor), + file_read_hist_(file_read_hist), + should_sample_(should_sample), + caller_(caller), + skip_filters_(skip_filters), + allow_unprepared_value_(allow_unprepared_value), + file_index_(flevel_->num_files), + level_(level), + range_del_agg_(range_del_agg), + pinned_iters_mgr_(nullptr), + compaction_boundaries_(compaction_boundaries), + is_next_read_sequential_(false), + block_protection_bytes_per_key_(block_protection_bytes_per_key), + range_tombstone_iter_(nullptr), + to_return_sentinel_(false) { + // Empty level is not supported. + assert(flevel_ != nullptr && flevel_->num_files > 0); + if (range_tombstone_iter_ptr_) { + *range_tombstone_iter_ptr_ = &range_tombstone_iter_; + } + } + + ~LevelIterator() override { delete file_iter_.Set(nullptr); } + + // Seek to the first file with a key >= target. + // If range_tombstone_iter_ is not nullptr, then we pretend that file + // boundaries are fake keys (sentinel keys). These keys are used to keep range + // tombstones alive even when all point keys in an SST file are exhausted. + // These sentinel keys will be skipped in merging iterator. + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + + // In addition to valid and invalid state (!file_iter.Valid() and + // status.ok()), a third state of the iterator is when !file_iter_.Valid() and + // to_return_sentinel_. This means we are at the end of a file, and a sentinel + // key (the file boundary that we pretend as a key) is to be returned next. + // file_iter_.Valid() and to_return_sentinel_ should not both be true. + bool Valid() const override { + assert(!(file_iter_.Valid() && to_return_sentinel_)); + return file_iter_.Valid() || to_return_sentinel_; + } + Slice key() const override { + assert(Valid()); + if (to_return_sentinel_) { + // Sentinel should be returned after file_iter_ reaches the end of the + // file + assert(!file_iter_.Valid()); + return sentinel_; + } + return file_iter_.key(); + } + + Slice value() const override { + assert(Valid()); + assert(!to_return_sentinel_); + return file_iter_.value(); + } + + Status status() const override { + return file_iter_.iter() ? file_iter_.status() : Status::OK(); + } + + bool PrepareValue() override { return file_iter_.PrepareValue(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline IterBoundCheck UpperBoundCheckResult() override { + if (Valid()) { + return file_iter_.UpperBoundCheckResult(); + } else { + return IterBoundCheck::kUnknown; + } + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + if (file_iter_.iter()) { + file_iter_.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_.iter() && file_iter_.IsKeyPinned(); + } + + bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_.iter() && file_iter_.IsValuePinned(); + } + + bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; } + + private: + // Return true if at least one invalid file is seen and skipped. + bool SkipEmptyFileForward(); + void SkipEmptyFileBackward(); + void SetFileIterator(InternalIterator* iter); + void InitFileIterator(size_t new_file_index); + + const Slice& file_smallest_key(size_t file_index) { + assert(file_index < flevel_->num_files); + return flevel_->files[file_index].smallest_key; + } + + const Slice& file_largest_key(size_t file_index) { + assert(file_index < flevel_->num_files); + return flevel_->files[file_index].largest_key; + } + + bool KeyReachedUpperBound(const Slice& internal_key) { + return read_options_.iterate_upper_bound != nullptr && + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(internal_key), /*a_has_ts=*/true, + *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0; + } + + void ClearRangeTombstoneIter() { + if (range_tombstone_iter_ && *range_tombstone_iter_) { + delete *range_tombstone_iter_; + *range_tombstone_iter_ = nullptr; + } + } + + // Move file_iter_ to the file at file_index_. + // range_tombstone_iter_ is updated with a range tombstone iterator + // into the new file. Old range tombstone iterator is cleared. + InternalIterator* NewFileIterator() { + assert(file_index_ < flevel_->num_files); + auto file_meta = flevel_->files[file_index_]; + if (should_sample_) { + sample_file_read_inc(file_meta.file_metadata); + } + + const InternalKey* smallest_compaction_key = nullptr; + const InternalKey* largest_compaction_key = nullptr; + if (compaction_boundaries_ != nullptr) { + smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; + largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; + } + CheckMayBeOutOfLowerBound(); + ClearRangeTombstoneIter(); + return table_cache_->NewIterator( + read_options_, file_options_, icomparator_, *file_meta.file_metadata, + range_del_agg_, prefix_extractor_, + nullptr /* don't need reference to table */, file_read_hist_, caller_, + /*arena=*/nullptr, skip_filters_, level_, + /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, + largest_compaction_key, allow_unprepared_value_, + block_protection_bytes_per_key_, range_tombstone_iter_); + } + + // Check if current file being fully within iterate_lower_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update may_be_out_of_lower_bound_ accordingly. + void CheckMayBeOutOfLowerBound() { + if (read_options_.iterate_lower_bound != nullptr && + file_index_ < flevel_->num_files) { + may_be_out_of_lower_bound_ = + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true, + *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0; + } + } + + TableCache* table_cache_; + const ReadOptions& read_options_; + const FileOptions& file_options_; + const InternalKeyComparator& icomparator_; + const UserComparatorWrapper user_comparator_; + const LevelFilesBrief* flevel_; + mutable FileDescriptor current_value_; + // `prefix_extractor_` may be non-null even for total order seek. Checking + // this variable is not the right way to identify whether prefix iterator + // is used. + const std::shared_ptr& prefix_extractor_; + + HistogramImpl* file_read_hist_; + bool should_sample_; + TableReaderCaller caller_; + bool skip_filters_; + bool allow_unprepared_value_; + bool may_be_out_of_lower_bound_ = true; + size_t file_index_; + int level_; + RangeDelAggregator* range_del_agg_; + IteratorWrapper file_iter_; // May be nullptr + PinnedIteratorsManager* pinned_iters_mgr_; + + // To be propagated to RangeDelAggregator in order to safely truncate range + // tombstones. + const std::vector* compaction_boundaries_; + + bool is_next_read_sequential_; + + uint8_t block_protection_bytes_per_key_; + + // This is set when this level iterator is used under a merging iterator + // that processes range tombstones. range_tombstone_iter_ points to where the + // merging iterator stores the range tombstones iterator for this level. When + // this level iterator moves to a new SST file, it updates the range + // tombstones accordingly through this pointer. So the merging iterator always + // has access to the current SST file's range tombstones. + // + // The level iterator treats file boundary as fake keys (sentinel keys) to + // keep range tombstones alive if needed and make upper level, i.e. merging + // iterator, aware of file changes (when level iterator moves to a new SST + // file, there is some bookkeeping work that needs to be done at merging + // iterator end). + // + // *range_tombstone_iter_ points to range tombstones of the current SST file + TruncatedRangeDelIterator** range_tombstone_iter_; + + // Whether next/prev key is a sentinel key. + bool to_return_sentinel_ = false; + // The sentinel key to be returned + Slice sentinel_; + // Sets flags for if we should return the sentinel key next. + // The condition for returning sentinel is reaching the end of current + // file_iter_: !Valid() && status.().ok(). + void TrySetDeleteRangeSentinel(const Slice& boundary_key); + void ClearSentinel() { to_return_sentinel_ = false; } + + // Set in Seek() when a prefix seek reaches end of the current file, + // and the next file has a different prefix. SkipEmptyFileForward() + // will not move to next file when this flag is set. + bool prefix_exhausted_ = false; +}; + +void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) { + assert(range_tombstone_iter_); + if (file_iter_.iter() != nullptr && !file_iter_.Valid() && + file_iter_.status().ok()) { + to_return_sentinel_ = true; + sentinel_ = boundary_key; + } +} + +void LevelIterator::Seek(const Slice& target) { + prefix_exhausted_ = false; + ClearSentinel(); + // Check whether the seek key fall under the same file + bool need_to_reseek = true; + if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) { + const FdWithKeyRange& cur_file = flevel_->files[file_index_]; + if (icomparator_.InternalKeyComparator::Compare( + target, cur_file.largest_key) <= 0 && + icomparator_.InternalKeyComparator::Compare( + target, cur_file.smallest_key) >= 0) { + need_to_reseek = false; + assert(static_cast(FindFile(icomparator_, *flevel_, target)) == + file_index_); + } + } + if (need_to_reseek) { + TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile"); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + InitFileIterator(new_file_index); + } + + if (file_iter_.iter() != nullptr) { + file_iter_.Seek(target); + // Status::TryAgain indicates asynchronous request for retrieval of data + // blocks has been submitted. So it should return at this point and Seek + // should be called again to retrieve the requested block and execute the + // remaining code. + if (file_iter_.status() == Status::TryAgain()) { + return; + } + if (!file_iter_.Valid() && file_iter_.status().ok() && + prefix_extractor_ != nullptr && !read_options_.total_order_seek && + !read_options_.auto_prefix_mode && + file_index_ < flevel_->num_files - 1) { + size_t ts_sz = user_comparator_.user_comparator()->timestamp_size(); + Slice target_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(target, ts_sz); + Slice next_file_first_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1), + ts_sz); + if (prefix_extractor_->InDomain(target_user_key_without_ts) && + (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) || + user_comparator_.CompareWithoutTimestamp( + prefix_extractor_->Transform(target_user_key_without_ts), false, + prefix_extractor_->Transform( + next_file_first_user_key_without_ts), + false) != 0)) { + // SkipEmptyFileForward() will not advance to next file when this flag + // is set for reason detailed below. + // + // The file we initially positioned to has no keys under the target + // prefix, and the next file's smallest key has a different prefix than + // target. When doing prefix iterator seek, when keys for one prefix + // have been exhausted, it can jump to any key that is larger. Here we + // are enforcing a stricter contract than that, in order to make it + // easier for higher layers (merging and DB iterator) to reason the + // correctness: + // 1. Within the prefix, the result should be accurate. + // 2. If keys for the prefix is exhausted, it is either positioned to + // the next key after the prefix, or make the iterator invalid. + // A side benefit will be that it invalidates the iterator earlier so + // that the upper level merging iterator can merge fewer child + // iterators. + // + // The flag is cleared in Seek*() calls. There is no need to clear the + // flag in Prev() since Prev() will not be called when the flag is set + // for reasons explained below. If range_tombstone_iter_ is nullptr, + // then there is no file boundary sentinel key. Since + // !file_iter_.Valid() from the if condition above, this level iterator + // is !Valid(), so Prev() will not be called. If range_tombstone_iter_ + // is not nullptr, there are two cases depending on if this level + // iterator reaches top of the heap in merging iterator (the upper + // layer). + // If so, merging iterator will see the sentinel key, call + // NextAndGetResult() and the call to NextAndGetResult() will skip the + // sentinel key and makes this level iterator invalid. If not, then it + // could be because the upper layer is done before any method of this + // level iterator is called or another Seek*() call is invoked. Either + // way, Prev() is never called before Seek*(). + // The flag should not be cleared at the beginning of + // Next/NextAndGetResult() since it is used in SkipEmptyFileForward() + // called in Next/NextAndGetResult(). + prefix_exhausted_ = true; + } + } + + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekForPrev(const Slice& target) { + prefix_exhausted_ = false; + ClearSentinel(); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + // Seek beyond this level's smallest key + if (new_file_index == 0 && + icomparator_.Compare(target, file_smallest_key(0)) < 0) { + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + CheckMayBeOutOfLowerBound(); + return; + } + if (new_file_index >= flevel_->num_files) { + new_file_index = flevel_->num_files - 1; + } + + InitFileIterator(new_file_index); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekForPrev(target); + if (range_tombstone_iter_ && + icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) { + // In SeekForPrev() case, it is possible that the target is less than + // file's lower boundary since largest key is used to determine file index + // (FindFile()). When target is less than file's lower boundary, sentinel + // key should not be set so that SeekForPrev() does not result in a key + // larger than target. This is correct in that there is no need to keep + // the range tombstones in this file alive as they only cover keys + // starting from the file's lower boundary, which is after `target`. + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + } + SkipEmptyFileBackward(); + } + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekToFirst() { + prefix_exhausted_ = false; + ClearSentinel(); + InitFileIterator(0); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToFirst(); + if (range_tombstone_iter_) { + // We do this in SeekToFirst() and SeekToLast() since + // we could have an empty file with only range tombstones. + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekToLast() { + prefix_exhausted_ = false; + ClearSentinel(); + InitFileIterator(flevel_->num_files - 1); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToLast(); + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + } + } + SkipEmptyFileBackward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::Next() { + assert(Valid()); + if (to_return_sentinel_) { + // file_iter_ is at EOF already when to_return_sentinel_ + ClearSentinel(); + } else { + file_iter_.Next(); + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + SkipEmptyFileForward(); +} + +bool LevelIterator::NextAndGetResult(IterateResult* result) { + assert(Valid()); + // file_iter_ is at EOF already when to_return_sentinel_ + bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result); + if (!is_valid) { + if (to_return_sentinel_) { + ClearSentinel(); + } else if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + is_next_read_sequential_ = true; + SkipEmptyFileForward(); + is_next_read_sequential_ = false; + is_valid = Valid(); + if (is_valid) { + // This could be set in TrySetDeleteRangeSentinel() or + // SkipEmptyFileForward() above. + if (to_return_sentinel_) { + result->key = sentinel_; + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = true; + } else { + result->key = key(); + result->bound_check_result = file_iter_.UpperBoundCheckResult(); + // Ideally, we should return the real file_iter_.value_prepared but the + // information is not here. It would casue an extra PrepareValue() + // for the first key of a file. + result->value_prepared = !allow_unprepared_value_; + } + } + } + return is_valid; +} + +void LevelIterator::Prev() { + assert(Valid()); + if (to_return_sentinel_) { + ClearSentinel(); + } else { + file_iter_.Prev(); + if (range_tombstone_iter_) { + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + } + } + SkipEmptyFileBackward(); +} + +bool LevelIterator::SkipEmptyFileForward() { + bool seen_empty_file = false; + // Pause at sentinel key + while (!to_return_sentinel_ && + (file_iter_.iter() == nullptr || + (!file_iter_.Valid() && file_iter_.status().ok() && + file_iter_.iter()->UpperBoundCheckResult() != + IterBoundCheck::kOutOfBound))) { + seen_empty_file = true; + // Move to next file + if (file_index_ >= flevel_->num_files - 1 || + KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) || + prefix_exhausted_) { + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + break; + } + // may init a new *range_tombstone_iter + InitFileIterator(file_index_ + 1); + // We moved to a new SST file + // Seek range_tombstone_iter_ to reset its !Valid() default state. + // We do not need to call range_tombstone_iter_.Seek* in + // LevelIterator::Seek* since when the merging iterator calls + // LevelIterator::Seek*, it should also call Seek* into the corresponding + // range tombstone iterator. + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToFirst(); + if (range_tombstone_iter_) { + if (*range_tombstone_iter_) { + (*range_tombstone_iter_)->SeekToFirst(); + } + TrySetDeleteRangeSentinel(file_largest_key(file_index_)); + } + } + } + return seen_empty_file; +} + +void LevelIterator::SkipEmptyFileBackward() { + // Pause at sentinel key + while (!to_return_sentinel_ && + (file_iter_.iter() == nullptr || + (!file_iter_.Valid() && file_iter_.status().ok()))) { + // Move to previous file + if (file_index_ == 0) { + // Already the first file + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + return; + } + InitFileIterator(file_index_ - 1); + // We moved to a new SST file + // Seek range_tombstone_iter_ to reset its !Valid() default state. + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToLast(); + if (range_tombstone_iter_) { + if (*range_tombstone_iter_) { + (*range_tombstone_iter_)->SeekToLast(); + } + TrySetDeleteRangeSentinel(file_smallest_key(file_index_)); + if (to_return_sentinel_) { + break; + } + } + } + } +} + +void LevelIterator::SetFileIterator(InternalIterator* iter) { + if (pinned_iters_mgr_ && iter) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + + InternalIterator* old_iter = file_iter_.Set(iter); + + // Update the read pattern for PrefetchBuffer. + if (is_next_read_sequential_) { + file_iter_.UpdateReadaheadState(old_iter); + } + + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(old_iter); + } else { + delete old_iter; + } +} + +void LevelIterator::InitFileIterator(size_t new_file_index) { + if (new_file_index >= flevel_->num_files) { + file_index_ = new_file_index; + SetFileIterator(nullptr); + ClearRangeTombstoneIter(); + return; + } else { + // If the file iterator shows incomplete, we try it again if users seek + // to the same file, as this time we may go to a different data block + // which is cached in block cache. + // + if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() && + new_file_index == file_index_) { + // file_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + file_index_ = new_file_index; + InternalIterator* iter = NewFileIterator(); + SetFileIterator(iter); + } + } +} +} // anonymous namespace + +Status Version::GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, + const FileMetaData* file_meta, + const std::string* fname) const { + auto table_cache = cfd_->table_cache(); + auto ioptions = cfd_->ioptions(); + Status s = table_cache->GetTableProperties( + file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, + mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor, true /* no io */); + if (s.ok()) { + return s; + } + + // We only ignore error type `Incomplete` since it's by design that we + // disallow table when it's not in table cache. + if (!s.IsIncomplete()) { + return s; + } + + // 2. Table is not present in table cache, we'll read the table properties + // directly from the properties block in the file. + std::unique_ptr file; + std::string file_name; + if (fname != nullptr) { + file_name = *fname; + } else { + file_name = TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + } + s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file, + nullptr); + if (!s.ok()) { + return s; + } + + // By setting the magic number to kNullTableMagicNumber, we can bypass + // the magic number check in the footer. + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), file_name, ioptions->clock /* clock */, io_tracer_, + ioptions->stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, nullptr /* rate_limiter */, + ioptions->listeners)); + std::unique_ptr props; + s = ReadTableProperties( + file_reader.get(), file_meta->fd.GetFileSize(), + Footer::kNullTableMagicNumber /* table's magic number */, *ioptions, + read_options, &props); + if (!s.ok()) { + return s; + } + *tp = std::move(props); + RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + return s; +} + +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props) { + Status s; + for (int level = 0; level < storage_info_.num_levels_; level++) { + s = GetPropertiesOfAllTables(read_options, props, level); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str) { + if (max_entries_to_print <= 0) { + return Status::OK(); + } + int num_entries_left = max_entries_to_print; + + std::stringstream ss; + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + + ss << "=== file : " << fname << " ===\n"; + + TableCache* table_cache = cfd_->table_cache(); + std::unique_ptr tombstone_iter; + + Status s = table_cache->GetRangeTombstoneIterator( + read_options, cfd_->internal_comparator(), *file_meta, + cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key, + &tombstone_iter); + if (!s.ok()) { + return s; + } + if (tombstone_iter) { + tombstone_iter->SeekToFirst(); + + // TODO: print timestamp + while (tombstone_iter->Valid() && num_entries_left > 0) { + ss << "start: " << tombstone_iter->start_key().ToString(true) + << " end: " << tombstone_iter->end_key().ToString(true) + << " seq: " << tombstone_iter->seq() << '\n'; + tombstone_iter->Next(); + num_entries_left--; + } + if (num_entries_left <= 0) { + break; + } + } + } + if (num_entries_left <= 0) { + break; + } + } + assert(num_entries_left >= 0); + if (num_entries_left <= 0) { + ss << "(results may not be complete)\n"; + } + + *out_str = ss.str(); + return Status::OK(); +} + +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, + int level) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = + GetTableProperties(read_options, &table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + + return Status::OK(); +} + +Status Version::GetPropertiesOfTablesInRange( + const ReadOptions& read_options, const Range* range, std::size_t n, + TablePropertiesCollection* props) const { + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + for (decltype(n) i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + std::vector files; + storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, + false); + for (const auto& file_meta : files) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + if (props->count(fname) == 0) { + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(read_options, &table_properties, + file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + } + } + } + + return Status::OK(); +} + +Status Version::GetAggregatedTableProperties( + const ReadOptions& read_options, std::shared_ptr* tp, + int level) { + TablePropertiesCollection props; + Status s; + if (level < 0) { + s = GetPropertiesOfAllTables(read_options, &props); + } else { + s = GetPropertiesOfAllTables(read_options, &props, level); + } + if (!s.ok()) { + return s; + } + + auto* new_tp = new TableProperties(); + for (const auto& item : props) { + new_tp->Add(*item.second); + } + tp->reset(new_tp); + return Status::OK(); +} + +size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { + size_t total_usage = 0; + for (auto& file_level : storage_info_.level_files_brief_) { + for (size_t i = 0; i < file_level.num_files; i++) { + total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( + file_options_, read_options, cfd_->internal_comparator(), + *file_level.files[i].file_metadata, + mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor); + } + } + return total_usage; +} + +void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { + assert(cf_meta); + assert(cfd_); + + cf_meta->name = cfd_->GetName(); + cf_meta->size = 0; + cf_meta->file_count = 0; + cf_meta->levels.clear(); + + cf_meta->blob_file_size = 0; + cf_meta->blob_file_count = 0; + cf_meta->blob_files.clear(); + + auto* ioptions = cfd_->ioptions(); + auto* vstorage = storage_info(); + + for (int level = 0; level < cfd_->NumberLevels(); level++) { + uint64_t level_size = 0; + cf_meta->file_count += vstorage->LevelFiles(level).size(); + std::vector files; + for (const auto& file : vstorage->LevelFiles(level)) { + uint32_t path_id = file->fd.GetPathId(); + std::string file_path; + if (path_id < ioptions->cf_paths.size()) { + file_path = ioptions->cf_paths[path_id].path; + } else { + assert(!ioptions->cf_paths.empty()); + file_path = ioptions->cf_paths.back().path; + } + const uint64_t file_number = file->fd.GetNumber(); + files.emplace_back( + MakeTableFileName("", file_number), file_number, file_path, + file->fd.GetFileSize(), file->fd.smallest_seqno, + file->fd.largest_seqno, file->smallest.user_key().ToString(), + file->largest.user_key().ToString(), + file->stats.num_reads_sampled.load(std::memory_order_relaxed), + file->being_compacted, file->temperature, + file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), + file->TryGetFileCreationTime(), file->epoch_number, + file->file_checksum, file->file_checksum_func_name); + files.back().num_entries = file->num_entries; + files.back().num_deletions = file->num_deletions; + files.back().smallest = file->smallest.Encode().ToString(); + files.back().largest = file->largest.Encode().ToString(); + level_size += file->fd.GetFileSize(); + } + cf_meta->levels.emplace_back(level, level_size, std::move(files)); + cf_meta->size += level_size; + } + for (const auto& meta : vstorage->GetBlobFiles()) { + assert(meta); + + cf_meta->blob_files.emplace_back( + meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()), + ioptions->cf_paths.front().path, meta->GetBlobFileSize(), + meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(), + meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(), + meta->GetChecksumMethod(), meta->GetChecksumValue()); + ++cf_meta->blob_file_count; + cf_meta->blob_file_size += meta->GetBlobFileSize(); + } +} + +uint64_t Version::GetSstFilesSize() { + uint64_t sst_files_size = 0; + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.LevelFiles(level)) { + sst_files_size += file_meta->fd.GetFileSize(); + } + } + return sst_files_size; +} + +void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key) { + smallest_user_key->clear(); + largest_user_key->clear(); + bool initialized = false; + const Comparator* ucmp = storage_info_.user_comparator_; + for (int level = 0; level < cfd_->NumberLevels(); level++) { + if (storage_info_.LevelFiles(level).size() == 0) { + continue; + } + if (level == 0) { + // we need to consider all files on level 0 + for (const auto& file : storage_info_.LevelFiles(level)) { + const Slice& start_user_key = file->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = file->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = + storage_info_.LevelFiles(level)[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = + storage_info_.LevelFiles(level).back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + +void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + uint64_t oldest_time = std::numeric_limits::max(); + for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { + for (FileMetaData* meta : storage_info_.LevelFiles(level)) { + assert(meta->fd.table_reader != nullptr); + uint64_t file_creation_time = meta->TryGetFileCreationTime(); + if (file_creation_time == kUnknownFileCreationTime) { + *creation_time = 0; + return; + } + if (file_creation_time < oldest_time) { + oldest_time = file_creation_time; + } + } + } + *creation_time = oldest_time; +} + +InternalIterator* Version::TEST_GetLevelIterator( + const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder, + int level, bool allow_unprepared_value) { + auto* arena = merge_iter_builder->GetArena(); + auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr; + auto level_iter = new (mem) LevelIterator( + cfd_->table_cache(), read_options, file_options_, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + mutable_cf_options_.block_protection_bytes_per_key, + nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, + allow_unprepared_value, &tombstone_iter_ptr); + if (read_options.ignore_range_deletions) { + merge_iter_builder->AddIterator(level_iter); + } else { + merge_iter_builder->AddPointAndTombstoneIterator( + level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr); + } + return level_iter; +} + +uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { + // Estimation will be inaccurate when: + // (1) there exist merge keys + // (2) keys are directly overwritten + // (3) deletion on non-existing keys + // (4) low number of samples + if (current_num_samples_ == 0) { + return 0; + } + + if (current_num_non_deletions_ <= current_num_deletions_) { + return 0; + } + + uint64_t est = current_num_non_deletions_ - current_num_deletions_; + + uint64_t file_count = 0; + for (int level = 0; level < num_levels_; ++level) { + file_count += files_[level].size(); + } + + if (current_num_samples_ < file_count) { + // casting to avoid overflowing + return static_cast( + (est * static_cast(file_count) / current_num_samples_)); + } else { + return est; + } +} + +double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( + int level) const { + assert(level < num_levels_); + uint64_t sum_file_size_bytes = 0; + uint64_t sum_data_size_bytes = 0; + for (auto* file_meta : files_[level]) { + sum_file_size_bytes += file_meta->fd.GetFileSize(); + sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size; + } + if (sum_file_size_bytes == 0) { + return -1.0; + } + return static_cast(sum_data_size_bytes) / sum_file_size_bytes; +} + +void Version::AddIterators(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merge_iter_builder, + bool allow_unprepared_value) { + assert(storage_info_.finalized_); + + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level, + allow_unprepared_value); + } +} + +void Version::AddIteratorsForLevel(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merge_iter_builder, + int level, bool allow_unprepared_value) { + assert(storage_info_.finalized_); + if (level >= storage_info_.num_non_empty_levels()) { + // This is an empty level + return; + } else if (storage_info_.LevelFilesBrief(level).num_files == 0) { + // No files in this level + return; + } + + bool should_sample = should_sample_file_read(); + + auto* arena = merge_iter_builder->GetArena(); + if (level == 0) { + // Merge all level zero files together since they may overlap + TruncatedRangeDelIterator* tombstone_iter = nullptr; + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto& file = storage_info_.LevelFilesBrief(0).files[i]; + auto table_iter = cfd_->table_cache()->NewIterator( + read_options, soptions, cfd_->internal_comparator(), + *file.file_metadata, /*range_del_agg=*/nullptr, + mutable_cf_options_.prefix_extractor, nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, arena, + /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, allow_unprepared_value, + mutable_cf_options_.block_protection_bytes_per_key, &tombstone_iter); + if (read_options.ignore_range_deletions) { + merge_iter_builder->AddIterator(table_iter); + } else { + merge_iter_builder->AddPointAndTombstoneIterator(table_iter, + tombstone_iter); + } + } + if (should_sample) { + // Count ones for every L0 files. This is done per iterator creation + // rather than Seek(), while files in other levels are recored per seek. + // If users execute one range query per iterator, there may be some + // discrepancy here. + for (FileMetaData* meta : storage_info_.LevelFiles(0)) { + sample_file_read_inc(meta); + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr; + auto level_iter = new (mem) LevelIterator( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + mutable_cf_options_.block_protection_bytes_per_key, + /*range_del_agg=*/nullptr, + /*compaction_boundaries=*/nullptr, allow_unprepared_value, + &tombstone_iter_ptr); + if (read_options.ignore_range_deletions) { + merge_iter_builder->AddIterator(level_iter); + } else { + merge_iter_builder->AddPointAndTombstoneIterator( + level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr); + } + } +} + +Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, + const FileOptions& file_options, + const Slice& smallest_user_key, + const Slice& largest_user_key, + int level, bool* overlap) { + assert(storage_info_.finalized_); + + auto icmp = cfd_->internal_comparator(); + auto ucmp = icmp.user_comparator(); + + Arena arena; + Status status; + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + + *overlap = false; + + if (level == 0) { + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto file = &storage_info_.LevelFilesBrief(0).files[i]; + if (AfterFile(ucmp, &smallest_user_key, file) || + BeforeFile(ucmp, &largest_user_key, file)) { + continue; + } + ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( + read_options, file_options, cfd_->internal_comparator(), + *file->file_metadata, &range_del_agg, + mutable_cf_options_.prefix_extractor, nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, &arena, + /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false, + mutable_cf_options_.block_protection_bytes_per_key)); + status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, + iter.get(), overlap); + if (!status.ok() || *overlap) { + break; + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + auto mem = arena.AllocateAligned(sizeof(LevelIterator)); + ScopedArenaIterator iter(new (mem) LevelIterator( + cfd_->table_cache(), read_options, file_options, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg, + nullptr, false)); + status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, + iter.get(), overlap); + } + + if (status.ok() && *overlap == false && + range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) { + *overlap = true; + } + return status; +} + +VersionStorageInfo::VersionStorageInfo( + const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int levels, + CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, + bool _force_consistency_checks, + EpochNumberRequirement epoch_number_requirement) + : internal_comparator_(internal_comparator), + user_comparator_(user_comparator), + // cfd is nullptr if Version is dummy + num_levels_(levels), + num_non_empty_levels_(0), + file_indexer_(user_comparator), + compaction_style_(compaction_style), + files_(new std::vector[num_levels_]), + base_level_(num_levels_ == 1 ? -1 : 1), + lowest_unnecessary_level_(-1), + level_multiplier_(0.0), + files_by_compaction_pri_(num_levels_), + level0_non_overlapping_(false), + next_file_to_compact_by_size_(num_levels_), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + l0_delay_trigger_count_(0), + compact_cursor_(num_levels_), + accumulated_file_size_(0), + accumulated_raw_key_size_(0), + accumulated_raw_value_size_(0), + accumulated_num_non_deletions_(0), + accumulated_num_deletions_(0), + current_num_non_deletions_(0), + current_num_deletions_(0), + current_num_samples_(0), + estimated_compaction_needed_bytes_(0), + finalized_(false), + force_consistency_checks_(_force_consistency_checks), + epoch_number_requirement_(epoch_number_requirement) { + if (ref_vstorage != nullptr) { + accumulated_file_size_ = ref_vstorage->accumulated_file_size_; + accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; + accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_; + accumulated_num_non_deletions_ = + ref_vstorage->accumulated_num_non_deletions_; + accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; + current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_; + current_num_deletions_ = ref_vstorage->current_num_deletions_; + current_num_samples_ = ref_vstorage->current_num_samples_; + oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_; + compact_cursor_ = ref_vstorage->compact_cursor_; + compact_cursor_.resize(num_levels_); + } +} + +Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, + const FileOptions& file_opt, + const MutableCFOptions mutable_cf_options, + const std::shared_ptr& io_tracer, + uint64_t version_number, + EpochNumberRequirement epoch_number_requirement) + : env_(vset->env_), + clock_(vset->clock_), + cfd_(column_family_data), + info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger), + db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats), + table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), + blob_source_(cfd_ ? cfd_->blob_source() : nullptr), + merge_operator_( + (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()), + storage_info_( + (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), + (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), + cfd_ == nullptr ? 0 : cfd_->NumberLevels(), + cfd_ == nullptr ? kCompactionStyleLevel + : cfd_->ioptions()->compaction_style, + (cfd_ == nullptr || cfd_->current() == nullptr) + ? nullptr + : cfd_->current()->storage_info(), + cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks, + epoch_number_requirement), + vset_(vset), + next_(this), + prev_(this), + refs_(0), + file_options_(file_opt), + mutable_cf_options_(mutable_cf_options), + max_file_size_for_l0_meta_pin_( + MaxFileSizeForL0MetaPin(mutable_cf_options_)), + version_number_(version_number), + io_tracer_(io_tracer), + use_async_io_(env_->GetFileSystem()->use_async_io()) {} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) const { + BlobIndex blob_index; + + { + Status s = blob_index.DecodeFrom(blob_index_slice); + if (!s.ok()) { + return s; + } + } + + return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value, + bytes_read); +} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) const { + assert(value); + + if (blob_index.HasTTL() || blob_index.IsInlined()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + const uint64_t blob_file_number = blob_index.file_number(); + + auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number); + if (!blob_file_meta) { + return Status::Corruption("Invalid blob file number"); + } + + assert(blob_source_); + value->Reset(); + const Status s = blob_source_->GetBlob( + read_options, user_key, blob_file_number, blob_index.offset(), + blob_file_meta->GetBlobFileSize(), blob_index.size(), + blob_index.compression(), prefetch_buffer, value, bytes_read); + + return s; +} + +void Version::MultiGetBlob( + const ReadOptions& read_options, MultiGetRange& range, + std::unordered_map& blob_ctxs) { + assert(!blob_ctxs.empty()); + + autovector blob_reqs; + + for (auto& ctx : blob_ctxs) { + const auto file_number = ctx.first; + const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number); + + autovector blob_reqs_in_file; + BlobReadContexts& blobs_in_file = ctx.second; + for (auto& blob : blobs_in_file) { + const BlobIndex& blob_index = blob.blob_index; + const KeyContext* const key_context = blob.key_context; + assert(key_context); + assert(key_context->get_context); + assert(key_context->s); + + if (key_context->value) { + key_context->value->Reset(); + } else { + assert(key_context->columns); + key_context->columns->Reset(); + } + + if (!blob_file_meta) { + *key_context->s = Status::Corruption("Invalid blob file number"); + continue; + } + + if (blob_index.HasTTL() || blob_index.IsInlined()) { + *key_context->s = + Status::Corruption("Unexpected TTL/inlined blob index"); + continue; + } + + blob_reqs_in_file.emplace_back( + key_context->get_context->ukey_to_get_blob_value(), + blob_index.offset(), blob_index.size(), blob_index.compression(), + &blob.result, key_context->s); + } + if (blob_reqs_in_file.size() > 0) { + const auto file_size = blob_file_meta->GetBlobFileSize(); + blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file); + } + } + + if (blob_reqs.size() > 0) { + blob_source_->MultiGetBlob(read_options, blob_reqs, + /*bytes_read=*/nullptr); + } + + for (auto& ctx : blob_ctxs) { + BlobReadContexts& blobs_in_file = ctx.second; + for (auto& blob : blobs_in_file) { + const KeyContext* const key_context = blob.key_context; + assert(key_context); + assert(key_context->get_context); + assert(key_context->s); + + if (key_context->s->ok()) { + if (key_context->value) { + *key_context->value = std::move(blob.result); + range.AddValueSize(key_context->value->size()); + } else { + assert(key_context->columns); + key_context->columns->SetPlainValue(std::move(blob.result)); + range.AddValueSize(key_context->columns->serialized_size()); + } + + if (range.GetValueSize() > read_options.value_size_soft_limit) { + *key_context->s = Status::Aborted(); + } + } else if (key_context->s->IsIncomplete()) { + // read_options.read_tier == kBlockCacheTier + // Cannot read blob(s): no disk I/O allowed + auto& get_context = *(key_context->get_context); + get_context.MarkKeyMayExist(); + } + } + } +} + +void Version::Get(const ReadOptions& read_options, const LookupKey& k, + PinnableSlice* value, PinnableWideColumns* columns, + std::string* timestamp, Status* status, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, + bool* key_exists, SequenceNumber* seq, ReadCallback* callback, + bool* is_blob, bool do_merge) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + + assert(status->ok() || status->IsMergeInProgress()); + + if (key_exists != nullptr) { + // will falsify below if not found + *key_exists = true; + } + + uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_get_id = vset_->block_cache_tracer_->NextGetId(); + } + + // Note: the old StackableDB-based BlobDB passes in + // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we + // need to provide it here. + bool is_blob_index = false; + bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; + BlobFetcher blob_fetcher(this, read_options); + + assert(pinned_iters_mgr); + GetContext get_context( + user_comparator(), merge_operator_, info_log_, db_statistics_, + status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, + do_merge ? value : nullptr, do_merge ? columns : nullptr, + do_merge ? timestamp : nullptr, value_found, merge_context, do_merge, + max_covering_tombstone_seq, clock_, seq, + merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use, + tracing_get_id, &blob_fetcher); + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr->StartPinning(); + } + + FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + FdWithKeyRange* f = fp.GetNextFile(); + + while (f != nullptr) { + if (*max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so we + // stop here. + break; + } + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } + + bool timer_enabled = + GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + StopWatchNano timer(clock_, timer_enabled /* auto_start */); + *status = table_cache_->Get( + read_options, *internal_comparator(), *f->file_metadata, ikey, + &get_context, mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + fp.GetHitFileLevel()); + } + if (!status->ok()) { + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + return; + } + + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (fp.GetHitFileLevel() == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (fp.GetHitFileLevel() == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (fp.GetHitFileLevel() >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); + + if (is_blob_index && do_merge && (value || columns)) { + assert(!columns || + (!columns->columns().empty() && + columns->columns().front().name() == kDefaultWideColumnName)); + + Slice blob_index = + value ? *value : columns->columns().front().value(); + + TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex", + &blob_index); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + PinnableSlice result; + + constexpr uint64_t* bytes_read = nullptr; + + *status = GetBlob(read_options, get_context.ukey_to_get_blob_value(), + blob_index, prefetch_buffer, &result, bytes_read); + if (!status->ok()) { + if (status->IsIncomplete()) { + get_context.MarkKeyMayExist(); + } + return; + } + + if (value) { + *value = std::move(result); + } else { + assert(columns); + columns->SetPlainValue(std::move(result)); + } + } + + return; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + return; + case GetContext::kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case GetContext::kUnexpectedBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + return; + case GetContext::kMergeOperatorFailed: + *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed); + return; + } + f = fp.GetNextFile(); + } + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!do_merge) { + *status = Status::OK(); + return; + } + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + if (value || columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, merge_context->GetOperands(), + &result, info_log_, db_statistics_, clock_, + /* result_operand */ nullptr, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (status->ok()) { + if (LIKELY(value != nullptr)) { + *(value->GetSelf()) = std::move(result); + value->PinSelf(); + } else { + assert(columns != nullptr); + columns->SetPlainValue(std::move(result)); + } + } + } + } else { + if (key_exists != nullptr) { + *key_exists = false; + } + *status = Status::NotFound(); // Use an empty error message for speed + } +} + +void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback) { + PinnedIteratorsManager pinned_iters_mgr; + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr.StartPinning(); + } + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_mget_id = vset_->block_cache_tracer_->NextGetId(); + } + // Even though we know the batch size won't be > MAX_BATCH_SIZE, + // use autovector in order to avoid unnecessary construction of GetContext + // objects, which is expensive + autovector get_ctx; + BlobFetcher blob_fetcher(this, read_options); + for (auto iter = range->begin(); iter != range->end(); ++iter) { + assert(iter->s->ok() || iter->s->IsMergeInProgress()); + get_ctx.emplace_back( + user_comparator(), merge_operator_, info_log_, db_statistics_, + iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, + iter->ukey_with_ts, iter->value, iter->columns, iter->timestamp, + nullptr, &(iter->merge_context), true, + &iter->max_covering_tombstone_seq, clock_, nullptr, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, + &iter->is_blob_index, tracing_mget_id, &blob_fetcher); + // MergeInProgress status, if set, has been transferred to the get_context + // state, so we set status to ok here. From now on, the iter status will + // be used for IO errors, and get_context state will be used for any + // key level errors + *(iter->s) = Status::OK(); + } + int get_ctx_index = 0; + for (auto iter = range->begin(); iter != range->end(); + ++iter, get_ctx_index++) { + iter->get_context = &(get_ctx[get_ctx_index]); + } + + Status s; + // blob_file => [[blob_idx, it], ...] + std::unordered_map blob_ctxs; + MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end()); +#if USE_COROUTINES + if (read_options.async_io && read_options.optimize_multiget_for_io && + using_coroutines() && use_async_io_) { + s = MultiGetAsync(read_options, range, &blob_ctxs); + } else +#endif // USE_COROUTINES + { + MultiGetRange file_picker_range(*range, range->begin(), range->end()); + FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + FdWithKeyRange* f = fp.GetNextFileInLevel(); + uint64_t num_index_read = 0; + uint64_t num_filter_read = 0; + uint64_t num_sst_read = 0; + uint64_t num_level_read = 0; + + int prev_level = -1; + + while (!fp.IsSearchEnded()) { + // This will be set to true later if we actually look up in a file in L0. + // For per level stats purposes, an L0 file is treated as a level + bool dump_stats_for_l0_file = false; + + // Avoid using the coroutine version if we're looking in a L0 file, since + // L0 files won't be parallelized anyway. The regular synchronous version + // is faster. + if (!read_options.async_io || !using_coroutines() || !use_async_io_ || + fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) { + if (f) { + bool skip_filters = + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); + // Call MultiGetFromSST for looking up a single file + s = MultiGetFromSST(read_options, fp.CurrentFileRange(), + fp.GetHitFileLevel(), skip_filters, + /*skip_range_deletions=*/false, f, blob_ctxs, + /*table_handle=*/nullptr, num_filter_read, + num_index_read, num_sst_read); + if (fp.GetHitFileLevel() == 0) { + dump_stats_for_l0_file = true; + } + } + if (s.ok()) { + f = fp.GetNextFileInLevel(); + } +#if USE_COROUTINES + } else { + std::vector> mget_tasks; + while (f != nullptr) { + MultiGetRange file_range = fp.CurrentFileRange(); + TableCache::TypedHandle* table_handle = nullptr; + bool skip_filters = + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); + bool skip_range_deletions = false; + if (!skip_filters) { + Status status = table_cache_->MultiGetFilter( + read_options, *internal_comparator(), *f->file_metadata, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); + skip_range_deletions = true; + if (status.ok()) { + skip_filters = true; + } else if (!status.IsNotSupported()) { + s = status; + } + } + + if (!s.ok()) { + break; + } + + if (!file_range.empty()) { + mget_tasks.emplace_back(MultiGetFromSSTCoroutine( + read_options, file_range, fp.GetHitFileLevel(), skip_filters, + skip_range_deletions, f, blob_ctxs, table_handle, + num_filter_read, num_index_read, num_sst_read)); + } + if (fp.KeyMaySpanNextFile()) { + break; + } + f = fp.GetNextFileInLevel(); + } + if (mget_tasks.size() > 0) { + RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, + mget_tasks.size()); + // Collect all results so far + std::vector statuses = folly::coro::blockingWait( + folly::coro::collectAllRange(std::move(mget_tasks)) + .scheduleOn(&range->context()->executor())); + if (s.ok()) { + for (Status stat : statuses) { + if (!stat.ok()) { + s = std::move(stat); + break; + } + } + } + + if (s.ok() && fp.KeyMaySpanNextFile()) { + f = fp.GetNextFileInLevel(); + } + } +#endif // USE_COROUTINES + } + // If bad status or we found final result for all the keys + if (!s.ok() || file_picker_range.empty()) { + break; + } + if (!f) { + // Reached the end of this level. Prepare the next level + fp.PrepareNextLevelForSearch(); + if (!fp.IsSearchEnded()) { + // Its possible there is no overlap on this level and f is nullptr + f = fp.GetNextFileInLevel(); + } + if (dump_stats_for_l0_file || + (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) { + // Dump the stats if the search has moved to the next level and + // reset for next level. + if (num_filter_read + num_index_read) { + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + } + if (num_sst_read) { + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, + num_sst_read); + num_level_read++; + } + num_filter_read = 0; + num_index_read = 0; + num_sst_read = 0; + } + prev_level = fp.GetHitFileLevel(); + } + } + + // Dump stats for most recent level + if (num_filter_read + num_index_read) { + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + } + if (num_sst_read) { + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read); + num_level_read++; + } + if (num_level_read) { + RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, + num_level_read); + } + } + + if (s.ok() && !blob_ctxs.empty()) { + MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs); + } + + // Process any left over keys + for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) { + GetContext& get_context = *iter->get_context; + Status* status = iter->s; + Slice user_key = iter->lkey->user_key(); + + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + range->MarkKeyDone(iter); + continue; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + std::string result; + + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), + &result, info_log_, db_statistics_, clock_, + /* result_operand */ nullptr, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (LIKELY(iter->value != nullptr)) { + *iter->value->GetSelf() = std::move(result); + iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); + } else { + assert(iter->columns); + iter->columns->SetPlainValue(std::move(result)); + range->AddValueSize(iter->columns->serialized_size()); + } + + range->MarkKeyDone(iter); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } else { + range->MarkKeyDone(iter); + *status = Status::NotFound(); // Use an empty error message for speed + } + } + + for (auto iter = range->begin(); iter != range->end(); ++iter) { + range->MarkKeyDone(iter); + *(iter->s) = s; + } +} + +#ifdef USE_COROUTINES +Status Version::ProcessBatch( + const ReadOptions& read_options, FilePickerMultiGet* batch, + std::vector>& mget_tasks, + std::unordered_map* blob_ctxs, + autovector& batches, std::deque& waiting, + std::deque& to_process, unsigned int& num_tasks_queued, + std::unordered_map>& + mget_stats) { + FilePickerMultiGet& fp = *batch; + MultiGetRange range = fp.GetRange(); + // Initialize a new empty range. Any keys that are not in this level will + // eventually become part of the new range. + MultiGetRange leftover(range, range.begin(), range.begin()); + FdWithKeyRange* f = nullptr; + Status s; + + f = fp.GetNextFileInLevel(); + while (!f) { + fp.PrepareNextLevelForSearch(); + if (!fp.IsSearchEnded()) { + f = fp.GetNextFileInLevel(); + } else { + break; + } + } + while (f) { + MultiGetRange file_range = fp.CurrentFileRange(); + TableCache::TypedHandle* table_handle = nullptr; + bool skip_filters = IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); + bool skip_range_deletions = false; + if (!skip_filters) { + Status status = table_cache_->MultiGetFilter( + read_options, *internal_comparator(), *f->file_metadata, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); + if (status.ok()) { + skip_filters = true; + skip_range_deletions = true; + } else if (!status.IsNotSupported()) { + s = status; + } + } + if (!s.ok()) { + break; + } + // At this point, file_range contains any keys that are likely in this + // file. It may have false positives, but that's ok since higher level + // lookups for the key are dependent on this lookup anyway. + // Add the complement of file_range to leftover. That's the set of keys + // definitely not in this level. + // Subtract the complement of file_range from range, since they will be + // processed in a separate batch in parallel. + leftover += ~file_range; + range -= ~file_range; + if (!file_range.empty()) { + int level = fp.GetHitFileLevel(); + auto stat = mget_stats.find(level); + if (stat == mget_stats.end()) { + auto entry = mget_stats.insert({level, {0, 0, 0}}); + assert(entry.second); + stat = entry.first; + } + + if (waiting.empty() && to_process.empty() && + !fp.RemainingOverlapInLevel() && leftover.empty() && + mget_tasks.empty()) { + // All keys are in one SST file, so take the fast path + s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(), + skip_filters, skip_range_deletions, f, *blob_ctxs, + table_handle, std::get<0>(stat->second), + std::get<1>(stat->second), + std::get<2>(stat->second)); + } else { + mget_tasks.emplace_back(MultiGetFromSSTCoroutine( + read_options, file_range, fp.GetHitFileLevel(), skip_filters, + skip_range_deletions, f, *blob_ctxs, table_handle, + std::get<0>(stat->second), std::get<1>(stat->second), + std::get<2>(stat->second))); + ++num_tasks_queued; + } + } + if (fp.KeyMaySpanNextFile() && !file_range.empty()) { + break; + } + f = fp.GetNextFileInLevel(); + } + // Split the current batch only if some keys are likely in this level and + // some are not. Only split if we're done with this level, i.e f is null. + // Otherwise, it means there are more files in this level to look at. + if (s.ok() && !f && !leftover.empty() && !range.empty()) { + fp.ReplaceRange(range); + batches.emplace_back(&leftover, fp); + to_process.emplace_back(batches.size() - 1); + } + // 1. If f is non-null, that means we might not be done with this level. + // This can happen if one of the keys is the last key in the file, i.e + // fp.KeyMaySpanNextFile() is true. + // 2. If range is empty, then we're done with this range and no need to + // prepare the next level + // 3. If some tasks were queued for this range, then the next level will be + // prepared after executing those tasks + if (!f && !range.empty() && !num_tasks_queued) { + fp.PrepareNextLevelForSearch(); + } + return s; +} + +Status Version::MultiGetAsync( + const ReadOptions& options, MultiGetRange* range, + std::unordered_map* blob_ctxs) { + autovector batches; + std::deque waiting; + std::deque to_process; + Status s; + std::vector> mget_tasks; + std::unordered_map> mget_stats; + + // Create the initial batch with the input range + batches.emplace_back(range, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); + to_process.emplace_back(0); + + while (!to_process.empty()) { + // As we process a batch, it may get split into two. So reserve space for + // an additional batch in the autovector in order to prevent later moves + // of elements in ProcessBatch(). + batches.reserve(batches.size() + 1); + + size_t idx = to_process.front(); + FilePickerMultiGet* batch = &batches.at(idx); + unsigned int num_tasks_queued = 0; + to_process.pop_front(); + if (batch->IsSearchEnded() || batch->GetRange().empty()) { + // If to_process is empty, i.e no more batches to look at, then we need + // schedule the enqueued coroutines and wait for them. Otherwise, we + // skip this batch and move to the next one in to_process. + if (!to_process.empty()) { + continue; + } + } else { + // Look through one level. This may split the batch and enqueue it to + // to_process + s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting, + to_process, num_tasks_queued, mget_stats); + // If ProcessBatch didn't enqueue any coroutine tasks, it means all + // keys were filtered out. So put the batch back in to_process to + // lookup in the next level + if (!num_tasks_queued && !batch->IsSearchEnded()) { + // Put this back in the processing queue + to_process.emplace_back(idx); + } else if (num_tasks_queued) { + waiting.emplace_back(idx); + } + } + // If ProcessBatch() returned an error, then schedule the enqueued + // coroutines and wait for them, then abort the MultiGet. + if (to_process.empty() || !s.ok()) { + if (mget_tasks.size() > 0) { + assert(waiting.size()); + RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); + // Collect all results so far + std::vector statuses = folly::coro::blockingWait( + folly::coro::collectAllRange(std::move(mget_tasks)) + .scheduleOn(&range->context()->executor())); + mget_tasks.clear(); + if (s.ok()) { + for (Status stat : statuses) { + if (!stat.ok()) { + s = std::move(stat); + break; + } + } + } + + if (!s.ok()) { + break; + } + + for (size_t wait_idx : waiting) { + FilePickerMultiGet& fp = batches.at(wait_idx); + // 1. If fp.GetHitFile() is non-null, then there could be more + // overlap in this level. So skip preparing next level. + // 2. If fp.GetRange() is empty, then this batch is completed + // and no need to prepare the next level. + if (!fp.GetHitFile() && !fp.GetRange().empty()) { + fp.PrepareNextLevelForSearch(); + } + } + to_process.swap(waiting); + } else { + assert(!s.ok() || waiting.size() == 0); + } + } + if (!s.ok()) { + break; + } + } + + uint64_t num_levels = 0; + for (auto& stat : mget_stats) { + if (stat.first == 0) { + num_levels += std::get<2>(stat.second); + } else { + num_levels++; + } + + uint64_t num_meta_reads = + std::get<0>(stat.second) + std::get<1>(stat.second); + uint64_t num_sst_reads = std::get<2>(stat.second); + if (num_meta_reads > 0) { + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_meta_reads); + } + if (num_sst_reads > 0) { + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads); + } + } + if (num_levels > 0) { + RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels); + } + + return s; +} +#endif + +bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { + // Reaching the bottom level implies misses at all upper levels, so we'll + // skip checking the filters when we predict a hit. + return cfd_->ioptions()->optimize_filters_for_hits && + (level > 0 || is_file_last_in_level) && + level == storage_info_.num_non_empty_levels() - 1; +} + +void VersionStorageInfo::GenerateLevelFilesBrief() { + level_files_brief_.resize(num_non_empty_levels_); + for (int level = 0; level < num_non_empty_levels_; level++) { + DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level], + &arena_); + } +} + +void VersionStorageInfo::PrepareForVersionAppend( + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options) { + ComputeCompensatedSizes(); + UpdateNumNonEmptyLevels(); + CalculateBaseBytes(immutable_options, mutable_cf_options); + UpdateFilesByCompactionPri(immutable_options, mutable_cf_options); + GenerateFileIndexer(); + GenerateLevelFilesBrief(); + GenerateLevel0NonOverlapping(); + if (!immutable_options.allow_ingest_behind) { + GenerateBottommostFiles(); + } + GenerateFileLocationIndex(); +} + +void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, + bool update_stats) { + TEST_SYNC_POINT_CALLBACK( + "Version::PrepareAppend:forced_check", + reinterpret_cast(&storage_info_.force_consistency_checks_)); + + if (update_stats) { + UpdateAccumulatedStats(read_options); + } + + storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options); +} + +bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta) { + if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { + return false; + } + std::shared_ptr tp; + Status s = GetTableProperties(read_options, &tp, file_meta); + file_meta->init_stats_from_file = true; + if (!s.ok()) { + ROCKS_LOG_ERROR(vset_->db_options_->info_log, + "Unable to load table properties for file %" PRIu64 + " --- %s\n", + file_meta->fd.GetNumber(), s.ToString().c_str()); + return false; + } + if (tp.get() == nullptr) return false; + file_meta->num_entries = tp->num_entries; + file_meta->num_deletions = tp->num_deletions; + file_meta->raw_value_size = tp->raw_value_size; + file_meta->raw_key_size = tp->raw_key_size; + file_meta->num_range_deletions = tp->num_range_deletions; + return true; +} + +void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { + TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats", + nullptr); + + assert(file_meta->init_stats_from_file); + accumulated_file_size_ += file_meta->fd.GetFileSize(); + accumulated_raw_key_size_ += file_meta->raw_key_size; + accumulated_raw_value_size_ += file_meta->raw_value_size; + accumulated_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + accumulated_num_deletions_ += file_meta->num_deletions; + + current_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ += file_meta->num_deletions; + current_num_samples_++; +} + +void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { + if (file_meta->init_stats_from_file) { + current_num_non_deletions_ -= + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ -= file_meta->num_deletions; + current_num_samples_--; + } +} + +void Version::UpdateAccumulatedStats(const ReadOptions& read_options) { + // maximum number of table properties loaded from files. + const int kMaxInitCount = 20; + int init_count = 0; + // here only the first kMaxInitCount files which haven't been + // initialized from file will be updated with num_deletions. + // The motivation here is to cap the maximum I/O per Version creation. + // The reason for choosing files from lower-level instead of higher-level + // is that such design is able to propagate the initialization from + // lower-level to higher-level: When the num_deletions of lower-level + // files are updated, it will make the lower-level files have accurate + // compensated_file_size, making lower-level to higher-level compaction + // will be triggered, which creates higher-level files whose num_deletions + // will be updated here. + for (int level = 0; + level < storage_info_.num_levels_ && init_count < kMaxInitCount; + ++level) { + for (auto* file_meta : storage_info_.files_[level]) { + if (MaybeInitializeFileMetaData(read_options, file_meta)) { + // each FileMeta will be initialized only once. + storage_info_.UpdateAccumulatedStats(file_meta); + // when option "max_open_files" is -1, all the file metadata has + // already been read, so MaybeInitializeFileMetaData() won't incur + // any I/O cost. "max_open_files=-1" means that the table cache passed + // to the VersionSet and then to the ColumnFamilySet has a size of + // TableCache::kInfiniteCapacity + if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + continue; + } + if (++init_count >= kMaxInitCount) { + break; + } + } + } + } + // In case all sampled-files contain only deletion entries, then we + // load the table-property of a file in higher-level to initialize + // that value. + for (int level = storage_info_.num_levels_ - 1; + storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { + for (int i = static_cast(storage_info_.files_[level].size()) - 1; + storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { + if (MaybeInitializeFileMetaData(read_options, + storage_info_.files_[level][i])) { + storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); + } + } + } +} + +void VersionStorageInfo::ComputeCompensatedSizes() { + static const int kDeletionWeightOnCompaction = 2; + uint64_t average_value_size = GetAverageValueSize(); + + // compute the compensated size + for (int level = 0; level < num_levels_; level++) { + for (auto* file_meta : files_[level]) { + // Here we only compute compensated_file_size for those file_meta + // which compensated_file_size is uninitialized (== 0). This is true only + // for files that have been created right now and no other thread has + // access to them. That's why we can safely mutate compensated_file_size. + if (file_meta->compensated_file_size == 0) { + file_meta->compensated_file_size = file_meta->fd.GetFileSize(); + // Here we only boost the size of deletion entries of a file only + // when the number of deletion entries is greater than the number of + // non-deletion entries in the file. The motivation here is that in + // a stable workload, the number of deletion entries should be roughly + // equal to the number of non-deletion entries. If we compensate the + // size of deletion entries in a stable workload, the deletion + // compensation logic might introduce unwanted effet which changes the + // shape of LSM tree. + if ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 >= + file_meta->num_entries) { + file_meta->compensated_file_size += + ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 - + file_meta->num_entries) * + average_value_size * kDeletionWeightOnCompaction; + } + file_meta->compensated_file_size += + file_meta->compensated_range_deletion_size; + } + } + } +} + +int VersionStorageInfo::MaxInputLevel() const { + if (compaction_style_ == kCompactionStyleLevel) { + return num_levels() - 2; + } + return 0; +} + +int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const { + if (allow_ingest_behind) { + assert(num_levels() > 1); + return num_levels() - 2; + } + return num_levels() - 1; +} + +void VersionStorageInfo::EstimateCompactionBytesNeeded( + const MutableCFOptions& mutable_cf_options) { + // Only implemented for level-based compaction + if (compaction_style_ != kCompactionStyleLevel) { + estimated_compaction_needed_bytes_ = 0; + return; + } + + // Start from Level 0, if level 0 qualifies compaction to level 1, + // we estimate the size of compaction. + // Then we move on to the next level and see whether it qualifies compaction + // to the next level. The size of the level is estimated as the actual size + // on the level plus the input bytes from the previous level if there is any. + // If it exceeds, take the exceeded bytes as compaction input and add the size + // of the compaction size to tatal size. + // We keep doing it to Level 2, 3, etc, until the last level and return the + // accumulated bytes. + + uint64_t bytes_compact_to_next_level = 0; + uint64_t level_size = 0; + for (auto* f : files_[0]) { + level_size += f->fd.GetFileSize(); + } + // Level 0 + bool level0_compact_triggered = false; + if (static_cast(files_[0].size()) >= + mutable_cf_options.level0_file_num_compaction_trigger || + level_size >= mutable_cf_options.max_bytes_for_level_base) { + level0_compact_triggered = true; + estimated_compaction_needed_bytes_ = level_size; + bytes_compact_to_next_level = level_size; + } else { + estimated_compaction_needed_bytes_ = 0; + } + + // Level 1 and up. + uint64_t bytes_next_level = 0; + for (int level = base_level(); level <= MaxInputLevel(); level++) { + level_size = 0; + if (bytes_next_level > 0) { +#ifndef NDEBUG + uint64_t level_size2 = 0; + for (auto* f : files_[level]) { + level_size2 += f->fd.GetFileSize(); + } + assert(level_size2 == bytes_next_level); +#endif + level_size = bytes_next_level; + bytes_next_level = 0; + } else { + for (auto* f : files_[level]) { + level_size += f->fd.GetFileSize(); + } + } + if (level == base_level() && level0_compact_triggered) { + // Add base level size to compaction if level0 compaction triggered. + estimated_compaction_needed_bytes_ += level_size; + } + // Add size added by previous compaction + level_size += bytes_compact_to_next_level; + bytes_compact_to_next_level = 0; + uint64_t level_target = MaxBytesForLevel(level); + if (level_size > level_target) { + bytes_compact_to_next_level = level_size - level_target; + // Estimate the actual compaction fan-out ratio as size ratio between + // the two levels. + + assert(bytes_next_level == 0); + if (level + 1 < num_levels_) { + for (auto* f : files_[level + 1]) { + bytes_next_level += f->fd.GetFileSize(); + } + } + if (bytes_next_level > 0) { + assert(level_size > 0); + estimated_compaction_needed_bytes_ += static_cast( + static_cast(bytes_compact_to_next_level) * + (static_cast(bytes_next_level) / + static_cast(level_size) + + 1)); + } + } + } +} + +namespace { +uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const std::vector& files) { + uint32_t ttl_expired_files_count = 0; + + int64_t _current_time; + auto status = ioptions.clock->GetCurrentTime(&_current_time); + if (status.ok()) { + const uint64_t current_time = static_cast(_current_time); + for (FileMetaData* f : files) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0 && + oldest_ancester_time < (current_time - mutable_cf_options.ttl)) { + ttl_expired_files_count++; + } + } + } + } + return ttl_expired_files_count; +} + +bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const std::vector& files) { + const std::vector& ages = + mutable_cf_options.compaction_options_fifo + .file_temperature_age_thresholds; + if (ages.empty()) { + return false; + } + if (files.empty()) { + return false; + } + int64_t _current_time; + auto status = ioptions.clock->GetCurrentTime(&_current_time); + const uint64_t current_time = static_cast(_current_time); + // We use oldest_ancestor_time of a file to be the estimate age of + // the file just older than it. This is the same logic used in + // FIFOCompactionPicker::PickTemperatureChangeCompaction(). + if (status.ok() && current_time >= ages[0].age) { + uint64_t create_time_threshold = current_time - ages[0].age; + Temperature target_temp; + assert(files.size() >= 1); + for (size_t index = files.size() - 1; index >= 1; --index) { + FileMetaData* cur_file = files[index]; + FileMetaData* prev_file = files[index - 1]; + if (!cur_file->being_compacted) { + uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime(); + if (oldest_ancestor_time == kUnknownOldestAncesterTime) { + return false; + } + if (oldest_ancestor_time > create_time_threshold) { + return false; + } + target_temp = ages[0].temperature; + for (size_t i = 1; i < ages.size(); ++i) { + if (current_time >= ages[i].age && + oldest_ancestor_time <= current_time - ages[i].age) { + target_temp = ages[i].temperature; + } + } + if (cur_file->temperature != target_temp) { + return true; + } + } + } + } + return false; +} +} // anonymous namespace + +void VersionStorageInfo::ComputeCompactionScore( + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options) { + double total_downcompact_bytes = 0.0; + // Historically, score is defined as actual bytes in a level divided by + // the level's target size, and 1.0 is the threshold for triggering + // compaction. Higher score means higher prioritization. + // Now we keep the compaction triggering condition, but consider more + // factors for prioritization, while still keeping the 1.0 threshold. + // In order to provide flexibility for reducing score while still + // maintaining it to be over 1.0, we scale the original score by 10x + // if it is larger than 1.0. + const double kScoreScale = 10.0; + for (int level = 0; level <= MaxInputLevel(); level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int num_sorted_runs = 0; + uint64_t total_size = 0; + for (auto* f : files_[level]) { + total_downcompact_bytes += static_cast(f->fd.GetFileSize()); + if (!f->being_compacted) { + total_size += f->compensated_file_size; + num_sorted_runs++; + } + } + if (compaction_style_ == kCompactionStyleUniversal) { + // For universal compaction, we use level0 score to indicate + // compaction score for the whole DB. Adding other levels as if + // they are L0 files. + for (int i = 1; i < num_levels(); i++) { + // It's possible that a subset of the files in a level may be in a + // compaction, due to delete triggered compaction or trivial move. + // In that case, the below check may not catch a level being + // compacted as it only checks the first file. The worst that can + // happen is a scheduled compaction thread will find nothing to do. + if (!files_[i].empty() && !files_[i][0]->being_compacted) { + num_sorted_runs++; + } + } + } + + if (compaction_style_ == kCompactionStyleFIFO) { + score = static_cast(total_size) / + mutable_cf_options.compaction_options_fifo.max_table_files_size; + if (score < 1 && + mutable_cf_options.compaction_options_fifo.allow_compaction) { + score = std::max( + static_cast(num_sorted_runs) / + mutable_cf_options.level0_file_num_compaction_trigger, + score); + } + if (score < 1 && mutable_cf_options.ttl > 0) { + score = + std::max(static_cast(GetExpiredTtlFilesCount( + immutable_options, mutable_cf_options, files_[0])), + score); + } + if (score < 1 && + ShouldChangeFileTemperature(immutable_options, mutable_cf_options, + files_[0])) { + // For FIFO, just need a large enough score to trigger compaction. + const double kScoreForNeedCompaction = 1.1; + score = kScoreForNeedCompaction; + } + } else { + score = static_cast(num_sorted_runs) / + mutable_cf_options.level0_file_num_compaction_trigger; + if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { + // Level-based involves L0->L0 compactions that can lead to oversized + // L0 files. Take into account size as well to avoid later giant + // compactions to the base level. + // If score in L0 is always too high, L0->L1 will always be + // prioritized over L1->L2 compaction and L1 will accumulate to + // too large. But if L0 score isn't high enough, L0 will accumulate + // and data is not moved to L1 fast enough. With potential L0->L0 + // compaction, number of L0 files aren't always an indication of + // L0 oversizing, and we also need to consider total size of L0. + if (immutable_options.level_compaction_dynamic_level_bytes) { + if (total_size >= mutable_cf_options.max_bytes_for_level_base) { + // When calculating estimated_compaction_needed_bytes, we assume + // L0 is qualified as pending compactions. We will need to make + // sure that it qualifies for compaction. + // It might be guaranteed by logic below anyway, but we are + // explicit here to make sure we don't stop writes with no + // compaction scheduled. + score = std::max(score, 1.01); + } + if (total_size > level_max_bytes_[base_level_]) { + // In this case, we compare L0 size with actual L1 size and make + // sure score is more than 1.0 (10.0 after scaled) if L0 is larger + // than L1. Since in this case L1 score is lower than 10.0, L0->L1 + // is prioritized over L1->L2. + uint64_t base_level_size = 0; + for (auto f : files_[base_level_]) { + base_level_size += f->compensated_file_size; + } + score = std::max(score, static_cast(total_size) / + static_cast(std::max( + base_level_size, + level_max_bytes_[base_level_]))); + } + if (score > 1.0) { + score *= kScoreScale; + } + } else { + score = std::max(score, + static_cast(total_size) / + mutable_cf_options.max_bytes_for_level_base); + } + } + } + } else { // level > 0 + // Compute the ratio of current size to size limit. + uint64_t level_bytes_no_compacting = 0; + uint64_t level_total_bytes = 0; + for (auto f : files_[level]) { + level_total_bytes += f->fd.GetFileSize(); + if (!f->being_compacted) { + level_bytes_no_compacting += f->compensated_file_size; + } + } + if (!immutable_options.level_compaction_dynamic_level_bytes) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + if (level_bytes_no_compacting < MaxBytesForLevel(level)) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + // If there are a large mount of data being compacted down to the + // current level soon, we would de-prioritize compaction from + // a level where the incoming data would be a large ratio. We do + // it by dividing level size not by target level size, but + // the target size and the incoming compaction bytes. + score = static_cast(level_bytes_no_compacting) / + (MaxBytesForLevel(level) + total_downcompact_bytes) * + kScoreScale; + } + // Drain unnecessary levels, but with lower priority compared to + // when L0 is eligible. Only non-empty levels can be unnecessary. + // If there is no unnecessary levels, lowest_unnecessary_level_ = -1. + if (level_bytes_no_compacting > 0 && + level <= lowest_unnecessary_level_) { + score = std::max( + score, kScoreScale * + (1.001 + 0.001 * (lowest_unnecessary_level_ - level))); + } + } + if (level <= lowest_unnecessary_level_) { + total_downcompact_bytes += level_total_bytes; + } else if (level_total_bytes > MaxBytesForLevel(level)) { + total_downcompact_bytes += + static_cast(level_total_bytes - MaxBytesForLevel(level)); + } + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < num_levels() - 2; i++) { + for (int j = i + 1; j < num_levels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } + ComputeFilesMarkedForCompaction(); + if (!immutable_options.allow_ingest_behind) { + ComputeBottommostFilesMarkedForCompaction(); + } + if (mutable_cf_options.ttl > 0) { + ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); + } + if (mutable_cf_options.periodic_compaction_seconds > 0) { + ComputeFilesMarkedForPeriodicCompaction( + immutable_options, mutable_cf_options.periodic_compaction_seconds); + } + + if (mutable_cf_options.enable_blob_garbage_collection && + mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 && + mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) { + ComputeFilesMarkedForForcedBlobGC( + mutable_cf_options.blob_garbage_collection_age_cutoff, + mutable_cf_options.blob_garbage_collection_force_threshold); + } + + EstimateCompactionBytesNeeded(mutable_cf_options); +} + +void VersionStorageInfo::ComputeFilesMarkedForCompaction() { + files_marked_for_compaction_.clear(); + int last_qualify_level = 0; + + // Do not include files from the last level with data + // If table properties collector suggests a file on the last level, + // we should not move it to a new level. + for (int level = num_levels() - 1; level >= 1; level--) { + if (!files_[level].empty()) { + last_qualify_level = level - 1; + break; + } + } + + for (int level = 0; level <= last_qualify_level; level++) { + for (auto* f : files_[level]) { + if (!f->being_compacted && f->marked_for_compaction) { + files_marked_for_compaction_.emplace_back(level, f); + } + } + } +} + +void VersionStorageInfo::ComputeExpiredTtlFiles( + const ImmutableOptions& ioptions, const uint64_t ttl) { + assert(ttl > 0); + + expired_ttl_files_.clear(); + + int64_t _current_time; + auto status = ioptions.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(_current_time); + + for (int level = 0; level < num_levels() - 1; level++) { + for (FileMetaData* f : files_[level]) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time > 0 && + oldest_ancester_time < (current_time - ttl)) { + expired_ttl_files_.emplace_back(level, f); + } + } + } + } +} + +void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( + const ImmutableOptions& ioptions, + const uint64_t periodic_compaction_seconds) { + assert(periodic_compaction_seconds > 0); + + files_marked_for_periodic_compaction_.clear(); + + int64_t temp_current_time; + auto status = ioptions.clock->GetCurrentTime(&temp_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(temp_current_time); + + // If periodic_compaction_seconds is larger than current time, periodic + // compaction can't possibly be triggered. + if (periodic_compaction_seconds > current_time) { + return; + } + + const uint64_t allowed_time_limit = + current_time - periodic_compaction_seconds; + + for (int level = 0; level < num_levels(); level++) { + for (auto f : files_[level]) { + if (!f->being_compacted) { + // Compute a file's modification time in the following order: + // 1. Use file_creation_time table property if it is > 0. + // 2. Use creation_time table property if it is > 0. + // 3. Use file's mtime metadata if the above two table properties are 0. + // Don't consider the file at all if the modification time cannot be + // correctly determined based on the above conditions. + uint64_t file_modification_time = f->TryGetFileCreationTime(); + if (file_modification_time == kUnknownFileCreationTime) { + file_modification_time = f->TryGetOldestAncesterTime(); + } + if (file_modification_time == kUnknownOldestAncesterTime) { + auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(), + f->fd.GetPathId()); + status = ioptions.env->GetFileModificationTime( + file_path, &file_modification_time); + if (!status.ok()) { + ROCKS_LOG_WARN(ioptions.logger, + "Can't get file modification time: %s: %s", + file_path.c_str(), status.ToString().c_str()); + continue; + } + } + if (file_modification_time > 0 && + file_modification_time < allowed_time_limit) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + } + } + } +} + +void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold) { + files_marked_for_forced_blob_gc_.clear(); + + if (blob_files_.empty()) { + return; + } + + // Number of blob files eligible for GC based on age + const size_t cutoff_count = static_cast( + blob_garbage_collection_age_cutoff * blob_files_.size()); + if (!cutoff_count) { + return; + } + + // Compute the sum of total and garbage bytes over the oldest batch of blob + // files. The oldest batch is defined as the set of blob files which are + // kept alive by the same SSTs as the very oldest one. Here is a toy example. + // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11, + // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and + // potentially some higher-numbered ones, while SST 3 relies on blob file 12 + // and potentially some higher-numbered ones. Then, the SST to oldest blob + // file mapping is as follows: + // + // SST file number Oldest blob file number + // 1 10 + // 2 10 + // 3 12 + // + // This is what the same thing looks like from the blob files' POV. (Note that + // the linked SSTs simply denote the inverse mapping of the above.) + // + // Blob file number Linked SST set + // 10 {1, 2} + // 11 {} + // 12 {3} + // 13 {} + // + // Then, the oldest batch of blob files consists of blob files 10 and 11, + // and we can get rid of them by forcing the compaction of SSTs 1 and 2. + // + // Note that the overall ratio of garbage computed for the batch has to exceed + // blob_garbage_collection_force_threshold and the entire batch has to be + // eligible for GC according to blob_garbage_collection_age_cutoff in order + // for us to schedule any compactions. + const auto& oldest_meta = blob_files_.front(); + assert(oldest_meta); + + const auto& linked_ssts = oldest_meta->GetLinkedSsts(); + assert(!linked_ssts.empty()); + + size_t count = 1; + uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes(); + uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes(); + + assert(cutoff_count <= blob_files_.size()); + + for (; count < cutoff_count; ++count) { + const auto& meta = blob_files_[count]; + assert(meta); + + if (!meta->GetLinkedSsts().empty()) { + // Found the beginning of the next batch of blob files + break; + } + + sum_total_blob_bytes += meta->GetTotalBlobBytes(); + sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); + } + + if (count < blob_files_.size()) { + const auto& meta = blob_files_[count]; + assert(meta); + + if (meta->GetLinkedSsts().empty()) { + // Some files in the oldest batch are not eligible for GC + return; + } + } + + if (sum_garbage_blob_bytes < + blob_garbage_collection_force_threshold * sum_total_blob_bytes) { + return; + } + + for (uint64_t sst_file_number : linked_ssts) { + const FileLocation location = GetFileLocation(sst_file_number); + assert(location.IsValid()); + + const int level = location.GetLevel(); + assert(level >= 0); + + const size_t pos = location.GetPosition(); + + FileMetaData* const sst_meta = files_[level][pos]; + assert(sst_meta); + + if (sst_meta->being_compacted) { + continue; + } + + files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta); + } +} + +namespace { + +// used to sort files by size +struct Fsize { + size_t index; + FileMetaData* file; +}; + +// Comparator that is used to sort files based on their size +// In normal mode: descending size +bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { + return (first.file->compensated_file_size > + second.file->compensated_file_size); +} +} // anonymous namespace + +void VersionStorageInfo::AddFile(int level, FileMetaData* f) { + auto& level_files = files_[level]; + level_files.push_back(f); + + f->refs++; +} + +void VersionStorageInfo::AddBlobFile( + std::shared_ptr blob_file_meta) { + assert(blob_file_meta); + + assert(blob_files_.empty() || + (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() < + blob_file_meta->GetBlobFileNumber())); + + blob_files_.emplace_back(std::move(blob_file_meta)); +} + +VersionStorageInfo::BlobFiles::const_iterator +VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const { + return std::lower_bound( + blob_files_.begin(), blob_files_.end(), blob_file_number, + [](const std::shared_ptr& lhs, uint64_t rhs) { + assert(lhs); + return lhs->GetBlobFileNumber() < rhs; + }); +} + +void VersionStorageInfo::SetFinalized() { + finalized_ = true; + +#ifndef NDEBUG + if (compaction_style_ != kCompactionStyleLevel) { + // Not level based compaction. + return; + } + assert(base_level_ < 0 || num_levels() == 1 || + (base_level_ >= 1 && base_level_ < num_levels())); + // Verify all levels newer than base_level are empty except L0 + for (int level = 1; level < base_level(); level++) { + assert(NumLevelBytes(level) == 0); + } + uint64_t max_bytes_prev_level = 0; + for (int level = base_level(); level < num_levels() - 1; level++) { + if (LevelFiles(level).size() == 0) { + continue; + } + assert(MaxBytesForLevel(level) >= max_bytes_prev_level); + max_bytes_prev_level = MaxBytesForLevel(level); + } + for (int level = 0; level < num_levels(); level++) { + assert(LevelFiles(level).size() == 0 || + LevelFiles(level).size() == LevelFilesBrief(level).num_files); + if (LevelFiles(level).size() > 0) { + assert(level < num_non_empty_levels()); + } + } + assert(compaction_level_.size() > 0); + assert(compaction_level_.size() == compaction_score_.size()); +#endif +} + +void VersionStorageInfo::UpdateNumNonEmptyLevels() { + num_non_empty_levels_ = num_levels_; + for (int i = num_levels_ - 1; i >= 0; i--) { + if (files_[i].size() != 0) { + return; + } else { + num_non_empty_levels_ = i; + } + } +} + +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, SystemClock* clock, + int level, int num_non_empty_levels, uint64_t ttl, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + int64_t curr_time; + Status status = clock->GetCurrentTime(&curr_time); + if (!status.ok()) { + // If we can't get time, disable TTL. + ttl = 0; + } + + FileTtlBooster ttl_booster(static_cast(curr_time), ttl, + num_non_empty_levels, level); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1; + assert(ttl_boost_score > 0); + assert(file->compensated_file_size != 0); + file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U / + file->compensated_file_size / + ttl_boost_score; + } + + size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort + ? VersionStorageInfo::kNumberFilesToSort + : temp->size(); + + std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + // If score is the same, pick file with smaller keys. + // This makes the algorithm more deterministic, and also + // help the trivial move case to have more files to + // extend. + if (file_to_order[f1.file->fd.GetNumber()] == + file_to_order[f2.file->fd.GetNumber()]) { + return icmp.Compare(f1.file->smallest, + f2.file->smallest) < 0; + } + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} + +void SortFileByRoundRobin(const InternalKeyComparator& icmp, + std::vector* compact_cursor, + bool level0_non_overlapping, int level, + std::vector* temp) { + if (level == 0 && !level0_non_overlapping) { + // Using kOldestSmallestSeqFirst when level === 0, since the + // files may overlap (not fully sorted) + std::sort(temp->begin(), temp->end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno; + }); + return; + } + + bool should_move_files = + compact_cursor->at(level).size() > 0 && temp->size() > 1; + + // The iterator points to the Fsize with smallest key larger than or equal to + // the given cursor + std::vector::iterator current_file_iter; + if (should_move_files) { + // Find the file of which the smallest key is larger than or equal to + // the cursor (the smallest key in the successor file of the last + // chosen file), skip this if the cursor is invalid or there is only + // one file in this level + current_file_iter = std::lower_bound( + temp->begin(), temp->end(), compact_cursor->at(level), + [&](const Fsize& f, const InternalKey& cursor) -> bool { + return icmp.Compare(cursor, f.file->smallest) > 0; + }); + + should_move_files = + current_file_iter != temp->end() && current_file_iter != temp->begin(); + } + if (should_move_files) { + // Construct a local temporary vector + std::vector local_temp; + local_temp.reserve(temp->size()); + // Move the selected File into the first position and its successors + // into the second, third, ..., positions + for (auto iter = current_file_iter; iter != temp->end(); iter++) { + local_temp.push_back(*iter); + } + // Move the origin predecessors of the selected file in a round-robin + // manner + for (auto iter = temp->begin(); iter != current_file_iter; iter++) { + local_temp.push_back(*iter); + } + // Replace all the items in temp + for (size_t i = 0; i < local_temp.size(); i++) { + temp->at(i) = local_temp[i]; + } + } +} +} // anonymous namespace + +void VersionStorageInfo::UpdateFilesByCompactionPri( + const ImmutableOptions& ioptions, const MutableCFOptions& options) { + if (compaction_style_ == kCompactionStyleNone || + compaction_style_ == kCompactionStyleFIFO || + compaction_style_ == kCompactionStyleUniversal) { + // don't need this + return; + } + // No need to sort the highest level because it is never compacted. + for (int level = 0; level < num_levels() - 1; level++) { + const std::vector& files = files_[level]; + auto& files_by_compaction_pri = files_by_compaction_pri_[level]; + assert(files_by_compaction_pri.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (size_t i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + size_t num = VersionStorageInfo::kNumberFilesToSort; + if (num > temp.size()) { + num = temp.size(); + } + switch (ioptions.compaction_pri) { + case kByCompensatedSize: + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareCompensatedSizeDescending); + break; + case kOldestLargestSeqFirst: + std::sort(temp.begin(), temp.end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.largest_seqno < + f2.file->fd.largest_seqno; + }); + break; + case kOldestSmallestSeqFirst: + std::sort(temp.begin(), temp.end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.smallest_seqno < + f2.file->fd.smallest_seqno; + }); + break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], ioptions.clock, level, + num_non_empty_levels_, options.ttl, &temp); + break; + case kRoundRobin: + SortFileByRoundRobin(*internal_comparator_, &compact_cursor_, + level0_non_overlapping_, level, &temp); + break; + default: + assert(false); + } + assert(temp.size() == files.size()); + + // initialize files_by_compaction_pri_ + for (size_t i = 0; i < temp.size(); i++) { + files_by_compaction_pri.push_back(static_cast(temp[i].index)); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_compaction_pri_[level].size()); + } +} + +void VersionStorageInfo::GenerateLevel0NonOverlapping() { + assert(!finalized_); + level0_non_overlapping_ = true; + if (level_files_brief_.size() == 0) { + return; + } + + // A copy of L0 files sorted by smallest key + std::vector level0_sorted_file( + level_files_brief_[0].files, + level_files_brief_[0].files + level_files_brief_[0].num_files); + std::sort(level0_sorted_file.begin(), level0_sorted_file.end(), + [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool { + return (internal_comparator_->Compare(f1.smallest_key, + f2.smallest_key) < 0); + }); + + for (size_t i = 1; i < level0_sorted_file.size(); ++i) { + FdWithKeyRange& f = level0_sorted_file[i]; + FdWithKeyRange& prev = level0_sorted_file[i - 1]; + if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) { + level0_non_overlapping_ = false; + break; + } + } +} + +void VersionStorageInfo::GenerateBottommostFiles() { + assert(!finalized_); + assert(bottommost_files_.empty()); + for (size_t level = 0; level < level_files_brief_.size(); ++level) { + for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files; + ++file_idx) { + const FdWithKeyRange& f = level_files_brief_[level].files[file_idx]; + int l0_file_idx; + if (level == 0) { + l0_file_idx = static_cast(file_idx); + } else { + l0_file_idx = -1; + } + Slice smallest_user_key = ExtractUserKey(f.smallest_key); + Slice largest_user_key = ExtractUserKey(f.largest_key); + if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key, + static_cast(level), + l0_file_idx)) { + bottommost_files_.emplace_back(static_cast(level), + f.file_metadata); + } + } + } +} + +void VersionStorageInfo::GenerateFileLocationIndex() { + size_t num_files = 0; + + for (int level = 0; level < num_levels_; ++level) { + num_files += files_[level].size(); + } + + file_locations_.reserve(num_files); + + for (int level = 0; level < num_levels_; ++level) { + for (size_t pos = 0; pos < files_[level].size(); ++pos) { + const FileMetaData* const meta = files_[level][pos]; + assert(meta); + + const uint64_t file_number = meta->fd.GetNumber(); + + assert(file_locations_.find(file_number) == file_locations_.end()); + file_locations_.emplace(file_number, FileLocation(level, pos)); + } + } +} + +void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) { + assert(seqnum >= oldest_snapshot_seqnum_); + oldest_snapshot_seqnum_ = seqnum; + if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) { + ComputeBottommostFilesMarkedForCompaction(); + } +} + +void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() { + bottommost_files_marked_for_compaction_.clear(); + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto& level_and_file : bottommost_files_) { + if (!level_and_file.second->being_compacted && + level_and_file.second->fd.largest_seqno != 0) { + // largest_seqno might be nonzero due to containing the final key in an + // earlier compaction, whose seqnum we didn't zero out. Multiple deletions + // ensures the file really contains deleted or overwritten keys. + if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { + bottommost_files_marked_for_compaction_.push_back(level_and_file); + } else { + bottommost_files_mark_threshold_ = + std::min(bottommost_files_mark_threshold_, + level_and_file.second->fd.largest_seqno); + } + } + } +} + +void Version::Ref() { ++refs_; } + +bool Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + delete this; + return true; + } + return false; +} + +bool VersionStorageInfo::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + if (level >= num_non_empty_levels_) { + // empty level, no overlap + return false; + } + return SomeFileOverlapsRange(*internal_comparator_, (level > 0), + level_files_brief_[level], smallest_user_key, + largest_user_key); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// If hint_index is specified, then it points to a file in the +// overlapping range. +// The file_index returns a pointer to any file in an overlapping range. +void VersionStorageInfo::GetOverlappingInputs( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index, + bool expand_range, InternalKey** next_smallest) const { + if (level >= num_non_empty_levels_) { + // this level is empty, no overlapping inputs + return; + } + + inputs->clear(); + if (file_index) { + *file_index = -1; + } + const Comparator* user_cmp = user_comparator_; + if (level > 0) { + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, false, next_smallest); + return; + } + + if (next_smallest) { + // next_smallest key only makes sense for non-level 0, where files are + // non-overlapping + *next_smallest = nullptr; + } + + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + + // index stores the file index need to check. + std::list index; + for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { + index.emplace_back(i); + } + + while (!index.empty()) { + bool found_overlapping_file = false; + auto iter = index.begin(); + while (iter != index.end()) { + FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); + const Slice file_start = ExtractUserKey(f->smallest_key); + const Slice file_limit = ExtractUserKey(f->largest_key); + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + iter++; + } else if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + iter++; + } else { + // if overlap + inputs->emplace_back(files_[level][*iter]); + found_overlapping_file = true; + // record the first file index. + if (file_index && *file_index == -1) { + *file_index = static_cast(*iter); + } + // the related file is overlap, erase to avoid checking again. + iter = index.erase(iter); + if (expand_range) { + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { + user_begin = file_start; + } + if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { + user_end = file_limit; + } + } + } + } + // if all the files left are not overlap, break + if (!found_overlapping_file) { + break; + } + } +} + +// Store in "*inputs" files in "level" that within range [begin,end] +// Guarantee a "clean cut" boundary between the files in inputs +// and the surrounding files and the maxinum number of files. +// This will ensure that no parts of a key are lost during compaction. +// If hint_index is specified, then it points to a file in the range. +// The file_index returns a pointer to any file in an overlapping range. +void VersionStorageInfo::GetCleanInputsWithinInterval( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index) const { + inputs->clear(); + if (file_index) { + *file_index = -1; + } + if (level >= num_non_empty_levels_ || level == 0 || + level_files_brief_[level].num_files == 0) { + // this level is empty, no inputs within range + // also don't support clean input interval within L0 + return; + } + + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, true /* within_interval */); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// Employ binary search to find at least one file that overlaps the +// specified range. From that file, iterate backwards and +// forwards to find all overlapping files. +// if within_range is set, then only store the maximum clean inputs +// within range [begin, end]. "clean" means there is a boundary +// between the files in "*inputs" and the surrounding files +void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index, + bool within_interval, InternalKey** next_smallest) const { + assert(level > 0); + + auto user_cmp = user_comparator_; + const FdWithKeyRange* files = level_files_brief_[level].files; + const int num_files = static_cast(level_files_brief_[level].num_files); + + // begin to use binary search to find lower bound + // and upper bound. + int start_index = 0; + int end_index = num_files; + + if (begin != nullptr) { + // if within_interval is true, with file_key would find + // not overlapping ranges in std::lower_bound. + auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f, + const InternalKey* k) { + auto& file_key = within_interval ? f.file_metadata->smallest + : f.file_metadata->largest; + return sstableKeyCompare(user_cmp, file_key, *k) < 0; + }; + + start_index = static_cast( + std::lower_bound(files, + files + (hint_index == -1 ? num_files : hint_index), + begin, cmp) - + files); + + if (start_index > 0 && within_interval) { + bool is_overlapping = true; + while (is_overlapping && start_index < num_files) { + auto& pre_limit = files[start_index - 1].file_metadata->largest; + auto& cur_start = files[start_index].file_metadata->smallest; + is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0; + start_index += is_overlapping; + } + } + } + + if (end != nullptr) { + // if within_interval is true, with file_key would find + // not overlapping ranges in std::upper_bound. + auto cmp = [&user_cmp, &within_interval](const InternalKey* k, + const FdWithKeyRange& f) { + auto& file_key = within_interval ? f.file_metadata->largest + : f.file_metadata->smallest; + return sstableKeyCompare(user_cmp, *k, file_key) < 0; + }; + + end_index = static_cast( + std::upper_bound(files + start_index, files + num_files, end, cmp) - + files); + + if (end_index < num_files && within_interval) { + bool is_overlapping = true; + while (is_overlapping && end_index > start_index) { + auto& next_start = files[end_index].file_metadata->smallest; + auto& cur_limit = files[end_index - 1].file_metadata->largest; + is_overlapping = + sstableKeyCompare(user_cmp, cur_limit, next_start) == 0; + end_index -= is_overlapping; + } + } + } + + assert(start_index <= end_index); + + // If there were no overlapping files, return immediately. + if (start_index == end_index) { + if (next_smallest) { + *next_smallest = nullptr; + } + return; + } + + assert(start_index < end_index); + + // returns the index where an overlap is found + if (file_index) { + *file_index = start_index; + } + + // insert overlapping files into vector + for (int i = start_index; i < end_index; i++) { + inputs->push_back(files_[level][i]); + } + + if (next_smallest != nullptr) { + // Provide the next key outside the range covered by inputs + if (end_index < static_cast(files_[level].size())) { + **next_smallest = files_[level][end_index]->smallest; + } else { + *next_smallest = nullptr; + } + } +} + +uint64_t VersionStorageInfo::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < num_levels()); + return TotalFileSize(files_[level]); +} + +const char* VersionStorageInfo::LevelSummary( + LevelSummaryStorage* scratch) const { + int len = 0; + if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { + assert(base_level_ < static_cast(level_max_bytes_.size())); + if (level_multiplier_ != 0.0) { + len = snprintf( + scratch->buffer, sizeof(scratch->buffer), + "base level %d level multiplier %.2f max bytes base %" PRIu64 " ", + base_level_, level_multiplier_, level_max_bytes_[base_level_]); + } + } + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files["); + for (int i = 0; i < num_levels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + if (len > 0) { + // overwrite the last space + --len; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "] max score %.2f", compaction_score_[0]); + + if (!files_marked_for_compaction_.empty()) { + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " (%" ROCKSDB_PRIszt " files need compaction)", + files_marked_for_compaction_.size()); + } + + return scratch->buffer; +} + +const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + char sztxt[16]; + AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt)); + int ret = snprintf(scratch->buffer + len, sz, + "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", + f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, + static_cast(f->being_compacted)); + if (ret < 0 || ret >= sz) break; + len += ret; + } + // overwrite the last space (only if files_[level].size() is non-zero) + if (files_[level].size() && len > 0) { + --len; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +bool VersionStorageInfo::HasMissingEpochNumber() const { + for (int level = 0; level < num_levels_; ++level) { + for (const FileMetaData* f : files_[level]) { + if (f->epoch_number == kUnknownEpochNumber) { + return true; + } + } + } + return false; +} + +uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const { + uint64_t max_epoch_number = kUnknownEpochNumber; + for (int level = 0; level < num_levels_; ++level) { + for (const FileMetaData* f : files_[level]) { + max_epoch_number = std::max(max_epoch_number, f->epoch_number); + } + } + return max_epoch_number; +} + +void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) { + cfd->ResetNextEpochNumber(); + + bool reserve_epoch_num_for_file_ingested_behind = + cfd->ioptions()->allow_ingest_behind; + if (reserve_epoch_num_for_file_ingested_behind) { + uint64_t reserved_epoch_number = cfd->NewEpochNumber(); + assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind); + ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(), + "[%s]CF has reserved epoch number %" PRIu64 + " for files ingested " + "behind since `Options::allow_ingest_behind` is true", + cfd->GetName().c_str(), reserved_epoch_number); + } + + if (HasMissingEpochNumber()) { + assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing); + assert(num_levels_ >= 1); + + for (int level = num_levels_ - 1; level >= 1; --level) { + auto& files_at_level = files_[level]; + if (files_at_level.empty()) { + continue; + } + uint64_t next_epoch_number = cfd->NewEpochNumber(); + for (FileMetaData* f : files_at_level) { + f->epoch_number = next_epoch_number; + } + } + + for (auto file_meta_iter = files_[0].rbegin(); + file_meta_iter != files_[0].rend(); file_meta_iter++) { + FileMetaData* f = *file_meta_iter; + f->epoch_number = cfd->NewEpochNumber(); + } + + ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(), + "[%s]CF's epoch numbers are inferred based on seqno", + cfd->GetName().c_str()); + epoch_number_requirement_ = EpochNumberRequirement::kMustPresent; + } else { + assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent); + cfd->SetNextEpochNumber( + std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber())); + } +} + +uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < num_levels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < static_cast(level_max_bytes_.size())); + return level_max_bytes_[level]; +} + +void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, + const MutableCFOptions& options) { + // Special logic to set number of sorted runs. + // It is to match the previous behavior when all files are in L0. + int num_l0_count = static_cast(files_[0].size()); + if (compaction_style_ == kCompactionStyleUniversal) { + // For universal compaction, we use level0 score to indicate + // compaction score for the whole DB. Adding other levels as if + // they are L0 files. + for (int i = 1; i < num_levels(); i++) { + if (!files_[i].empty()) { + num_l0_count++; + } + } + } + set_l0_delay_trigger_count(num_l0_count); + + level_max_bytes_.resize(ioptions.num_levels); + if (!ioptions.level_compaction_dynamic_level_bytes) { + base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1; + + // Calculate for static bytes base case + for (int i = 0; i < ioptions.num_levels; ++i) { + if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) { + level_max_bytes_[i] = options.max_bytes_for_level_base; + } else if (i > 1) { + level_max_bytes_[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes_[i - 1], + options.max_bytes_for_level_multiplier), + options.MaxBytesMultiplerAdditional(i - 1)); + } else { + level_max_bytes_[i] = options.max_bytes_for_level_base; + } + } + } else { + assert(ioptions.compaction_style == kCompactionStyleLevel); + uint64_t max_level_size = 0; + + int first_non_empty_level = -1; + // Find size of non-L0 level of most data. + // Cannot use the size of the last level because it can be empty or less + // than previous levels after compaction. + for (int i = 1; i < num_levels_; i++) { + uint64_t total_size = 0; + for (const auto& f : files_[i]) { + total_size += f->fd.GetFileSize(); + } + if (total_size > 0 && first_non_empty_level == -1) { + first_non_empty_level = i; + } + if (total_size > max_level_size) { + max_level_size = total_size; + } + } + + // Prefill every level's max bytes to disallow compaction from there. + for (int i = 0; i < num_levels_; i++) { + level_max_bytes_[i] = std::numeric_limits::max(); + } + + lowest_unnecessary_level_ = -1; + if (max_level_size == 0) { + // No data for L1 and up. L0 compacts to last level directly. + // No compaction from L1+ needs to be scheduled. + base_level_ = num_levels_ - 1; + } else { + assert(first_non_empty_level >= 1); + uint64_t base_bytes_max = options.max_bytes_for_level_base; + uint64_t base_bytes_min = static_cast( + base_bytes_max / options.max_bytes_for_level_multiplier); + + // Try whether we can make last level's target size to be max_level_size + uint64_t cur_level_size = max_level_size; + for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) { + // Round up after dividing + cur_level_size = static_cast( + cur_level_size / options.max_bytes_for_level_multiplier); + if (lowest_unnecessary_level_ == -1 && + cur_level_size <= base_bytes_min && + (ioptions.preclude_last_level_data_seconds == 0 || + i < num_levels_ - 2)) { + // When per_key_placement is enabled, the penultimate level is + // necessary. + lowest_unnecessary_level_ = i; + } + } + + // Calculate base level and its size. + uint64_t base_level_size; + if (cur_level_size <= base_bytes_min) { + // If per_key_placement is not enabled, + // either there is only one non-empty level after level 0, + // which can less than base_bytes_min AND necessary, + // or there is some unnecessary level. + assert(first_non_empty_level == num_levels_ - 1 || + ioptions.preclude_last_level_data_seconds > 0 || + lowest_unnecessary_level_ != -1); + // Case 1. If we make target size of last level to be max_level_size, + // target size of the first non-empty level would be smaller than + // base_bytes_min. We set it be base_bytes_min. + base_level_size = base_bytes_min + 1U; + base_level_ = first_non_empty_level; + if (base_level_ < num_levels_ - 1) { + ROCKS_LOG_INFO( + ioptions.logger, + "More existing levels in DB than needed: all non-zero " + "levels <= level %d are unnecessary. " + "max_bytes_for_level_multiplier may not be guaranteed.", + lowest_unnecessary_level_); + } + } else { + assert(lowest_unnecessary_level_ == -1); + // Find base level (where L0 data is compacted to). + base_level_ = first_non_empty_level; + while (base_level_ > 1 && cur_level_size > base_bytes_max) { + --base_level_; + cur_level_size = static_cast( + cur_level_size / options.max_bytes_for_level_multiplier); + } + if (cur_level_size > base_bytes_max) { + // Even L1 will be too large + assert(base_level_ == 1); + base_level_size = base_bytes_max; + } else { + base_level_size = cur_level_size; + } + } + + level_multiplier_ = options.max_bytes_for_level_multiplier; + assert(base_level_size > 0); + + uint64_t level_size = base_level_size; + for (int i = base_level_; i < num_levels_; i++) { + if (i > base_level_) { + level_size = MultiplyCheckOverflow(level_size, level_multiplier_); + } + // Don't set any level below base_bytes_max. Otherwise, the LSM can + // assume an hourglass shape where L1+ sizes are smaller than L0. This + // causes compaction scoring, which depends on level sizes, to favor L1+ + // at the expense of L0, which may fill up and stall. + level_max_bytes_[i] = std::max(level_size, base_bytes_max); + } + } + } +} + +uint64_t VersionStorageInfo::EstimateLiveDataSize() const { + // Estimate the live data size by adding up the size of a maximal set of + // sst files with no range overlap in same or higher level. The less + // compacted, the more optimistic (smaller) this estimate is. Also, + // for multiple sorted runs within a level, file order will matter. + uint64_t size = 0; + + auto ikey_lt = [this](InternalKey* x, InternalKey* y) { + return internal_comparator_->Compare(*x, *y) < 0; + }; + // (Ordered) map of largest keys in files being included in size estimate + std::map ranges(ikey_lt); + + for (int l = num_levels_ - 1; l >= 0; l--) { + bool found_end = false; + for (auto file : files_[l]) { + // Find the first file already included with largest key is larger than + // the smallest key of `file`. If that file does not overlap with the + // current file, none of the files in the map does. If there is + // no potential overlap, we can safely insert the rest of this level + // (if the level is not 0) into the map without checking again because + // the elements in the level are sorted and non-overlapping. + auto lb = (found_end && l != 0) ? ranges.end() + : ranges.lower_bound(&file->smallest); + found_end = (lb == ranges.end()); + if (found_end || internal_comparator_->Compare( + file->largest, (*lb).second->smallest) < 0) { + ranges.emplace_hint(lb, &file->largest, file); + size += file->fd.file_size; + } + } + } + + // For BlobDB, the result also includes the exact value of live bytes in the + // blob files of the version. + for (const auto& meta : blob_files_) { + assert(meta); + + size += meta->GetTotalBlobBytes(); + size -= meta->GetGarbageBlobBytes(); + } + + return size; +} + +bool VersionStorageInfo::RangeMightExistAfterSortedRun( + const Slice& smallest_user_key, const Slice& largest_user_key, + int last_level, int last_l0_idx) { + assert((last_l0_idx != -1) == (last_level == 0)); + // TODO(ajkr): this preserves earlier behavior where we considered an L0 file + // bottommost only if it's the oldest L0 file and there are no files on older + // levels. It'd be better to consider it bottommost if there's no overlap in + // older levels/files. + if (last_level == 0 && + last_l0_idx != static_cast(LevelFiles(0).size() - 1)) { + return true; + } + + // Checks whether there are files living beyond the `last_level`. If lower + // levels have files, it checks for overlap between [`smallest_key`, + // `largest_key`] and those files. Bottomlevel optimizations can be made if + // there are no files in lower levels or if there is no overlap with the files + // in the lower levels. + for (int level = last_level + 1; level < num_levels(); level++) { + // The range is not in the bottommost level if there are files in lower + // levels when the `last_level` is 0 or if there are files in lower levels + // which overlap with [`smallest_key`, `largest_key`]. + if (files_[level].size() > 0 && + (last_level == 0 || + OverlapInLevel(level, &smallest_user_key, &largest_user_key))) { + return true; + } + } + return false; +} + +void Version::AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const { + assert(live_table_files); + assert(live_blob_files); + + for (int level = 0; level < storage_info_.num_levels(); ++level) { + const auto& level_files = storage_info_.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + + live_table_files->emplace_back(meta->fd.GetNumber()); + } + } + + const auto& blob_files = storage_info_.GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + live_blob_files->emplace_back(meta->GetBlobFileNumber()); + } +} + +void Version::RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const { + for (ObsoleteFileInfo& fi : sst_delete_candidates) { + if (!fi.only_delete_metadata && + storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) != + VersionStorageInfo::FileLocation::Invalid()) { + fi.only_delete_metadata = true; + } + } + + blob_delete_candidates.erase( + std::remove_if( + blob_delete_candidates.begin(), blob_delete_candidates.end(), + [this](ObsoleteBlobFileInfo& x) { + return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber()); + }), + blob_delete_candidates.end()); +} + +std::string Version::DebugString(bool hex, bool print_stats) const { + std::string r; + for (int level = 0; level < storage_info_.num_levels_; level++) { + // E.g., + // --- level 1 --- + // 17:123[1 .. 124]['a' .. 'd'] + // 20:43[124 .. 128]['e' .. 'g'] + // + // if print_stats=true: + // 17:123[1 .. 124]['a' .. 'd'](4096) + r.append("--- level "); + AppendNumberTo(&r, level); + r.append(" --- version# "); + AppendNumberTo(&r, version_number_); + if (storage_info_.compact_cursor_[level].Valid()) { + r.append(" --- compact_cursor: "); + r.append(storage_info_.compact_cursor_[level].DebugString(hex)); + } + r.append(" ---\n"); + const std::vector& files = storage_info_.files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->fd.GetNumber()); + r.push_back(':'); + AppendNumberTo(&r, files[i]->fd.GetFileSize()); + r.append("["); + AppendNumberTo(&r, files[i]->fd.smallest_seqno); + r.append(" .. "); + AppendNumberTo(&r, files[i]->fd.largest_seqno); + r.append("]"); + r.append("["); + r.append(files[i]->smallest.DebugString(hex)); + r.append(" .. "); + r.append(files[i]->largest.DebugString(hex)); + r.append("]"); + if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, files[i]->oldest_blob_file_number); + } + if (print_stats) { + r.append("("); + r.append(std::to_string( + files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed))); + r.append(")"); + } + r.append("\n"); + } + } + + const auto& blob_files = storage_info_.GetBlobFiles(); + if (!blob_files.empty()) { + r.append("--- blob files --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + for (const auto& blob_file_meta : blob_files) { + assert(blob_file_meta); + + r.append(blob_file_meta->DebugString()); + r.push_back('\n'); + } + } + + return r; +} + +// this is used to batch writes to the manifest file +struct VersionSet::ManifestWriter { + Status status; + bool done; + InstrumentedCondVar cv; + ColumnFamilyData* cfd; + const MutableCFOptions mutable_cf_options; + const autovector& edit_list; + const std::function manifest_write_callback; + + explicit ManifestWriter( + InstrumentedMutex* mu, ColumnFamilyData* _cfd, + const MutableCFOptions& cf_options, const autovector& e, + const std::function& manifest_wcb) + : done(false), + cv(mu), + cfd(_cfd), + mutable_cf_options(cf_options), + edit_list(e), + manifest_write_callback(manifest_wcb) {} + ~ManifestWriter() { status.PermitUncheckedError(); } + + bool IsAllWalEdits() const { + bool all_wal_edits = true; + for (const auto& e : edit_list) { + if (!e->IsWalManipulation()) { + all_wal_edits = false; + break; + } + } + return all_wal_edits; + } +}; + +Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { + assert(edit); + if (edit->is_in_atomic_group_) { + TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); + if (replay_buffer_.empty()) { + replay_buffer_.resize(edit->remaining_entries_ + 1); + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); + } + read_edits_in_atomic_group_++; + if (read_edits_in_atomic_group_ + edit->remaining_entries_ != + static_cast(replay_buffer_.size())) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); + return Status::Corruption("corrupted atomic group"); + } + replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit; + if (read_edits_in_atomic_group_ == replay_buffer_.size()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit); + return Status::OK(); + } + return Status::OK(); + } + + // A normal edit. + if (!replay_buffer().empty()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit); + return Status::Corruption("corrupted atomic group"); + } + return Status::OK(); +} + +bool AtomicGroupReadBuffer::IsFull() const { + return read_edits_in_atomic_group_ == replay_buffer_.size(); +} + +bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); } + +void AtomicGroupReadBuffer::Clear() { + read_edits_in_atomic_group_ = 0; + replay_buffer_.clear(); +} + +VersionSet::VersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& storage_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, + const std::string& db_session_id) + : column_family_set_(new ColumnFamilySet( + dbname, _db_options, storage_options, table_cache, + write_buffer_manager, write_controller, block_cache_tracer, io_tracer, + db_id, db_session_id)), + table_cache_(table_cache), + env_(_db_options->env), + fs_(_db_options->fs, io_tracer), + clock_(_db_options->clock), + dbname_(dbname), + db_options_(_db_options), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + options_file_number_(0), + options_file_size_(0), + pending_manifest_file_number_(0), + last_sequence_(0), + last_allocated_sequence_(0), + last_published_sequence_(0), + prev_log_number_(0), + current_version_number_(0), + manifest_file_size_(0), + file_options_(storage_options), + block_cache_tracer_(block_cache_tracer), + io_tracer_(io_tracer), + db_session_id_(db_session_id) {} + +VersionSet::~VersionSet() { + // we need to delete column_family_set_ because its destructor depends on + // VersionSet + column_family_set_.reset(); + for (auto& file : obsolete_files_) { + if (file.metadata->table_reader_handle) { + table_cache_->Release(file.metadata->table_reader_handle); + TableCache::Evict(table_cache_, file.metadata->fd.GetNumber()); + } + file.DeleteMetadata(); + } + obsolete_files_.clear(); + io_status_.PermitUncheckedError(); +} + +void VersionSet::Reset() { + if (column_family_set_) { + WriteBufferManager* wbm = column_family_set_->write_buffer_manager(); + WriteController* wc = column_family_set_->write_controller(); + // db_id becomes the source of truth after DBImpl::Recover(): + // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527 + // Note: we may not be able to recover db_id from MANIFEST if + // options.write_dbid_to_manifest is false (default). + column_family_set_.reset(new ColumnFamilySet( + dbname_, db_options_, file_options_, table_cache_, wbm, wc, + block_cache_tracer_, io_tracer_, db_id_, db_session_id_)); + } + db_id_.clear(); + next_file_number_.store(2); + min_log_number_to_keep_.store(0); + manifest_file_number_ = 0; + options_file_number_ = 0; + pending_manifest_file_number_ = 0; + last_sequence_.store(0); + last_allocated_sequence_.store(0); + last_published_sequence_.store(0); + prev_log_number_ = 0; + descriptor_log_.reset(); + current_version_number_ = 0; + manifest_writers_.clear(); + manifest_file_size_ = 0; + obsolete_files_.clear(); + obsolete_manifests_.clear(); + wals_.Reset(); +} + +void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, + Version* v) { + // compute new compaction score + v->storage_info()->ComputeCompactionScore( + *column_family_data->ioptions(), + *column_family_data->GetLatestMutableCFOptions()); + + // Mark v finalized + v->storage_info_.SetFinalized(); + + // Make "v" current + assert(v->refs_ == 0); + Version* current = column_family_data->current(); + assert(v != current); + if (current != nullptr) { + assert(current->refs_ > 0); + current->Unref(); + } + column_family_data->SetCurrent(v); + v->Ref(); + + // Append to linked list + v->prev_ = column_family_data->dummy_versions()->prev_; + v->next_ = column_family_data->dummy_versions(); + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::ProcessManifestWrites( + std::deque& writers, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options) { + mu->AssertHeld(); + assert(!writers.empty()); + ManifestWriter& first_writer = writers.front(); + ManifestWriter* last_writer = &first_writer; + + assert(!manifest_writers_.empty()); + assert(manifest_writers_.front() == &first_writer); + + autovector batch_edits; + autovector versions; + autovector mutable_cf_options_ptrs; + std::vector> builder_guards; + + // Tracking `max_last_sequence` is needed to ensure we write + // `VersionEdit::last_sequence_`s in non-decreasing order according to the + // recovery code's requirement. It also allows us to defer updating + // `descriptor_last_sequence_` until the apply phase, after the log phase + // succeeds. + SequenceNumber max_last_sequence = descriptor_last_sequence_; + + if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + // No group commits for column family add or drop + LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence); + batch_edits.push_back(first_writer.edit_list.front()); + } else { + auto it = manifest_writers_.cbegin(); + size_t group_start = std::numeric_limits::max(); + while (it != manifest_writers_.cend()) { + if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) { + // no group commits for column family add or drop + break; + } + last_writer = *(it++); + assert(last_writer != nullptr); + assert(last_writer->cfd != nullptr); + if (last_writer->cfd->IsDropped()) { + // If we detect a dropped CF at this point, and the corresponding + // version edits belong to an atomic group, then we need to find out + // the preceding version edits in the same atomic group, and update + // their `remaining_entries_` member variable because we are NOT going + // to write the version edits' of dropped CF to the MANIFEST. If we + // don't update, then Recover can report corrupted atomic group because + // the `remaining_entries_` do not match. + if (!batch_edits.empty()) { + if (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ > 0) { + assert(group_start < batch_edits.size()); + const auto& edit_list = last_writer->edit_list; + size_t k = 0; + while (k < edit_list.size()) { + if (!edit_list[k]->is_in_atomic_group_) { + break; + } else if (edit_list[k]->remaining_entries_ == 0) { + ++k; + break; + } + ++k; + } + for (auto i = group_start; i < batch_edits.size(); ++i) { + assert(static_cast(k) <= + batch_edits.back()->remaining_entries_); + batch_edits[i]->remaining_entries_ -= static_cast(k); + } + } + } + continue; + } + // We do a linear search on versions because versions is small. + // TODO(yanqin) maybe consider unordered_map + Version* version = nullptr; + VersionBuilder* builder = nullptr; + for (int i = 0; i != static_cast(versions.size()); ++i) { + uint32_t cf_id = last_writer->cfd->GetID(); + if (versions[i]->cfd()->GetID() == cf_id) { + version = versions[i]; + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + builder = builder_guards[i]->version_builder(); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id); + break; + } + } + if (version == nullptr) { + // WAL manipulations do not need to be applied to versions. + if (!last_writer->IsAllWalEdits()) { + version = new Version(last_writer->cfd, this, file_options_, + last_writer->mutable_cf_options, io_tracer_, + current_version_number_++); + versions.push_back(version); + mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); + builder_guards.emplace_back( + new BaseReferencedVersionBuilder(last_writer->cfd)); + builder = builder_guards.back()->version_builder(); + } + assert(last_writer->IsAllWalEdits() || builder); + assert(last_writer->IsAllWalEdits() || version); + TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion", + version); + } + for (const auto& e : last_writer->edit_list) { + if (e->is_in_atomic_group_) { + if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || + (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ == 0)) { + group_start = batch_edits.size(); + } + } else if (group_start != std::numeric_limits::max()) { + group_start = std::numeric_limits::max(); + } + Status s = LogAndApplyHelper(last_writer->cfd, builder, e, + &max_last_sequence, mu); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } + batch_edits.push_back(e); + } + } + for (int i = 0; i < static_cast(versions.size()); ++i) { + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + auto* builder = builder_guards[i]->version_builder(); + Status s = builder->SaveTo(versions[i]->storage_info()); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } + } + } + +#ifndef NDEBUG + // Verify that version edits of atomic groups have correct + // remaining_entries_. + size_t k = 0; + while (k < batch_edits.size()) { + while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) { + ++k; + } + if (k == batch_edits.size()) { + break; + } + size_t i = k; + while (i < batch_edits.size()) { + if (!batch_edits[i]->is_in_atomic_group_) { + break; + } + assert(i - k + batch_edits[i]->remaining_entries_ == + batch_edits[k]->remaining_entries_); + if (batch_edits[i]->remaining_entries_ == 0) { + ++i; + break; + } + ++i; + } + assert(batch_edits[i - 1]->is_in_atomic_group_); + assert(0 == batch_edits[i - 1]->remaining_entries_); + std::vector tmp; + for (size_t j = k; j != i; ++j) { + tmp.emplace_back(batch_edits[j]); + } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp); + k = i; + } +#endif // NDEBUG + + assert(pending_manifest_file_number_ == 0); + if (!descriptor_log_ || + manifest_file_size_ > db_options_->max_manifest_file_size) { + TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); + new_descriptor_log = true; + } else { + pending_manifest_file_number_ = manifest_file_number_; + } + + // Local cached copy of state variable(s). WriteCurrentStateToManifest() + // reads its content after releasing db mutex to avoid race with + // SwitchMemtable(). + std::unordered_map curr_state; + VersionEdit wal_additions; + if (new_descriptor_log) { + pending_manifest_file_number_ = NewFileNumber(); + batch_edits.back()->SetNextFile(next_file_number_.load()); + + // if we are writing out new snapshot make sure to persist max column + // family. + if (column_family_set_->GetMaxColumnFamily() > 0) { + first_writer.edit_list.front()->SetMaxColumnFamily( + column_family_set_->GetMaxColumnFamily()); + } + for (const auto* cfd : *column_family_set_) { + assert(curr_state.find(cfd->GetID()) == curr_state.end()); + curr_state.emplace(std::make_pair( + cfd->GetID(), + MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow()))); + } + + for (const auto& wal : wals_.GetWals()) { + wal_additions.AddWal(wal.first, wal.second); + } + } + + uint64_t new_manifest_file_size = 0; + Status s; + IOStatus io_s; + IOStatus manifest_io_status; + { + FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); + mu->Unlock(); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); + if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + for (int i = 0; i < static_cast(versions.size()); ++i) { + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + assert(!mutable_cf_options_ptrs.empty() && + builder_guards.size() == versions.size()); + ColumnFamilyData* cfd = versions[i]->cfd_; + s = builder_guards[i]->version_builder()->LoadTableHandlers( + cfd->internal_stats(), 1 /* max_threads */, + true /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + mutable_cf_options_ptrs[i]->prefix_extractor, + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options, + mutable_cf_options_ptrs[i]->block_protection_bytes_per_key); + if (!s.ok()) { + if (db_options_->paranoid_checks) { + break; + } + s = Status::OK(); + } + } + } + + if (s.ok() && new_descriptor_log) { + // This is fine because everything inside of this block is serialized -- + // only one thread can be here at the same time + // create new manifest file + ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", + pending_manifest_file_number_); + std::string descriptor_fname = + DescriptorFileName(dbname_, pending_manifest_file_number_); + std::unique_ptr descriptor_file; + io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file, + opt_file_opts); + if (io_s.ok()) { + descriptor_file->SetPreallocationBlockSize( + db_options_->manifest_preallocation_size); + FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, + io_tracer_, nullptr, db_options_->listeners, nullptr, + tmp_set.Contains(FileType::kDescriptorFile), + tmp_set.Contains(FileType::kDescriptorFile))); + descriptor_log_.reset( + new log::Writer(std::move(file_writer), 0, false)); + s = WriteCurrentStateToManifest(curr_state, wal_additions, + descriptor_log_.get(), io_s); + } else { + manifest_io_status = io_s; + s = io_s; + } + } + + if (s.ok()) { + if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + constexpr bool update_stats = true; + + for (int i = 0; i < static_cast(versions.size()); ++i) { + versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], read_options, + update_stats); + } + } + + // Write new records to MANIFEST log +#ifndef NDEBUG + size_t idx = 0; +#endif + for (auto& e : batch_edits) { + std::string record; + if (!e->EncodeTo(&record)) { + s = Status::Corruption("Unable to encode VersionEdit:" + + e->DebugString(true)); + break; + } + TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord", + REDUCE_ODDS2); +#ifndef NDEBUG + if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + nullptr); + TEST_SYNC_POINT( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"); + } + ++idx; +#endif /* !NDEBUG */ + io_s = descriptor_log_->AddRecord(record); + if (!io_s.ok()) { + s = io_s; + manifest_io_status = io_s; + break; + } + } + + if (s.ok()) { + io_s = SyncManifest(db_options_, descriptor_log_->file()); + manifest_io_status = io_s; + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); + } + if (!io_s.ok()) { + s = io_s; + ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", + s.ToString().c_str()); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok()) { + assert(manifest_io_status.ok()); + } + if (s.ok() && new_descriptor_log) { + io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_, + dir_contains_current_file); + if (!io_s.ok()) { + s = io_s; + } + } + + if (s.ok()) { + // find offset in manifest file where this version is stored. + new_manifest_file_size = descriptor_log_->file()->GetFileSize(); + } + + if (first_writer.edit_list.front()->is_column_family_drop_) { + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); + } + + LogFlush(db_options_->info_log); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone"); + mu->Lock(); + } + + if (s.ok()) { + // Apply WAL edits, DB mutex must be held. + for (auto& e : batch_edits) { + if (e->IsWalAddition()) { + s = wals_.AddWals(e->GetWalAdditions()); + } else if (e->IsWalDeletion()) { + s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber()); + } + if (!s.ok()) { + break; + } + } + } + + if (!io_s.ok()) { + if (io_status_.ok()) { + io_status_ = io_s; + } + } else if (!io_status_.ok()) { + io_status_ = io_s; + } + + // Append the old manifest file to the obsolete_manifest_ list to be deleted + // by PurgeObsoleteFiles later. + if (s.ok() && new_descriptor_log) { + obsolete_manifests_.emplace_back( + DescriptorFileName("", manifest_file_number_)); + } + + // Install the new versions + if (s.ok()) { + if (first_writer.edit_list.front()->is_column_family_add_) { + assert(batch_edits.size() == 1); + assert(new_cf_options != nullptr); + assert(max_last_sequence == descriptor_last_sequence_); + CreateColumnFamily(*new_cf_options, read_options, + first_writer.edit_list.front()); + } else if (first_writer.edit_list.front()->is_column_family_drop_) { + assert(batch_edits.size() == 1); + assert(max_last_sequence == descriptor_last_sequence_); + first_writer.cfd->SetDropped(); + first_writer.cfd->UnrefAndTryDelete(); + } else { + // Each version in versions corresponds to a column family. + // For each column family, update its log number indicating that logs + // with number smaller than this should be ignored. + uint64_t last_min_log_number_to_keep = 0; + for (const auto& e : batch_edits) { + ColumnFamilyData* cfd = nullptr; + if (!e->IsColumnFamilyManipulation()) { + cfd = column_family_set_->GetColumnFamily(e->column_family_); + // e would not have been added to batch_edits if its corresponding + // column family is dropped. + assert(cfd); + } + if (cfd) { + if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) { + cfd->SetLogNumber(e->log_number_); + } + if (e->HasFullHistoryTsLow()) { + cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow()); + } + } + if (e->has_min_log_number_to_keep_) { + last_min_log_number_to_keep = + std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_); + } + } + + if (last_min_log_number_to_keep != 0) { + MarkMinLogNumberToKeep(last_min_log_number_to_keep); + } + + for (int i = 0; i < static_cast(versions.size()); ++i) { + ColumnFamilyData* cfd = versions[i]->cfd_; + AppendVersion(cfd, versions[i]); + } + } + assert(max_last_sequence >= descriptor_last_sequence_); + descriptor_last_sequence_ = max_last_sequence; + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + prev_log_number_ = first_writer.edit_list.front()->prev_log_number_; + } else { + std::string version_edits; + for (auto& e : batch_edits) { + version_edits += ("\n" + e->DebugString(true)); + } + ROCKS_LOG_ERROR(db_options_->info_log, + "Error in committing version edit to MANIFEST: %s", + version_edits.c_str()); + for (auto v : versions) { + delete v; + } + if (manifest_io_status.ok()) { + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + } + // If manifest append failed for whatever reason, the file could be + // corrupted. So we need to force the next version update to start a + // new manifest file. + descriptor_log_.reset(); + // If manifest operations failed, then we know the CURRENT file still + // points to the original MANIFEST. Therefore, we can safely delete the + // new MANIFEST. + // If manifest operations succeeded, and we are here, then it is possible + // that renaming tmp file to CURRENT failed. + // + // On local POSIX-compliant FS, the CURRENT must point to the original + // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also + // keep it. Future recovery will ignore this MANIFEST. It's also ok for the + // process not to crash and continue using the db. Any future LogAndApply() + // call will switch to a new MANIFEST and update CURRENT, still ignoring + // this one. + // + // On non-local FS, it is + // possible that the rename operation succeeded on the server (remote) + // side, but the client somehow returns a non-ok status to RocksDB. Note + // that this does not violate atomicity. Should we delete the new MANIFEST + // successfully, a subsequent recovery attempt will likely see the CURRENT + // pointing to the new MANIFEST, thus fail. We will not be able to open the + // DB again. Therefore, if manifest operations succeed, we should keep the + // the new MANIFEST. If the process proceeds, any future LogAndApply() call + // will switch to a new MANIFEST and update CURRENT. If user tries to + // re-open the DB, + // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present. + // b) CURRENT points to the original MANIFEST, and the original MANIFEST + // also exists. + if (new_descriptor_log && !manifest_io_status.ok()) { + ROCKS_LOG_INFO(db_options_->info_log, + "Deleting manifest %" PRIu64 " current manifest %" PRIu64 + "\n", + pending_manifest_file_number_, manifest_file_number_); + Status manifest_del_status = env_->DeleteFile( + DescriptorFileName(dbname_, pending_manifest_file_number_)); + if (!manifest_del_status.ok()) { + ROCKS_LOG_WARN(db_options_->info_log, + "Failed to delete manifest %" PRIu64 ": %s", + pending_manifest_file_number_, + manifest_del_status.ToString().c_str()); + } + } + } + + pending_manifest_file_number_ = 0; + +#ifndef NDEBUG + // This is here kind of awkwardly because there's no other consistency + // checks on `VersionSet`'s updates for the new `Version`s. We might want + // to move it to a dedicated function, or remove it if we gain enough + // confidence in `descriptor_last_sequence_`. + if (s.ok()) { + for (const auto* v : versions) { + const auto* vstorage = v->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (const auto& file : vstorage->LevelFiles(level)) { + assert(file->fd.largest_seqno <= descriptor_last_sequence_); + } + } + } + } +#endif // NDEBUG + + // wake up all the waiting writers + while (true) { + ManifestWriter* ready = manifest_writers_.front(); + manifest_writers_.pop_front(); + bool need_signal = true; + for (const auto& w : writers) { + if (&w == ready) { + need_signal = false; + break; + } + } + ready->status = s; + ready->done = true; + if (ready->manifest_write_callback) { + (ready->manifest_write_callback)(s); + } + if (need_signal) { + ready->cv.Signal(); + } + if (ready == last_writer) { + break; + } + } + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return s; +} + +void VersionSet::WakeUpWaitingManifestWriters() { + // wake up all the waiting writers + // Notify new head of manifest write queue. + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } +} + +// 'datas' is grammatically incorrect. We still use this notation to indicate +// that this variable represents a collection of column_family_data. +Status VersionSet::LogAndApply( + const autovector& column_family_datas, + const autovector& mutable_cf_options_list, + const ReadOptions& read_options, + const autovector>& edit_lists, + InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, + bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, + const std::vector>& manifest_wcbs) { + mu->AssertHeld(); + int num_edits = 0; + for (const auto& elist : edit_lists) { + num_edits += static_cast(elist.size()); + } + if (num_edits == 0) { + return Status::OK(); + } else if (num_edits > 1) { +#ifndef NDEBUG + for (const auto& edit_list : edit_lists) { + for (const auto& edit : edit_list) { + assert(!edit->IsColumnFamilyManipulation()); + } + } +#endif /* ! NDEBUG */ + } + + int num_cfds = static_cast(column_family_datas.size()); + if (num_cfds == 1 && column_family_datas[0] == nullptr) { + assert(edit_lists.size() == 1 && edit_lists[0].size() == 1); + assert(edit_lists[0][0]->is_column_family_add_); + assert(new_cf_options != nullptr); + } + std::deque writers; + if (num_cfds > 0) { + assert(static_cast(num_cfds) == mutable_cf_options_list.size()); + assert(static_cast(num_cfds) == edit_lists.size()); + } + for (int i = 0; i < num_cfds; ++i) { + const auto wcb = + manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i]; + writers.emplace_back(mu, column_family_datas[i], + *mutable_cf_options_list[i], edit_lists[i], wcb); + manifest_writers_.push_back(&writers[i]); + } + assert(!writers.empty()); + ManifestWriter& first_writer = writers.front(); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting", + nullptr); + while (!first_writer.done && &first_writer != manifest_writers_.front()) { + first_writer.cv.Wait(); + } + if (first_writer.done) { + // All non-CF-manipulation operations can be grouped together and committed + // to MANIFEST. They should all have finished. The status code is stored in + // the first manifest writer. +#ifndef NDEBUG + for (const auto& writer : writers) { + assert(writer.done); + } + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu); +#endif /* !NDEBUG */ + return first_writer.status; + } + + int num_undropped_cfds = 0; + for (auto cfd : column_family_datas) { + // if cfd == nullptr, it is a column family add. + if (cfd == nullptr || !cfd->IsDropped()) { + ++num_undropped_cfds; + } + } + if (0 == num_undropped_cfds) { + for (int i = 0; i != num_cfds; ++i) { + manifest_writers_.pop_front(); + } + // Notify new head of manifest write queue. + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return Status::ColumnFamilyDropped(); + } + return ProcessManifestWrites(writers, mu, dir_contains_current_file, + new_descriptor_log, new_cf_options, + read_options); +} + +void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, + SequenceNumber* max_last_sequence) { + assert(max_last_sequence != nullptr); + assert(edit->IsColumnFamilyManipulation()); + edit->SetNextFile(next_file_number_.load()); + assert(!edit->HasLastSequence()); + edit->SetLastSequence(*max_last_sequence); + if (edit->is_column_family_drop_) { + // if we drop column family, we have to make sure to save max column family, + // so that we don't reuse existing ID + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } +} + +Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, + VersionBuilder* builder, VersionEdit* edit, + SequenceNumber* max_last_sequence, + InstrumentedMutex* mu) { +#ifdef NDEBUG + (void)cfd; +#endif + mu->AssertHeld(); + assert(!edit->IsColumnFamilyManipulation()); + assert(max_last_sequence != nullptr); + + if (edit->has_log_number_) { + assert(edit->log_number_ >= cfd->GetLogNumber()); + assert(edit->log_number_ < next_file_number_.load()); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + edit->SetNextFile(next_file_number_.load()); + if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) { + *max_last_sequence = edit->GetLastSequence(); + } else { + edit->SetLastSequence(*max_last_sequence); + } + + // The builder can be nullptr only if edit is WAL manipulation, + // because WAL edits do not need to be applied to versions, + // we return Status::OK() in this case. + assert(builder || edit->IsWalManipulation()); + return builder ? builder->Apply(edit) : Status::OK(); +} + +Status VersionSet::GetCurrentManifestPath(const std::string& dbname, + FileSystem* fs, + std::string* manifest_path, + uint64_t* manifest_file_number) { + assert(fs != nullptr); + assert(manifest_path != nullptr); + assert(manifest_file_number != nullptr); + + std::string fname; + Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname); + if (!s.ok()) { + return s; + } + if (fname.empty() || fname.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + // remove the trailing '\n' + fname.resize(fname.size() - 1); + FileType type; + bool parse_ok = ParseFileName(fname, manifest_file_number, &type); + if (!parse_ok || type != kDescriptorFile) { + return Status::Corruption("CURRENT file corrupted"); + } + *manifest_path = dbname; + if (dbname.back() != '/') { + manifest_path->push_back('/'); + } + manifest_path->append(fname); + return Status::OK(); +} + +Status VersionSet::Recover( + const std::vector& column_families, bool read_only, + std::string* db_id, bool no_error_if_files_missing) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); + // Read "CURRENT" file, which contains a pointer to the current manifest + // file + std::string manifest_path; + Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, + &manifest_file_number_); + if (!s.ok()) { + return s; + } + + ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n", + manifest_path.c_str()); + + std::unique_ptr manifest_file_reader; + { + std::unique_ptr manifest_file; + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + } + uint64_t current_manifest_file_size = 0; + uint64_t log_number = 0; + { + VersionSet::LogReporter reporter; + Status log_read_status; + reporter.status = &log_read_status; + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + VersionEditHandler handler( + read_only, column_families, const_cast(this), + /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_, + read_options, EpochNumberRequirement::kMightMissing); + handler.Iterate(reader, &log_read_status); + s = handler.status(); + if (s.ok()) { + log_number = handler.GetVersionEditParams().log_number_; + current_manifest_file_size = reader.GetReadOffset(); + assert(current_manifest_file_size != 0); + handler.GetDbId(db_id); + } + if (s.ok()) { + RecoverEpochNumbers(); + } + } + + if (s.ok()) { + manifest_file_size_ = current_manifest_file_size; + ROCKS_LOG_INFO( + db_options_->info_log, + "Recovered from manifest file:%s succeeded," + "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64 + ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 + ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 + ",min_log_number_to_keep is %" PRIu64 "\n", + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep()); + + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + ROCKS_LOG_INFO(db_options_->info_log, + "Column family [%s] (ID %" PRIu32 + "), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + + return s; +} + +namespace { +class ManifestPicker { + public: + explicit ManifestPicker(const std::string& dbname, + const std::vector& files_in_dbname); + // REQUIRES Valid() == true + std::string GetNextManifest(uint64_t* file_number, std::string* file_name); + bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); } + + private: + const std::string& dbname_; + // MANIFEST file names(s) + std::vector manifest_files_; + std::vector::const_iterator manifest_file_iter_; +}; + +ManifestPicker::ManifestPicker(const std::string& dbname, + const std::vector& files_in_dbname) + : dbname_(dbname) { + // populate manifest files + assert(!files_in_dbname.empty()); + for (const auto& fname : files_in_dbname) { + uint64_t file_num = 0; + FileType file_type; + bool parse_ok = ParseFileName(fname, &file_num, &file_type); + if (parse_ok && file_type == kDescriptorFile) { + manifest_files_.push_back(fname); + } + } + // seek to first manifest + std::sort(manifest_files_.begin(), manifest_files_.end(), + [](const std::string& lhs, const std::string& rhs) { + uint64_t num1 = 0; + uint64_t num2 = 0; + FileType type1; + FileType type2; + bool parse_ok1 = ParseFileName(lhs, &num1, &type1); + bool parse_ok2 = ParseFileName(rhs, &num2, &type2); +#ifndef NDEBUG + assert(parse_ok1); + assert(parse_ok2); +#else + (void)parse_ok1; + (void)parse_ok2; +#endif + return num1 > num2; + }); + manifest_file_iter_ = manifest_files_.begin(); +} + +std::string ManifestPicker::GetNextManifest(uint64_t* number, + std::string* file_name) { + assert(Valid()); + std::string ret; + if (manifest_file_iter_ != manifest_files_.end()) { + ret.assign(dbname_); + if (ret.back() != kFilePathSeparator) { + ret.push_back(kFilePathSeparator); + } + ret.append(*manifest_file_iter_); + if (number) { + FileType type; + bool parse = ParseFileName(*manifest_file_iter_, number, &type); + assert(type == kDescriptorFile); +#ifndef NDEBUG + assert(parse); +#else + (void)parse; +#endif + } + if (file_name) { + *file_name = *manifest_file_iter_; + } + ++manifest_file_iter_; + } + return ret; +} +} // anonymous namespace + +Status VersionSet::TryRecover( + const std::vector& column_families, bool read_only, + const std::vector& files_in_dbname, std::string* db_id, + bool* has_missing_table_file) { + ManifestPicker manifest_picker(dbname_, files_in_dbname); + if (!manifest_picker.Valid()) { + return Status::Corruption("Cannot locate MANIFEST file in " + dbname_); + } + Status s; + std::string manifest_path = + manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); + while (!manifest_path.empty()) { + s = TryRecoverFromOneManifest(manifest_path, column_families, read_only, + db_id, has_missing_table_file); + if (s.ok() || !manifest_picker.Valid()) { + break; + } + Reset(); + manifest_path = + manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); + } + return s; +} + +Status VersionSet::TryRecoverFromOneManifest( + const std::string& manifest_path, + const std::vector& column_families, bool read_only, + std::string* db_id, bool* has_missing_table_file) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); + ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", + manifest_path.c_str()); + std::unique_ptr manifest_file_reader; + Status s; + { + std::unique_ptr manifest_file; + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + } + + assert(s.ok()); + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, + /*checksum=*/true, /*log_num=*/0); + VersionEditHandlerPointInTime handler_pit( + read_only, column_families, const_cast(this), io_tracer_, + read_options, EpochNumberRequirement::kMightMissing); + + handler_pit.Iterate(reader, &s); + + handler_pit.GetDbId(db_id); + + assert(nullptr != has_missing_table_file); + *has_missing_table_file = handler_pit.HasMissingFiles(); + + s = handler_pit.status(); + if (s.ok()) { + RecoverEpochNumbers(); + } + return s; +} + +void VersionSet::RecoverEpochNumbers() { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + cfd->RecoverEpochNumbers(); + } +} + +Status VersionSet::ListColumnFamilies(std::vector* column_families, + const std::string& dbname, + FileSystem* fs) { + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_path; + uint64_t manifest_file_number; + Status s = + GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number); + if (!s.ok()) { + return s; + } + return ListColumnFamiliesFromManifest(manifest_path, fs, column_families); +} + +Status VersionSet::ListColumnFamiliesFromManifest( + const std::string& manifest_path, FileSystem* fs, + std::vector* column_families) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::unique_ptr file_reader; + Status s; + { + std::unique_ptr file; + // these are just for performance reasons, not correctness, + // so we're fine using the defaults + s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr); + if (!s.ok()) { + return s; + } + file_reader = std::make_unique( + std::move(file), manifest_path, /*io_tracer=*/nullptr); + } + + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + + ListColumnFamiliesHandler handler(read_options); + handler.Iterate(reader, &s); + + assert(column_families); + column_families->clear(); + if (handler.status().ok()) { + for (const auto& iter : handler.GetColumnFamilyNames()) { + column_families->push_back(iter.second); + } + } + + return handler.status(); +} + +Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const FileOptions& file_options, + int new_levels) { + if (new_levels <= 1) { + return Status::InvalidArgument( + "Number of levels needs to be bigger than 1"); + } + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + + ImmutableDBOptions db_options(*options); + ColumnFamilyOptions cf_options(*options); + std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, + options->table_cache_numshardbits)); + WriteController wc(options->delayed_write_rate); + WriteBufferManager wb(options->db_write_buffer_size); + VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, + nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, + /*db_id*/ "", + /*db_session_id*/ ""); + Status status; + + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(*options)); + dummy.push_back(dummy_descriptor); + status = versions.Recover(dummy); + if (!status.ok()) { + return status; + } + + Version* current_version = + versions.GetColumnFamilySet()->GetDefault()->current(); + auto* vstorage = current_version->storage_info(); + int current_levels = vstorage->num_levels(); + + if (current_levels <= new_levels) { + return Status::OK(); + } + + // Make sure there are file only on one level from + // (new_levels-1) to (current_levels-1) + int first_nonempty_level = -1; + int first_nonempty_level_filenum = 0; + for (int i = new_levels - 1; i < current_levels; i++) { + int file_num = vstorage->NumLevelFiles(i); + if (file_num != 0) { + if (first_nonempty_level < 0) { + first_nonempty_level = i; + first_nonempty_level_filenum = file_num; + } else { + char msg[255]; + snprintf(msg, sizeof(msg), + "Found at least two levels containing files: " + "[%d:%d],[%d:%d].\n", + first_nonempty_level, first_nonempty_level_filenum, i, + file_num); + return Status::InvalidArgument(msg); + } + } + } + + // we need to allocate an array with the old number of levels size to + // avoid SIGSEGV in WriteCurrentStatetoManifest() + // however, all levels bigger or equal to new_levels will be empty + std::vector* new_files_list = + new std::vector[current_levels]; + for (int i = 0; i < new_levels - 1; i++) { + new_files_list[i] = vstorage->LevelFiles(i); + } + + if (first_nonempty_level > 0) { + auto& new_last_level = new_files_list[new_levels - 1]; + + new_last_level = vstorage->LevelFiles(first_nonempty_level); + + for (size_t i = 0; i < new_last_level.size(); ++i) { + const FileMetaData* const meta = new_last_level[i]; + assert(meta); + + const uint64_t file_number = meta->fd.GetNumber(); + + vstorage->file_locations_[file_number] = + VersionStorageInfo::FileLocation(new_levels - 1, i); + } + } + + delete[] vstorage->files_; + vstorage->files_ = new_files_list; + vstorage->num_levels_ = new_levels; + vstorage->ResizeCompactCursors(new_levels); + + MutableCFOptions mutable_cf_options(*options); + VersionEdit ve; + InstrumentedMutex dummy_mutex; + InstrumentedMutexLock l(&dummy_mutex); + return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), + mutable_cf_options, read_options, &ve, + &dummy_mutex, nullptr, true); +} + +// Get the checksum information including the checksum and checksum function +// name of all SST and blob files in VersionSet. Store the information in +// FileChecksumList which contains a map from file number to its checksum info. +// If DB is not running, make sure call VersionSet::Recover() to load the file +// metadata from Manifest to VersionSet before calling this function. +Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { + // Clean the previously stored checksum information if any. + Status s; + if (checksum_list == nullptr) { + s = Status::InvalidArgument("checksum_list is nullptr"); + return s; + } + checksum_list->reset(); + + for (auto cfd : *column_family_set_) { + assert(cfd); + + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + /* SST files */ + for (int level = 0; level < cfd->NumberLevels(); level++) { + const auto& level_files = vstorage->LevelFiles(level); + + for (const auto& file : level_files) { + assert(file); + + s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(), + file->file_checksum, + file->file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + } + + /* Blob files */ + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + std::string checksum_value = meta->GetChecksumValue(); + std::string checksum_method = meta->GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (meta->GetChecksumMethod().empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + + s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(), + checksum_value, checksum_method); + if (!s.ok()) { + return s; + } + } + } + + return s; +} + +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex, bool json) { + assert(options.env); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + + std::vector column_families; + Status s = ListColumnFamiliesFromManifest( + dscname, options.env->GetFileSystem().get(), &column_families); + if (!s.ok()) { + return s; + } + + // Open the specified manifest file. + std::unique_ptr file_reader; + { + std::unique_ptr file; + const std::shared_ptr& fs = options.env->GetFileSystem(); + s = fs->NewSequentialFile( + dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr); + if (!s.ok()) { + return s; + } + file_reader = std::make_unique( + std::move(file), dscname, db_options_->log_readahead_size, io_tracer_); + } + + std::vector cf_descs; + for (const auto& cf : column_families) { + cf_descs.emplace_back(cf, options); + } + + DumpManifestHandler handler(cf_descs, this, io_tracer_, read_options, verbose, + hex, json); + { + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + handler.Iterate(reader, &s); + } + + return handler.status(); +} + +void VersionSet::MarkFileNumberUsed(uint64_t number) { + // only called during recovery and repair which are single threaded, so this + // works because there can't be concurrent calls + if (next_file_number_.load(std::memory_order_relaxed) <= number) { + next_file_number_.store(number + 1, std::memory_order_relaxed); + } +} +// Called only either from ::LogAndApply which is protected by mutex or during +// recovery which is single-threaded. +void VersionSet::MarkMinLogNumberToKeep(uint64_t number) { + if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) { + min_log_number_to_keep_.store(number, std::memory_order_relaxed); + } +} + +Status VersionSet::WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // WARNING: This method doesn't hold a mutex!! + + // This is done without DB mutex lock held, but only within single-threaded + // LogAndApply. Column family manipulations can only happen within LogAndApply + // (the same single thread), so we're safe to iterate. + + assert(io_s.ok()); + if (db_options_->write_dbid_to_manifest) { + VersionEdit edit_for_db_id; + assert(!db_id_.empty()); + edit_for_db_id.SetDBId(db_id_); + std::string db_id_record; + if (!edit_for_db_id.EncodeTo(&db_id_record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit_for_db_id.DebugString(true)); + } + io_s = log->AddRecord(db_id_record); + if (!io_s.ok()) { + return io_s; + } + } + + // Save WALs. + if (!wal_additions.GetWalAdditions().empty()) { + TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal", + const_cast(&wal_additions)); + std::string record; + if (!wal_additions.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit: " + + wal_additions.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + + // New manifest should rollover the WAL deletion record from previous + // manifest. Otherwise, when an addition record of a deleted WAL gets added to + // this new manifest later (which can happens in e.g, SyncWAL()), this new + // manifest creates an illusion that such WAL hasn't been deleted. + VersionEdit wal_deletions; + wal_deletions.DeleteWalsBefore(min_log_number_to_keep()); + std::string wal_deletions_record; + if (!wal_deletions.EncodeTo(&wal_deletions_record)) { + return Status::Corruption("Unable to Encode VersionEdit: " + + wal_deletions.DebugString(true)); + } + io_s = log->AddRecord(wal_deletions_record); + if (!io_s.ok()) { + return io_s; + } + + for (auto cfd : *column_family_set_) { + assert(cfd); + + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + { + // Store column family info + VersionEdit edit; + if (cfd->GetID() != 0) { + // default column family is always there, + // no need to explicitly write it + edit.AddColumnFamily(cfd->GetName()); + edit.SetColumnFamily(cfd->GetID()); + } + edit.SetComparatorName( + cfd->internal_comparator().user_comparator()->Name()); + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + + { + // Save files + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + const auto* current = cfd->current(); + assert(current); + + const auto* vstorage = current->storage_info(); + assert(vstorage); + + for (int level = 0; level < cfd->NumberLevels(); level++) { + const auto& level_files = vstorage->LevelFiles(level); + + for (const auto& f : level_files) { + assert(f); + + edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size, f->tail_size); + } + } + + edit.SetCompactCursors(vstorage->GetCompactCursors()); + + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& meta : blob_files) { + assert(meta); + + const uint64_t blob_file_number = meta->GetBlobFileNumber(); + + edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(), + meta->GetTotalBlobBytes(), meta->GetChecksumMethod(), + meta->GetChecksumValue()); + if (meta->GetGarbageBlobCount() > 0) { + edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(), + meta->GetGarbageBlobBytes()); + } + } + + const auto iter = curr_state.find(cfd->GetID()); + assert(iter != curr_state.end()); + uint64_t log_number = iter->second.log_number; + edit.SetLogNumber(log_number); + + if (cfd->GetID() == 0) { + // min_log_number_to_keep is for the whole db, not for specific column + // family. So it does not need to be set for every column family, just + // need to be set once. Since default CF can never be dropped, we set + // the min_log to the default CF here. + uint64_t min_log = min_log_number_to_keep(); + if (min_log != 0) { + edit.SetMinLogNumberToKeep(min_log); + } + } + + const std::string& full_history_ts_low = iter->second.full_history_ts_low; + if (!full_history_ts_low.empty()) { + edit.SetFullHistoryTsLow(full_history_ts_low); + } + + edit.SetLastSequence(descriptor_last_sequence_); + + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + } + return Status::OK(); +} + +// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this +// function is called repeatedly with consecutive pairs of slices. For example +// if the slice list is [a, b, c, d] this function is called with arguments +// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where +// we avoid doing binary search for the keys b and c twice and instead somehow +// maintain state of where they first appear in the files. +uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, + Version* v, const Slice& start, + const Slice& end, int start_level, + int end_level, TableReaderCaller caller) { + const auto& icmp = v->cfd_->internal_comparator(); + + // pre-condition + assert(icmp.Compare(start, end) <= 0); + + uint64_t total_full_size = 0; + const auto* vstorage = v->storage_info(); + const int num_non_empty_levels = vstorage->num_non_empty_levels(); + end_level = (end_level == -1) ? num_non_empty_levels + : std::min(end_level, num_non_empty_levels); + if (end_level <= start_level) { + return 0; + } + + // Outline of the optimization that uses options.files_size_error_margin. + // When approximating the files total size that is used to store a keys range, + // we first sum up the sizes of the files that fully fall into the range. + // Then we sum up the sizes of all the files that may intersect with the range + // (this includes all files in L0 as well). Then, if total_intersecting_size + // is smaller than total_full_size * options.files_size_error_margin - we can + // infer that the intersecting files have a sufficiently negligible + // contribution to the total size, and we can approximate the storage required + // for the keys in range as just half of the intersecting_files_size. + // E.g., if the value of files_size_error_margin is 0.1, then the error of the + // approximation is limited to only ~10% of the total size of files that fully + // fall into the keys range. In such case, this helps to avoid a costly + // process of binary searching the intersecting files that is required only + // for a more precise calculation of the total size. + + autovector first_files; + autovector last_files; + + // scan all the levels + for (int level = start_level; level < end_level; ++level) { + const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); + if (files_brief.num_files == 0) { + // empty level, skip exploration + continue; + } + + if (level == 0) { + // level 0 files are not in sorted order, we need to iterate through + // the list to compute the total bytes that require scanning, + // so handle the case explicitly (similarly to first_files case) + for (size_t i = 0; i < files_brief.num_files; i++) { + first_files.push_back(&files_brief.files[i]); + } + continue; + } + + assert(level > 0); + assert(files_brief.num_files > 0); + + // identify the file position for start key + const int idx_start = + FindFileInRange(icmp, files_brief, start, 0, + static_cast(files_brief.num_files - 1)); + assert(static_cast(idx_start) < files_brief.num_files); + + // identify the file position for end key + int idx_end = idx_start; + if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { + idx_end = + FindFileInRange(icmp, files_brief, end, idx_start, + static_cast(files_brief.num_files - 1)); + } + assert(idx_end >= idx_start && + static_cast(idx_end) < files_brief.num_files); + + // scan all files from the starting index to the ending index + // (inferred from the sorted order) + + // first scan all the intermediate full files (excluding first and last) + for (int i = idx_start + 1; i < idx_end; ++i) { + uint64_t file_size = files_brief.files[i].fd.GetFileSize(); + // The entire file falls into the range, so we can just take its size. + assert(file_size == ApproximateSize(read_options, v, files_brief.files[i], + start, end, caller)); + total_full_size += file_size; + } + + // save the first and the last files (which may be the same file), so we + // can scan them later. + first_files.push_back(&files_brief.files[idx_start]); + if (idx_start != idx_end) { + // we need to estimate size for both files, only if they are different + last_files.push_back(&files_brief.files[idx_end]); + } + } + + // The sum of all file sizes that intersect the [start, end] keys range. + uint64_t total_intersecting_size = 0; + for (const auto* file_ptr : first_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + for (const auto* file_ptr : last_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + + // Now scan all the first & last files at each level, and estimate their size. + // If the total_intersecting_size is less than X% of the total_full_size - we + // want to approximate the result in order to avoid the costly binary search + // inside ApproximateSize. We use half of file size as an approximation below. + + const double margin = options.files_size_error_margin; + if (margin > 0 && total_intersecting_size < + static_cast(total_full_size * margin)) { + total_full_size += total_intersecting_size / 2; + } else { + // Estimate for all the first files (might also be last files), at each + // level + for (const auto file_ptr : first_files) { + total_full_size += + ApproximateSize(read_options, v, *file_ptr, start, end, caller); + } + + // Estimate for all the last files, at each level + for (const auto file_ptr : last_files) { + // We could use ApproximateSize here, but calling ApproximateOffsetOf + // directly is just more efficient. + total_full_size += + ApproximateOffsetOf(read_options, v, *file_ptr, end, caller); + } + } + + return total_full_size; +} + +uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, + const Slice& key, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + + uint64_t result = 0; + if (icmp.Compare(f.largest_key, key) <= 0) { + // Entire file is before "key", so just add the file size + result = f.fd.GetFileSize(); + } else if (icmp.Compare(f.smallest_key, key) > 0) { + // Entire file is after "key", so ignore + result = 0; + } else { + // "key" falls in the range for this table. Add the + // approximate offset of "key" within the table. + TableCache* table_cache = v->cfd_->table_cache(); + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); + if (table_cache != nullptr) { + result = table_cache->ApproximateOffsetOf( + read_options, key, *f.file_metadata, caller, icmp, + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); + } + } + return result; +} + +uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + assert(icmp.Compare(start, end) <= 0); + + if (icmp.Compare(f.largest_key, start) <= 0 || + icmp.Compare(f.smallest_key, end) > 0) { + // Entire file is before or after the start/end keys range + return 0; + } + + if (icmp.Compare(f.smallest_key, start) >= 0) { + // Start of the range is before the file start - approximate by end offset + return ApproximateOffsetOf(read_options, v, f, end, caller); + } + + if (icmp.Compare(f.largest_key, end) < 0) { + // End of the range is after the file end - approximate by subtracting + // start offset from the file size + uint64_t start_offset = + ApproximateOffsetOf(read_options, v, f, start, caller); + assert(f.fd.GetFileSize() >= start_offset); + return f.fd.GetFileSize() - start_offset; + } + + // The interval falls entirely in the range for this file. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache == nullptr) { + return 0; + } + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); + return table_cache->ApproximateSize( + read_options, start, end, *f.file_metadata, caller, icmp, + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); +} + +void VersionSet::RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const { + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + if (!cfd->initialized()) { + continue; + } + + auto* current = cfd->current(); + bool found_current = false; + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); + if (v == current) { + found_current = true; + } + } + + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); + } + } +} + +void VersionSet::AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const { + assert(live_table_files); + assert(live_blob_files); + + // pre-calculate space requirement + size_t total_table_files = 0; + size_t total_blob_files = 0; + + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + + if (!cfd->initialized()) { + continue; + } + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + assert(v); + + const auto* vstorage = v->storage_info(); + assert(vstorage); + + for (int level = 0; level < vstorage->num_levels(); ++level) { + total_table_files += vstorage->LevelFiles(level).size(); + } + + total_blob_files += vstorage->GetBlobFiles().size(); + } + } + + // just one time extension to the right size + live_table_files->reserve(live_table_files->size() + total_table_files); + live_blob_files->reserve(live_blob_files->size() + total_blob_files); + + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + if (!cfd->initialized()) { + continue; + } + + auto* current = cfd->current(); + bool found_current = false; + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + v->AddLiveFiles(live_table_files, live_blob_files); + if (v == current) { + found_current = true; + } + } + + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->AddLiveFiles(live_table_files, live_blob_files); + } + } +} + +InternalIterator* VersionSet::MakeInputIterator( + const ReadOptions& read_options, const Compaction* c, + RangeDelAggregator* range_del_agg, + const FileOptions& file_options_compactions, + const std::optional& start, + const std::optional& end) { + auto cfd = c->column_family_data(); + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + + c->num_input_levels() - 1 + : c->num_input_levels()); + InternalIterator** list = new InternalIterator*[space]; + // First item in the pair is a pointer to range tombstones. + // Second item is a pointer to a member of a LevelIterator, + // that will be initialized to where CompactionMergingIterator stores + // pointer to its range tombstones. This is used by LevelIterator + // to update pointer to range tombstones as it traverse different SST files. + std::vector< + std::pair> + range_tombstones; + size_t num = 0; + for (size_t which = 0; which < c->num_input_levels(); which++) { + if (c->input_levels(which)->num_files != 0) { + if (c->level(which) == 0) { + const LevelFilesBrief* flevel = c->input_levels(which); + for (size_t i = 0; i < flevel->num_files; i++) { + const FileMetaData& fmd = *flevel->files[i].file_metadata; + if (start.has_value() && + cfd->user_comparator()->CompareWithoutTimestamp( + start.value(), fmd.largest.user_key()) > 0) { + continue; + } + // We should be able to filter out the case where the end key + // equals to the end boundary, since the end key is exclusive. + // We try to be extra safe here. + if (end.has_value() && + cfd->user_comparator()->CompareWithoutTimestamp( + end.value(), fmd.smallest.user_key()) < 0) { + continue; + } + TruncatedRangeDelIterator* range_tombstone_iter = nullptr; + list[num++] = cfd->table_cache()->NewIterator( + read_options, file_options_compactions, + cfd->internal_comparator(), fmd, range_del_agg, + c->mutable_cf_options()->prefix_extractor, + /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, + /*arena=*/nullptr, + /*skip_filters=*/false, + /*level=*/static_cast(c->level(which)), + MaxFileSizeForL0MetaPin(*c->mutable_cf_options()), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false, + c->mutable_cf_options()->block_protection_bytes_per_key, + /*range_del_iter=*/&range_tombstone_iter); + range_tombstones.emplace_back(range_tombstone_iter, nullptr); + } + } else { + // Create concatenating iterator for the files from this level + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr; + list[num++] = new LevelIterator( + cfd->table_cache(), read_options, file_options_compactions, + cfd->internal_comparator(), c->input_levels(which), + c->mutable_cf_options()->prefix_extractor, + /*should_sample=*/false, + /*no per level latency histogram=*/nullptr, + TableReaderCaller::kCompaction, /*skip_filters=*/false, + /*level=*/static_cast(c->level(which)), + c->mutable_cf_options()->block_protection_bytes_per_key, + range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr); + range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); + } + } + } + assert(num <= space); + InternalIterator* result = NewCompactionMergingIterator( + &c->column_family_data()->internal_comparator(), list, + static_cast(num), range_tombstones); + delete[] list; + return result; +} + +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** meta, + ColumnFamilyData** cfd) { + for (auto cfd_iter : *column_family_set_) { + if (!cfd_iter->initialized()) { + continue; + } + Version* version = cfd_iter->current(); + const auto* vstorage = version->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file : vstorage->LevelFiles(level)) { + if (file->fd.GetNumber() == number) { + *meta = file; + *filelevel = level; + *cfd = cfd_iter; + return Status::OK(); + } + } + } + } + return Status::NotFound("File not present in any level"); +} + +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& file : + cfd->current()->storage_info()->LevelFiles(level)) { + LiveFileMetaData filemetadata; + filemetadata.column_family_name = cfd->GetName(); + uint32_t path_id = file->fd.GetPathId(); + if (path_id < cfd->ioptions()->cf_paths.size()) { + filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path; + } else { + assert(!cfd->ioptions()->cf_paths.empty()); + filemetadata.db_path = cfd->ioptions()->cf_paths.back().path; + } + filemetadata.directory = filemetadata.db_path; + const uint64_t file_number = file->fd.GetNumber(); + filemetadata.name = MakeTableFileName("", file_number); + filemetadata.relative_filename = filemetadata.name.substr(1); + filemetadata.file_number = file_number; + filemetadata.level = level; + filemetadata.size = file->fd.GetFileSize(); + filemetadata.smallestkey = file->smallest.user_key().ToString(); + filemetadata.largestkey = file->largest.user_key().ToString(); + filemetadata.smallest_seqno = file->fd.smallest_seqno; + filemetadata.largest_seqno = file->fd.largest_seqno; + filemetadata.num_reads_sampled = + file->stats.num_reads_sampled.load(std::memory_order_relaxed); + filemetadata.being_compacted = file->being_compacted; + filemetadata.num_entries = file->num_entries; + filemetadata.num_deletions = file->num_deletions; + filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; + filemetadata.file_checksum = file->file_checksum; + filemetadata.file_checksum_func_name = file->file_checksum_func_name; + filemetadata.temperature = file->temperature; + filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); + filemetadata.file_creation_time = file->TryGetFileCreationTime(); + filemetadata.epoch_number = file->epoch_number; + metadata->push_back(filemetadata); + } + } + } +} + +void VersionSet::GetObsoleteFiles(std::vector* files, + std::vector* blob_files, + std::vector* manifest_filenames, + uint64_t min_pending_output) { + assert(files); + assert(blob_files); + assert(manifest_filenames); + assert(files->empty()); + assert(blob_files->empty()); + assert(manifest_filenames->empty()); + + std::vector pending_files; + for (auto& f : obsolete_files_) { + if (f.metadata->fd.GetNumber() < min_pending_output) { + files->emplace_back(std::move(f)); + } else { + pending_files.emplace_back(std::move(f)); + } + } + obsolete_files_.swap(pending_files); + + std::vector pending_blob_files; + for (auto& blob_file : obsolete_blob_files_) { + if (blob_file.GetBlobFileNumber() < min_pending_output) { + blob_files->emplace_back(std::move(blob_file)); + } else { + pending_blob_files.emplace_back(std::move(blob_file)); + } + } + obsolete_blob_files_.swap(pending_blob_files); + + obsolete_manifests_.swap(*manifest_filenames); +} + +ColumnFamilyData* VersionSet::CreateColumnFamily( + const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, + const VersionEdit* edit) { + assert(edit->is_column_family_add_); + + MutableCFOptions dummy_cf_options; + Version* dummy_versions = + new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_); + // Ref() dummy version once so that later we can call Unref() to delete it + // by avoiding calling "delete" explicitly (~Version is private) + dummy_versions->Ref(); + auto new_cfd = column_family_set_->CreateColumnFamily( + edit->column_family_name_, edit->column_family_, dummy_versions, + cf_options); + + Version* v = new Version(new_cfd, this, file_options_, + *new_cfd->GetLatestMutableCFOptions(), io_tracer_, + current_version_number_++); + + constexpr bool update_stats = false; + + v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), read_options, + update_stats); + + AppendVersion(new_cfd, v); + // GetLatestMutableCFOptions() is safe here without mutex since the + // cfd is not available to client + new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(), + LastSequence()); + new_cfd->SetLogNumber(edit->log_number_); + return new_cfd; +} + +uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) { + uint64_t count = 0; + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + count++; + } + return count; +} + +uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) { + std::unordered_set unique_files; + uint64_t total_files_size = 0; + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + VersionStorageInfo* storage_info = v->storage_info(); + for (int level = 0; level < storage_info->num_levels_; level++) { + for (const auto& file_meta : storage_info->LevelFiles(level)) { + if (unique_files.find(file_meta->fd.packed_number_and_path_id) == + unique_files.end()) { + unique_files.insert(file_meta->fd.packed_number_and_path_id); + total_files_size += file_meta->fd.GetFileSize(); + } + } + } + } + return total_files_size; +} + +uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { + std::unordered_set unique_blob_files; + + uint64_t all_versions_blob_file_size = 0; + + for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + // iterate all the versions + const auto* vstorage = v->storage_info(); + assert(vstorage); + + const auto& blob_files = vstorage->GetBlobFiles(); + + for (const auto& meta : blob_files) { + assert(meta); + + const uint64_t blob_file_number = meta->GetBlobFileNumber(); + + if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) { + // find Blob file that has not been counted + unique_blob_files.insert(blob_file_number); + all_versions_blob_file_size += meta->GetBlobFileSize(); + } + } + } + + return all_versions_blob_file_size; +} + +Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, + const std::string& fpath, int level, + const FileMetaData& meta) { + uint64_t fsize = 0; + Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); + if (status.ok()) { + if (fsize != meta.fd.GetFileSize()) { + status = Status::Corruption("File size mismatch: " + fpath); + } + } + if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) { + assert(cfd); + TableCache* table_cache = cfd->table_cache(); + assert(table_cache); + + const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions(); + assert(cf_opts); + std::shared_ptr pe = cf_opts->prefix_extractor; + size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts); + + const FileOptions& file_opts = file_options(); + + Version* version = cfd->current(); + assert(version); + VersionStorageInfo& storage_info = version->storage_info_; + const InternalKeyComparator* icmp = storage_info.InternalComparator(); + assert(icmp); + + InternalStats* internal_stats = cfd->internal_stats(); + + TableCache::TypedHandle* handle = nullptr; + FileMetaData meta_copy = meta; + status = table_cache->FindTable( + read_options, file_opts, *icmp, meta_copy, &handle, + cf_opts->block_protection_bytes_per_key, pe, + /*no_io=*/false, /*record_read_stats=*/true, + internal_stats->GetFileReadHist(level), false, level, + /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, + meta_copy.temperature); + if (handle) { + table_cache->get_cache().Release(handle); + } + } + return status; +} + +ReactiveVersionSet::ReactiveVersionSet( + const std::string& dbname, const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, WriteController* write_controller, + const std::shared_ptr& io_tracer) + : VersionSet(dbname, _db_options, _file_options, table_cache, + write_buffer_manager, write_controller, + /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "", + /*db_session_id*/ "") {} + +ReactiveVersionSet::~ReactiveVersionSet() {} + +Status ReactiveVersionSet::Recover( + const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status) { + assert(manifest_reader != nullptr); + assert(manifest_reporter != nullptr); + assert(manifest_reader_status != nullptr); + + manifest_reader_status->reset(new Status()); + manifest_reporter->reset(new LogReporter()); + static_cast_with_check(manifest_reporter->get())->status = + manifest_reader_status->get(); + Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); + if (!s.ok()) { + return s; + } + log::Reader* reader = manifest_reader->get(); + assert(reader); + + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_, + read_options_, EpochNumberRequirement::kMightMissing)); + + manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); + + s = manifest_tailer_->status(); + if (s.ok()) { + RecoverEpochNumbers(); + } + return s; +} + +Status ReactiveVersionSet::ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + Status* manifest_read_status, + std::unordered_set* cfds_changed) { + assert(manifest_reader != nullptr); + assert(cfds_changed != nullptr); + mu->AssertHeld(); + + Status s; + log::Reader* reader = manifest_reader->get(); + assert(reader); + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); + if (!s.ok()) { + return s; + } + manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status); + s = manifest_tailer_->status(); + if (s.ok()) { + *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); + } + + return s; +} + +Status ReactiveVersionSet::MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader) { + assert(manifest_reader != nullptr); + Status s; + std::string manifest_path; + s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, + &manifest_file_number_); + if (!s.ok()) { + return s; + } + std::unique_ptr manifest_file; + if (manifest_reader->get() != nullptr && + manifest_reader->get()->file()->file_name() == manifest_path) { + // CURRENT points to the same MANIFEST as before, no need to switch + // MANIFEST. + return s; + } + assert(nullptr == manifest_reader->get() || + manifest_reader->get()->file()->file_name() != manifest_path); + s = fs_->FileExists(manifest_path, IOOptions(), nullptr); + if (s.IsNotFound()) { + return Status::TryAgain( + "The primary may have switched to a new MANIFEST and deleted the old " + "one."); + } else if (!s.ok()) { + return s; + } + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:0"); + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:1"); + // The primary can also delete the MANIFEST while the secondary is reading + // it. This is OK on POSIX. For other file systems, maybe create a hard link + // to MANIFEST. The hard link should be cleaned up later by the secondary. + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + std::unique_ptr manifest_file_reader; + if (s.ok()) { + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + manifest_reader->reset(new log::FragmentBufferedReader( + nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, + 0 /* log_number */)); + ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", + manifest_path.c_str()); + if (manifest_tailer_) { + manifest_tailer_->PrepareToReadNewManifest(); + } + } else if (s.IsPathNotFound()) { + // This can happen if the primary switches to a new MANIFEST after the + // secondary reads the CURRENT file but before the secondary actually tries + // to open the MANIFEST. + s = Status::TryAgain( + "The primary may have switched to a new MANIFEST and deleted the old " + "one."); + } + return s; +} + +#ifndef NDEBUG +uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group(); +} +#endif // !NDEBUG + +std::vector& ReactiveVersionSet::replay_buffer() { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().replay_buffer(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.h new file mode 100644 index 0000000000..25ad365837 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set.h @@ -0,0 +1,1714 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of table files per level, as well as a +// set of blob files. The entire set of versions is maintained in a +// VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache/cache_helpers.h" +#include "db/blob/blob_file_meta.h" +#include "db/blob/blob_index.h" +#include "db/column_family.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker.h" +#include "db/dbformat.h" +#include "db/file_indexer.h" +#include "db/log_reader.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/table_cache.h" +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "env/file_system_tracer.h" +#if USE_COROUTINES +#include "folly/experimental/coro/BlockingWait.h" +#include "folly/experimental/coro/Collect.h" +#endif +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "table/get_context.h" +#include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/autovector.h" +#include "util/coro_utils.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +namespace log { +class Writer; +} + +class BlobIndex; +class Compaction; +class LogBuffer; +class LookupKey; +class MemTable; +class Version; +class VersionSet; +class WriteBufferManager; +class MergeContext; +class ColumnFamilySet; +class MergeIteratorBuilder; +class SystemClock; +class ManifestTailer; +class FilePickerMultiGet; + +// VersionEdit is always supposed to be valid and it is used to point at +// entries in Manifest. Ideally it should not be used as a container to +// carry around few of its fields as function params because it can cause +// readers to think it's a valid entry from Manifest. To avoid that confusion +// introducing VersionEditParams to simply carry around multiple VersionEdit +// params. It need not point to a valid record in Manifest. +using VersionEditParams = VersionEdit; + +// Return the smallest index i such that file_level.files[i]->largest >= key. +// Return file_level.num_files if there is no such file. +// REQUIRES: "file_level.files" contains a sorted list of +// non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key); + +// Returns true iff some file in "files" overlaps the user key range +// [*smallest,*largest]. +// smallest==nullptr represents a key smaller than all keys in the DB. +// largest==nullptr represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, file_level.files[] +// contains disjoint ranges in sorted order. +extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key); + +// Generate LevelFilesBrief from vector +// Would copy smallest_key and largest_key data to sequential memory +// arena: Arena used to allocate the memory +extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena); +enum EpochNumberRequirement { + kMightMissing, + kMustPresent, +}; + +// Information of the storage associated with each Version, including number of +// levels of LSM tree, files information at each level, files marked for +// compaction, blob files, etc. +class VersionStorageInfo { + public: + VersionStorageInfo(const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int num_levels, + CompactionStyle compaction_style, + VersionStorageInfo* src_vstorage, + bool _force_consistency_checks, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); + // No copying allowed + VersionStorageInfo(const VersionStorageInfo&) = delete; + void operator=(const VersionStorageInfo&) = delete; + ~VersionStorageInfo(); + + void Reserve(int level, size_t size) { files_[level].reserve(size); } + + void AddFile(int level, FileMetaData* f); + + // Resize/Initialize the space for compact_cursor_ + void ResizeCompactCursors(int level) { + compact_cursor_.resize(level, InternalKey()); + } + + const std::vector& GetCompactCursors() const { + return compact_cursor_; + } + + // REQUIRES: ResizeCompactCursors has been called + void AddCursorForOneLevel(int level, + const InternalKey& smallest_uncompacted_key) { + compact_cursor_[level] = smallest_uncompacted_key; + } + + // REQUIRES: lock is held + // Update the compact cursor and advance the file index using increment + // so that it can point to the next cursor (increment means the number of + // input files in this level of the last compaction) + const InternalKey& GetNextCompactCursor(int level, size_t increment) { + int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment; + assert(cmp_idx <= (int)files_by_compaction_pri_[level].size()); + // TODO(zichen): may need to update next_file_to_compact_by_size_ + // for parallel compaction. + InternalKey new_cursor; + if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) { + cmp_idx = 0; + } + // TODO(zichen): rethink if this strategy gives us some good guarantee + return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest; + } + + void ReserveBlob(size_t size) { blob_files_.reserve(size); } + + void AddBlobFile(std::shared_ptr blob_file_meta); + + void PrepareForVersionAppend(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + // REQUIRES: PrepareForVersionAppend has been called + void SetFinalized(); + + // Update the accumulated stats from a file-meta. + void UpdateAccumulatedStats(FileMetaData* file_meta); + + // Decrease the current stat from a to-be-deleted file-meta + void RemoveCurrentStats(FileMetaData* file_meta); + + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // REQUIRES: db_mutex held!! + // TODO find a better way to pass compaction_options_fifo. + void ComputeCompactionScore(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + // Estimate est_comp_needed_bytes_ + void EstimateCompactionBytesNeeded( + const MutableCFOptions& mutable_cf_options); + + // This computes files_marked_for_compaction_ and is called by + // ComputeCompactionScore() + void ComputeFilesMarkedForCompaction(); + + // This computes ttl_expired_files_ and is called by + // ComputeCompactionScore() + void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions, + const uint64_t ttl); + + // This computes files_marked_for_periodic_compaction_ and is called by + // ComputeCompactionScore() + void ComputeFilesMarkedForPeriodicCompaction( + const ImmutableOptions& ioptions, + const uint64_t periodic_compaction_seconds); + + // This computes bottommost_files_marked_for_compaction_ and is called by + // ComputeCompactionScore() or UpdateOldestSnapshot(). + // + // Among bottommost files (assumes they've already been computed), marks the + // ones that have keys that would be eliminated if recompacted, according to + // the seqnum of the oldest existing snapshot. Must be called every time + // oldest snapshot changes as that is when bottom-level files can become + // eligible for compaction. + // + // REQUIRES: DB mutex held + void ComputeBottommostFilesMarkedForCompaction(); + + // This computes files_marked_for_forced_blob_gc_ and is called by + // ComputeCompactionScore() + // + // REQUIRES: DB mutex held + void ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold); + + bool level0_non_overlapping() const { return level0_non_overlapping_; } + + // Updates the oldest snapshot and related internal state, like the bottommost + // files marked for compaction. + // REQUIRES: DB mutex held + void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum); + + int MaxInputLevel() const; + int MaxOutputLevel(bool allow_ingest_behind) const; + + // Return level number that has idx'th highest score + int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } + + // Return idx'th highest score + double CompactionScore(int idx) const { return compaction_score_[idx]; } + + void GetOverlappingInputs( + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr, // return index of overlap file + bool expand_range = true, // if set, returns files which overlap the + // range and overlap each other. If false, + // then just files intersecting the range + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included + void GetCleanInputsWithinInterval( + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr) // return index of overlap file + const; + + void GetOverlappingInputsRangeBinarySearch( + int level, // level > 0 + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index, // index of overlap file + int* file_index, // return index of overlap file + bool within_interval = false, // if set, force the inputs within interval + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included + + // Returns true iff some file in the specified level overlaps + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. + bool OverlapInLevel(int level, const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Returns true iff the first or last file in inputs contains + // an overlapping user key to the file "just outside" of it (i.e. + // just after the last file, or just before the first file) + // REQUIRES: "*inputs" is a sorted list of non-overlapping files + bool HasOverlappingUserKey(const std::vector* inputs, + int level); + + int num_levels() const { return num_levels_; } + + // REQUIRES: PrepareForVersionAppend has been called + int num_non_empty_levels() const { + assert(finalized_); + return num_non_empty_levels_; + } + + // REQUIRES: PrepareForVersionAppend has been called + // This may or may not return number of level files. It is to keep backward + // compatible behavior in universal compaction. + int l0_delay_trigger_count() const { return l0_delay_trigger_count_; } + + void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + int NumLevelFiles(int level) const { + assert(finalized_); + return static_cast(files_[level].size()); + } + + // Return the combined file size of all files at the specified level. + uint64_t NumLevelBytes(int level) const; + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + const std::vector& LevelFiles(int level) const { + return files_[level]; + } + + bool HasMissingEpochNumber() const; + uint64_t GetMaxEpochNumberOfFiles() const; + EpochNumberRequirement GetEpochNumberRequirement() const { + return epoch_number_requirement_; + } + void SetEpochNumberRequirement( + EpochNumberRequirement epoch_number_requirement) { + epoch_number_requirement_ = epoch_number_requirement; + } + void RecoverEpochNumbers(ColumnFamilyData* cfd); + + class FileLocation { + public: + FileLocation() = default; + FileLocation(int level, size_t position) + : level_(level), position_(position) {} + + int GetLevel() const { return level_; } + size_t GetPosition() const { return position_; } + + bool IsValid() const { return level_ >= 0; } + + bool operator==(const FileLocation& rhs) const { + return level_ == rhs.level_ && position_ == rhs.position_; + } + + bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); } + + static FileLocation Invalid() { return FileLocation(); } + + private: + int level_ = -1; + size_t position_ = 0; + }; + + // REQUIRES: PrepareForVersionAppend has been called + FileLocation GetFileLocation(uint64_t file_number) const { + const auto it = file_locations_.find(file_number); + + if (it == file_locations_.end()) { + return FileLocation::Invalid(); + } + + assert(it->second.GetLevel() < num_levels_); + assert(it->second.GetPosition() < files_[it->second.GetLevel()].size()); + assert(files_[it->second.GetLevel()][it->second.GetPosition()]); + assert(files_[it->second.GetLevel()][it->second.GetPosition()] + ->fd.GetNumber() == file_number); + + return it->second; + } + + // REQUIRES: PrepareForVersionAppend has been called + FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const { + auto location = GetFileLocation(file_number); + + if (!location.IsValid()) { + return nullptr; + } + + return files_[location.GetLevel()][location.GetPosition()]; + } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + using BlobFiles = std::vector>; + const BlobFiles& GetBlobFiles() const { return blob_files_; } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + BlobFiles::const_iterator GetBlobFileMetaDataLB( + uint64_t blob_file_number) const; + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + std::shared_ptr GetBlobFileMetaData( + uint64_t blob_file_number) const { + const auto it = GetBlobFileMetaDataLB(blob_file_number); + + assert(it == blob_files_.end() || *it); + + if (it != blob_files_.end() && + (*it)->GetBlobFileNumber() == blob_file_number) { + return *it; + } + + return std::shared_ptr(); + } + + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + struct BlobStats { + uint64_t total_file_size = 0; + uint64_t total_garbage_size = 0; + double space_amp = 0.0; + }; + + BlobStats GetBlobStats() const { + uint64_t total_file_size = 0; + uint64_t total_garbage_size = 0; + + for (const auto& meta : blob_files_) { + assert(meta); + + total_file_size += meta->GetBlobFileSize(); + total_garbage_size += meta->GetGarbageBlobBytes(); + } + + double space_amp = 0.0; + if (total_file_size > total_garbage_size) { + space_amp = static_cast(total_file_size) / + (total_file_size - total_garbage_size); + } + + return BlobStats{total_file_size, total_garbage_size, space_amp}; + } + + const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const { + assert(level < static_cast(level_files_brief_.size())); + return level_files_brief_[level]; + } + + // REQUIRES: PrepareForVersionAppend has been called + const std::vector& FilesByCompactionPri(int level) const { + assert(finalized_); + return files_by_compaction_pri_[level]; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForCompaction() + const { + assert(finalized_); + return files_marked_for_compaction_; + } + + void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) { + f->marked_for_compaction = true; + files_marked_for_compaction_.emplace_back(level, f); + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& ExpiredTtlFiles() const { + assert(finalized_); + return expired_ttl_files_; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& + FilesMarkedForPeriodicCompaction() const { + assert(finalized_); + return files_marked_for_periodic_compaction_; + } + + void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& + BottommostFilesMarkedForCompaction() const { + assert(finalized_); + return bottommost_files_marked_for_compaction_; + } + + // REQUIRES: ComputeCompactionScore has been called + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForForcedBlobGC() + const { + assert(finalized_); + return files_marked_for_forced_blob_gc_; + } + + int base_level() const { return base_level_; } + double level_multiplier() const { return level_multiplier_; } + + // REQUIRES: lock is held + // Set the index that is used to offset into files_by_compaction_pri_ to find + // the next compaction candidate file. + void SetNextCompactionIndex(int level, int index) { + next_file_to_compact_by_size_[level] = index; + } + + // REQUIRES: lock is held + int NextCompactionIndex(int level) const { + return next_file_to_compact_by_size_[level]; + } + + // REQUIRES: PrepareForVersionAppend has been called + const FileIndexer& file_indexer() const { + assert(finalized_); + return file_indexer_; + } + + // Only the first few entries of files_by_compaction_pri_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t kNumberFilesToSort = 50; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[1000]; + }; + struct FileSummaryStorage { + char buffer[3000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + uint64_t MaxNextLevelOverlappingBytes(); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + uint64_t GetAverageValueSize() const { + if (accumulated_num_non_deletions_ == 0) { + return 0; + } + assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); + assert(accumulated_file_size_ > 0); + return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * + accumulated_file_size_ / + (accumulated_raw_key_size_ + accumulated_raw_value_size_); + } + + uint64_t GetEstimatedActiveKeys() const; + + double GetEstimatedCompressionRatioAtLevel(int level) const; + + // re-initializes the index that is used to offset into + // files_by_compaction_pri_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + const InternalKeyComparator* InternalComparator() const { + return internal_comparator_; + } + + // Returns maximum total bytes of data on a given level. + uint64_t MaxBytesForLevel(int level) const; + + // Returns an estimate of the amount of live data in bytes. + uint64_t EstimateLiveDataSize() const; + + uint64_t estimated_compaction_needed_bytes() const { + return estimated_compaction_needed_bytes_; + } + + void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { + estimated_compaction_needed_bytes_ = v; + } + + bool force_consistency_checks() const { return force_consistency_checks_; } + + SequenceNumber bottommost_files_mark_threshold() const { + return bottommost_files_mark_threshold_; + } + + // Returns whether any key in [`smallest_key`, `largest_key`] could appear in + // an older L0 file than `last_l0_idx` or in a greater level than `last_level` + // + // @param last_level Level after which we check for overlap + // @param last_l0_idx If `last_level == 0`, index of L0 file after which we + // check for overlap; otherwise, must be -1 + bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key, + const Slice& largest_user_key, + int last_level, int last_l0_idx); + + private: + void ComputeCompensatedSizes(); + void UpdateNumNonEmptyLevels(); + void CalculateBaseBytes(const ImmutableOptions& ioptions, + const MutableCFOptions& options); + void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + void GenerateFileIndexer() { + file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); + } + + void GenerateLevelFilesBrief(); + void GenerateLevel0NonOverlapping(); + void GenerateBottommostFiles(); + void GenerateFileLocationIndex(); + + const InternalKeyComparator* internal_comparator_; + const Comparator* user_comparator_; + int num_levels_; // Number of levels + int num_non_empty_levels_; // Number of levels. Any level larger than it + // is guaranteed to be empty. + // Per-level max bytes + std::vector level_max_bytes_; + + // A short brief metadata of files per level + autovector level_files_brief_; + FileIndexer file_indexer_; + Arena arena_; // Used to allocate space for file_levels_ + + CompactionStyle compaction_style_; + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // Map of all table files in version. Maps file number to (level, position on + // level). + using FileLocations = UnorderedMap; + FileLocations file_locations_; + + // Vector of blob files in version sorted by blob file number. + BlobFiles blob_files_; + + // Level that L0 data should be compacted to. All levels < base_level_ should + // be empty. -1 if it is not level-compaction so it's not applicable. + int base_level_; + + // Applies to level compaction when + // `level_compaction_dynamic_level_bytes=true`. All non-empty levels <= + // lowest_unnecessary_level_ are not needed and will be drained automatically. + // -1 if there is no unnecessary level, + int lowest_unnecessary_level_; + + double level_multiplier_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector> files_by_compaction_pri_; + + // If true, means that files in L0 have keys with non overlapping ranges + bool level0_non_overlapping_; + + // An index into files_by_compaction_pri_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_compaction_pri_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t number_of_files_to_sort_ = 50; + + // This vector contains list of files marked for compaction and also not + // currently being compacted. It is protected by DB mutex. It is calculated in + // ComputeCompactionScore() + autovector> files_marked_for_compaction_; + + autovector> expired_ttl_files_; + + autovector> + files_marked_for_periodic_compaction_; + + // These files are considered bottommost because none of their keys can exist + // at lower levels. They are not necessarily all in the same level. The marked + // ones are eligible for compaction because they contain duplicate key + // versions that are no longer protected by snapshot. These variables are + // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and + // `ComputeBottommostFilesMarkedForCompaction()`. + autovector> bottommost_files_; + autovector> + bottommost_files_marked_for_compaction_; + + autovector> files_marked_for_forced_blob_gc_; + + // Threshold for needing to mark another bottommost file. Maintain it so we + // can quickly check when releasing a snapshot whether more bottommost files + // became eligible for compaction. It's defined as the min of the max nonzero + // seqnums of unmarked bottommost files. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + // Monotonically increases as we release old snapshots. Zero indicates no + // snapshots have been released yet. When no snapshots remain we set it to the + // current seqnum, which needs to be protected as a snapshot can still be + // created that references it. + SequenceNumber oldest_snapshot_seqnum_ = 0; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by ComputeCompactionScore. + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop + // for number of L0 files. + + // Compact cursors for round-robin compactions in each level + std::vector compact_cursor_; + + // the following are the sampled temporary stats. + // the current accumulated size of sampled files. + uint64_t accumulated_file_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_key_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_value_size_; + // total number of non-deletion entries + uint64_t accumulated_num_non_deletions_; + // total number of deletion entries + uint64_t accumulated_num_deletions_; + // current number of non_deletion entries + uint64_t current_num_non_deletions_; + // current number of deletion entries + uint64_t current_num_deletions_; + // current number of file samples + uint64_t current_num_samples_; + // Estimated bytes needed to be compacted until all levels' size is down to + // target sizes. + uint64_t estimated_compaction_needed_bytes_; + + bool finalized_; + + // If set to true, we will run consistency checks even if RocksDB + // is compiled in release mode + bool force_consistency_checks_; + + EpochNumberRequirement epoch_number_requirement_; + + friend class Version; + friend class VersionSet; +}; + +struct ObsoleteFileInfo { + FileMetaData* metadata; + std::string path; + // If true, the FileMataData should be destroyed but the file should + // not be deleted. This is because another FileMetaData still references + // the file, usually because the file is trivial moved so two FileMetadata + // is managing the file. + bool only_delete_metadata = false; + + ObsoleteFileInfo() noexcept + : metadata(nullptr), only_delete_metadata(false) {} + ObsoleteFileInfo(FileMetaData* f, const std::string& file_path, + std::shared_ptr + file_metadata_cache_res_mgr_arg = nullptr) + : metadata(f), + path(file_path), + only_delete_metadata(false), + file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {} + + ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; + ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; + + ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() { + *this = std::move(rhs); + } + + ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept { + path = std::move(rhs.path); + metadata = rhs.metadata; + rhs.metadata = nullptr; + file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr; + rhs.file_metadata_cache_res_mgr = nullptr; + + return *this; + } + void DeleteMetadata() { + if (file_metadata_cache_res_mgr) { + Status s = file_metadata_cache_res_mgr->UpdateCacheReservation( + metadata->ApproximateMemoryUsage(), false /* increase */); + s.PermitUncheckedError(); + } + delete metadata; + metadata = nullptr; + } + + private: + std::shared_ptr file_metadata_cache_res_mgr; +}; + +class ObsoleteBlobFileInfo { + public: + ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path) + : blob_file_number_(blob_file_number), path_(std::move(path)) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + const std::string& GetPath() const { return path_; } + + private: + uint64_t blob_file_number_; + std::string path_; +}; + +using MultiGetRange = MultiGetContext::Range; +// A column family's version consists of the table and blob files owned by +// the column family at a certain point in time. +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // @param read_options Must outlive any iterator built by + // `merger_iter_builder`. + void AddIterators(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merger_iter_builder, + bool allow_unprepared_value); + + // @param read_options Must outlive any iterator built by + // `merger_iter_builder`. + void AddIteratorsForLevel(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merger_iter_builder, + int level, bool allow_unprepared_value); + + Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&, + const Slice& smallest_user_key, + const Slice& largest_user_key, int level, + bool* overlap); + + // Lookup the value for key or get all merge operands for key. + // If do_merge = true (default) then lookup value for key. + // Behavior if do_merge = true: + // If found, store it in *value and + // return OK. Else return a non-OK status. + // Uses *operands to store merge_operator operations to apply later. + // + // If the ReadOptions.read_tier is set to do a read-only fetch, then + // *value_found will be set to false if it cannot be determined whether + // this value exists without doing IO. + // + // If the key is Deleted, *status will be set to NotFound and + // *key_exists will be set to true. + // If no key was found, *status will be set to NotFound and + // *key_exists will be set to false. + // If seq is non-null, *seq will be set to the sequence number found + // for the key if a key was found. + // Behavior if do_merge = false + // If the key has any merge operands then store them in + // merge_context.operands_list and don't merge the operands + // REQUIRES: lock is not held + // REQUIRES: pinned_iters_mgr != nullptr + void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, + PinnableWideColumns* columns, std::string* timestamp, Status* status, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, + bool* value_found = nullptr, bool* key_exists = nullptr, + SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, + bool* is_blob = nullptr, bool do_merge = true); + + void MultiGet(const ReadOptions&, MultiGetRange* range, + ReadCallback* callback = nullptr); + + // Interprets blob_index_slice as a blob reference, and (assuming the + // corresponding blob file is part of this Version) retrieves the blob and + // saves it in *value. + // REQUIRES: blob_index_slice stores an encoded blob reference + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + // Retrieves a blob using a blob reference and saves it in *value, + // assuming the corresponding blob file is part of this Version. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + struct BlobReadContext { + BlobReadContext(const BlobIndex& blob_idx, const KeyContext* key_ctx) + : blob_index(blob_idx), key_context(key_ctx) {} + + BlobIndex blob_index; + const KeyContext* key_context; + PinnableSlice result; + }; + + using BlobReadContexts = std::vector; + void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range, + std::unordered_map& blob_ctxs); + + // Loads some stats information from files (if update_stats is set) and + // populates derived data structures. Call without mutex held. It needs to be + // called before appending the version to the version set. + void PrepareAppend(const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, bool update_stats); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); + + // Add all files listed in the current version to *live_table_files and + // *live_blob_files. + void AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const; + + // Remove live files that are in the delete candidate lists. + void RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const; + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false, bool print_stats = false) const; + + // Returns the version number of this version + uint64_t GetVersionNumber() const { return version_number_; } + + // REQUIRES: lock is held + // On success, "tp" will contains the table properties of the file + // specified in "file_meta". If the file name of "file_meta" is + // known ahead, passing it by a non-null "fname" can save a + // file-name conversion. + Status GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, + const FileMetaData* file_meta, + const std::string* fname = nullptr) const; + + // REQUIRES: lock is held + // On success, *props will be populated with all SSTables' table properties. + // The keys of `props` are the sst file name, the values of `props` are the + // tables' properties, represented as std::shared_ptr. + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props); + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const ReadOptions& read_options, + const Range* range, std::size_t n, + TablePropertiesCollection* props) const; + + // Print summary of range delete tombstones in SST files into out_str, + // with maximum max_entries_to_print entries printed out. + Status TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str); + + // REQUIRES: lock is held + // On success, "tp" will contains the aggregated table property among + // the table properties of all sst files in this version. + Status GetAggregatedTableProperties( + const ReadOptions& read_options, + std::shared_ptr* tp, int level = -1); + + uint64_t GetEstimatedActiveKeys() { + return storage_info_.GetEstimatedActiveKeys(); + } + + size_t GetMemoryUsageByTableReaders(const ReadOptions& read_options); + + ColumnFamilyData* cfd() const { return cfd_; } + + // Return the next Version in the linked list. + Version* Next() const { return next_; } + + int TEST_refs() const { return refs_; } + + VersionStorageInfo* storage_info() { return &storage_info_; } + const VersionStorageInfo* storage_info() const { return &storage_info_; } + + VersionSet* version_set() { return vset_; } + + void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key); + + uint64_t GetSstFilesSize(); + + // Retrieves the file_creation_time of the oldest file in the DB. + // Prerequisite for this API is max_open_files = -1 + void GetCreationTimeOfOldestFile(uint64_t* creation_time); + + const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } + + InternalIterator* TEST_GetLevelIterator( + const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder, + int level, bool allow_unprepared_value); + + private: + Env* env_; + SystemClock* clock_; + + friend class ReactiveVersionSet; + friend class VersionSet; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + + const InternalKeyComparator* internal_comparator() const { + return storage_info_.internal_comparator_; + } + const Comparator* user_comparator() const { + return storage_info_.user_comparator_; + } + + // Returns true if the filter blocks in the specified level will not be + // checked during read operations. In certain cases (trivial move or preload), + // the filter block may already be cached, but we still do not access it such + // that it eventually expires from the cache. + bool IsFilterSkipped(int level, bool is_file_last_in_level = false); + + // The helper function of UpdateAccumulatedStats, which may fill the missing + // fields of file_meta from its associated TableProperties. + // Returns true if it does initialize FileMetaData. + bool MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta); + + // Update the accumulated stats associated with the current version. + // This accumulated stats will be used in compaction. + void UpdateAccumulatedStats(const ReadOptions& read_options); + + DECLARE_SYNC_AND_ASYNC( + /* ret_type */ Status, /* func_name */ MultiGetFromSST, + const ReadOptions& read_options, MultiGetRange file_range, + int hit_file_level, bool skip_filters, bool skip_range_deletions, + FdWithKeyRange* f, + std::unordered_map& blob_ctxs, + TableCache::TypedHandle* table_handle, uint64_t& num_filter_read, + uint64_t& num_index_read, uint64_t& num_sst_read); + +#ifdef USE_COROUTINES + // MultiGet using async IO to read data blocks from SST files in parallel + // within and across levels + Status MultiGetAsync( + const ReadOptions& options, MultiGetRange* range, + std::unordered_map* blob_ctxs); + + // A helper function to lookup a batch of keys in a single level. It will + // queue coroutine tasks to mget_tasks. It may also split the input batch + // by creating a new batch with keys definitely not in this level and + // enqueuing it to to_process. + Status ProcessBatch( + const ReadOptions& read_options, FilePickerMultiGet* batch, + std::vector>& mget_tasks, + std::unordered_map* blob_ctxs, + autovector& batches, std::deque& waiting, + std::deque& to_process, unsigned int& num_tasks_queued, + std::unordered_map>& + mget_stats); +#endif + + ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs + Logger* info_log_; + Statistics* db_statistics_; + TableCache* table_cache_; + BlobSource* blob_source_; + const MergeOperator* merge_operator_; + + VersionStorageInfo storage_info_; + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version + const FileOptions file_options_; + const MutableCFOptions mutable_cf_options_; + // Cached value to avoid recomputing it on every read. + const size_t max_file_size_for_l0_meta_pin_; + + // A version number that uniquely represents this version. This is + // used for debugging and logging purposes only. + uint64_t version_number_; + std::shared_ptr io_tracer_; + bool use_async_io_; + + Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, + MutableCFOptions mutable_cf_options, + const std::shared_ptr& io_tracer, + uint64_t version_number = 0, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); + + ~Version(); + + // No copying allowed + Version(const Version&) = delete; + void operator=(const Version&) = delete; +}; + +class BaseReferencedVersionBuilder; + +class AtomicGroupReadBuffer { + public: + AtomicGroupReadBuffer() = default; + Status AddEdit(VersionEdit* edit); + void Clear(); + bool IsFull() const; + bool IsEmpty() const; + + uint64_t TEST_read_edits_in_atomic_group() const { + return read_edits_in_atomic_group_; + } + std::vector& replay_buffer() { return replay_buffer_; } + + private: + uint64_t read_edits_in_atomic_group_ = 0; + std::vector replay_buffer_; +}; + +// VersionSet is the collection of versions of all the column families of the +// database. Each database owns one VersionSet. A VersionSet has access to all +// column families via ColumnFamilySet, i.e. set of the column families. +class VersionSet { + public: + VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, + const FileOptions& file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id); + // No copying allowed + VersionSet(const VersionSet&) = delete; + void operator=(const VersionSet&) = delete; + + virtual ~VersionSet(); + + Status LogAndApplyToDefaultColumnFamily( + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); + const MutableCFOptions* cf_options = + default_cf->GetLatestMutableCFOptions(); + return LogAndApply(default_cf, *cf_options, read_options, edit, mu, + dir_contains_current_file, new_descriptor_log, + column_family_options); + } + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Will release *mu while actually writing to the file. + // column_family_options has to be set if edit is column family add + // REQUIRES: *mu is held on entry. + // REQUIRES: no other thread concurrently calls LogAndApply() + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + autovector cfds; + cfds.emplace_back(column_family_data); + autovector mutable_cf_options_list; + mutable_cf_options_list.emplace_back(&mutable_cf_options); + autovector> edit_lists; + autovector edit_list; + edit_list.emplace_back(edit); + edit_lists.emplace_back(edit_list); + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, + column_family_options); + } + // The batch version. If edit_list.size() > 1, caller must ensure that + // no edit in the list column family add or drop + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, + const autovector& edit_list, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr, + const std::function& manifest_wcb = {}) { + autovector cfds; + cfds.emplace_back(column_family_data); + autovector mutable_cf_options_list; + mutable_cf_options_list.emplace_back(&mutable_cf_options); + autovector> edit_lists; + edit_lists.emplace_back(edit_list); + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, + column_family_options, {manifest_wcb}); + } + + // The across-multi-cf batch version. If edit_lists contain more than + // 1 version edits, caller must ensure that no edit in the []list is column + // family manipulation. + virtual Status LogAndApply( + const autovector& cfds, + const autovector& mutable_cf_options_list, + const ReadOptions& read_options, + const autovector>& edit_lists, + InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, + bool new_descriptor_log = false, + const ColumnFamilyOptions* new_cf_options = nullptr, + const std::vector>& manifest_wcbs = + {}); + + static Status GetCurrentManifestPath(const std::string& dbname, + FileSystem* fs, + std::string* manifest_filename, + uint64_t* manifest_file_number); + void WakeUpWaitingManifestWriters(); + + // Recover the last saved descriptor (MANIFEST) from persistent storage. + // If read_only == true, Recover() will not complain if some column families + // are not opened + Status Recover(const std::vector& column_families, + bool read_only = false, std::string* db_id = nullptr, + bool no_error_if_files_missing = false); + + Status TryRecover(const std::vector& column_families, + bool read_only, + const std::vector& files_in_dbname, + std::string* db_id, bool* has_missing_table_file); + + // Try to recover the version set to the most recent consistent state + // recorded in the specified manifest. + Status TryRecoverFromOneManifest( + const std::string& manifest_path, + const std::vector& column_families, + bool read_only, std::string* db_id, bool* has_missing_table_file); + + // Recover the next epoch number of each CFs and epoch number + // of their files (if missing) + void RecoverEpochNumbers(); + + // Reads a manifest file and returns a list of column families in + // column_families. + static Status ListColumnFamilies(std::vector* column_families, + const std::string& dbname, FileSystem* fs); + static Status ListColumnFamiliesFromManifest( + const std::string& manifest_path, FileSystem* fs, + std::vector* column_families); + + // Try to reduce the number of levels. This call is valid when + // only one level from the new max level to the old + // max level containing files. + // The call is static, since number of levels is immutable during + // the lifetime of a RocksDB instance. It reduces number of levels + // in a DB by applying changes to manifest. + // For example, a db currently has 7 levels [0-6], and a call to + // to reduce to 5 [0-4] can only be executed when only one level + // among [4-6] contains files. + static Status ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const FileOptions& file_options, + int new_levels); + + // Get the checksum information of all live files + Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list); + + // printf contents (for debugging) + Status DumpManifest(Options& options, std::string& manifestFileName, + bool verbose, bool hex = false, bool json = false); + + + const std::string& DbSessionId() const { return db_session_id_; } + + // Return the current manifest file number + uint64_t manifest_file_number() const { return manifest_file_number_; } + + uint64_t options_file_number() const { return options_file_number_; } + + uint64_t pending_manifest_file_number() const { + return pending_manifest_file_number_; + } + + uint64_t current_next_file_number() const { return next_file_number_.load(); } + + uint64_t min_log_number_to_keep() const { + return min_log_number_to_keep_.load(); + } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } + + // Fetch And Add n new file number + uint64_t FetchAddFileNumber(uint64_t n) { + return next_file_number_.fetch_add(n); + } + + // Return the last sequence number. + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } + + // Note: memory_order_acquire must be sufficient. + uint64_t LastAllocatedSequence() const { + return last_allocated_sequence_.load(std::memory_order_seq_cst); + } + + // Note: memory_order_acquire must be sufficient. + uint64_t LastPublishedSequence() const { + return last_published_sequence_.load(std::memory_order_seq_cst); + } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + // Last visible sequence must always be less than last written seq + assert(!db_options_->two_write_queues || s <= last_allocated_sequence_); + last_sequence_.store(s, std::memory_order_release); + } + + // Note: memory_order_release must be sufficient + void SetLastPublishedSequence(uint64_t s) { + assert(s >= last_published_sequence_); + last_published_sequence_.store(s, std::memory_order_seq_cst); + } + + // Note: memory_order_release must be sufficient + void SetLastAllocatedSequence(uint64_t s) { + assert(s >= last_allocated_sequence_); + last_allocated_sequence_.store(s, std::memory_order_seq_cst); + } + + // Note: memory_order_release must be sufficient + uint64_t FetchAddLastAllocatedSequence(uint64_t s) { + return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst); + } + + // Mark the specified file number as used. + // REQUIRED: this is only called during single-threaded recovery or repair. + void MarkFileNumberUsed(uint64_t number); + + // Mark the specified log number as deleted + // REQUIRED: this is only called during single-threaded recovery or repair, or + // from ::LogAndApply where the global mutex is held. + void MarkMinLogNumberToKeep(uint64_t number); + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t prev_log_number() const { return prev_log_number_; } + + // Returns the minimum log number which still has data not flushed to any SST + // file. + // In non-2PC mode, all the log numbers smaller than this number can be safely + // deleted, although we still use `min_log_number_to_keep_` to determine when + // to delete a WAL file. + uint64_t MinLogNumberWithUnflushedData() const { + return PreComputeMinLogNumberWithUnflushedData(nullptr); + } + + // Returns the minimum log number which still has data not flushed to any SST + // file. + // Empty column families' log number is considered to be + // new_log_number_for_empty_cf. + uint64_t PreComputeMinLogNumberWithUnflushedData( + uint64_t new_log_number_for_empty_cf) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + uint64_t num = + cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber(); + if (min_log_num > num && !cfd->IsDropped()) { + min_log_num = num; + } + } + return min_log_num; + } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfd_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const ColumnFamilyData* cfd_to_skip) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (cfd == cfd_to_skip) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfds_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const std::unordered_set& cfds_to_skip) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (cfds_to_skip.count(cfd)) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + // @param read_options Must outlive the returned iterator. + // @param start, end indicates compaction range + InternalIterator* MakeInputIterator( + const ReadOptions& read_options, const Compaction* c, + RangeDelAggregator* range_del_agg, + const FileOptions& file_options_compactions, + const std::optional& start, + const std::optional& end); + + // Add all files listed in any live version to *live_table_files and + // *live_blob_files. Note that these lists may contain duplicates. + void AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const; + + // Remove live files that are in the delete candidate lists. + void RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const; + + // Return the approximate size of data to be scanned for range [start, end) + // in levels [start_level, end_level). If end_level == -1 it will search + // through all non-empty levels + uint64_t ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, + const Slice& start, const Slice& end, + int start_level, int end_level, + TableReaderCaller caller); + + // Return the size of the current manifest file + uint64_t manifest_file_size() const { return manifest_file_size_; } + + Status GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** metadata, ColumnFamilyData** cfd); + + // This function doesn't support leveldb SST filenames + void GetLiveFilesMetaData(std::vector* metadata); + + void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) { + assert(table_cache_); + + table_cache_->Erase(GetSliceForKey(&blob_file_number)); + + obsolete_blob_files_.emplace_back(blob_file_number, std::move(path)); + } + + void GetObsoleteFiles(std::vector* files, + std::vector* blob_files, + std::vector* manifest_filenames, + uint64_t min_pending_output); + + ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + RefedColumnFamilySet GetRefedColumnFamilySet() { + return RefedColumnFamilySet(GetColumnFamilySet()); + } + + const FileOptions& file_options() { return file_options_; } + void ChangeFileOptions(const MutableDBOptions& new_options) { + file_options_.writable_file_max_buffer_size = + new_options.writable_file_max_buffer_size; + } + + const ImmutableDBOptions* db_options() const { return db_options_; } + + static uint64_t GetNumLiveVersions(Version* dummy_versions); + + static uint64_t GetTotalSstFilesSize(Version* dummy_versions); + + static uint64_t GetTotalBlobFileSize(Version* dummy_versions); + + // Get the IO Status returned by written Manifest. + const IOStatus& io_status() const { return io_status_; } + + // The returned WalSet needs to be accessed with DB mutex held. + const WalSet& GetWalSet() const { return wals_; } + + void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) { + assert(cfd); + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + Version* const version = + new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); + + constexpr bool update_stats = false; + const ReadOptions read_options; + version->PrepareAppend(mutable_cf_options, read_options, update_stats); + AppendVersion(cfd, version); + } + + protected: + using VersionBuilderMap = + UnorderedMap>; + + struct ManifestWriter; + + friend class Version; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; + friend class DBImpl; + friend class DBImplReadOnly; + + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t /*bytes*/, const Status& s) override { + if (status->ok()) { + *status = s; + } + } + }; + + void Reset(); + + // Returns approximated offset of a key in a file for a given version. + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& key, + TableReaderCaller caller); + + // Returns approximated data size between start and end keys in a file + // for a given version. + uint64_t ApproximateSize(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& start, + const Slice& end, TableReaderCaller caller); + + struct MutableCFState { + uint64_t log_number; + std::string full_history_ts_low; + + explicit MutableCFState() = default; + explicit MutableCFState(uint64_t _log_number, std::string ts_low) + : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {} + }; + + // Save current contents to *log + Status WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); + + void AppendVersion(ColumnFamilyData* column_family_data, Version* v); + + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const ReadOptions& read_options, + const VersionEdit* edit); + + Status VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, + int level, const FileMetaData& meta); + + // Protected by DB mutex. + WalSet wals_; + + std::unique_ptr column_family_set_; + Cache* table_cache_; + Env* const env_; + FileSystemPtr const fs_; + SystemClock* const clock_; + const std::string dbname_; + std::string db_id_; + const ImmutableDBOptions* const db_options_; + std::atomic next_file_number_; + // Any WAL number smaller than this should be ignored during recovery, + // and is qualified for being deleted. + std::atomic min_log_number_to_keep_ = {0}; + uint64_t manifest_file_number_; + uint64_t options_file_number_; + uint64_t options_file_size_; + uint64_t pending_manifest_file_number_; + // The last seq visible to reads. It normally indicates the last sequence in + // the memtable but when using two write queues it could also indicate the + // last sequence in the WAL visible to reads. + std::atomic last_sequence_; + // The last sequence number of data committed to the descriptor (manifest + // file). + SequenceNumber descriptor_last_sequence_ = 0; + // The last seq that is already allocated. It is applicable only when we have + // two write queues. In that case seq might or might not have appreated in + // memtable but it is expected to appear in the WAL. + // We have last_sequence <= last_allocated_sequence_ + std::atomic last_allocated_sequence_; + // The last allocated sequence that is also published to the readers. This is + // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise + // last_sequence_ also indicates the last published seq. + // We have last_sequence <= last_published_sequence_ <= + // last_allocated_sequence_ + std::atomic last_published_sequence_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + std::unique_ptr descriptor_log_; + + // generates a increasing version number for every new version + uint64_t current_version_number_; + + // Queue of writers to the manifest file + std::deque manifest_writers_; + + // Current size of manifest file + uint64_t manifest_file_size_; + + std::vector obsolete_files_; + std::vector obsolete_blob_files_; + std::vector obsolete_manifests_; + + // env options for all reads and writes except compactions + FileOptions file_options_; + + BlockCacheTracer* const block_cache_tracer_; + + // Store the IO status when Manifest is written + IOStatus io_status_; + + std::shared_ptr io_tracer_; + + std::string db_session_id_; + + private: + // REQUIRES db mutex at beginning. may release and re-acquire db mutex + Status ProcessManifestWrites(std::deque& writers, + InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, + bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options); + + void LogAndApplyCFHelper(VersionEdit* edit, + SequenceNumber* max_last_sequence); + Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, + VersionEdit* edit, SequenceNumber* max_last_sequence, + InstrumentedMutex* mu); +}; + +// ReactiveVersionSet represents a collection of versions of the column +// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary, +// need to replay the MANIFEST (description log in older terms) in order to +// reconstruct and install versions. +class ReactiveVersionSet : public VersionSet { + public: + ReactiveVersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + const std::shared_ptr& io_tracer); + + ~ReactiveVersionSet() override; + + Status ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + Status* manifest_read_status, + std::unordered_set* cfds_changed); + + Status Recover(const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status); +#ifndef NDEBUG + uint64_t TEST_read_edits_in_atomic_group() const; +#endif //! NDEBUG + + std::vector& replay_buffer(); + + protected: + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( + VersionEdit& edit, std::unordered_set* cfds_changed, + VersionEdit* version_edit); + + Status MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader); + + private: + std::unique_ptr manifest_tailer_; + // TODO: plumb Env::IOActivity + const ReadOptions read_options_; + using VersionSet::LogAndApply; + using VersionSet::Recover; + + Status LogAndApply( + const autovector& /*cfds*/, + const autovector& /*mutable_cf_options_list*/, + const ReadOptions& /* read_options */, + const autovector>& /*edit_lists*/, + InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, + bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, + const std::vector>& /*manifest_wcbs*/) + override { + return Status::NotSupported("not supported in reactive mode"); + } + + // No copy allowed + ReactiveVersionSet(const ReactiveVersionSet&); + ReactiveVersionSet& operator=(const ReactiveVersionSet&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set_sync_and_async.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set_sync_and_async.h new file mode 100644 index 0000000000..2507762e8c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_set_sync_and_async.h @@ -0,0 +1,172 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/coro_utils.h" + +#if defined(WITHOUT_COROUTINES) || \ + (defined(USE_COROUTINES) && defined(WITH_COROUTINES)) + +namespace ROCKSDB_NAMESPACE { + +// Lookup a batch of keys in a single SST file +DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) +(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level, + bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f, + std::unordered_map& blob_ctxs, + TableCache::TypedHandle* table_handle, uint64_t& num_filter_read, + uint64_t& num_index_read, uint64_t& num_sst_read) { + bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + + Status s; + StopWatchNano timer(clock_, timer_enabled /* auto_start */); + s = CO_AWAIT(table_cache_->MultiGet)( + read_options, *internal_comparator(), *f->file_metadata, &file_range, + mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor, + cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters, + skip_range_deletions, hit_file_level, table_handle); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + hit_file_level); + } + if (!s.ok()) { + // TODO: Set status for individual keys appropriately + for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { + *iter->s = s; + file_range.MarkKeyDone(iter); + } + CO_RETURN s; + } + uint64_t batch_size = 0; + for (auto iter = file_range.begin(); s.ok() && iter != file_range.end(); + ++iter) { + GetContext& get_context = *iter->get_context; + Status* status = iter->s; + // The Status in the KeyContext takes precedence over GetContext state + // Status may be an error if there were any IO errors in the table + // reader. We never expect Status to be NotFound(), as that is + // determined by get_context + assert(!status->IsNotFound()); + if (!status->ok()) { + file_range.MarkKeyDone(iter); + continue; + } + + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } + batch_size++; + num_index_read += get_context.get_context_stats_.num_index_read; + num_filter_read += get_context.get_context_stats_.num_filter_read; + num_sst_read += get_context.get_context_stats_.num_sst_read; + // Reset these stats since they're specific to a level + get_context.get_context_stats_.num_index_read = 0; + get_context.get_context_stats_.num_filter_read = 0; + get_context.get_context_stats_.num_sst_read = 0; + + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } else { + if (iter->max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so + // we stop here for this key + file_range.SkipKey(iter); + } + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (hit_file_level == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (hit_file_level == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (hit_file_level >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level); + + file_range.MarkKeyDone(iter); + + if (iter->is_blob_index) { + BlobIndex blob_index; + Status tmp_s; + + if (iter->value) { + TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex", + &(*iter)); + + tmp_s = blob_index.DecodeFrom(*(iter->value)); + + } else { + assert(iter->columns); + assert(!iter->columns->columns().empty()); + assert(iter->columns->columns().front().name() == + kDefaultWideColumnName); + + tmp_s = + blob_index.DecodeFrom(iter->columns->columns().front().value()); + } + + if (tmp_s.ok()) { + const uint64_t blob_file_num = blob_index.file_number(); + blob_ctxs[blob_file_num].emplace_back(blob_index, &*iter); + } else { + *(iter->s) = tmp_s; + } + } else { + if (iter->value) { + file_range.AddValueSize(iter->value->size()); + } else { + assert(iter->columns); + file_range.AddValueSize(iter->columns->serialized_size()); + } + + if (file_range.GetValueSize() > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } + continue; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kCorrupt: + *status = + Status::Corruption("corrupted key for ", iter->lkey->user_key()); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kUnexpectedBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kMergeOperatorFailed: + *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed); + file_range.MarkKeyDone(iter); + continue; + } + } + + RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size); + CO_RETURN s; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_util.h new file mode 100644 index 0000000000..e39f255719 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/version_util.h @@ -0,0 +1,72 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +// Instead of opening a `DB` to perform certain manifest updates, this +// uses the underlying `VersionSet` API to read and modify the MANIFEST. This +// allows us to use the user's real options, while not having to worry about +// the DB persisting new SST files via flush/compaction or attempting to read/ +// compact files which may fail, particularly for the file we intend to remove +// (the user may want to remove an already deleted file from MANIFEST). +class OfflineManifestWriter { + public: + OfflineManifestWriter(const DBOptions& options, const std::string& db_path) + : wc_(options.delayed_write_rate), + wb_(options.db_write_buffer_size), + immutable_db_options_(WithDbPath(options, db_path)), + tc_(NewLRUCache(1 << 20 /* capacity */, + options.table_cache_numshardbits)), + versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "") {} + + Status Recover(const std::vector& column_families) { + return versions_.Recover(column_families, /*read_only*/ false, + /*db_id*/ nullptr, + /*no_error_if_files_missing*/ true); + } + + Status LogAndApply(const ReadOptions& read_options, ColumnFamilyData* cfd, + VersionEdit* edit, + FSDirectory* dir_contains_current_file) { + // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. + InstrumentedMutex mutex; + mutex.Lock(); + Status s = versions_.LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), read_options, edit, &mutex, + dir_contains_current_file, false /* new_descriptor_log */); + mutex.Unlock(); + return s; + } + + VersionSet& Versions() { return versions_; } + const ImmutableDBOptions& IOptions() { return immutable_db_options_; } + + private: + WriteController wc_; + WriteBufferManager wb_; + ImmutableDBOptions immutable_db_options_; + std::shared_ptr tc_; + EnvOptions sopt_; + VersionSet versions_; + + static ImmutableDBOptions WithDbPath(const DBOptions& options, + const std::string& db_path) { + ImmutableDBOptions rv(options); + if (rv.db_paths.empty()) { + // `VersionSet` expects options that have been through + // `SanitizeOptions()`, which would sanitize an empty `db_paths`. + rv.db_paths.emplace_back(db_path, 0 /* target_size */); + } + return rv; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.cc new file mode 100644 index 0000000000..2525be610b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wal_edit.h" + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +void WalAddition::EncodeTo(std::string* dst) const { + PutVarint64(dst, number_); + + if (metadata_.HasSyncedSize()) { + PutVarint32(dst, static_cast(WalAdditionTag::kSyncedSize)); + PutVarint64(dst, metadata_.GetSyncedSizeInBytes()); + } + + PutVarint32(dst, static_cast(WalAdditionTag::kTerminate)); +} + +Status WalAddition::DecodeFrom(Slice* src) { + constexpr char class_name[] = "WalAddition"; + + if (!GetVarint64(src, &number_)) { + return Status::Corruption(class_name, "Error decoding WAL log number"); + } + + while (true) { + uint32_t tag_value = 0; + if (!GetVarint32(src, &tag_value)) { + return Status::Corruption(class_name, "Error decoding tag"); + } + WalAdditionTag tag = static_cast(tag_value); + switch (tag) { + case WalAdditionTag::kSyncedSize: { + uint64_t size = 0; + if (!GetVarint64(src, &size)) { + return Status::Corruption(class_name, "Error decoding WAL file size"); + } + metadata_.SetSyncedSizeInBytes(size); + break; + } + // TODO: process future tags such as checksum. + case WalAdditionTag::kTerminate: + return Status::OK(); + default: { + std::stringstream ss; + ss << "Unknown tag " << tag_value; + return Status::Corruption(class_name, ss.str()); + } + } + } +} + +JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) { + jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes" + << wal.GetMetadata().GetSyncedSizeInBytes(); + return jw; +} + +std::ostream& operator<<(std::ostream& os, const WalAddition& wal) { + os << "log_number: " << wal.GetLogNumber() + << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes(); + return os; +} + +std::string WalAddition::DebugString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +void WalDeletion::EncodeTo(std::string* dst) const { + PutVarint64(dst, number_); +} + +Status WalDeletion::DecodeFrom(Slice* src) { + constexpr char class_name[] = "WalDeletion"; + + if (!GetVarint64(src, &number_)) { + return Status::Corruption(class_name, "Error decoding WAL log number"); + } + + return Status::OK(); +} + +JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) { + jw << "LogNumber" << wal.GetLogNumber(); + return jw; +} + +std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) { + os << "log_number: " << wal.GetLogNumber(); + return os; +} + +std::string WalDeletion::DebugString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +Status WalSet::AddWal(const WalAddition& wal) { + if (wal.GetLogNumber() < min_wal_number_to_keep_) { + // The WAL has been obsolete, ignore it. + return Status::OK(); + } + + auto it = wals_.lower_bound(wal.GetLogNumber()); + bool existing = it != wals_.end() && it->first == wal.GetLogNumber(); + + if (!existing) { + wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()}); + return Status::OK(); + } + + assert(existing); + if (!wal.GetMetadata().HasSyncedSize()) { + std::stringstream ss; + ss << "WAL " << wal.GetLogNumber() << " is created more than once"; + return Status::Corruption("WalSet::AddWal", ss.str()); + } + + assert(wal.GetMetadata().HasSyncedSize()); + if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <= + it->second.GetSyncedSizeInBytes()) { + // This is possible because version edits with different synced WAL sizes + // for the same WAL can be committed out-of-order. For example, thread + // 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20 + // bytes of 1.log. It's possible that thread 1 calls LogAndApply() after + // thread 2. + // In this case, just return ok. + return Status::OK(); + } + + // Update synced size for the given WAL. + it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes()); + return Status::OK(); +} + +Status WalSet::AddWals(const WalAdditions& wals) { + Status s; + for (const WalAddition& wal : wals) { + s = AddWal(wal); + if (!s.ok()) { + break; + } + } + return s; +} + +Status WalSet::DeleteWalsBefore(WalNumber wal) { + if (wal > min_wal_number_to_keep_) { + min_wal_number_to_keep_ = wal; + wals_.erase(wals_.begin(), wals_.lower_bound(wal)); + } + return Status::OK(); +} + +void WalSet::Reset() { + wals_.clear(); + min_wal_number_to_keep_ = 0; +} + +Status WalSet::CheckWals( + Env* env, + const std::unordered_map& logs_on_disk) const { + assert(env != nullptr); + + Status s; + for (const auto& wal : wals_) { + const uint64_t log_number = wal.first; + const WalMetadata& wal_meta = wal.second; + + if (!wal_meta.HasSyncedSize()) { + // The WAL and WAL directory is not even synced, + // so the WAL's inode may not be persisted, + // then the WAL might not show up when listing WAL directory. + continue; + } + + if (logs_on_disk.find(log_number) == logs_on_disk.end()) { + std::stringstream ss; + ss << "Missing WAL with log number: " << log_number << "."; + s = Status::Corruption(ss.str()); + break; + } + + uint64_t log_file_size = 0; + s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size); + if (!s.ok()) { + break; + } + if (log_file_size < wal_meta.GetSyncedSizeInBytes()) { + std::stringstream ss; + ss << "Size mismatch: WAL (log number: " << log_number + << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes() + << " bytes , but actually is " << log_file_size << " bytes on disk."; + s = Status::Corruption(ss.str()); + break; + } + } + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.h new file mode 100644 index 0000000000..bb5c5e292e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_edit.h @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// WAL related classes used in VersionEdit and VersionSet. +// Modifications to WalAddition and WalDeletion may need to update +// VersionEdit and its related tests. + +#pragma once + +#include +#include +#include +#include +#include + +#include "logging/event_logger.h" +#include "port/port.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +using WalNumber = uint64_t; + +// Metadata of a WAL. +class WalMetadata { + public: + WalMetadata() = default; + + explicit WalMetadata(uint64_t synced_size_bytes) + : synced_size_bytes_(synced_size_bytes) {} + + bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; } + + void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; } + + uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; } + + private: + friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs); + friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs); + // The size of WAL is unknown, used when the WAL is not synced yet or is + // empty. + constexpr static uint64_t kUnknownWalSize = + std::numeric_limits::max(); + + // Size of the most recently synced WAL in bytes. + uint64_t synced_size_bytes_ = kUnknownWalSize; +}; + +inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) { + return lhs.synced_size_bytes_ == rhs.synced_size_bytes_; +} + +inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) { + return !(lhs == rhs); +} + +// These tags are persisted to MANIFEST, so it's part of the user API. +enum class WalAdditionTag : uint32_t { + // Indicates that there are no more tags. + kTerminate = 1, + // Synced Size in bytes. + kSyncedSize = 2, + // Add tags in the future, such as checksum? +}; + +// Records the event of adding a WAL in VersionEdit. +class WalAddition { + public: + WalAddition() : number_(0), metadata_() {} + + explicit WalAddition(WalNumber number) : number_(number), metadata_() {} + + WalAddition(WalNumber number, WalMetadata meta) + : number_(number), metadata_(std::move(meta)) {} + + WalNumber GetLogNumber() const { return number_; } + + const WalMetadata& GetMetadata() const { return metadata_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* src); + + std::string DebugString() const; + + private: + WalNumber number_; + WalMetadata metadata_; +}; + +std::ostream& operator<<(std::ostream& os, const WalAddition& wal); +JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal); + +using WalAdditions = std::vector; + +// Records the event of deleting WALs before the specified log number. +class WalDeletion { + public: + WalDeletion() : number_(kEmpty) {} + + explicit WalDeletion(WalNumber number) : number_(number) {} + + WalNumber GetLogNumber() const { return number_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* src); + + std::string DebugString() const; + + bool IsEmpty() const { return number_ == kEmpty; } + + void Reset() { number_ = kEmpty; } + + private: + static constexpr WalNumber kEmpty = 0; + + WalNumber number_; +}; + +std::ostream& operator<<(std::ostream& os, const WalDeletion& wal); +JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal); + +// Used in VersionSet to keep the current set of WALs. +// +// When a WAL is synced or becomes obsoleted, +// a VersionEdit is logged to MANIFEST and +// the WAL is added to or deleted from WalSet. +// +// Not thread safe, needs external synchronization such as holding DB mutex. +class WalSet { + public: + // Add WAL(s). + // If the WAL is closed, + // then there must be an existing unclosed WAL, + // otherwise, return Status::Corruption. + // Can happen when applying a VersionEdit or recovering from MANIFEST. + Status AddWal(const WalAddition& wal); + Status AddWals(const WalAdditions& wals); + + // Delete WALs with log number smaller than the specified wal number. + // Can happen when applying a VersionEdit or recovering from MANIFEST. + Status DeleteWalsBefore(WalNumber wal); + + // Resets the internal state. + void Reset(); + + // WALs with number less than MinWalNumberToKeep should not exist in WalSet. + WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; } + + const std::map& GetWals() const { return wals_; } + + // Checks whether there are missing or corrupted WALs. + // Returns Status::OK if there is no missing nor corrupted WAL, + // otherwise returns Status::Corruption. + // logs_on_disk is a map from log number to the log filename. + // Note that logs_on_disk may contain logs that is obsolete but + // haven't been deleted from disk. + Status CheckWals( + Env* env, + const std::unordered_map& logs_on_disk) const; + + private: + std::map wals_; + // WAL number < min_wal_number_to_keep_ should not exist in wals_. + // It's monotonically increasing, in-memory only, not written to MANIFEST. + WalNumber min_wal_number_to_keep_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.cc new file mode 100644 index 0000000000..400b2e58b6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.cc @@ -0,0 +1,527 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/wal_manager.h" + +#include +#include +#include +#include + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/transaction_log_impl.h" +#include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sequence_file_reader.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + + +Status WalManager::DeleteFile(const std::string& fname, uint64_t number) { + auto s = env_->DeleteFile(wal_dir_ + "/" + fname); + if (s.ok()) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + return s; +} + +Status WalManager::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. + Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. + std::string archivedir = ArchivalDirectory(wal_dir_); + Status exists = env_->FileExists(archivedir); + if (exists.ok()) { + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } else if (!exists.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive", + log->PathName().c_str()); + } + } + + return s; +} + +Status WalManager::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set) { + if (seq_per_batch_) { + return Status::NotSupported(); + } + + assert(!seq_per_batch_); + + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset(new TransactionLogIteratorImpl( + wal_dir_, &db_options_, read_options, file_options_, seq, + std::move(wal_files), version_set, seq_per_batch_, io_tracer_)); + return (*iter)->status(); +} + +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void WalManager::PurgeObsoleteWALFiles() { + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time = 0; + Status s = db_options_.clock->GetCurrentTime(¤t_time); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s", + s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) + ? db_options_.WAL_ttl_seconds / 2 + : kDefaultIntervalToDeleteObsoleteWAL; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(wal_dir_); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s", + s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kWalFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + s = env_->GetFileModificationTime(file_path, &file_m_time); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Can't get file mod time: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Unable to get file size: %s: %s", file_path.c_str(), + s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to delete file: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = static_cast( + db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size); + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to get archived WALs from: %s: %s", + archival_dir.c_str(), s.ToString().c_str()); + files_del_num = 0; + } else if (files_del_num > archived_logs.size()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(archived_logs[i]->LogNumber()); + } + } +} + +void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { + auto archived_log_name = ArchivedLogFileName(wal_dir_, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1"); + Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2"); + ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n", + fname.c_str(), archived_log_name.c_str(), + s.ToString().c_str()); +} + +Status WalManager::GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(all_files.size()); + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kWalFile) { + SequenceNumber sequence; + Status s = ReadFirstRecord(log_type, number, &sequence); + if (!s.ok()) { + return s; + } + if (sequence == 0) { + // empty file + continue; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2"); + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + // re-try in case the alive log file has been moved to archive. + if (!s.ok() && log_type == kAliveLogFile) { + std::string archived_file = ArchivedLogFileName(path, number); + if (env_->FileExists(archived_file).ok()) { + s = env_->GetFileSize(archived_file, &size_bytes); + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + // oops, the file just got deleted from archived dir! move on + s = Status::OK(); + continue; + } + } + } + if (!s.ok()) { + return s; + } + + log_files.push_back(std::unique_ptr( + new LogFileImpl(number, log_type, sequence, size_bytes))); + } + } + std::sort( + log_files.begin(), log_files.end(), + [](const std::unique_ptr& a, const std::unique_ptr& b) { + LogFileImpl* a_impl = static_cast_with_check(a.get()); + LogFileImpl* b_impl = static_cast_with_check(b.get()); + return *a_impl < *b_impl; + }); + return status; +} + +Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + int64_t start = 0; // signed to avoid overflow when target is < first file. + int64_t end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + int64_t mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = + all_logs.at(static_cast(mid))->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + // end could be -ve. + size_t start_index = + static_cast(std::max(static_cast(0), end)); + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +Status WalManager::ReadFirstRecord(const WalFileType type, + const uint64_t number, + SequenceNumber* sequence) { + *sequence = 0; + if (type != kAliveLogFile && type != kArchivedLogFile) { + ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s", + std::to_string(type).c_str()); + return Status::NotSupported("File Type Not Known " + std::to_string(type)); + } + { + MutexLock l(&read_first_record_cache_mutex_); + auto itr = read_first_record_cache_.find(number); + if (itr != read_first_record_cache_.end()) { + *sequence = itr->second; + return Status::OK(); + } + } + Status s; + if (type == kAliveLogFile) { + std::string fname = LogFileName(wal_dir_, number); + s = ReadFirstLine(fname, number, sequence); + if (!s.ok() && env_->FileExists(fname).ok()) { + // return any error that is not caused by non-existing file + return s; + } + } + + if (type == kArchivedLogFile || !s.ok()) { + // check if the file got moved to archive. + std::string archived_file = ArchivedLogFileName(wal_dir_, number); + s = ReadFirstLine(archived_file, number, sequence); + // maybe the file was deleted from archive dir. If that's the case, return + // Status::OK(). The caller with identify this as empty file because + // *sequence == 0 + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + return Status::OK(); + } + } + + if (s.ok() && *sequence != 0) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.insert({number, *sequence}); + } + return s; +} + +Status WalManager::GetLiveWalFile(uint64_t number, + std::unique_ptr* log_file) { + if (!log_file) { + return Status::InvalidArgument("log_file not preallocated."); + } + + if (!number) { + return Status::PathNotFound("log file not available"); + } + + Status s; + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes); + + if (!s.ok()) { + return s; + } + + log_file->reset(new LogFileImpl(number, kAliveLogFile, + 0, // SequenceNumber + size_bytes)); + + return Status::OK(); +} + +// the function returns status.ok() and sequence == 0 if the file exists, but is +// empty +Status WalManager::ReadFirstLine(const std::string& fname, + const uint64_t number, + SequenceNumber* sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + + Status* status; + bool ignore_error; // true if db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s", + (this->ignore_error ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (this->status->ok()) { + // only keep the first error + *this->status = s; + } + } + }; + + std::unique_ptr file; + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); + std::unique_ptr file_reader( + new SequentialFileReader(std::move(file), fname, io_tracer_)); + + if (!status.ok()) { + return status; + } + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = &status; + reporter.ignore_error = !db_options_.paranoid_checks; + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, number); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && + (status.ok() || !db_options_.paranoid_checks)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + // TODO read record's till the first no corrupt entry? + } else { + WriteBatch batch; + // We can overwrite an existing non-OK Status since it'd only reach here + // with `paranoid_checks == false`. + status = WriteBatchInternal::SetContents(&batch, record); + if (status.ok()) { + *sequence = WriteBatchInternal::Sequence(&batch); + return status; + } + } + } + + if (status.ok() && reader.IsCompressedAndEmptyFile()) { + // In case of wal_compression, it writes a `kSetCompressionType` record + // which is not associated with any sequence number. As result for an empty + // file, GetSortedWalsOfType() will skip these WALs causing the operations + // to fail. + // Therefore, in order to avoid that failure, it sets sequence_number to 1 + // indicating those WALs should be included. + *sequence = 1; + } else { + // ReadRecord might have returned false on EOF, which means that the log + // file is empty. Or, a failure may have occurred while processing the first + // entry. In any case, return status and set sequence number to 0. + *sequence = 0; + } + return status; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.h new file mode 100644 index 0000000000..ab79bf0023 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wal_manager.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/version_set.h" +#include "file/file_util.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + + +// WAL manager provides the abstraction for reading the WAL files as a single +// unit. Internally, it opens and reads the files using Reader or Writer +// abstraction. +class WalManager { + public: + WalManager(const ImmutableDBOptions& db_options, + const FileOptions& file_options, + const std::shared_ptr& io_tracer, + const bool seq_per_batch = false) + : db_options_(db_options), + file_options_(file_options), + env_(db_options.env), + fs_(db_options.fs, io_tracer), + purge_wal_files_last_run_(0), + seq_per_batch_(seq_per_batch), + wal_dir_(db_options_.GetWalDir()), + wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()), + io_tracer_(io_tracer) {} + + Status GetSortedWalFiles(VectorLogPtr& files); + + // Allow user to tail transaction log to find all recent changes to the + // database that are newer than `seq_number`. + Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set); + + void PurgeObsoleteWALFiles(); + + void ArchiveWALFile(const std::string& fname, uint64_t number); + + Status DeleteFile(const std::string& fname, uint64_t number); + + Status GetLiveWalFile(uint64_t number, std::unique_ptr* log_file); + + Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstRecord(type, number, sequence); + } + + Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstLine(fname, number, sequence); + } + + private: + Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files, + WalFileType type); + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + + // ReadFirstRecord checks the read_first_record_cache_ to see if the entry + // exists or not. If not, it will read the WAL file. + // In case of wal_compression, WAL contains a `kSetCompressionType` record + // which is not associated with any sequence number. So the sequence_number is + // set to 1 if that WAL doesn't include any other record (basically empty) in + // order to include that WAL and is inserted in read_first_record_cache_. + // Therefore, sequence_number is used as boolean if WAL should be included or + // not and that sequence_number shouldn't be use for any other purpose. + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence); + + // In case of no wal_compression, ReadFirstLine returns status.ok() and + // sequence == 0 if the file exists, but is empty. + // In case of wal_compression, WAL contains + // `kSetCompressionType` record which is not associated with any sequence + // number if that WAL doesn't include any other record (basically empty). As + // result for an empty file, GetSortedWalsOfType() will skip these WALs + // causing the operations to fail. To avoid that, it sets sequence_number to + // 1 inorder to include that WAL. + Status ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence); + + // ------- state from DBImpl ------ + const ImmutableDBOptions& db_options_; + const FileOptions file_options_; + Env* env_; + const FileSystemPtr fs_; + + // ------- WalManager state ------- + // cache for ReadFirstRecord() calls + std::unordered_map read_first_record_cache_; + port::Mutex read_first_record_cache_mutex_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + bool seq_per_batch_; + + const std::string& wal_dir_; + + bool wal_in_db_path_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + static constexpr uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; + + std::shared_ptr io_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.cc new file mode 100644 index 0000000000..f62143c405 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.cc @@ -0,0 +1,182 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wide/wide_column_serialization.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/autovector.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default, + const WideColumns& columns, + std::string& output) { + const size_t num_columns = + value_of_default ? columns.size() + 1 : columns.size(); + + if (num_columns > static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Too many wide columns"); + } + + PutVarint32(&output, kCurrentVersion); + + PutVarint32(&output, static_cast(num_columns)); + + const Slice* prev_name = nullptr; + if (value_of_default) { + if (value_of_default->size() > + static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Wide column value too long"); + } + + PutLengthPrefixedSlice(&output, kDefaultWideColumnName); + PutVarint32(&output, static_cast(value_of_default->size())); + + prev_name = &kDefaultWideColumnName; + } + + for (size_t i = 0; i < columns.size(); ++i) { + const WideColumn& column = columns[i]; + + const Slice& name = column.name(); + if (name.size() > + static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Wide column name too long"); + } + + if (prev_name && prev_name->compare(name) >= 0) { + return Status::Corruption("Wide columns out of order"); + } + + const Slice& value = column.value(); + if (value.size() > + static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("Wide column value too long"); + } + + PutLengthPrefixedSlice(&output, name); + PutVarint32(&output, static_cast(value.size())); + + prev_name = &name; + } + + if (value_of_default) { + output.append(value_of_default->data(), value_of_default->size()); + } + + for (const auto& column : columns) { + const Slice& value = column.value(); + + output.append(value.data(), value.size()); + } + + return Status::OK(); +} + +Status WideColumnSerialization::Deserialize(Slice& input, + WideColumns& columns) { + assert(columns.empty()); + + uint32_t version = 0; + if (!GetVarint32(&input, &version)) { + return Status::Corruption("Error decoding wide column version"); + } + + if (version > kCurrentVersion) { + return Status::NotSupported("Unsupported wide column version"); + } + + uint32_t num_columns = 0; + if (!GetVarint32(&input, &num_columns)) { + return Status::Corruption("Error decoding number of wide columns"); + } + + if (!num_columns) { + return Status::OK(); + } + + columns.reserve(num_columns); + + autovector column_value_sizes; + column_value_sizes.reserve(num_columns); + + for (uint32_t i = 0; i < num_columns; ++i) { + Slice name; + if (!GetLengthPrefixedSlice(&input, &name)) { + return Status::Corruption("Error decoding wide column name"); + } + + if (!columns.empty() && columns.back().name().compare(name) >= 0) { + return Status::Corruption("Wide columns out of order"); + } + + columns.emplace_back(name, Slice()); + + uint32_t value_size = 0; + if (!GetVarint32(&input, &value_size)) { + return Status::Corruption("Error decoding wide column value size"); + } + + column_value_sizes.emplace_back(value_size); + } + + const Slice data(input); + size_t pos = 0; + + for (uint32_t i = 0; i < num_columns; ++i) { + const uint32_t value_size = column_value_sizes[i]; + + if (pos + value_size > data.size()) { + return Status::Corruption("Error decoding wide column value payload"); + } + + columns[i].value() = Slice(data.data() + pos, value_size); + + pos += value_size; + } + + return Status::OK(); +} + +WideColumns::const_iterator WideColumnSerialization::Find( + const WideColumns& columns, const Slice& column_name) { + const auto it = + std::lower_bound(columns.cbegin(), columns.cend(), column_name, + [](const WideColumn& lhs, const Slice& rhs) { + return lhs.name().compare(rhs) < 0; + }); + + if (it == columns.cend() || it->name() != column_name) { + return columns.cend(); + } + + return it; +} + +Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input, + Slice& value) { + WideColumns columns; + + const Status s = Deserialize(input, columns); + if (!s.ok()) { + return s; + } + + if (columns.empty() || columns[0].name() != kDefaultWideColumnName) { + value.clear(); + return Status::OK(); + } + + value = columns[0].value(); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.h new file mode 100644 index 0000000000..f0ffbd3924 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_column_serialization.h @@ -0,0 +1,77 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// Wide-column serialization/deserialization primitives. +// +// The two main parts of the layout are 1) a sorted index containing the column +// names and column value sizes and 2) the column values themselves. Keeping the +// index and the values separate will enable selectively reading column values +// down the line. Note that currently the index has to be fully parsed in order +// to find out the offset of each column value. +// +// Legend: cn = column name, cv = column value, cns = column name size, cvs = +// column value size. +// +// +----------+--------------+----------+-------+----------+---... +// | version | # of columns | cns 1 | cn 1 | cvs 1 | +// +----------+--------------+------------------+--------- +---... +// | varint32 | varint32 | varint32 | bytes | varint32 | +// +----------+--------------+----------+-------+----------+---... +// +// ... continued ... +// +// ...---+----------+-------+----------+-------+---...---+-------+ +// | cns N | cn N | cvs N | cv 1 | | cv N | +// ...---+----------+-------+----------+-------+---...---+-------+ +// | varint32 | bytes | varint32 | bytes | | bytes | +// ...---+----------+-------+----------+-------+---...---+-------+ + +class WideColumnSerialization { + public: + static Status Serialize(const WideColumns& columns, std::string& output); + static Status Serialize(const Slice& value_of_default, + const WideColumns& other_columns, + std::string& output); + + static Status Deserialize(Slice& input, WideColumns& columns); + + static WideColumns::const_iterator Find(const WideColumns& columns, + const Slice& column_name); + static Status GetValueOfDefaultColumn(Slice& input, Slice& value); + + static constexpr uint32_t kCurrentVersion = 1; + + private: + static Status SerializeImpl(const Slice* value_of_default, + const WideColumns& columns, std::string& output); +}; + +inline Status WideColumnSerialization::Serialize(const WideColumns& columns, + std::string& output) { + constexpr Slice* value_of_default = nullptr; + + return SerializeImpl(value_of_default, columns, output); +} + +inline Status WideColumnSerialization::Serialize( + const Slice& value_of_default, const WideColumns& other_columns, + std::string& output) { + return SerializeImpl(&value_of_default, other_columns, output); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_columns.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_columns.cc new file mode 100644 index 0000000000..186be7f854 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/wide/wide_columns.cc @@ -0,0 +1,22 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/wide_columns.h" + +#include "db/wide/wide_column_serialization.h" + +namespace ROCKSDB_NAMESPACE { + +const Slice kDefaultWideColumnName; + +const WideColumns kNoWideColumns; + +Status PinnableWideColumns::CreateIndexForWideColumns() { + Slice value_copy = value_; + + return WideColumnSerialization::Deserialize(value_copy, columns_); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch.cc new file mode 100644 index 0000000000..726b14f098 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch.cc @@ -0,0 +1,3142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeDeletion varstring +// kTypeSingleDeletion varstring +// kTypeRangeDeletion varstring varstring +// kTypeMerge varstring varstring +// kTypeColumnFamilyValue varint32 varstring varstring +// kTypeColumnFamilyDeletion varint32 varstring +// kTypeColumnFamilySingleDeletion varint32 varstring +// kTypeColumnFamilyRangeDeletion varint32 varstring varstring +// kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeBeginPrepareXID +// kTypeEndPrepareXID varstring +// kTypeCommitXID varstring +// kTypeCommitXIDAndTimestamp varstring varstring +// kTypeRollbackXID varstring +// kTypeBeginPersistedPrepareXID +// kTypeBeginUnprepareXID +// kTypeWideColumnEntity varstring varstring +// kTypeColumnFamilyWideColumnEntity varint32 varstring varstring +// kTypeNoop +// varstring := +// len: varint32 +// data: uint8[len] + +#include "rocksdb/write_batch.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/kv_checksum.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/wide/wide_column_serialization.h" +#include "db/write_batch_internal.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics_impl.h" +#include "port/lang.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/duplicate_detector.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// anon namespace for file-local types +namespace { + +enum ContentFlags : uint32_t { + DEFERRED = 1 << 0, + HAS_PUT = 1 << 1, + HAS_DELETE = 1 << 2, + HAS_SINGLE_DELETE = 1 << 3, + HAS_MERGE = 1 << 4, + HAS_BEGIN_PREPARE = 1 << 5, + HAS_END_PREPARE = 1 << 6, + HAS_COMMIT = 1 << 7, + HAS_ROLLBACK = 1 << 8, + HAS_DELETE_RANGE = 1 << 9, + HAS_BLOB_INDEX = 1 << 10, + HAS_BEGIN_UNPREPARE = 1 << 11, + HAS_PUT_ENTITY = 1 << 12, +}; + +struct BatchContentClassifier : public WriteBatch::Handler { + uint32_t content_flags = 0; + + Status PutCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_PUT; + return Status::OK(); + } + + Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */, + const Slice& /* entity */) override { + content_flags |= ContentFlags::HAS_PUT_ENTITY; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_SINGLE_DELETE; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE_RANGE; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_MERGE; + return Status::OK(); + } + + Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_BLOB_INDEX; + return Status::OK(); + } + + Status MarkBeginPrepare(bool unprepare) override { + content_flags |= ContentFlags::HAS_BEGIN_PREPARE; + if (unprepare) { + content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE; + } + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + content_flags |= ContentFlags::HAS_END_PREPARE; + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; + return Status::OK(); + } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + content_flags |= ContentFlags::HAS_ROLLBACK; + return Status::OK(); + } +}; + +} // anonymous namespace + +struct SavePoints { + std::stack> stack; +}; + +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, + size_t protection_bytes_per_key, size_t default_cf_ts_sz) + : content_flags_(0), + max_bytes_(max_bytes), + default_cf_ts_sz_(default_cf_ts_sz), + rep_() { + // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per + // entry. + assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8); + if (protection_bytes_per_key != 0) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + } + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + +WriteBatch::WriteBatch(const std::string& rep) + : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {} + +WriteBatch::WriteBatch(std::string&& rep) + : content_flags_(ContentFlags::DEFERRED), + max_bytes_(0), + rep_(std::move(rep)) {} + +WriteBatch::WriteBatch(const WriteBatch& src) + : wal_term_point_(src.wal_term_point_), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + max_bytes_(src.max_bytes_), + default_cf_ts_sz_(src.default_cf_ts_sz_), + rep_(src.rep_) { + if (src.save_points_ != nullptr) { + save_points_.reset(new SavePoints()); + save_points_->stack = src.save_points_->stack; + } + if (src.prot_info_ != nullptr) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + prot_info_->entries_ = src.prot_info_->entries_; + } +} + +WriteBatch::WriteBatch(WriteBatch&& src) noexcept + : save_points_(std::move(src.save_points_)), + wal_term_point_(std::move(src.wal_term_point_)), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + max_bytes_(src.max_bytes_), + prot_info_(std::move(src.prot_info_)), + default_cf_ts_sz_(src.default_cf_ts_sz_), + rep_(std::move(src.rep_)) {} + +WriteBatch& WriteBatch::operator=(const WriteBatch& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(src); + } + return *this; +} + +WriteBatch& WriteBatch::operator=(WriteBatch&& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(std::move(src)); + } + return *this; +} + +WriteBatch::~WriteBatch() {} + +WriteBatch::Handler::~Handler() {} + +void WriteBatch::Handler::LogData(const Slice& /*blob*/) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} + +bool WriteBatch::Handler::Continue() { return true; } + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(WriteBatchInternal::kHeader); + + content_flags_.store(0, std::memory_order_relaxed); + + if (save_points_ != nullptr) { + while (!save_points_->stack.empty()) { + save_points_->stack.pop(); + } + } + + if (prot_info_ != nullptr) { + prot_info_->entries_.clear(); + } + wal_term_point_.clear(); + default_cf_ts_sz_ = 0; +} + +uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); } + +uint32_t WriteBatch::ComputeContentFlags() const { + auto rv = content_flags_.load(std::memory_order_relaxed); + if ((rv & ContentFlags::DEFERRED) != 0) { + BatchContentClassifier classifier; + // Should we handle status here? + Iterate(&classifier).PermitUncheckedError(); + rv = classifier.content_flags; + + // this method is conceptually const, because it is performing a lazy + // computation that doesn't affect the abstract state of the batch. + // content_flags_ is marked mutable so that we can perform the + // following assignment + content_flags_.store(rv, std::memory_order_relaxed); + } + return rv; +} + +void WriteBatch::MarkWalTerminationPoint() { + wal_term_point_.size = GetDataSize(); + wal_term_point_.count = Count(); + wal_term_point_.content_flags = content_flags_; +} + +size_t WriteBatch::GetProtectionBytesPerKey() const { + if (prot_info_ != nullptr) { + return prot_info_->GetBytesPerKey(); + } + return 0; +} + +bool WriteBatch::HasPut() const { + return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; +} + +bool WriteBatch::HasPutEntity() const { + return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0; +} + +bool WriteBatch::HasDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0; +} + +bool WriteBatch::HasSingleDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0; +} + +bool WriteBatch::HasDeleteRange() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0; +} + +bool WriteBatch::HasMerge() const { + return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0; +} + +bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) { + assert(input != nullptr && key != nullptr); + // Skip tag byte + input->remove_prefix(1); + + if (cf_record) { + // Skip column_family bytes + uint32_t cf; + if (!GetVarint32(input, &cf)) { + return false; + } + } + + // Extract key + return GetLengthPrefixedSlice(input, key); +} + +bool WriteBatch::HasBeginPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0; +} + +bool WriteBatch::HasEndPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0; +} + +bool WriteBatch::HasCommit() const { + return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0; +} + +bool WriteBatch::HasRollback() const { + return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0; +} + +Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid) { + assert(key != nullptr && value != nullptr); + *tag = (*input)[0]; + input->remove_prefix(1); + *column_family = 0; // default + switch (*tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + FALLTHROUGH_INTENDED; + case kTypeValue: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeColumnFamilyDeletion: + case kTypeColumnFamilySingleDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + FALLTHROUGH_INTENDED; + case kTypeDeletion: + case kTypeSingleDeletion: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeColumnFamilyRangeDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch DeleteRange"); + } + FALLTHROUGH_INTENDED; + case kTypeRangeDeletion: + // for range delete, "key" is begin_key, "value" is end_key + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch DeleteRange"); + } + break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + FALLTHROUGH_INTENDED; + case kTypeMerge: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeColumnFamilyBlobIndex: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch BlobIndex"); + } + FALLTHROUGH_INTENDED; + case kTypeBlobIndex: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch BlobIndex"); + } + break; + case kTypeLogData: + assert(blob != nullptr); + if (!GetLengthPrefixedSlice(input, blob)) { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + case kTypeNoop: + case kTypeBeginPrepareXID: + // This indicates that the prepared batch is also persisted in the db. + // This is used in WritePreparedTxn + case kTypeBeginPersistedPrepareXID: + // This is used in WriteUnpreparedTxn + case kTypeBeginUnprepareXID: + break; + case kTypeEndPrepareXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad EndPrepare XID"); + } + break; + case kTypeCommitXIDAndTimestamp: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad commit timestamp"); + } + FALLTHROUGH_INTENDED; + case kTypeCommitXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Commit XID"); + } + break; + case kTypeRollbackXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Rollback XID"); + } + break; + case kTypeColumnFamilyWideColumnEntity: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch PutEntity"); + } + FALLTHROUGH_INTENDED; + case kTypeWideColumnEntity: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch PutEntity"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + return Status::OK(); +} + +Status WriteBatch::Iterate(Handler* handler) const { + if (rep_.size() < WriteBatchInternal::kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader, + rep_.size()); +} + +Status WriteBatchInternal::Iterate(const WriteBatch* wb, + WriteBatch::Handler* handler, size_t begin, + size_t end) { + if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) { + return Status::Corruption("Invalid start/end bounds for Iterate"); + } + assert(begin <= end); + Slice input(wb->rep_.data() + begin, static_cast(end - begin)); + bool whole_batch = + (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size()); + + Slice key, value, blob, xid; + + // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as + // the batch boundary symbols otherwise we would mis-count the number of + // batches. We do that by checking whether the accumulated batch is empty + // before seeing the next Noop. + bool empty_batch = true; + uint32_t found = 0; + Status s; + char tag = 0; + uint32_t column_family = 0; // default + bool last_was_try_again = false; + bool handler_continue = true; + while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) { + handler_continue = handler->Continue(); + if (!handler_continue) { + break; + } + + if (LIKELY(!s.IsTryAgain())) { + last_was_try_again = false; + tag = 0; + column_family = 0; // default + + s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, + &blob, &xid); + if (!s.ok()) { + return s; + } + } else { + assert(s.IsTryAgain()); + assert(!last_was_try_again); // to detect infinite loop bugs + if (UNLIKELY(last_was_try_again)) { + return Status::Corruption( + "two consecutive TryAgain in WriteBatch handler; this is either a " + "software bug or data corruption."); + } + last_was_try_again = true; + s = Status::OK(); + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_PUT)); + s = handler->PutCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE)); + s = handler->DeleteCF(column_family, key); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE)); + s = handler->SingleDeleteCF(column_family, key); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE)); + s = handler->DeleteRangeCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE)); + s = handler->MergeCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyBlobIndex: + case kTypeBlobIndex: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX)); + s = handler->PutBlobIndexCF(column_family, key, value); + if (LIKELY(s.ok())) { + found++; + } + break; + case kTypeLogData: + handler->LogData(blob); + // A batch might have nothing but LogData. It is still a batch. + empty_batch = false; + break; + case kTypeBeginPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + s = handler->MarkBeginPrepare(); + assert(s.ok()); + empty_batch = false; + if (handler->WriteAfterCommit() == + WriteBatch::Handler::OptionState::kDisabled) { + s = Status::NotSupported( + "WriteCommitted txn tag when write_after_commit_ is disabled (in " + "WritePrepared/WriteUnprepared mode). If it is not due to " + "corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + if (handler->WriteBeforePrepare() == + WriteBatch::Handler::OptionState::kEnabled) { + s = Status::NotSupported( + "WriteCommitted txn tag when write_before_prepare_ is enabled " + "(in WriteUnprepared mode). If it is not due to corruption, the " + "WAL must be emptied before changing the WritePolicy."); + } + break; + case kTypeBeginPersistedPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + s = handler->MarkBeginPrepare(); + assert(s.ok()); + empty_batch = false; + if (handler->WriteAfterCommit() == + WriteBatch::Handler::OptionState::kEnabled) { + s = Status::NotSupported( + "WritePrepared/WriteUnprepared txn tag when write_after_commit_ " + "is enabled (in default WriteCommitted mode). If it is not due " + "to corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + break; + case kTypeBeginUnprepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); + s = handler->MarkBeginPrepare(true /* unprepared */); + assert(s.ok()); + empty_batch = false; + if (handler->WriteAfterCommit() == + WriteBatch::Handler::OptionState::kEnabled) { + s = Status::NotSupported( + "WriteUnprepared txn tag when write_after_commit_ is enabled (in " + "default WriteCommitted mode). If it is not due to corruption, " + "the WAL must be emptied before changing the WritePolicy."); + } + if (handler->WriteBeforePrepare() == + WriteBatch::Handler::OptionState::kDisabled) { + s = Status::NotSupported( + "WriteUnprepared txn tag when write_before_prepare_ is disabled " + "(in WriteCommitted/WritePrepared mode). If it is not due to " + "corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + break; + case kTypeEndPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); + s = handler->MarkEndPrepare(xid); + assert(s.ok()); + empty_batch = true; + break; + case kTypeCommitXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + s = handler->MarkCommit(xid); + assert(s.ok()); + empty_batch = true; + break; + case kTypeCommitXIDAndTimestamp: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + // key stores the commit timestamp. + assert(!key.empty()); + s = handler->MarkCommitWithTimestamp(xid, key); + if (LIKELY(s.ok())) { + empty_batch = true; + } + break; + case kTypeRollbackXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); + s = handler->MarkRollback(xid); + assert(s.ok()); + empty_batch = true; + break; + case kTypeNoop: + s = handler->MarkNoop(empty_batch); + assert(s.ok()); + empty_batch = true; + break; + case kTypeWideColumnEntity: + case kTypeColumnFamilyWideColumnEntity: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_PUT_ENTITY)); + s = handler->PutEntityCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + ++found; + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (!s.ok()) { + return s; + } + if (handler_continue && whole_batch && + found != WriteBatchInternal::Count(wb)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) { + return b->is_latest_persistent_state_; +} + +void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) { + b->is_latest_persistent_state_ = true; +} + +uint32_t WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) { + return WriteBatchInternal::kHeader; +} + +std::tuple +WriteBatchInternal::GetColumnFamilyIdAndTimestampSize( + WriteBatch* b, ColumnFamilyHandle* column_family) { + uint32_t cf_id = GetColumnFamilyID(column_family); + size_t ts_sz = 0; + Status s; + if (column_family) { + const Comparator* const ucmp = column_family->GetComparator(); + if (ucmp) { + ts_sz = ucmp->timestamp_size(); + if (0 == cf_id && b->default_cf_ts_sz_ != ts_sz) { + s = Status::InvalidArgument("Default cf timestamp size mismatch"); + } + } + } else if (b->default_cf_ts_sz_ > 0) { + ts_sz = b->default_cf_ts_sz_; + } + return std::make_tuple(s, cf_id, ts_sz); +} + +namespace { +Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family, + const Slice& ts) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be null"); + } + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t cf_ts_sz = ucmp->timestamp_size(); + if (0 == cf_ts_sz) { + return Status::InvalidArgument("timestamp disabled"); + } + if (cf_ts_sz != ts.size()) { + return Status::InvalidArgument("timestamp size mismatch"); + } + return Status::OK(); +} +} // anonymous namespace + +Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // Technically the optype could've been `kTypeColumnFamilyValue` with the + // CF ID encoded in the `WriteBatch`. That distinction is unimportant + // however since we verify CF ID is correct, as well as all other fields + // (a missing/extra encoded CF ID would corrupt another field). It is + // convenient to consolidate on `kTypeValue` here as that is what will be + // inserted into memtable. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeValue) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Put(this, cf_id, key, value); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2), + SliceParts(&value, 1)); +} + +Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key, + const SliceParts& value) { + size_t total_key_bytes = 0; + for (int i = 0; i < key.num_parts; ++i) { + total_key_bytes += key.parts[i].size(); + } + if (total_key_bytes >= size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + + size_t total_value_bytes = 0; + for (int i = 0; i < value.num_parts; ++i) { + total_value_bytes += value.parts[i].size(); + } + if (total_value_bytes >= size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + return Status::OK(); +} + +Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value) { + Status s = CheckSlicePartsLength(key, value); + if (!s.ok()) { + return s; + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeValue) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (ts_sz == 0) { + return WriteBatchInternal::Put(this, cf_id, key, value); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id, + const Slice& key, + const WideColumns& columns) { + assert(b); + + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + + WideColumns sorted_columns(columns); + std::sort(sorted_columns.begin(), sorted_columns.end(), + [](const WideColumn& lhs, const WideColumn& rhs) { + return lhs.name().compare(rhs.name()) < 0; + }); + + std::string entity; + const Status s = WideColumnSerialization::Serialize(sorted_columns, entity); + if (!s.ok()) { + return s; + } + + if (entity.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("wide column entity is too large"); + } + + LocalSavePoint save(b); + + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeWideColumnEntity)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyWideColumnEntity)); + PutVarint32(&b->rep_, column_family_id); + } + + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, entity); + + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_PUT_ENTITY, + std::memory_order_relaxed); + + if (b->prot_info_ != nullptr) { + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, entity, kTypeWideColumnEntity) + .ProtectC(column_family_id)); + } + + return save.commit(); +} + +Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family, + const Slice& key, const WideColumns& columns) { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call this method without a column family handle"); + } + + Status s; + uint32_t cf_id = 0; + size_t ts_sz = 0; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (ts_sz) { + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); + } + + return WriteBatchInternal::PutEntity(this, cf_id, key, columns); +} + +Status WriteBatchInternal::InsertNoop(WriteBatch* b) { + b->rep_.push_back(static_cast(kTypeNoop)); + return Status::OK(); +} + +Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid, + bool write_after_commit, + bool unprepared_batch) { + // a manually constructed batch can only contain one prepare section + assert(b->rep_[12] == static_cast(kTypeNoop)); + + // all savepoints up to this point are cleared + if (b->save_points_ != nullptr) { + while (!b->save_points_->stack.empty()) { + b->save_points_->stack.pop(); + } + } + + // rewrite noop as begin marker + b->rep_[12] = static_cast( + write_after_commit ? kTypeBeginPrepareXID + : (unprepared_batch ? kTypeBeginUnprepareXID + : kTypeBeginPersistedPrepareXID)); + b->rep_.push_back(static_cast(kTypeEndPrepareXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_END_PREPARE | + ContentFlags::HAS_BEGIN_PREPARE, + std::memory_order_relaxed); + if (unprepared_batch) { + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BEGIN_UNPREPARE, + std::memory_order_relaxed); + } + return Status::OK(); +} + +Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeCommitXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b, + const Slice& xid, + const Slice& commit_ts) { + assert(!commit_ts.empty()); + b->rep_.push_back(static_cast(kTypeCommitXIDAndTimestamp)); + PutLengthPrefixedSlice(&b->rep_, commit_ts); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeRollbackXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_ROLLBACK, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const Slice& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, "" /* value */, kTypeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Delete(this, cf_id, key); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + return WriteBatchInternal::Delete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + assert(column_family); + has_key_with_ts_ = true; + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Delete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, + SliceParts(nullptr /* _parts */, 0 /* _num_parts */), + kTypeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Delete(this, cf_id, key); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::SingleDelete(WriteBatch* b, + uint32_t column_family_id, + const Slice& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeSingleDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilySingleDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, "" /* value */, kTypeSingleDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::SingleDelete(this, cf_id, key); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + return WriteBatchInternal::SingleDelete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::SingleDelete(this, cf_id, + SliceParts(key_with_ts.data(), 2)); +} + +Status WriteBatchInternal::SingleDelete(WriteBatch* b, + uint32_t column_family_id, + const SliceParts& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeSingleDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilySingleDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, + SliceParts(nullptr /* _parts */, + 0 /* _num_parts */) /* value */, + kTypeSingleDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::SingleDelete(this, cf_id, key); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, + const Slice& begin_key, + const Slice& end_key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeRangeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyRangeDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, begin_key); + PutLengthPrefixedSlice(&b->rep_, end_key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE_RANGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(begin_key, end_key, kTypeRangeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array begin_key_with_ts{{begin_key, dummy_ts}}; + std::array end_key_with_ts{{end_key, dummy_ts}}; + return WriteBatchInternal::DeleteRange( + this, cf_id, SliceParts(begin_key_with_ts.data(), 2), + SliceParts(end_key_with_ts.data(), 2)); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + assert(column_family); + has_key_with_ts_ = true; + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{begin_key, ts}}; + std::array end_key_with_ts{{end_key, ts}}; + return WriteBatchInternal::DeleteRange(this, cf_id, + SliceParts(key_with_ts.data(), 2), + SliceParts(end_key_with_ts.data(), 2)); +} + +Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, + const SliceParts& begin_key, + const SliceParts& end_key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeRangeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyRangeDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, begin_key); + PutLengthPrefixedSliceParts(&b->rep_, end_key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE_RANGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(begin_key, end_key, kTypeRangeDeletion) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeMerge) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Merge(this, cf_id, key, value); + } + + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + + return WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); +} + +Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, + const SliceParts& value) { + Status s = CheckSlicePartsLength(key, value); + if (!s.ok()) { + return s; + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeMerge) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } + + if (0 == ts_sz) { + return WriteBatchInternal::Merge(this, cf_id, key, value); + } + + return Status::InvalidArgument( + "Cannot call this method on column family enabling timestamp"); +} + +Status WriteBatchInternal::PutBlobIndex(WriteBatch* b, + uint32_t column_family_id, + const Slice& key, const Slice& value) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeBlobIndex)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyBlobIndex)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BLOB_INDEX, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, value, kTypeBlobIndex) + .ProtectC(column_family_id)); + } + return save.commit(); +} + +Status WriteBatch::PutLogData(const Slice& blob) { + LocalSavePoint save(this); + rep_.push_back(static_cast(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); + return save.commit(); +} + +void WriteBatch::SetSavePoint() { + if (save_points_ == nullptr) { + save_points_.reset(new SavePoints()); + } + // Record length and count of current batch of writes. + save_points_->stack.push(SavePoint( + GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed))); +} + +Status WriteBatch::RollbackToSavePoint() { + if (save_points_ == nullptr || save_points_->stack.size() == 0) { + return Status::NotFound(); + } + + // Pop the most recent savepoint off the stack + SavePoint savepoint = save_points_->stack.top(); + save_points_->stack.pop(); + + assert(savepoint.size <= rep_.size()); + assert(static_cast(savepoint.count) <= Count()); + + if (savepoint.size == rep_.size()) { + // No changes to rollback + } else if (savepoint.size == 0) { + // Rollback everything + Clear(); + } else { + rep_.resize(savepoint.size); + if (prot_info_ != nullptr) { + prot_info_->entries_.resize(savepoint.count); + } + WriteBatchInternal::SetCount(this, savepoint.count); + content_flags_.store(savepoint.content_flags, std::memory_order_relaxed); + } + + return Status::OK(); +} + +Status WriteBatch::PopSavePoint() { + if (save_points_ == nullptr || save_points_->stack.size() == 0) { + return Status::NotFound(); + } + + // Pop the most recent savepoint off the stack + save_points_->stack.pop(); + + return Status::OK(); +} + +Status WriteBatch::UpdateTimestamps( + const Slice& ts, std::function ts_sz_func) { + TimestampUpdater ts_updater(prot_info_.get(), + std::move(ts_sz_func), ts); + const Status s = Iterate(&ts_updater); + if (s.ok()) { + needs_in_place_update_ts_ = false; + } + return s; +} + +Status WriteBatch::VerifyChecksum() const { + if (prot_info_ == nullptr) { + return Status::OK(); + } + Slice input(rep_.data() + WriteBatchInternal::kHeader, + rep_.size() - WriteBatchInternal::kHeader); + Slice key, value, blob, xid; + char tag = 0; + uint32_t column_family = 0; // default + Status s; + size_t prot_info_idx = 0; + bool checksum_protected = true; + while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) { + // In case key/value/column_family are not updated by + // ReadRecordFromWriteBatch + key.clear(); + value.clear(); + column_family = 0; + s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, + &blob, &xid); + if (!s.ok()) { + return s; + } + checksum_protected = true; + // Write batch checksum uses op_type without ColumnFamily (e.g., if op_type + // in the write batch is kTypeColumnFamilyValue, kTypeValue is used to + // compute the checksum), and encodes column family id separately. See + // comment in first `WriteBatchInternal::Put()` for more detail. + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + tag = kTypeValue; + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + tag = kTypeDeletion; + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + tag = kTypeSingleDeletion; + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + tag = kTypeRangeDeletion; + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + tag = kTypeMerge; + break; + case kTypeColumnFamilyBlobIndex: + case kTypeBlobIndex: + tag = kTypeBlobIndex; + break; + case kTypeLogData: + case kTypeBeginPrepareXID: + case kTypeEndPrepareXID: + case kTypeCommitXID: + case kTypeRollbackXID: + case kTypeNoop: + case kTypeBeginPersistedPrepareXID: + case kTypeBeginUnprepareXID: + case kTypeDeletionWithTimestamp: + case kTypeCommitXIDAndTimestamp: + checksum_protected = false; + break; + case kTypeColumnFamilyWideColumnEntity: + case kTypeWideColumnEntity: + tag = kTypeWideColumnEntity; + break; + default: + return Status::Corruption( + "unknown WriteBatch tag", + std::to_string(static_cast(tag))); + } + if (checksum_protected) { + s = prot_info_->entries_[prot_info_idx++] + .StripC(column_family) + .StripKVO(key, value, static_cast(tag)) + .GetStatus(); + if (!s.ok()) { + return s; + } + } + } + + if (prot_info_idx != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } + assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size()); + return Status::OK(); +} + +namespace { + +class MemTableInserter : public WriteBatch::Handler { + SequenceNumber sequence_; + ColumnFamilyMemTables* const cf_mems_; + FlushScheduler* const flush_scheduler_; + TrimHistoryScheduler* const trim_history_scheduler_; + const bool ignore_missing_column_families_; + const uint64_t recovering_log_number_; + // log number that all Memtables inserted into should reference + uint64_t log_number_ref_; + DBImpl* db_; + const bool concurrent_memtable_writes_; + bool post_info_created_; + const WriteBatch::ProtectionInfo* prot_info_; + size_t prot_info_idx_; + + bool* has_valid_writes_; + // On some (!) platforms just default creating + // a map is too expensive in the Write() path as they + // cause memory allocations though unused. + // Make creation optional but do not incur + // std::unique_ptr additional allocation + using MemPostInfoMap = std::map; + using PostMapType = std::aligned_storage::type; + PostMapType mem_post_info_map_; + // current recovered transaction we are rebuilding (recovery) + WriteBatch* rebuilding_trx_; + SequenceNumber rebuilding_trx_seq_; + // Increase seq number once per each write batch. Otherwise increase it once + // per key. + bool seq_per_batch_; + // Whether the memtable write will be done only after the commit + bool write_after_commit_; + // Whether memtable write can be done before prepare + bool write_before_prepare_; + // Whether this batch was unprepared or not + bool unprepared_batch_; + using DupDetector = std::aligned_storage::type; + DupDetector duplicate_detector_; + bool dup_dectector_on_; + + bool hint_per_batch_; + bool hint_created_; + // Hints for this batch + using HintMap = std::unordered_map; + using HintMapType = std::aligned_storage::type; + HintMapType hint_; + + HintMap& GetHintMap() { + assert(hint_per_batch_); + if (!hint_created_) { + new (&hint_) HintMap(); + hint_created_ = true; + } + return *reinterpret_cast(&hint_); + } + + MemPostInfoMap& GetPostMap() { + assert(concurrent_memtable_writes_); + if (!post_info_created_) { + new (&mem_post_info_map_) MemPostInfoMap(); + post_info_created_ = true; + } + return *reinterpret_cast(&mem_post_info_map_); + } + + bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) { + assert(!write_after_commit_); + assert(rebuilding_trx_ != nullptr); + if (!dup_dectector_on_) { + new (&duplicate_detector_) DuplicateDetector(db_); + dup_dectector_on_ = true; + } + return reinterpret_cast(&duplicate_detector_) + ->IsDuplicateKeySeq(column_family_id, key, sequence_); + } + + const ProtectionInfoKVOC64* NextProtectionInfo() { + const ProtectionInfoKVOC64* res = nullptr; + if (prot_info_ != nullptr) { + assert(prot_info_idx_ < prot_info_->entries_.size()); + res = &prot_info_->entries_[prot_info_idx_]; + ++prot_info_idx_; + } + return res; + } + + void DecrementProtectionInfoIdxForTryAgain() { + if (prot_info_ != nullptr) --prot_info_idx_; + } + + void ResetProtectionInfo() { + prot_info_idx_ = 0; + prot_info_ = nullptr; + } + + protected: + Handler::OptionState WriteBeforePrepare() const override { + return write_before_prepare_ ? Handler::OptionState::kEnabled + : Handler::OptionState::kDisabled; + } + Handler::OptionState WriteAfterCommit() const override { + return write_after_commit_ ? Handler::OptionState::kEnabled + : Handler::OptionState::kDisabled; + } + + public: + // cf_mems should not be shared with concurrent inserters + MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, + uint64_t recovering_log_number, DB* db, + bool concurrent_memtable_writes, + const WriteBatch::ProtectionInfo* prot_info, + bool* has_valid_writes = nullptr, bool seq_per_batch = false, + bool batch_per_txn = true, bool hint_per_batch = false) + : sequence_(_sequence), + cf_mems_(cf_mems), + flush_scheduler_(flush_scheduler), + trim_history_scheduler_(trim_history_scheduler), + ignore_missing_column_families_(ignore_missing_column_families), + recovering_log_number_(recovering_log_number), + log_number_ref_(0), + db_(static_cast_with_check(db)), + concurrent_memtable_writes_(concurrent_memtable_writes), + post_info_created_(false), + prot_info_(prot_info), + prot_info_idx_(0), + has_valid_writes_(has_valid_writes), + rebuilding_trx_(nullptr), + rebuilding_trx_seq_(0), + seq_per_batch_(seq_per_batch), + // Write after commit currently uses one seq per key (instead of per + // batch). So seq_per_batch being false indicates write_after_commit + // approach. + write_after_commit_(!seq_per_batch), + // WriteUnprepared can write WriteBatches per transaction, so + // batch_per_txn being false indicates write_before_prepare. + write_before_prepare_(!batch_per_txn), + unprepared_batch_(false), + duplicate_detector_(), + dup_dectector_on_(false), + hint_per_batch_(hint_per_batch), + hint_created_(false) { + assert(cf_mems_); + } + + ~MemTableInserter() override { + if (dup_dectector_on_) { + reinterpret_cast(&duplicate_detector_) + ->~DuplicateDetector(); + } + if (post_info_created_) { + reinterpret_cast(&mem_post_info_map_)->~MemPostInfoMap(); + } + if (hint_created_) { + for (auto iter : GetHintMap()) { + delete[] reinterpret_cast(iter.second); + } + reinterpret_cast(&hint_)->~HintMap(); + } + delete rebuilding_trx_; + } + + MemTableInserter(const MemTableInserter&) = delete; + MemTableInserter& operator=(const MemTableInserter&) = delete; + + // The batch seq is regularly restarted; In normal mode it is set when + // MemTableInserter is constructed in the write thread and in recovery mode it + // is set when a batch, which is tagged with seq, is read from the WAL. + // Within a sequenced batch, which could be a merge of multiple batches, we + // have two policies to advance the seq: i) seq_per_key (default) and ii) + // seq_per_batch. To implement the latter we need to mark the boundary between + // the individual batches. The approach is this: 1) Use the terminating + // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID, + // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a + // natural boundary marker. + void MaybeAdvanceSeq(bool batch_boundry = false) { + if (batch_boundry == seq_per_batch_) { + sequence_++; + } + } + + void set_log_number_ref(uint64_t log) { log_number_ref_ = log; } + void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) { + prot_info_ = prot_info; + prot_info_idx_ = 0; + } + + SequenceNumber sequence() const { return sequence_; } + + void PostProcess() { + assert(concurrent_memtable_writes_); + // If post info was not created there is nothing + // to process and no need to create on demand + if (post_info_created_) { + for (auto& pair : GetPostMap()) { + pair.first->BatchPostProcess(pair.second); + } + } + } + + bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { + // If we are in a concurrent mode, it is the caller's responsibility + // to clone the original ColumnFamilyMemTables so that each thread + // has its own instance. Otherwise, it must be guaranteed that there + // is no concurrent access + bool found = cf_mems_->Seek(column_family_id); + if (!found) { + if (ignore_missing_column_families_) { + *s = Status::OK(); + } else { + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + } + return false; + } + if (recovering_log_number_ != 0 && + recovering_log_number_ < cf_mems_->GetLogNumber()) { + // This is true only in recovery environment (recovering_log_number_ is + // always 0 in + // non-recovery, regular write code-path) + // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that + // column family already contains updates from this log. We can't apply + // updates twice because of update-in-place or merge workloads -- ignore + // the update + *s = Status::OK(); + return false; + } + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + + if (log_number_ref_ > 0) { + cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_); + } + + return true; + } + + Status PutCFImpl(uint32_t column_family_id, const Slice& key, + const Slice& value, ValueType value_type, + const ProtectionInfoKVOS64* kv_prot_info) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, + value); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + return ret_status; + } + assert(ret_status.ok()); + + MemTable* mem = cf_mems_->GetMemTable(); + auto* moptions = mem->GetImmutableMemTableOptions(); + // inplace_update_support is inconsistent with snapshots, and therefore with + // any kind of transactions including the ones that use seq_per_batch + assert(!seq_per_batch_ || !moptions->inplace_update_support); + if (!moptions->inplace_update_support) { + ret_status = + mem->Add(sequence_, value_type, key, value, kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); + } else if (moptions->inplace_callback == nullptr || + value_type != kTypeValue) { + assert(!concurrent_memtable_writes_); + ret_status = mem->Update(sequence_, value_type, key, value, kv_prot_info); + } else { + assert(!concurrent_memtable_writes_); + assert(value_type == kTypeValue); + ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info); + if (ret_status.IsNotFound()) { + // key not found in memtable. Do sst get, update, add + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity + ReadOptions ropts; + // it's going to be overwritten for sure, so no point caching data block + // containing the old version + ropts.fill_cache = false; + ropts.snapshot = &read_from_snapshot; + + std::string prev_value; + std::string merged_value; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + Status get_status = Status::NotSupported(); + if (db_ != nullptr && recovering_log_number_ == 0) { + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + // TODO (yanqin): fix when user-defined timestamp is enabled. + get_status = db_->Get(ropts, cf_handle, key, &prev_value); + } + // Intentionally overwrites the `NotFound` in `ret_status`. + if (!get_status.ok() && !get_status.IsNotFound()) { + ret_status = get_status; + } else { + ret_status = Status::OK(); + } + if (ret_status.ok()) { + UpdateStatus update_status; + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = static_cast(prev_value.size()); + if (get_status.ok()) { + update_status = moptions->inplace_callback(prev_buffer, &prev_size, + value, &merged_value); + } else { + update_status = moptions->inplace_callback( + nullptr /* existing_value */, nullptr /* existing_value_size */, + value, &merged_value); + } + if (update_status == UpdateStatus::UPDATED_INPLACE) { + assert(get_status.ok()); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, + Slice(prev_buffer, prev_size)); + // prev_value is updated in-place with final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + &updated_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } else if (update_status == UpdateStatus::UPDATED) { + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, merged_value); + // merged_value contains the final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(merged_value), &updated_kv_prot_info); + } else { + // merged_value contains the final value. + ret_status = + mem->Add(sequence_, value_type, key, Slice(merged_value), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } + } + } + } + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + } + return ret_status; + } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + Status ret_status; + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = PutCFImpl(column_family_id, key, value, kTypeValue, + &mem_kv_prot_info); + } else { + ret_status = PutCFImpl(column_family_id, key, value, kTypeValue, + nullptr /* kv_prot_info */); + } + // TODO: this assumes that if TryAgain status is returned to the caller, + // the operation is actually tried again. The proper way to do this is to + // pass a `try_again` parameter to the operation itself and decrement + // prot_info_idx_ based on that + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status PutEntityCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + + Status s; + if (kv_prot_info) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity, + &mem_kv_prot_info); + } else { + s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity, + /* kv_prot_info */ nullptr); + } + + if (UNLIKELY(s.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + + return s; + } + + Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value, ValueType delete_type, + const ProtectionInfoKVOS64* kv_prot_info) { + Status ret_status; + MemTable* mem = cf_mems_->GetMemTable(); + ret_status = + mem->Add(sequence_, delete_type, key, value, kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } + return ret_status; + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + ColumnFamilyData* cfd = cf_mems_->current(); + assert(!cfd || cfd->user_comparator()); + const size_t ts_sz = (cfd && cfd->user_comparator()) + ? cfd->user_comparator()->timestamp_size() + : 0; + const ValueType delete_type = + (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp; + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type); + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + nullptr /* kv_prot_info */); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, + key); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + assert(ret_status.ok()); + + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, nullptr /* kv_prot_info */); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + assert(ret_status.ok()); + + if (db_ != nullptr) { + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + auto* cfd = + static_cast_with_check(cf_handle)->cfd(); + if (!cfd->is_delete_range_supported()) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + return Status::NotSupported( + std::string("DeleteRange not supported for table type ") + + cfd->ioptions()->table_factory->Name() + " in CF " + + cfd->GetName()); + } + int cmp = + cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key); + if (cmp > 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + // It's an empty range where endpoints appear mistaken. Don't bother + // applying it to the DB, and return an error to the user. + return Status::InvalidArgument("end key comes before start key"); + } else if (cmp == 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + // It's an empty range. Don't bother applying it to the DB. + return Status::OK(); + } + } + + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, nullptr /* kv_prot_info */); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, + value); + // else insert the values to the memtable right away + } + + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, + column_family_id, key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + assert(ret_status.ok()); + + MemTable* mem = cf_mems_->GetMemTable(); + auto* moptions = mem->GetImmutableMemTableOptions(); + if (moptions->merge_operator == nullptr) { + return Status::InvalidArgument( + "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`"); + } + bool perform_merge = false; + assert(!concurrent_memtable_writes_ || + moptions->max_successive_merges == 0); + + // If we pass DB through and options.max_successive_merges is hit + // during recovery, Get() will be issued which will try to acquire + // DB mutex and cause deadlock, as DB mutex is already held. + // So we disable merge in recovery + if (moptions->max_successive_merges > 0 && db_ != nullptr && + recovering_log_number_ == 0) { + assert(!concurrent_memtable_writes_); + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= moptions->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + Status get_status = db_->Get(read_options, cf_handle, key, &get_value); + if (!get_status.ok()) { + // Failed to read a key we know exists. Store the delta in memtable. + perform_merge = false; + } else { + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = moptions->merge_operator; + assert(merge_operator); + + std::string new_value; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, &get_value_slice, {value}, &new_value, + moptions->info_log, moptions->statistics, + SystemClock::Default().get(), /* result_operand */ nullptr, + /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); + + if (!merge_status.ok()) { + // Failed to merge! + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + assert(!concurrent_memtable_writes_); + if (kv_prot_info != nullptr) { + auto merged_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + merged_kv_prot_info.UpdateV(value, new_value); + merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue); + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + &merged_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + nullptr /* kv_prot_info */); + } + } + } + } + + if (!perform_merge) { + assert(ret_status.ok()); + // Add merge operand to memtable + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = + mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem)); + } else { + ret_status = mem->Add( + sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */, + concurrent_memtable_writes_, get_post_process_info(mem)); + } + } + + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } + // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, + key, value); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); + Status ret_status; + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + // Same as PutCF except for value type. + ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + &mem_kv_prot_info); + } else { + ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + nullptr /* kv_prot_info */); + } + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + + void CheckMemtableFull() { + if (flush_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + assert(cfd != nullptr); + if (cfd->mem()->ShouldScheduleFlush() && + cfd->mem()->MarkFlushScheduled()) { + // MarkFlushScheduled only returns true if we are the one that + // should take action, so no need to dedup further + flush_scheduler_->ScheduleWork(cfd); + } + } + // check if memtable_list size exceeds max_write_buffer_size_to_maintain + if (trim_history_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + + assert(cfd); + assert(cfd->ioptions()); + + const size_t size_to_maintain = static_cast( + cfd->ioptions()->max_write_buffer_size_to_maintain); + + if (size_to_maintain > 0) { + MemTableList* const imm = cfd->imm(); + assert(imm); + + if (imm->HasHistory()) { + const MemTable* const mem = cfd->mem(); + assert(mem); + + if (mem->MemoryAllocatedBytes() + + imm->MemoryAllocatedBytesExcludingLast() >= + size_to_maintain && + imm->MarkTrimHistoryNeeded()) { + trim_history_scheduler_->ScheduleWork(cfd); + } + } + } + } + } + + // The write batch handler calls MarkBeginPrepare with unprepare set to true + // if it encounters the kTypeBeginUnprepareXID marker. + Status MarkBeginPrepare(bool unprepare) override { + assert(rebuilding_trx_ == nullptr); + assert(db_); + + if (recovering_log_number_ != 0) { + db_->mutex()->AssertHeld(); + // during recovery we rebuild a hollow transaction + // from all encountered prepare sections of the wal + if (db_->allow_2pc() == false) { + return Status::NotSupported( + "WAL contains prepared transactions. Open with " + "TransactionDB::Open()."); + } + + // we are now iterating through a prepared section + rebuilding_trx_ = new WriteBatch(); + rebuilding_trx_seq_ = sequence_; + // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers. + // unprepared_batch_ should be false because it is false by default, and + // gets reset to false in MarkEndPrepare. + assert(!unprepared_batch_); + unprepared_batch_ = unprepare; + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& name) override { + assert(db_); + assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0)); + + if (recovering_log_number_ != 0) { + db_->mutex()->AssertHeld(); + assert(db_->allow_2pc()); + size_t batch_cnt = + write_after_commit_ + ? 0 // 0 will disable further checks + : static_cast(sequence_ - rebuilding_trx_seq_ + 1); + db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(), + rebuilding_trx_, rebuilding_trx_seq_, + batch_cnt, unprepared_batch_); + unprepared_batch_ = false; + rebuilding_trx_ = nullptr; + } else { + assert(rebuilding_trx_ == nullptr); + } + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return Status::OK(); + } + + Status MarkNoop(bool empty_batch) override { + if (recovering_log_number_ != 0) { + db_->mutex()->AssertHeld(); + } + // A hack in pessimistic transaction could result into a noop at the start + // of the write batch, that should be ignored. + if (!empty_batch) { + // In the absence of Prepare markers, a kTypeNoop tag indicates the end of + // a batch. This happens when write batch commits skipping the prepare + // phase. + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + } + return Status::OK(); + } + + Status MarkCommit(const Slice& name) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // We must hold db mutex in recovery. + db_->mutex()->AssertHeld(); + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx != nullptr) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(log_number_ref_ == 0); + if (write_after_commit_) { + // write_after_commit_ can only have one batch in trx. + assert(trx->batches_.size() == 1); + const auto& batch_info = trx->batches_.begin()->second; + // all inserts must reference this trx log number + log_number_ref_ = batch_info.log_number_; + ResetProtectionInfo(); + s = batch_info.batch_->Iterate(this); + log_number_ref_ = 0; + } + // else the values are already inserted before the commit + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + } else { + // When writes are not delayed until commit, there is no disconnect + // between a memtable write and the WAL that supports it. So the commit + // need not reference any log as the only log to which it depends. + assert(!write_after_commit_ || log_number_ref_ > 0); + } + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + if (UNLIKELY(s.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + + return s; + } + + Status MarkCommitWithTimestamp(const Slice& name, + const Slice& commit_ts) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // In recovery, db mutex must be held. + db_->mutex()->AssertHeld(); + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + // the log containing the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(0 == log_number_ref_); + if (write_after_commit_) { + // write_after_commit_ can only have one batch in trx. + assert(trx->batches_.size() == 1); + const auto& batch_info = trx->batches_.begin()->second; + // all inserts must reference this trx log number + log_number_ref_ = batch_info.log_number_; + + s = batch_info.batch_->UpdateTimestamps( + commit_ts, [this](uint32_t cf) { + assert(db_); + VersionSet* const vset = db_->GetVersionSet(); + assert(vset); + ColumnFamilySet* const cf_set = vset->GetColumnFamilySet(); + assert(cf_set); + ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf); + assert(cfd); + const auto* const ucmp = cfd->user_comparator(); + assert(ucmp); + return ucmp->timestamp_size(); + }); + if (s.ok()) { + ResetProtectionInfo(); + s = batch_info.batch_->Iterate(this); + log_number_ref_ = 0; + } + } + // else the values are already inserted before the commit + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_) { + *has_valid_writes_ = true; + } + } + } else { + // When writes are not delayed until commit, there is no connection + // between a memtable write and the WAL that supports it. So the commit + // need not reference any log as the only log to which it depends. + assert(!write_after_commit_ || log_number_ref_ > 0); + } + constexpr bool batch_boundary = true; + MaybeAdvanceSeq(batch_boundary); + + if (UNLIKELY(s.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + + return s; + } + + Status MarkRollback(const Slice& name) override { + assert(db_); + + if (recovering_log_number_ != 0) { + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the transactions prep section + // may have been released in the previous incarnation + // because we knew it had been rolled back + if (trx != nullptr) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + } else { + // in non recovery we simply ignore this tag + } + + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return Status::OK(); + } + + private: + MemTablePostProcessInfo* get_post_process_info(MemTable* mem) { + if (!concurrent_memtable_writes_) { + // No need to batch counters locally if we don't use concurrent mode. + return nullptr; + } + return &GetPostMap()[mem]; + } +}; + +} // anonymous namespace + +// This function can only be called in these conditions: +// 1) During Recovery() +// 2) During Write(), in a single-threaded write thread +// 3) During Write(), in a concurrent context where memtables has been cloned +// The reason is that it calls memtables->Seek(), which has a stateful cache +Status WriteBatchInternal::InsertInto( + WriteThread::WriteGroup& write_group, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db, + bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) { + MemTableInserter inserter( + sequence, memtables, flush_scheduler, trim_history_scheduler, + ignore_missing_column_families, recovery_log_number, db, + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn); + for (auto w : write_group) { + if (w->CallbackFailed()) { + continue; + } + w->sequence = inserter.sequence(); + if (!w->ShouldWriteToMemtable()) { + // In seq_per_batch_ mode this advances the seq by one. + inserter.MaybeAdvanceSeq(true); + continue; + } + SetSequence(w->batch, inserter.sequence()); + inserter.set_log_number_ref(w->log_ref); + inserter.set_prot_info(w->batch->prot_info_.get()); + w->status = w->batch->Iterate(&inserter); + if (!w->status.ok()) { + return w->status; + } + assert(!seq_per_batch || w->batch_cnt != 0); + assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt); + } + return Status::OK(); +} + +Status WriteBatchInternal::InsertInto( + WriteThread::Writer* writer, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt, + bool batch_per_txn, bool hint_per_batch) { +#ifdef NDEBUG + (void)batch_cnt; +#endif + assert(writer->ShouldWriteToMemtable()); + MemTableInserter inserter(sequence, memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn, hint_per_batch); + SetSequence(writer->batch, sequence); + inserter.set_log_number_ref(writer->log_ref); + inserter.set_prot_info(writer->batch->prot_info_.get()); + Status s = writer->batch->Iterate(&inserter); + assert(!seq_per_batch || batch_cnt != 0); + assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt); + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +Status WriteBatchInternal::InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, SequenceNumber* next_seq, + bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) { + MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, batch->prot_info_.get(), + has_valid_writes, seq_per_batch, batch_per_txn); + Status s = batch->Iterate(&inserter); + if (next_seq != nullptr) { + *next_seq = inserter.sequence(); + } + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +namespace { + +// This class updates protection info for a WriteBatch. +class ProtectionInfoUpdater : public WriteBatch::Handler { + public: + explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info) + : prot_info_(prot_info) {} + + ~ProtectionInfoUpdater() override {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + return UpdateProtInfo(cf, key, val, kTypeValue); + } + + Status PutEntityCF(uint32_t cf, const Slice& key, + const Slice& entity) override { + return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return UpdateProtInfo(cf, key, "", kTypeDeletion); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return UpdateProtInfo(cf, key, "", kTypeSingleDeletion); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override { + return UpdateProtInfo(cf, begin_key, end_key, kTypeRangeDeletion); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + return UpdateProtInfo(cf, key, val, kTypeMerge); + } + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, + const Slice& val) override { + return UpdateProtInfo(cf, key, val, kTypeBlobIndex); + } + + Status MarkBeginPrepare(bool /* unprepare */) override { + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& /* xid */) override { + return Status::OK(); + } + + Status MarkCommit(const Slice& /* xid */) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice& /* xid */, + const Slice& /* ts */) override { + return Status::OK(); + } + + Status MarkRollback(const Slice& /* xid */) override { return Status::OK(); } + + Status MarkNoop(bool /* empty_batch */) override { return Status::OK(); } + + private: + Status UpdateProtInfo(uint32_t cf, const Slice& key, const Slice& val, + const ValueType op_type) { + if (prot_info_) { + prot_info_->entries_.emplace_back( + ProtectionInfo64().ProtectKVO(key, val, op_type).ProtectC(cf)); + } + return Status::OK(); + } + + // No copy or move. + ProtectionInfoUpdater(const ProtectionInfoUpdater&) = delete; + ProtectionInfoUpdater(ProtectionInfoUpdater&&) = delete; + ProtectionInfoUpdater& operator=(const ProtectionInfoUpdater&) = delete; + ProtectionInfoUpdater& operator=(ProtectionInfoUpdater&&) = delete; + + WriteBatch::ProtectionInfo* const prot_info_ = nullptr; +}; + +} // anonymous namespace + +Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= WriteBatchInternal::kHeader); + assert(b->prot_info_ == nullptr); + + b->rep_.assign(contents.data(), contents.size()); + b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src, + const bool wal_only) { + assert(dst->Count() == 0 || + (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr)); + if ((src->prot_info_ != nullptr && + src->prot_info_->entries_.size() != src->Count()) || + (dst->prot_info_ != nullptr && + dst->prot_info_->entries_.size() != dst->Count())) { + return Status::Corruption( + "Write batch has inconsistent count and number of checksums"); + } + + size_t src_len; + int src_count; + uint32_t src_flags; + + const SavePoint& batch_end = src->GetWalTerminationPoint(); + + if (wal_only && !batch_end.is_cleared()) { + src_len = batch_end.size - WriteBatchInternal::kHeader; + src_count = batch_end.count; + src_flags = batch_end.content_flags; + } else { + src_len = src->rep_.size() - WriteBatchInternal::kHeader; + src_count = Count(src); + src_flags = src->content_flags_.load(std::memory_order_relaxed); + } + + if (src->prot_info_ != nullptr) { + if (dst->prot_info_ == nullptr) { + dst->prot_info_.reset(new WriteBatch::ProtectionInfo()); + } + std::copy(src->prot_info_->entries_.begin(), + src->prot_info_->entries_.begin() + src_count, + std::back_inserter(dst->prot_info_->entries_)); + } else if (dst->prot_info_ != nullptr) { + // dst has empty prot_info->entries + // In this special case, we allow write batch without prot_info to + // be appende to write batch with empty prot_info + dst->prot_info_ = nullptr; + } + SetCount(dst, Count(dst) + src_count); + assert(src->rep_.size() >= WriteBatchInternal::kHeader); + dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len); + dst->content_flags_.store( + dst->content_flags_.load(std::memory_order_relaxed) | src_flags, + std::memory_order_relaxed); + return Status::OK(); +} + +size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize, + size_t rightByteSize) { + if (leftByteSize == 0 || rightByteSize == 0) { + return leftByteSize + rightByteSize; + } else { + return leftByteSize + rightByteSize - WriteBatchInternal::kHeader; + } +} + +Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb, + size_t bytes_per_key, + uint64_t* checksum) { + if (bytes_per_key == 0) { + if (wb->prot_info_ != nullptr) { + wb->prot_info_.reset(); + return Status::OK(); + } else { + // Already not protected. + return Status::OK(); + } + } else if (bytes_per_key == 8) { + if (wb->prot_info_ == nullptr) { + wb->prot_info_.reset(new WriteBatch::ProtectionInfo()); + ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get()); + Status s = wb->Iterate(&prot_info_updater); + if (s.ok() && checksum != nullptr) { + uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size()); + if (expected_hash != *checksum) { + return Status::Corruption("Write batch content corrupted."); + } + } + return s; + } else { + // Already protected. + return Status::OK(); + } + } + return Status::NotSupported( + "WriteBatch protection info must be zero or eight bytes/key"); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_base.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_base.cc new file mode 100644 index 0000000000..e4c0e74bd6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_base.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/write_batch_base.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Simple implementation of SlicePart variants of Put(). Child classes +// can override these method with more performant solutions if they choose. +Status WriteBatchBase::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Put(column_family, key_slice, value_slice); +} + +Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Put(key_slice, value_slice); +} + +Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return Delete(column_family, key_slice); +} + +Status WriteBatchBase::Delete(const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return Delete(key_slice); +} + +Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return SingleDelete(column_family, key_slice); +} + +Status WriteBatchBase::SingleDelete(const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return SingleDelete(key_slice); +} + +Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) { + std::string begin_key_buf, end_key_buf; + Slice begin_key_slice(begin_key, &begin_key_buf); + Slice end_key_slice(end_key, &end_key_buf); + return DeleteRange(column_family, begin_key_slice, end_key_slice); +} + +Status WriteBatchBase::DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key) { + std::string begin_key_buf, end_key_buf; + Slice begin_key_slice(begin_key, &begin_key_buf); + Slice end_key_slice(end_key, &end_key_buf); + return DeleteRange(begin_key_slice, end_key_slice); +} + +Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Merge(column_family, key_slice, value_slice); +} + +Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Merge(key_slice, value_slice); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_internal.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_internal.h new file mode 100644 index 0000000000..1be0bd1400 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_batch_internal.h @@ -0,0 +1,401 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/flush_scheduler.h" +#include "db/kv_checksum.h" +#include "db/trim_history_scheduler.h" +#include "db/write_thread.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "util/autovector.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTable; +class FlushScheduler; +class ColumnFamilyData; + +class ColumnFamilyMemTables { + public: + virtual ~ColumnFamilyMemTables() {} + virtual bool Seek(uint32_t column_family_id) = 0; + // returns true if the update to memtable should be ignored + // (useful when recovering from log whose updates have already + // been processed) + virtual uint64_t GetLogNumber() const = 0; + virtual MemTable* GetMemTable() const = 0; + virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; + virtual ColumnFamilyData* current() { return nullptr; } +}; + +class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesDefault(MemTable* mem) + : ok_(false), mem_(mem) {} + + bool Seek(uint32_t column_family_id) override { + ok_ = (column_family_id == 0); + return ok_; + } + + uint64_t GetLogNumber() const override { return 0; } + + MemTable* GetMemTable() const override { + assert(ok_); + return mem_; + } + + ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + + private: + bool ok_; + MemTable* mem_; +}; + +struct WriteBatch::ProtectionInfo { + // `WriteBatch` usually doesn't contain a huge number of keys so protecting + // with a fixed, non-configurable eight bytes per key may work well enough. + autovector entries_; + + size_t GetBytesPerKey() const { return 8; } +}; + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. + static constexpr size_t kHeader = 12; + + // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* + static Status Put(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status Put(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static Status PutEntity(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const WideColumns& columns); + + static Status Delete(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key); + + static Status Delete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key); + + static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, + const Slice& begin_key, const Slice& end_key); + + static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, + const SliceParts& begin_key, + const SliceParts& end_key); + + static Status Merge(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status Merge(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid, + const bool write_after_commit = true, + const bool unprepared_batch = false); + + static Status MarkRollback(WriteBatch* batch, const Slice& xid); + + static Status MarkCommit(WriteBatch* batch, const Slice& xid); + + static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid, + const Slice& commit_ts); + + static Status InsertNoop(WriteBatch* batch); + + // Return the number of entries in the batch. + static uint32_t Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, uint32_t n); + + // Return the sequence number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the sequence number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + // Returns the offset of the first entry in the batch. + // This offset is only valid if the batch is not empty. + static size_t GetFirstOffset(WriteBatch* batch); + + static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); } + + static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); } + + static Status SetContents(WriteBatch* batch, const Slice& contents); + + static Status CheckSlicePartsLength(const SliceParts& key, + const SliceParts& value); + + // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive. + // + // If ignore_missing_column_families == true. WriteBatch + // referencing non-existing column family will be ignored. + // If ignore_missing_column_families == false, processing of the + // batches will be stopped if a reference is found to a non-existing + // column family and InvalidArgument() will be returned. The writes + // in batches may be only partially applied at that point. + // + // If log_number is non-zero, the memtable will be updated only if + // memtables->GetLogNumber() >= log_number. + // + // If flush_scheduler is non-null, it will be invoked if the memtable + // should be flushed. + // + // Under concurrent use, the caller is responsible for making sure that + // the memtables object itself is thread-local. + static Status InsertInto( + WriteThread::WriteGroup& write_group, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, uint64_t log_number = 0, + DB* db = nullptr, bool concurrent_memtable_writes = false, + bool seq_per_batch = false, bool batch_per_txn = true); + + // Convenience form of InsertInto when you have only one batch + // next_seq returns the seq after last sequence number used in MemTable insert + static Status InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, uint64_t log_number = 0, + DB* db = nullptr, bool concurrent_memtable_writes = false, + SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr, + bool seq_per_batch = false, bool batch_per_txn = true); + + static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, + bool concurrent_memtable_writes = false, + bool seq_per_batch = false, size_t batch_cnt = 0, + bool batch_per_txn = true, + bool hint_per_batch = false); + + // Appends src write batch to dst write batch and updates count in dst + // write batch. Returns OK if the append is successful. Checks number of + // checksum against count in dst and src write batches, and returns Corruption + // if the count is inconsistent. + static Status Append(WriteBatch* dst, const WriteBatch* src, + const bool WAL_only = false); + + // Returns the byte size of appending a WriteBatch with ByteSize + // leftByteSize and a WriteBatch with ByteSize rightByteSize + static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); + + // Iterate over [begin, end) range of a write batch + static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler, + size_t begin, size_t end); + + // This write batch includes the latest state that should be persisted. Such + // state meant to be used only during recovery. + static void SetAsLatestPersistentState(WriteBatch* b); + static bool IsLatestPersistentState(const WriteBatch* b); + + static std::tuple GetColumnFamilyIdAndTimestampSize( + WriteBatch* b, ColumnFamilyHandle* column_family); + + static bool TimestampsUpdateNeeded(const WriteBatch& wb) { + return wb.needs_in_place_update_ts_; + } + + static bool HasKeyWithTimestamp(const WriteBatch& wb) { + return wb.has_key_with_ts_; + } + + // Update per-key value protection information on this write batch. + // If checksum is provided, the batch content is verfied against the checksum. + static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key, + uint64_t* checksum = nullptr); +}; + +// LocalSavePoint is similar to a scope guard +class LocalSavePoint { + public: + explicit LocalSavePoint(WriteBatch* batch) + : batch_(batch), + savepoint_(batch->GetDataSize(), batch->Count(), + batch->content_flags_.load(std::memory_order_relaxed)) +#ifndef NDEBUG + , + committed_(false) +#endif + { + } + +#ifndef NDEBUG + ~LocalSavePoint() { assert(committed_); } +#endif + Status commit() { +#ifndef NDEBUG + committed_ = true; +#endif + if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) { + batch_->rep_.resize(savepoint_.size); + WriteBatchInternal::SetCount(batch_, savepoint_.count); + if (batch_->prot_info_ != nullptr) { + batch_->prot_info_->entries_.resize(savepoint_.count); + } + batch_->content_flags_.store(savepoint_.content_flags, + std::memory_order_relaxed); + return Status::MemoryLimit(); + } + return Status::OK(); + } + + private: + WriteBatch* batch_; + SavePoint savepoint_; +#ifndef NDEBUG + bool committed_; +#endif +}; + +template +class TimestampUpdater : public WriteBatch::Handler { + public: + explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info, + TimestampSizeFuncType&& ts_sz_func, const Slice& ts) + : prot_info_(prot_info), + ts_sz_func_(std::move(ts_sz_func)), + timestamp_(ts) { + assert(!timestamp_.empty()); + } + + ~TimestampUpdater() override {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + return UpdateTimestamp(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return UpdateTimestamp(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return UpdateTimestamp(cf, key); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override { + Status s = UpdateTimestamp(cf, begin_key, true /* is_key */); + if (s.ok()) { + s = UpdateTimestamp(cf, end_key, false /* is_key */); + } + return s; + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + return UpdateTimestamp(cf, key); + } + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override { + return UpdateTimestamp(cf, key); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } + + private: + // @param is_key specifies whether the update is for key or value. + Status UpdateTimestamp(uint32_t cf, const Slice& buf, bool is_key = true) { + Status s = UpdateTimestampImpl(cf, buf, idx_, is_key); + ++idx_; + return s; + } + + Status UpdateTimestampImpl(uint32_t cf, const Slice& buf, size_t /*idx*/, + bool is_key) { + if (timestamp_.empty()) { + return Status::InvalidArgument("Timestamp is empty"); + } + size_t cf_ts_sz = ts_sz_func_(cf); + if (0 == cf_ts_sz) { + // Skip this column family. + return Status::OK(); + } else if (std::numeric_limits::max() == cf_ts_sz) { + // Column family timestamp info not found. + return Status::NotFound(); + } else if (cf_ts_sz != timestamp_.size()) { + return Status::InvalidArgument("timestamp size mismatch"); + } + UpdateProtectionInformationIfNeeded(buf, timestamp_, is_key); + + char* ptr = const_cast(buf.data() + buf.size() - cf_ts_sz); + assert(ptr); + memcpy(ptr, timestamp_.data(), timestamp_.size()); + return Status::OK(); + } + + void UpdateProtectionInformationIfNeeded(const Slice& buf, const Slice& ts, + bool is_key) { + if (prot_info_ != nullptr) { + const size_t ts_sz = ts.size(); + SliceParts old(&buf, 1); + Slice old_no_ts(buf.data(), buf.size() - ts_sz); + std::array new_key_cmpts{{old_no_ts, ts}}; + SliceParts new_parts(new_key_cmpts.data(), 2); + if (is_key) { + prot_info_->entries_[idx_].UpdateK(old, new_parts); + } else { + prot_info_->entries_[idx_].UpdateV(old, new_parts); + } + } + } + + // No copy or move. + TimestampUpdater(const TimestampUpdater&) = delete; + TimestampUpdater(TimestampUpdater&&) = delete; + TimestampUpdater& operator=(const TimestampUpdater&) = delete; + TimestampUpdater& operator=(TimestampUpdater&&) = delete; + + WriteBatch::ProtectionInfo* const prot_info_ = nullptr; + const TimestampSizeFuncType ts_sz_func_{}; + const Slice timestamp_; + size_t idx_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_callback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_callback.h new file mode 100644 index 0000000000..106d020417 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_callback.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +class WriteCallback { + public: + virtual ~WriteCallback() {} + + // Will be called while on the write thread before the write executes. If + // this function returns a non-OK status, the write will be aborted and this + // status will be returned to the caller of DB::Write(). + virtual Status Callback(DB* db) = 0; + + // return true if writes with this callback can be batched with other writes + virtual bool AllowWriteBatching() = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.cc new file mode 100644 index 0000000000..c5f7443752 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_controller.h" + +#include +#include +#include +#include + +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr WriteController::GetStopToken() { + ++total_stopped_; + return std::unique_ptr(new StopWriteToken(this)); +} + +std::unique_ptr WriteController::GetDelayToken( + uint64_t write_rate) { + if (0 == total_delayed_++) { + // Starting delay, so reset counters. + next_refill_time_ = 0; + credit_in_bytes_ = 0; + } + // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in + // next_refill_time_ will be based on an old rate. This rate will apply + // for subsequent additional debts and for the next refill. + set_delayed_write_rate(write_rate); + return std::unique_ptr(new DelayWriteToken(this)); +} + +std::unique_ptr +WriteController::GetCompactionPressureToken() { + ++total_compaction_pressure_; + return std::unique_ptr( + new CompactionPressureToken(this)); +} + +bool WriteController::IsStopped() const { + return total_stopped_.load(std::memory_order_relaxed) > 0; +} +// This is inside DB mutex, so we can't sleep and need to minimize +// frequency to get time. +// If it turns out to be a performance issue, we can redesign the thread +// synchronization model here. +// The function trust caller will sleep micros returned. +uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) { + if (total_stopped_.load(std::memory_order_relaxed) > 0) { + return 0; + } + if (total_delayed_.load(std::memory_order_relaxed) == 0) { + return 0; + } + + if (credit_in_bytes_ >= num_bytes) { + credit_in_bytes_ -= num_bytes; + return 0; + } + // The frequency to get time inside DB mutex is less than one per refill + // interval. + auto time_now = NowMicrosMonotonic(clock); + + const uint64_t kMicrosPerSecond = 1000000; + // Refill every 1 ms + const uint64_t kMicrosPerRefill = 1000; + + if (next_refill_time_ == 0) { + // Start with an initial allotment of bytes for one interval + next_refill_time_ = time_now; + } + if (next_refill_time_ <= time_now) { + // Refill based on time interval plus any extra elapsed + uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill; + credit_in_bytes_ += static_cast( + 1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999); + next_refill_time_ = time_now + kMicrosPerRefill; + + if (credit_in_bytes_ >= num_bytes) { + // Avoid delay if possible, to reduce DB mutex release & re-aquire. + credit_in_bytes_ -= num_bytes; + return 0; + } + } + + // We need to delay to avoid exceeding write rate. + assert(num_bytes > credit_in_bytes_); + uint64_t bytes_over_budget = num_bytes - credit_in_bytes_; + uint64_t needed_delay = static_cast( + 1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond); + + credit_in_bytes_ = 0; + next_refill_time_ += needed_delay; + + // Minimum delay of refill interval, to reduce DB mutex contention. + return std::max(next_refill_time_ - time_now, kMicrosPerRefill); +} + +uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) { + return clock->NowNanos() / std::milli::den; +} + +StopWriteToken::~StopWriteToken() { + assert(controller_->total_stopped_ >= 1); + --controller_->total_stopped_; +} + +DelayWriteToken::~DelayWriteToken() { + controller_->total_delayed_--; + assert(controller_->total_delayed_.load() >= 0); +} + +CompactionPressureToken::~CompactionPressureToken() { + controller_->total_compaction_pressure_--; + assert(controller_->total_compaction_pressure_ >= 0); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.h new file mode 100644 index 0000000000..bcead165b3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_controller.h @@ -0,0 +1,148 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "rocksdb/rate_limiter.h" + +namespace ROCKSDB_NAMESPACE { + +class SystemClock; +class WriteControllerToken; + +// WriteController is controlling write stalls in our write code-path. Write +// stalls happen when compaction can't keep up with write rate. +// All of the methods here (including WriteControllerToken's destructors) need +// to be called while holding DB mutex +class WriteController { + public: + explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u, + int64_t low_pri_rate_bytes_per_sec = 1024 * 1024) + : total_stopped_(0), + total_delayed_(0), + total_compaction_pressure_(0), + credit_in_bytes_(0), + next_refill_time_(0), + low_pri_rate_limiter_( + NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) { + set_max_delayed_write_rate(_delayed_write_rate); + } + ~WriteController() = default; + + // When an actor (column family) requests a stop token, all writes will be + // stopped until the stop token is released (deleted) + std::unique_ptr GetStopToken(); + // When an actor (column family) requests a delay token, total delay for all + // writes to the DB will be controlled under the delayed write rate. Every + // write needs to call GetDelay() with number of bytes writing to the DB, + // which returns number of microseconds to sleep. + std::unique_ptr GetDelayToken( + uint64_t delayed_write_rate); + // When an actor (column family) requests a moderate token, compaction + // threads will be increased + std::unique_ptr GetCompactionPressureToken(); + + // these three metods are querying the state of the WriteController + bool IsStopped() const; + bool NeedsDelay() const { return total_delayed_.load() > 0; } + bool NeedSpeedupCompaction() const { + return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0; + } + // return how many microseconds the caller needs to sleep after the call + // num_bytes: how many number of bytes to put into the DB. + // Prerequisite: DB mutex held. + uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes); + void set_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; + } else if (write_rate > max_delayed_write_rate()) { + write_rate = max_delayed_write_rate(); + } + delayed_write_rate_ = write_rate; + } + + void set_max_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; + } + max_delayed_write_rate_ = write_rate; + // update delayed_write_rate_ as well + delayed_write_rate_ = write_rate; + } + + uint64_t delayed_write_rate() const { return delayed_write_rate_; } + + uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; } + + RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); } + + private: + uint64_t NowMicrosMonotonic(SystemClock* clock); + + friend class WriteControllerToken; + friend class StopWriteToken; + friend class DelayWriteToken; + friend class CompactionPressureToken; + + std::atomic total_stopped_; + std::atomic total_delayed_; + std::atomic total_compaction_pressure_; + + // Number of bytes allowed to write without delay + uint64_t credit_in_bytes_; + // Next time that we can add more credit of bytes + uint64_t next_refill_time_; + // Write rate set when initialization or by `DBImpl::SetDBOptions` + uint64_t max_delayed_write_rate_; + // Current write rate (bytes / second) + uint64_t delayed_write_rate_; + + std::unique_ptr low_pri_rate_limiter_; +}; + +class WriteControllerToken { + public: + explicit WriteControllerToken(WriteController* controller) + : controller_(controller) {} + virtual ~WriteControllerToken() {} + + protected: + WriteController* controller_; + + private: + // no copying allowed + WriteControllerToken(const WriteControllerToken&) = delete; + void operator=(const WriteControllerToken&) = delete; +}; + +class StopWriteToken : public WriteControllerToken { + public: + explicit StopWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~StopWriteToken(); +}; + +class DelayWriteToken : public WriteControllerToken { + public: + explicit DelayWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~DelayWriteToken(); +}; + +class CompactionPressureToken : public WriteControllerToken { + public: + explicit CompactionPressureToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~CompactionPressureToken(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.cc new file mode 100644 index 0000000000..3973df7685 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.cc @@ -0,0 +1,179 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_stall_stats.h" + +namespace ROCKSDB_NAMESPACE { +const std::string& InvalidWriteStallHyphenString() { + static const std::string kInvalidWriteStallHyphenString = "invalid"; + return kInvalidWriteStallHyphenString; +} + +const std::string& WriteStallCauseToHyphenString(WriteStallCause cause) { + static const std::string kMemtableLimit = "memtable-limit"; + static const std::string kL0FileCountLimit = "l0-file-count-limit"; + static const std::string kPendingCompactionBytes = "pending-compaction-bytes"; + static const std::string kWriteBufferManagerLimit = + "write-buffer-manager-limit"; + switch (cause) { + case WriteStallCause::kMemtableLimit: + return kMemtableLimit; + case WriteStallCause::kL0FileCountLimit: + return kL0FileCountLimit; + case WriteStallCause::kPendingCompactionBytes: + return kPendingCompactionBytes; + case WriteStallCause::kWriteBufferManagerLimit: + return kWriteBufferManagerLimit; + default: + break; + } + return InvalidWriteStallHyphenString(); +} + +const std::string& WriteStallConditionToHyphenString( + WriteStallCondition condition) { + static const std::string kDelayed = "delays"; + static const std::string kStopped = "stops"; + switch (condition) { + case WriteStallCondition::kDelayed: + return kDelayed; + case WriteStallCondition::kStopped: + return kStopped; + default: + break; + } + return InvalidWriteStallHyphenString(); +} + +InternalStats::InternalCFStatsType InternalCFStat( + WriteStallCause cause, WriteStallCondition condition) { + switch (cause) { + case WriteStallCause::kMemtableLimit: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::MEMTABLE_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::MEMTABLE_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + case WriteStallCause::kL0FileCountLimit: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::L0_FILE_COUNT_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::L0_FILE_COUNT_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + case WriteStallCause::kPendingCompactionBytes: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + default: + break; + } + return InternalStats::INTERNAL_CF_STATS_ENUM_MAX; +} + +InternalStats::InternalDBStatsType InternalDBStat( + WriteStallCause cause, WriteStallCondition condition) { + switch (cause) { + case WriteStallCause::kWriteBufferManagerLimit: { + switch (condition) { + case WriteStallCondition::kStopped: + return InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts; + default: + break; + } + break; + } + default: + break; + } + return InternalStats::kIntStatsNumMax; +} + +bool isCFScopeWriteStallCause(WriteStallCause cause) { + uint32_t int_cause = static_cast(cause); + uint32_t lower_bound = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + kNumCFScopeWriteStallCauses; + uint32_t upper_bound = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + 1; + return lower_bound <= int_cause && int_cause <= upper_bound; +} + +bool isDBScopeWriteStallCause(WriteStallCause cause) { + uint32_t int_cause = static_cast(cause); + uint32_t lower_bound = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + kNumDBScopeWriteStallCauses; + uint32_t upper_bound = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + 1; + return lower_bound <= int_cause && int_cause <= upper_bound; +} + +const std::string& WriteStallStatsMapKeys::TotalStops() { + static const std::string kTotalStops = "total-stops"; + return kTotalStops; +} + +const std::string& WriteStallStatsMapKeys::TotalDelays() { + static const std::string kTotalDelays = "total-delays"; + return kTotalDelays; +} + +const std::string& +WriteStallStatsMapKeys::CFL0FileCountLimitDelaysWithOngoingCompaction() { + static const std::string ret = + "cf-l0-file-count-limit-delays-with-ongoing-compaction"; + return ret; +} + +const std::string& +WriteStallStatsMapKeys::CFL0FileCountLimitStopsWithOngoingCompaction() { + static const std::string ret = + "cf-l0-file-count-limit-stops-with-ongoing-compaction"; + return ret; +} + +std::string WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause cause, WriteStallCondition condition) { + std::string cause_condition_count_name; + + std::string cause_name; + if (isCFScopeWriteStallCause(cause) || isDBScopeWriteStallCause(cause)) { + cause_name = WriteStallCauseToHyphenString(cause); + } else { + assert(false); + return ""; + } + + const std::string& condition_name = + WriteStallConditionToHyphenString(condition); + + cause_condition_count_name.reserve(cause_name.size() + 1 + + condition_name.size()); + cause_condition_count_name.append(cause_name); + cause_condition_count_name.append("-"); + cause_condition_count_name.append(condition_name); + + return cause_condition_count_name; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.h new file mode 100644 index 0000000000..6394abb0a8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_stall_stats.h @@ -0,0 +1,47 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/internal_stats.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { +extern const std::string& InvalidWriteStallHyphenString(); + +extern const std::string& WriteStallCauseToHyphenString(WriteStallCause cause); + +extern const std::string& WriteStallConditionToHyphenString( + WriteStallCondition condition); + +// REQUIRES: +// cause` is CF-scope `WriteStallCause`, see `WriteStallCause` for more +// +// REQUIRES: +// `condition` != `WriteStallCondition::kNormal` +extern InternalStats::InternalCFStatsType InternalCFStat( + WriteStallCause cause, WriteStallCondition condition); + +// REQUIRES: +// cause` is DB-scope `WriteStallCause`, see `WriteStallCause` for more +// +// REQUIRES: +// `condition` != `WriteStallCondition::kNormal` +extern InternalStats::InternalDBStatsType InternalDBStat( + WriteStallCause cause, WriteStallCondition condition); + +extern bool isCFScopeWriteStallCause(WriteStallCause cause); +extern bool isDBScopeWriteStallCause(WriteStallCause cause); + +constexpr uint32_t kNumCFScopeWriteStallCauses = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + static_cast(WriteStallCause::kMemtableLimit); + +constexpr uint32_t kNumDBScopeWriteStallCauses = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + static_cast(WriteStallCause::kWriteBufferManagerLimit); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.cc new file mode 100644 index 0000000000..7987007752 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.cc @@ -0,0 +1,844 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_thread.h" + +#include +#include + +#include "db/column_family.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +WriteThread::WriteThread(const ImmutableDBOptions& db_options) + : max_yield_usec_(db_options.enable_write_thread_adaptive_yield + ? db_options.write_thread_max_yield_usec + : 0), + slow_yield_usec_(db_options.write_thread_slow_yield_usec), + allow_concurrent_memtable_write_( + db_options.allow_concurrent_memtable_write), + enable_pipelined_write_(db_options.enable_pipelined_write), + max_write_batch_group_size_bytes( + db_options.max_write_batch_group_size_bytes), + newest_writer_(nullptr), + newest_memtable_writer_(nullptr), + last_sequence_(0), + write_stall_dummy_(), + stall_mu_(), + stall_cv_(&stall_mu_) {} + +uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) { + // We're going to block. Lazily create the mutex. We guarantee + // propagation of this construction to the waker via the + // STATE_LOCKED_WAITING state. The waker won't try to touch the mutex + // or the condvar unless they CAS away the STATE_LOCKED_WAITING that + // we install below. + w->CreateMutex(); + + auto state = w->state.load(std::memory_order_acquire); + assert(state != STATE_LOCKED_WAITING); + if ((state & goal_mask) == 0 && + w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) { + // we have permission (and an obligation) to use StateMutex + std::unique_lock guard(w->StateMutex()); + w->StateCV().wait(guard, [w] { + return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING; + }); + state = w->state.load(std::memory_order_relaxed); + } + // else tricky. Goal is met or CAS failed. In the latter case the waker + // must have changed the state, and compare_exchange_strong has updated + // our local variable with the new one. At the moment WriteThread never + // waits for a transition across intermediate states, so we know that + // since a state change has occurred the goal must have been met. + assert((state & goal_mask) != 0); + return state; +} + +uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask, + AdaptationContext* ctx) { + uint8_t state = 0; + + // 1. Busy loop using "pause" for 1 micro sec + // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default) + // 3. Else blocking wait + + // On a modern Xeon each loop takes about 7 nanoseconds (most of which + // is the effect of the pause instruction), so 200 iterations is a bit + // more than a microsecond. This is long enough that waits longer than + // this can amortize the cost of accessing the clock and yielding. + for (uint32_t tries = 0; tries < 200; ++tries) { + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + return state; + } + port::AsmVolatilePause(); + } + + // This is below the fast path, so that the stat is zero when all writes are + // from the same thread. + PERF_TIMER_GUARD(write_thread_wait_nanos); + + // If we're only going to end up waiting a short period of time, + // it can be a lot more efficient to call std::this_thread::yield() + // in a loop than to block in StateMutex(). For reference, on my 4.0 + // SELinux test server with support for syscall auditing enabled, the + // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is + // 2.7 usec, and the average is more like 10 usec. That can be a big + // drag on RockDB's single-writer design. Of course, spinning is a + // bad idea if other threads are waiting to run or if we're going to + // wait for a long time. How do we decide? + // + // We break waiting into 3 categories: short-uncontended, + // short-contended, and long. If we had an oracle, then we would always + // spin for short-uncontended, always block for long, and our choice for + // short-contended might depend on whether we were trying to optimize + // RocksDB throughput or avoid being greedy with system resources. + // + // Bucketing into short or long is easy by measuring elapsed time. + // Differentiating short-uncontended from short-contended is a bit + // trickier, but not too bad. We could look for involuntary context + // switches using getrusage(RUSAGE_THREAD, ..), but it's less work + // (portability code and CPU) to just look for yield calls that take + // longer than we expect. sched_yield() doesn't actually result in any + // context switch overhead if there are no other runnable processes + // on the current core, in which case it usually takes less than + // a microsecond. + // + // There are two primary tunables here: the threshold between "short" + // and "long" waits, and the threshold at which we suspect that a yield + // is slow enough to indicate we should probably block. If these + // thresholds are chosen well then CPU-bound workloads that don't + // have more threads than cores will experience few context switches + // (voluntary or involuntary), and the total number of context switches + // (voluntary and involuntary) will not be dramatically larger (maybe + // 2x) than the number of voluntary context switches that occur when + // --max_yield_wait_micros=0. + // + // There's another constant, which is the number of slow yields we will + // tolerate before reversing our previous decision. Solitary slow + // yields are pretty common (low-priority small jobs ready to run), + // so this should be at least 2. We set this conservatively to 3 so + // that we can also immediately schedule a ctx adaptation, rather than + // waiting for the next update_ctx. + + const size_t kMaxSlowYieldsWhileSpinning = 3; + + // Whether the yield approach has any credit in this context. The credit is + // added by yield being succesfull before timing out, and decreased otherwise. + auto& yield_credit = ctx->value; + // Update the yield_credit based on sample runs or right after a hard failure + bool update_ctx = false; + // Should we reinforce the yield credit + bool would_spin_again = false; + // The samling base for updating the yeild credit. The sampling rate would be + // 1/sampling_base. + const int sampling_base = 256; + + if (max_yield_usec_ > 0) { + update_ctx = Random::GetTLSInstance()->OneIn(sampling_base); + + if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) { + // we're updating the adaptation statistics, or spinning has > + // 50% chance of being shorter than max_yield_usec_ and causing no + // involuntary context switches + auto spin_begin = std::chrono::steady_clock::now(); + + // this variable doesn't include the final yield (if any) that + // causes the goal to be met + size_t slow_yield_count = 0; + + auto iter_begin = spin_begin; + while ((iter_begin - spin_begin) <= + std::chrono::microseconds(max_yield_usec_)) { + std::this_thread::yield(); + + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + // success + would_spin_again = true; + break; + } + + auto now = std::chrono::steady_clock::now(); + if (now == iter_begin || + now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) { + // conservatively count it as a slow yield if our clock isn't + // accurate enough to measure the yield duration + ++slow_yield_count; + if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) { + // Not just one ivcsw, but several. Immediately update yield_credit + // and fall back to blocking + update_ctx = true; + break; + } + } + iter_begin = now; + } + } + } + + if ((state & goal_mask) == 0) { + TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w); + state = BlockingAwaitState(w, goal_mask); + } + + if (update_ctx) { + // Since our update is sample based, it is ok if a thread overwrites the + // updates by other threads. Thus the update does not have to be atomic. + auto v = yield_credit.load(std::memory_order_relaxed); + // fixed point exponential decay with decay constant 1/1024, with +1 + // and -1 scaled to avoid overflow for int32_t + // + // On each update the positive credit is decayed by a facor of 1/1024 (i.e., + // 0.1%). If the sampled yield was successful, the credit is also increased + // by X. Setting X=2^17 ensures that the credit never exceeds + // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same + // logic applies to negative credits. + v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072; + yield_credit.store(v, std::memory_order_relaxed); + } + + assert((state & goal_mask) != 0); + return state; +} + +void WriteThread::SetState(Writer* w, uint8_t new_state) { + assert(w); + auto state = w->state.load(std::memory_order_acquire); + if (state == STATE_LOCKED_WAITING || + !w->state.compare_exchange_strong(state, new_state)) { + assert(state == STATE_LOCKED_WAITING); + + std::lock_guard guard(w->StateMutex()); + assert(w->state.load(std::memory_order_relaxed) != new_state); + w->state.store(new_state, std::memory_order_relaxed); + w->StateCV().notify_one(); + } +} + +bool WriteThread::LinkOne(Writer* w, std::atomic* newest_writer) { + assert(newest_writer != nullptr); + assert(w->state == STATE_INIT); + Writer* writers = newest_writer->load(std::memory_order_relaxed); + while (true) { + assert(writers != w); + // If write stall in effect, and w->no_slowdown is not true, + // block here until stall is cleared. If its true, then return + // immediately + if (writers == &write_stall_dummy_) { + if (w->no_slowdown) { + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + return false; + } + // Since no_slowdown is false, wait here to be notified of the write + // stall clearing + { + MutexLock lock(&stall_mu_); + writers = newest_writer->load(std::memory_order_relaxed); + if (writers == &write_stall_dummy_) { + TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w); + stall_cv_.Wait(); + // Load newest_writers_ again since it may have changed + writers = newest_writer->load(std::memory_order_relaxed); + continue; + } + } + } + w->link_older = writers; + if (newest_writer->compare_exchange_weak(writers, w)) { + return (writers == nullptr); + } + } +} + +bool WriteThread::LinkGroup(WriteGroup& write_group, + std::atomic* newest_writer) { + assert(newest_writer != nullptr); + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + Writer* w = last_writer; + while (true) { + // Unset link_newer pointers to make sure when we call + // CreateMissingNewerLinks later it create all missing links. + w->link_newer = nullptr; + w->write_group = nullptr; + if (w == leader) { + break; + } + w = w->link_older; + } + Writer* newest = newest_writer->load(std::memory_order_relaxed); + while (true) { + leader->link_older = newest; + if (newest_writer->compare_exchange_weak(newest, last_writer)) { + return (newest == nullptr); + } + } +} + +void WriteThread::CreateMissingNewerLinks(Writer* head) { + while (true) { + Writer* next = head->link_older; + if (next == nullptr || next->link_newer != nullptr) { + assert(next == nullptr || next->link_newer == head); + break; + } + next->link_newer = head; + head = next; + } +} + +void WriteThread::CompleteLeader(WriteGroup& write_group) { + assert(write_group.size > 0); + Writer* leader = write_group.leader; + if (write_group.size == 1) { + write_group.leader = nullptr; + write_group.last_writer = nullptr; + } else { + assert(leader->link_newer != nullptr); + leader->link_newer->link_older = nullptr; + write_group.leader = leader->link_newer; + } + write_group.size -= 1; + SetState(leader, STATE_COMPLETED); +} + +void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) { + assert(write_group.size > 1); + assert(w != write_group.leader); + if (w == write_group.last_writer) { + w->link_older->link_newer = nullptr; + write_group.last_writer = w->link_older; + } else { + w->link_older->link_newer = w->link_newer; + w->link_newer->link_older = w->link_older; + } + write_group.size -= 1; + SetState(w, STATE_COMPLETED); +} + +void WriteThread::BeginWriteStall() { + ++stall_begun_count_; + LinkOne(&write_stall_dummy_, &newest_writer_); + + // Walk writer list until w->write_group != nullptr. The current write group + // will not have a mix of slowdown/no_slowdown, so its ok to stop at that + // point + Writer* w = write_stall_dummy_.link_older; + Writer* prev = &write_stall_dummy_; + while (w != nullptr && w->write_group == nullptr) { + if (w->no_slowdown) { + prev->link_older = w->link_older; + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + // Only update `link_newer` if it's already set. + // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later, + // which assumes the the first non-nullptr `link_newer` is the last + // nullptr link in the writer list. + // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop + // updating the whole list when it sees the first non nullptr link. + if (prev->link_older && prev->link_older->link_newer) { + prev->link_older->link_newer = prev; + } + w = prev->link_older; + } else { + prev = w; + w = w->link_older; + } + } +} + +void WriteThread::EndWriteStall() { + MutexLock lock(&stall_mu_); + + // Unlink write_stall_dummy_ from the write queue. This will unblock + // pending write threads to enqueue themselves + assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_); + // write_stall_dummy_.link_older can be nullptr only if LockWAL() has been + // called. + if (write_stall_dummy_.link_older) { + write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer; + } + newest_writer_.exchange(write_stall_dummy_.link_older); + + ++stall_ended_count_; + + // Wake up writers + stall_cv_.SignalAll(); +} + +uint64_t WriteThread::GetBegunCountOfOutstandingStall() { + if (stall_begun_count_ > stall_ended_count_) { + // Oustanding stall in queue + assert(newest_writer_.load(std::memory_order_relaxed) == + &write_stall_dummy_); + return stall_begun_count_; + } else { + // No stall in queue + assert(newest_writer_.load(std::memory_order_relaxed) != + &write_stall_dummy_); + return 0; + } +} + +void WriteThread::WaitForStallEndedCount(uint64_t stall_count) { + MutexLock lock(&stall_mu_); + + while (stall_ended_count_ < stall_count) { + stall_cv_.Wait(); + } +} + +static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup"); +void WriteThread::JoinBatchGroup(Writer* w) { + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w); + assert(w->batch != nullptr); + + bool linked_as_leader = LinkOne(w, &newest_writer_); + + if (linked_as_leader) { + SetState(w, STATE_GROUP_LEADER); + } + + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait2", w); + + if (!linked_as_leader) { + /** + * Wait util: + * 1) An existing leader pick us as the new leader when it finishes + * 2) An existing leader pick us as its follewer and + * 2.1) finishes the memtable writes on our behalf + * 2.2) Or tell us to finish the memtable writes in pralallel + * 3) (pipelined write) An existing leader pick us as its follower and + * finish book-keeping and WAL write for us, enqueue us as pending + * memtable writer, and + * 3.1) we become memtable writer group leader, or + * 3.2) an existing memtable writer group leader tell us to finish memtable + * writes in parallel. + */ + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w); + AwaitState(w, + STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER | + STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, + &jbg_ctx); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); + } +} + +size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, + WriteGroup* write_group) { + assert(leader->link_older == nullptr); + assert(leader->batch != nullptr); + assert(write_group != nullptr); + + size_t size = WriteBatchInternal::ByteSize(leader->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; + } + + leader->write_group = write_group; + write_group->leader = leader; + write_group->last_writer = leader; + write_group->size = 1; + Writer* newest_writer = newest_writer_.load(std::memory_order_acquire); + + // This is safe regardless of any db mutex status of the caller. Previous + // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks + // (they emptied the list and then we added ourself as leader) or had to + // explicitly wake us up (the list was non-empty when we added ourself, + // so we have already received our MarkJoined). + CreateMissingNewerLinks(newest_writer); + + // Tricky. Iteration start (leader) is exclusive and finish + // (newest_writer) is inclusive. Iteration goes from old to new. + Writer* w = leader; + while (w != newest_writer) { + assert(w->link_newer); + w = w->link_newer; + + if (w->sync && !leader->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (w->no_slowdown != leader->no_slowdown) { + // Do not mix writes that are ok with delays with the ones that + // request fail on delays. + break; + } + + if (w->disable_wal != leader->disable_wal) { + // Do not mix writes that enable WAL with the ones whose + // WAL disabled. + break; + } + + if (w->protection_bytes_per_key != leader->protection_bytes_per_key) { + // Do not mix writes with different levels of integrity protection. + break; + } + + if (w->rate_limiter_priority != leader->rate_limiter_priority) { + // Do not mix writes with different rate limiter priorities. + break; + } + + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } + + if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { + // don't batch writes that don't want to be batched + break; + } + + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { + // Do not make batch too big + break; + } + + w->write_group = write_group; + size += batch_size; + write_group->last_writer = w; + write_group->size++; + } + TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w); + return size; +} + +void WriteThread::EnterAsMemTableWriter(Writer* leader, + WriteGroup* write_group) { + assert(leader != nullptr); + assert(leader->link_older == nullptr); + assert(leader->batch != nullptr); + assert(write_group != nullptr); + + size_t size = WriteBatchInternal::ByteSize(leader->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; + } + + leader->write_group = write_group; + write_group->leader = leader; + write_group->size = 1; + Writer* last_writer = leader; + + if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) { + Writer* newest_writer = newest_memtable_writer_.load(); + CreateMissingNewerLinks(newest_writer); + + Writer* w = leader; + while (w != newest_writer) { + assert(w->link_newer); + w = w->link_newer; + + if (w->batch == nullptr) { + break; + } + + if (w->batch->HasMerge()) { + break; + } + + if (!allow_concurrent_memtable_write_) { + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { + // Do not make batch too big + break; + } + size += batch_size; + } + + w->write_group = write_group; + last_writer = w; + write_group->size++; + } + } + + write_group->last_writer = last_writer; + write_group->last_sequence = + last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1; +} + +void WriteThread::ExitAsMemTableWriter(Writer* /*self*/, + WriteGroup& write_group) { + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + + Writer* newest_writer = last_writer; + if (!newest_memtable_writer_.compare_exchange_strong(newest_writer, + nullptr)) { + CreateMissingNewerLinks(newest_writer); + Writer* next_leader = last_writer->link_newer; + assert(next_leader != nullptr); + next_leader->link_older = nullptr; + SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER); + } + Writer* w = leader; + while (true) { + if (!write_group.status.ok()) { + w->status = write_group.status; + } + Writer* next = w->link_newer; + if (w != leader) { + SetState(w, STATE_COMPLETED); + } + if (w == last_writer) { + break; + } + assert(next); + w = next; + } + // Note that leader has to exit last, since it owns the write group. + SetState(leader, STATE_COMPLETED); +} + +void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) { + assert(write_group != nullptr); + write_group->running.store(write_group->size); + for (auto w : *write_group) { + SetState(w, STATE_PARALLEL_MEMTABLE_WRITER); + } +} + +static WriteThread::AdaptationContext cpmtw_ctx( + "CompleteParallelMemTableWriter"); +// This method is called by both the leader and parallel followers +bool WriteThread::CompleteParallelMemTableWriter(Writer* w) { + auto* write_group = w->write_group; + if (!w->status.ok()) { + std::lock_guard guard(write_group->leader->StateMutex()); + write_group->status = w->status; + } + + if (write_group->running-- > 1) { + // we're not the last one + AwaitState(w, STATE_COMPLETED, &cpmtw_ctx); + return false; + } + // else we're the last parallel worker and should perform exit duties. + w->status = write_group->status; + // Callers of this function must ensure w->status is checked. + write_group->status.PermitUncheckedError(); + return true; +} + +void WriteThread::ExitAsBatchGroupFollower(Writer* w) { + auto* write_group = w->write_group; + + assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER); + assert(write_group->status.ok()); + ExitAsBatchGroupLeader(*write_group, write_group->status); + assert(w->status.ok()); + assert(w->state == STATE_COMPLETED); + SetState(write_group->leader, STATE_COMPLETED); +} + +static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader"); +void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group, + Status& status) { + TEST_SYNC_POINT_CALLBACK("WriteThread::ExitAsBatchGroupLeader:Start", + &write_group); + + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + assert(leader->link_older == nullptr); + + // If status is non-ok already, then write_group.status won't have the chance + // of being propagated to caller. + if (!status.ok()) { + write_group.status.PermitUncheckedError(); + } + + // Propagate memtable write error to the whole group. + if (status.ok() && !write_group.status.ok()) { + status = write_group.status; + } + + if (enable_pipelined_write_) { + // We insert a dummy Writer right before our current write_group. This + // allows us to unlink our write_group without the risk that a subsequent + // writer becomes a new leader and might overtake us and add itself to the + // memtable-writer-list before we can do so. This ensures that writers are + // added to the memtable-writer-list in the exact same order in which they + // were in the newest_writer list. + // This must happen before completing the writers from our group to prevent + // a race where the owning thread of one of these writers can start a new + // write operation. + Writer dummy; + Writer* head = newest_writer_.load(std::memory_order_acquire); + if (head != last_writer || + !newest_writer_.compare_exchange_strong(head, &dummy)) { + // Either last_writer wasn't the head during the load(), or it was the + // head during the load() but somebody else pushed onto the list before + // we did the compare_exchange_strong (causing it to fail). In the latter + // case compare_exchange_strong has the effect of re-reading its first + // param (head). No need to retry a failing CAS, because only a departing + // leader (which we are at the moment) can remove nodes from the list. + assert(head != last_writer); + + // After walking link_older starting from head (if not already done) we + // will be able to traverse w->link_newer below. + CreateMissingNewerLinks(head); + assert(last_writer->link_newer != nullptr); + last_writer->link_newer->link_older = &dummy; + dummy.link_newer = last_writer->link_newer; + } + + // Complete writers that don't write to memtable + for (Writer* w = last_writer; w != leader;) { + Writer* next = w->link_older; + w->status = status; + if (!w->ShouldWriteToMemtable()) { + CompleteFollower(w, write_group); + } + w = next; + } + if (!leader->ShouldWriteToMemtable()) { + CompleteLeader(write_group); + } + + TEST_SYNC_POINT_CALLBACK( + "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters", + &write_group); + + // Link the remaining of the group to memtable writer list. + // We have to link our group to memtable writer queue before wake up the + // next leader or set newest_writer_ to null, otherwise the next leader + // can run ahead of us and link to memtable writer queue before we do. + if (write_group.size > 0) { + if (LinkGroup(write_group, &newest_memtable_writer_)) { + // The leader can now be different from current writer. + SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER); + } + } + + // Unlink the dummy writer from the list and identify the new leader + head = newest_writer_.load(std::memory_order_acquire); + if (head != &dummy || + !newest_writer_.compare_exchange_strong(head, nullptr)) { + CreateMissingNewerLinks(head); + Writer* new_leader = dummy.link_newer; + assert(new_leader != nullptr); + new_leader->link_older = nullptr; + SetState(new_leader, STATE_GROUP_LEADER); + } + + AwaitState(leader, + STATE_MEMTABLE_WRITER_LEADER | STATE_PARALLEL_MEMTABLE_WRITER | + STATE_COMPLETED, + &eabgl_ctx); + } else { + Writer* head = newest_writer_.load(std::memory_order_acquire); + if (head != last_writer || + !newest_writer_.compare_exchange_strong(head, nullptr)) { + // Either last_writer wasn't the head during the load(), or it was the + // head during the load() but somebody else pushed onto the list before + // we did the compare_exchange_strong (causing it to fail). In the + // latter case compare_exchange_strong has the effect of re-reading + // its first param (head). No need to retry a failing CAS, because + // only a departing leader (which we are at the moment) can remove + // nodes from the list. + assert(head != last_writer); + + // After walking link_older starting from head (if not already done) + // we will be able to traverse w->link_newer below. This function + // can only be called from an active leader, only a leader can + // clear newest_writer_, we didn't, and only a clear newest_writer_ + // could cause the next leader to start their work without a call + // to MarkJoined, so we can definitely conclude that no other leader + // work is going on here (with or without db mutex). + CreateMissingNewerLinks(head); + assert(last_writer->link_newer != nullptr); + assert(last_writer->link_newer->link_older == last_writer); + last_writer->link_newer->link_older = nullptr; + + // Next leader didn't self-identify, because newest_writer_ wasn't + // nullptr when they enqueued (we were definitely enqueued before them + // and are still in the list). That means leader handoff occurs when + // we call MarkJoined + SetState(last_writer->link_newer, STATE_GROUP_LEADER); + } + // else nobody else was waiting, although there might already be a new + // leader now + + while (last_writer != leader) { + assert(last_writer); + last_writer->status = status; + // we need to read link_older before calling SetState, because as soon + // as it is marked committed the other thread's Await may return and + // deallocate the Writer. + auto next = last_writer->link_older; + SetState(last_writer, STATE_COMPLETED); + + last_writer = next; + } + } +} + +static WriteThread::AdaptationContext eu_ctx("EnterUnbatched"); +void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) { + assert(w != nullptr && w->batch == nullptr); + mu->Unlock(); + bool linked_as_leader = LinkOne(w, &newest_writer_); + if (!linked_as_leader) { + TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait"); + // Last leader will not pick us as a follower since our batch is nullptr + AwaitState(w, STATE_GROUP_LEADER, &eu_ctx); + } + if (enable_pipelined_write_) { + WaitForMemTableWriters(); + } + mu->Lock(); +} + +void WriteThread::ExitUnbatched(Writer* w) { + assert(w != nullptr); + Writer* newest_writer = w; + if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) { + CreateMissingNewerLinks(newest_writer); + Writer* next_leader = w->link_newer; + assert(next_leader != nullptr); + next_leader->link_older = nullptr; + SetState(next_leader, STATE_GROUP_LEADER); + } +} + +static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters"); +void WriteThread::WaitForMemTableWriters() { + assert(enable_pipelined_write_); + if (newest_memtable_writer_.load() == nullptr) { + return; + } + Writer w; + if (!LinkOne(&w, &newest_memtable_writer_)) { + AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx); + } + newest_memtable_writer_.store(nullptr); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.h new file mode 100644 index 0000000000..6e5805e376 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db/write_thread.h @@ -0,0 +1,464 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/post_memtable_callback.h" +#include "db/pre_release_callback.h" +#include "db/write_callback.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteThread { + public: + enum State : uint8_t { + // The initial state of a writer. This is a Writer that is + // waiting in JoinBatchGroup. This state can be left when another + // thread informs the waiter that it has become a group leader + // (-> STATE_GROUP_LEADER), when a leader that has chosen to be + // non-parallel informs a follower that its writes have been committed + // (-> STATE_COMPLETED), or when a leader that has chosen to perform + // updates in parallel and needs this Writer to apply its batch (-> + // STATE_PARALLEL_MEMTABLE_WRITER). + STATE_INIT = 1, + + // The state used to inform a waiting Writer that it has become the + // leader, and it should now build a write batch group. Tricky: + // this state is not used if newest_writer_ is empty when a writer + // enqueues itself, because there is no need to wait (or even to + // create the mutex and condvar used to wait) in that case. This is + // a terminal state unless the leader chooses to make this a parallel + // batch, in which case the last parallel worker to finish will move + // the leader to STATE_COMPLETED. + STATE_GROUP_LEADER = 2, + + // The state used to inform a waiting writer that it has become the + // leader of memtable writer group. The leader will either write + // memtable for the whole group, or launch a parallel group write + // to memtable by calling LaunchParallelMemTableWrite. + STATE_MEMTABLE_WRITER_LEADER = 4, + + // The state used to inform a waiting writer that it has become a + // parallel memtable writer. It can be the group leader who launch the + // parallel writer group, or one of the followers. The writer should then + // apply its batch to the memtable concurrently and call + // CompleteParallelMemTableWriter. + STATE_PARALLEL_MEMTABLE_WRITER = 8, + + // A follower whose writes have been applied, or a parallel leader + // whose followers have all finished their work. This is a terminal + // state. + STATE_COMPLETED = 16, + + // A state indicating that the thread may be waiting using StateMutex() + // and StateCondVar() + STATE_LOCKED_WAITING = 32, + }; + + struct Writer; + + struct WriteGroup { + Writer* leader = nullptr; + Writer* last_writer = nullptr; + SequenceNumber last_sequence; + // before running goes to zero, status needs leader->StateMutex() + Status status; + std::atomic running; + size_t size = 0; + + struct Iterator { + Writer* writer; + Writer* last_writer; + + explicit Iterator(Writer* w, Writer* last) + : writer(w), last_writer(last) {} + + Writer* operator*() const { return writer; } + + Iterator& operator++() { + assert(writer != nullptr); + if (writer == last_writer) { + writer = nullptr; + } else { + writer = writer->link_newer; + } + return *this; + } + + bool operator!=(const Iterator& other) const { + return writer != other.writer; + } + }; + + Iterator begin() const { return Iterator(leader, last_writer); } + Iterator end() const { return Iterator(nullptr, nullptr); } + }; + + // Information kept for every waiting writer. + struct Writer { + WriteBatch* batch; + bool sync; + bool no_slowdown; + bool disable_wal; + Env::IOPriority rate_limiter_priority; + bool disable_memtable; + size_t batch_cnt; // if non-zero, number of sub-batches in the write batch + size_t protection_bytes_per_key; + PreReleaseCallback* pre_release_callback; + PostMemTableCallback* post_memtable_callback; + uint64_t log_used; // log number that this batch was inserted into + uint64_t log_ref; // log number that memtable insert should reference + WriteCallback* callback; + bool made_waitable; // records lazy construction of mutex and cv + std::atomic state; // write under StateMutex() or pre-link + WriteGroup* write_group; + SequenceNumber sequence; // the sequence number to use for the first key + Status status; + Status callback_status; // status returned by callback->Callback() + + std::aligned_storage::type state_mutex_bytes; + std::aligned_storage::type state_cv_bytes; + Writer* link_older; // read/write only before linking, or as leader + Writer* link_newer; // lazy, read/write only before linking, or as leader + + Writer() + : batch(nullptr), + sync(false), + no_slowdown(false), + disable_wal(false), + rate_limiter_priority(Env::IOPriority::IO_TOTAL), + disable_memtable(false), + batch_cnt(0), + protection_bytes_per_key(0), + pre_release_callback(nullptr), + post_memtable_callback(nullptr), + log_used(0), + log_ref(0), + callback(nullptr), + made_waitable(false), + state(STATE_INIT), + write_group(nullptr), + sequence(kMaxSequenceNumber), + link_older(nullptr), + link_newer(nullptr) {} + + Writer(const WriteOptions& write_options, WriteBatch* _batch, + WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, + size_t _batch_cnt = 0, + PreReleaseCallback* _pre_release_callback = nullptr, + PostMemTableCallback* _post_memtable_callback = nullptr) + : batch(_batch), + sync(write_options.sync), + no_slowdown(write_options.no_slowdown), + disable_wal(write_options.disableWAL), + rate_limiter_priority(write_options.rate_limiter_priority), + disable_memtable(_disable_memtable), + batch_cnt(_batch_cnt), + protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), + pre_release_callback(_pre_release_callback), + post_memtable_callback(_post_memtable_callback), + log_used(0), + log_ref(_log_ref), + callback(_callback), + made_waitable(false), + state(STATE_INIT), + write_group(nullptr), + sequence(kMaxSequenceNumber), + link_older(nullptr), + link_newer(nullptr) {} + + ~Writer() { + if (made_waitable) { + StateMutex().~mutex(); + StateCV().~condition_variable(); + } + status.PermitUncheckedError(); + callback_status.PermitUncheckedError(); + } + + bool CheckCallback(DB* db) { + if (callback != nullptr) { + callback_status = callback->Callback(db); + } + return callback_status.ok(); + } + + void CreateMutex() { + if (!made_waitable) { + // Note that made_waitable is tracked separately from state + // transitions, because we can't atomically create the mutex and + // link into the list. + made_waitable = true; + new (&state_mutex_bytes) std::mutex; + new (&state_cv_bytes) std::condition_variable; + } + } + + // returns the aggregate status of this Writer + Status FinalStatus() { + if (!status.ok()) { + // a non-ok memtable write status takes presidence + assert(callback == nullptr || callback_status.ok()); + return status; + } else if (!callback_status.ok()) { + // if the callback failed then that is the status we want + // because a memtable insert should not have been attempted + assert(callback != nullptr); + assert(status.ok()); + return callback_status; + } else { + // if there is no callback then we only care about + // the memtable insert status + assert(callback == nullptr || callback_status.ok()); + return status; + } + } + + bool CallbackFailed() { + return (callback != nullptr) && !callback_status.ok(); + } + + bool ShouldWriteToMemtable() { + return status.ok() && !CallbackFailed() && !disable_memtable; + } + + bool ShouldWriteToWAL() { + return status.ok() && !CallbackFailed() && !disable_wal; + } + + // No other mutexes may be acquired while holding StateMutex(), it is + // always last in the order + std::mutex& StateMutex() { + assert(made_waitable); + return *static_cast(static_cast(&state_mutex_bytes)); + } + + std::condition_variable& StateCV() { + assert(made_waitable); + return *static_cast( + static_cast(&state_cv_bytes)); + } + }; + + struct AdaptationContext { + const char* name; + std::atomic value; + + explicit AdaptationContext(const char* name0) : name(name0), value(0) {} + }; + + explicit WriteThread(const ImmutableDBOptions& db_options); + + virtual ~WriteThread() = default; + + // IMPORTANT: None of the methods in this class rely on the db mutex + // for correctness. All of the methods except JoinBatchGroup and + // EnterUnbatched may be called either with or without the db mutex held. + // Correctness is maintained by ensuring that only a single thread is + // a leader at a time. + + // Registers w as ready to become part of a batch group, waits until the + // caller should perform some work, and returns the current state of the + // writer. If w has become the leader of a write batch group, returns + // STATE_GROUP_LEADER. If w has been made part of a sequential batch + // group and the leader has performed the write, returns STATE_DONE. + // If w has been made part of a parallel batch group and is responsible + // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER. + // + // The db mutex SHOULD NOT be held when calling this function, because + // it will block. + // + // Writer* w: Writer to be executed as part of a batch group + void JoinBatchGroup(Writer* w); + + // Constructs a write batch group led by leader, which should be a + // Writer passed to JoinBatchGroup on the current thread. + // + // Writer* leader: Writer that is STATE_GROUP_LEADER + // WriteGroup* write_group: Out-param of group members + // returns: Total batch group byte size + size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group); + + // Unlinks the Writer-s in a batch group, wakes up the non-leaders, + // and wakes up the next leader (if any). + // + // WriteGroup* write_group: the write group + // Status status: Status of write operation + void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status); + + // Exit batch group on behalf of batch group leader. + void ExitAsBatchGroupFollower(Writer* w); + + // Constructs a write batch group led by leader from newest_memtable_writers_ + // list. The leader should either write memtable for the whole group and + // call ExitAsMemTableWriter, or launch parallel memtable write through + // LaunchParallelMemTableWriters. + void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup); + + // Memtable writer group leader, or the last finished writer in a parallel + // write group, exit from the newest_memtable_writers_ list, and wake up + // the next leader if needed. + void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group); + + // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of + // the non-leader members of this write batch group. Sets Writer::sequence + // before waking them up. + // + // WriteGroup* write_group: Extra state used to coordinate the parallel add + void LaunchParallelMemTableWriters(WriteGroup* write_group); + + // Reports the completion of w's batch to the parallel group leader, and + // waits for the rest of the parallel batch to complete. Returns true + // if this thread is the last to complete, and hence should advance + // the sequence number and then call EarlyExitParallelGroup, false if + // someone else has already taken responsibility for that. + bool CompleteParallelMemTableWriter(Writer* w); + + // Waits for all preceding writers (unlocking mu while waiting), then + // registers w as the currently proceeding writer. + // + // Writer* w: A Writer not eligible for batching + // InstrumentedMutex* mu: The db mutex, to unlock while waiting + // REQUIRES: db mutex held + void EnterUnbatched(Writer* w, InstrumentedMutex* mu); + + // Completes a Writer begun with EnterUnbatched, unblocking subsequent + // writers. + void ExitUnbatched(Writer* w); + + // Wait for all parallel memtable writers to finish, in case pipelined + // write is enabled. + void WaitForMemTableWriters(); + + SequenceNumber UpdateLastSequence(SequenceNumber sequence) { + if (sequence > last_sequence_) { + last_sequence_ = sequence; + } + return last_sequence_; + } + + // Insert a dummy writer at the tail of the write queue to indicate a write + // stall, and fail any writers in the queue with no_slowdown set to true + // REQUIRES: db mutex held, no other stall on this queue outstanding + void BeginWriteStall(); + + // Remove the dummy writer and wake up waiting writers + // REQUIRES: db mutex held + void EndWriteStall(); + + // Number of BeginWriteStall(), or 0 if there is no active stall in the + // write queue. + // REQUIRES: db mutex held + uint64_t GetBegunCountOfOutstandingStall(); + + // Wait for number of completed EndWriteStall() to reach >= `stall_count`, + // which will generally have come from GetBegunCountOfOutstandingStall(). + // (Does not require db mutex held) + void WaitForStallEndedCount(uint64_t stall_count); + + private: + // See AwaitState. + const uint64_t max_yield_usec_; + const uint64_t slow_yield_usec_; + + // Allow multiple writers write to memtable concurrently. + const bool allow_concurrent_memtable_write_; + + // Enable pipelined write to WAL and memtable. + const bool enable_pipelined_write_; + + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + const uint64_t max_write_batch_group_size_bytes; + + // Points to the newest pending writer. Only leader can remove + // elements, adding can be done lock-free by anybody. + std::atomic newest_writer_; + + // Points to the newest pending memtable writer. Used only when pipelined + // write is enabled. + std::atomic newest_memtable_writer_; + + // The last sequence that have been consumed by a writer. The sequence + // is not necessary visible to reads because the writer can be ongoing. + SequenceNumber last_sequence_; + + // A dummy writer to indicate a write stall condition. This will be inserted + // at the tail of the writer queue by the leader, so newer writers can just + // check for this and bail + Writer write_stall_dummy_; + + // Mutex and condvar for writers to block on a write stall. During a write + // stall, writers with no_slowdown set to false will wait on this rather + // on the writer queue + port::Mutex stall_mu_; + port::CondVar stall_cv_; + + // Count the number of stalls begun, so that we can check whether + // a particular stall has cleared (even if caught in another stall). + // Controlled by DB mutex. + // Because of the contract on BeginWriteStall() / EndWriteStall(), + // stall_ended_count_ <= stall_begun_count_ <= stall_ended_count_ + 1. + uint64_t stall_begun_count_ = 0; + // Count the number of stalls ended, so that we can check whether + // a particular stall has cleared (even if caught in another stall). + // Writes controlled by DB mutex + stall_mu_, signalled by stall_cv_. + // Read with stall_mu or DB mutex. + uint64_t stall_ended_count_ = 0; + + // Waits for w->state & goal_mask using w->StateMutex(). Returns + // the state that satisfies goal_mask. + uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); + + // Blocks until w->state & goal_mask, returning the state value + // that satisfied the predicate. Uses ctx to adaptively use + // std::this_thread::yield() to avoid mutex overheads. ctx should be + // a context-dependent static. + uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx); + + // Set writer state and wake the writer up if it is waiting. + void SetState(Writer* w, uint8_t new_state); + + // Links w into the newest_writer list. Return true if w was linked directly + // into the leader position. Safe to call from multiple threads without + // external locking. + bool LinkOne(Writer* w, std::atomic* newest_writer); + + // Link write group into the newest_writer list as a whole, while keeping the + // order of the writers unchanged. Return true if the group was linked + // directly into the leader position. + bool LinkGroup(WriteGroup& write_group, std::atomic* newest_writer); + + // Computes any missing link_newer links. Should not be called + // concurrently with itself. + void CreateMissingNewerLinks(Writer* head); + + // Set the leader in write_group to completed state and remove it from the + // write group. + void CompleteLeader(WriteGroup& write_group); + + // Set a follower in write_group to completed state and remove it from the + // write group. + void CompleteFollower(Writer* w, WriteGroup& write_group); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_common.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_common.h new file mode 100644 index 0000000000..8756932a7e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_common.h @@ -0,0 +1,674 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The test uses an array to compare against values written to the database. +// Keys written to the array are in 1:1 correspondence to the actual values in +// the database according to the formula in the function GenerateValue. + +// Space is reserved in the array from 0 to FLAGS_max_key and values are +// randomly written/deleted/read from those positions. During verification we +// compare all the positions in the array. To shorten/elongate the running +// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread, +// (sometimes also FLAGS_threads). +// +// NOTE that if FLAGS_test_batches_snapshots is set, the test will have +// different behavior. See comment of the flag for details. + +#ifdef GFLAGS +#pragma once +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "db_stress_tool/db_stress_env_wrapper.h" +#include "db_stress_tool/db_stress_listener.h" +#include "db_stress_tool/db_stress_shared_state.h" +#include "db_stress_tool/db_stress_test_base.h" +#include "logging/logging.h" +#include "monitoring/histogram.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/utilities/backup_engine.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/debug.h" +#include "rocksdb/utilities/options_util.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/write_batch.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/gflags_compat.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/blob_db/blob_db.h" +#include "utilities/fault_injection_fs.h" +#include "utilities/merge_operators.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::RegisterFlagValidator; +using GFLAGS_NAMESPACE::SetUsageMessage; + +DECLARE_uint64(seed); +DECLARE_bool(read_only); +DECLARE_int64(max_key); +DECLARE_double(hot_key_alpha); +DECLARE_int32(max_key_len); +DECLARE_string(key_len_percent_dist); +DECLARE_int32(key_window_scale_factor); +DECLARE_int32(column_families); +DECLARE_string(options_file); +DECLARE_int64(active_width); +DECLARE_bool(test_batches_snapshots); +DECLARE_bool(atomic_flush); +DECLARE_int32(manual_wal_flush_one_in); +DECLARE_int32(lock_wal_one_in); +DECLARE_bool(test_cf_consistency); +DECLARE_bool(test_multi_ops_txns); +DECLARE_int32(threads); +DECLARE_int32(ttl); +DECLARE_int32(value_size_mult); +DECLARE_int32(compaction_readahead_size); +DECLARE_bool(enable_pipelined_write); +DECLARE_bool(verify_before_write); +DECLARE_bool(histogram); +DECLARE_bool(destroy_db_initially); +DECLARE_bool(verbose); +DECLARE_bool(progress_reports); +DECLARE_uint64(db_write_buffer_size); +DECLARE_int32(write_buffer_size); +DECLARE_int32(max_write_buffer_number); +DECLARE_int32(min_write_buffer_number_to_merge); +DECLARE_int32(max_write_buffer_number_to_maintain); +DECLARE_int64(max_write_buffer_size_to_maintain); +DECLARE_double(memtable_prefix_bloom_size_ratio); +DECLARE_bool(memtable_whole_key_filtering); +DECLARE_int32(open_files); +DECLARE_int64(compressed_cache_size); +DECLARE_int32(compressed_cache_numshardbits); +DECLARE_int32(compaction_style); +DECLARE_int32(compaction_pri); +DECLARE_int32(num_levels); +DECLARE_int32(level0_file_num_compaction_trigger); +DECLARE_int32(level0_slowdown_writes_trigger); +DECLARE_int32(level0_stop_writes_trigger); +DECLARE_int32(block_size); +DECLARE_int32(format_version); +DECLARE_int32(index_block_restart_interval); +DECLARE_bool(disable_auto_compactions); +DECLARE_int32(max_background_compactions); +DECLARE_int32(num_bottom_pri_threads); +DECLARE_int32(compaction_thread_pool_adjust_interval); +DECLARE_int32(compaction_thread_pool_variations); +DECLARE_int32(max_background_flushes); +DECLARE_int32(universal_size_ratio); +DECLARE_int32(universal_min_merge_width); +DECLARE_int32(universal_max_merge_width); +DECLARE_int32(universal_max_size_amplification_percent); +DECLARE_int32(clear_column_family_one_in); +DECLARE_int32(get_live_files_one_in); +DECLARE_int32(get_sorted_wal_files_one_in); +DECLARE_int32(get_current_wal_file_one_in); +DECLARE_int32(set_options_one_in); +DECLARE_int32(set_in_place_one_in); +DECLARE_int64(cache_size); +DECLARE_int32(cache_numshardbits); +DECLARE_bool(cache_index_and_filter_blocks); +DECLARE_bool(charge_compression_dictionary_building_buffer); +DECLARE_bool(charge_filter_construction); +DECLARE_bool(charge_table_reader); +DECLARE_bool(charge_file_metadata); +DECLARE_bool(charge_blob_cache); +DECLARE_int32(top_level_index_pinning); +DECLARE_int32(partition_pinning); +DECLARE_int32(unpartitioned_pinning); +DECLARE_string(cache_type); +DECLARE_uint64(subcompactions); +DECLARE_uint64(periodic_compaction_seconds); +DECLARE_uint64(compaction_ttl); +DECLARE_bool(fifo_allow_compaction); +DECLARE_bool(allow_concurrent_memtable_write); +DECLARE_double(experimental_mempurge_threshold); +DECLARE_bool(enable_write_thread_adaptive_yield); +DECLARE_int32(reopen); +DECLARE_double(bloom_bits); +DECLARE_int32(ribbon_starting_level); +DECLARE_bool(partition_filters); +DECLARE_bool(optimize_filters_for_memory); +DECLARE_bool(detect_filter_construct_corruption); +DECLARE_int32(index_type); +DECLARE_int32(data_block_index_type); +DECLARE_string(db); +DECLARE_string(secondaries_base); +DECLARE_bool(test_secondary); +DECLARE_string(expected_values_dir); +DECLARE_bool(verify_checksum); +DECLARE_bool(mmap_read); +DECLARE_bool(mmap_write); +DECLARE_bool(use_direct_reads); +DECLARE_bool(use_direct_io_for_flush_and_compaction); +DECLARE_bool(mock_direct_io); +DECLARE_bool(statistics); +DECLARE_bool(sync); +DECLARE_bool(use_fsync); +DECLARE_uint64(stats_dump_period_sec); +DECLARE_uint64(bytes_per_sync); +DECLARE_uint64(wal_bytes_per_sync); +DECLARE_int32(kill_random_test); +DECLARE_string(kill_exclude_prefixes); +DECLARE_bool(disable_wal); +DECLARE_uint64(recycle_log_file_num); +DECLARE_int64(target_file_size_base); +DECLARE_int32(target_file_size_multiplier); +DECLARE_uint64(max_bytes_for_level_base); +DECLARE_double(max_bytes_for_level_multiplier); +DECLARE_int32(range_deletion_width); +DECLARE_uint64(rate_limiter_bytes_per_sec); +DECLARE_bool(rate_limit_bg_reads); +DECLARE_bool(rate_limit_user_ops); +DECLARE_bool(rate_limit_auto_wal_flush); +DECLARE_uint64(sst_file_manager_bytes_per_sec); +DECLARE_uint64(sst_file_manager_bytes_per_truncate); +DECLARE_bool(use_txn); +DECLARE_uint64(txn_write_policy); +DECLARE_bool(unordered_write); +DECLARE_int32(backup_one_in); +DECLARE_uint64(backup_max_size); +DECLARE_int32(checkpoint_one_in); +DECLARE_int32(ingest_external_file_one_in); +DECLARE_int32(ingest_external_file_width); +DECLARE_int32(compact_files_one_in); +DECLARE_int32(compact_range_one_in); +DECLARE_int32(mark_for_compaction_one_file_in); +DECLARE_int32(flush_one_in); +DECLARE_int32(pause_background_one_in); +DECLARE_int32(compact_range_width); +DECLARE_int32(acquire_snapshot_one_in); +DECLARE_bool(compare_full_db_state_snapshot); +DECLARE_uint64(snapshot_hold_ops); +DECLARE_bool(long_running_snapshots); +DECLARE_bool(use_multiget); +DECLARE_bool(use_get_entity); +DECLARE_bool(use_multi_get_entity); +DECLARE_int32(readpercent); +DECLARE_int32(prefixpercent); +DECLARE_int32(writepercent); +DECLARE_int32(delpercent); +DECLARE_int32(delrangepercent); +DECLARE_int32(nooverwritepercent); +DECLARE_int32(iterpercent); +DECLARE_uint64(num_iterations); +DECLARE_int32(customopspercent); +DECLARE_string(compression_type); +DECLARE_string(bottommost_compression_type); +DECLARE_int32(compression_max_dict_bytes); +DECLARE_int32(compression_zstd_max_train_bytes); +DECLARE_int32(compression_parallel_threads); +DECLARE_uint64(compression_max_dict_buffer_bytes); +DECLARE_bool(compression_use_zstd_dict_trainer); +DECLARE_string(checksum_type); +DECLARE_string(env_uri); +DECLARE_string(fs_uri); +DECLARE_uint64(ops_per_thread); +DECLARE_uint64(log2_keys_per_lock); +DECLARE_uint64(max_manifest_file_size); +DECLARE_bool(in_place_update); +DECLARE_string(memtablerep); +DECLARE_int32(prefix_size); +DECLARE_bool(use_merge); +DECLARE_uint32(use_put_entity_one_in); +DECLARE_bool(use_full_merge_v1); +DECLARE_int32(sync_wal_one_in); +DECLARE_bool(avoid_unnecessary_blocking_io); +DECLARE_bool(write_dbid_to_manifest); +DECLARE_bool(avoid_flush_during_recovery); +DECLARE_uint64(max_write_batch_group_size_bytes); +DECLARE_bool(level_compaction_dynamic_level_bytes); +DECLARE_int32(verify_checksum_one_in); +DECLARE_int32(verify_db_one_in); +DECLARE_int32(continuous_verification_interval); +DECLARE_int32(get_property_one_in); +DECLARE_string(file_checksum_impl); + +// Options for StackableDB-based BlobDB +DECLARE_bool(use_blob_db); +DECLARE_uint64(blob_db_min_blob_size); +DECLARE_uint64(blob_db_bytes_per_sync); +DECLARE_uint64(blob_db_file_size); +DECLARE_bool(blob_db_enable_gc); +DECLARE_double(blob_db_gc_cutoff); + +// Options for integrated BlobDB +DECLARE_bool(allow_setting_blob_options_dynamically); +DECLARE_bool(enable_blob_files); +DECLARE_uint64(min_blob_size); +DECLARE_uint64(blob_file_size); +DECLARE_string(blob_compression_type); +DECLARE_bool(enable_blob_garbage_collection); +DECLARE_double(blob_garbage_collection_age_cutoff); +DECLARE_double(blob_garbage_collection_force_threshold); +DECLARE_uint64(blob_compaction_readahead_size); +DECLARE_int32(blob_file_starting_level); +DECLARE_bool(use_blob_cache); +DECLARE_bool(use_shared_block_and_blob_cache); +DECLARE_uint64(blob_cache_size); +DECLARE_int32(blob_cache_numshardbits); +DECLARE_int32(prepopulate_blob_cache); + +DECLARE_int32(approximate_size_one_in); +DECLARE_bool(sync_fault_injection); + +DECLARE_bool(best_efforts_recovery); +DECLARE_bool(skip_verifydb); +DECLARE_bool(enable_compaction_filter); +DECLARE_bool(paranoid_file_checks); +DECLARE_bool(fail_if_options_file_error); +DECLARE_uint64(batch_protection_bytes_per_key); +DECLARE_uint32(memtable_protection_bytes_per_key); +DECLARE_uint32(block_protection_bytes_per_key); + +DECLARE_uint64(user_timestamp_size); +DECLARE_string(secondary_cache_uri); +DECLARE_int32(secondary_cache_fault_one_in); + +DECLARE_int32(prepopulate_block_cache); + +DECLARE_bool(two_write_queues); +DECLARE_bool(use_only_the_last_commit_time_batch_for_recovery); +DECLARE_uint64(wp_snapshot_cache_bits); +DECLARE_uint64(wp_commit_cache_bits); + +DECLARE_bool(adaptive_readahead); +DECLARE_bool(async_io); +DECLARE_string(wal_compression); +DECLARE_bool(verify_sst_unique_id_in_manifest); + +DECLARE_int32(create_timestamped_snapshot_one_in); + +DECLARE_bool(allow_data_in_errors); + +DECLARE_bool(enable_thread_tracking); + +// Tiered storage +DECLARE_bool(enable_tiered_storage); // set last_level_temperature +DECLARE_int64(preclude_last_level_data_seconds); +DECLARE_int64(preserve_internal_time_seconds); + +DECLARE_int32(verify_iterator_with_expected_state_one_in); +DECLARE_bool(preserve_unverified_changes); + +DECLARE_uint64(readahead_size); +DECLARE_uint64(initial_auto_readahead_size); +DECLARE_uint64(max_auto_readahead_size); +DECLARE_uint64(num_file_reads_for_auto_readahead); +DECLARE_bool(use_io_uring); + +constexpr long KB = 1024; +constexpr int kRandomValueMaxFactor = 3; +constexpr int kValueMaxLen = 100; + +// wrapped posix environment +extern ROCKSDB_NAMESPACE::Env* db_stress_env; +extern ROCKSDB_NAMESPACE::Env* db_stress_listener_env; +extern std::shared_ptr fault_fs_guard; + +extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e; +extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e; +extern enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e; + +enum RepFactory { kSkipList, kHashSkipList, kVectorRep }; + +inline enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kHashSkipList; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} + +extern enum RepFactory FLAGS_rep_factory; + +namespace ROCKSDB_NAMESPACE { +inline enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( + const char* ctype) { + assert(ctype); + + ROCKSDB_NAMESPACE::CompressionType ret_compression_type; + + if (!strcasecmp(ctype, "disable")) { + ret_compression_type = ROCKSDB_NAMESPACE::kDisableCompressionOption; + } else if (!strcasecmp(ctype, "none")) { + ret_compression_type = ROCKSDB_NAMESPACE::kNoCompression; + } else if (!strcasecmp(ctype, "snappy")) { + ret_compression_type = ROCKSDB_NAMESPACE::kSnappyCompression; + } else if (!strcasecmp(ctype, "zlib")) { + ret_compression_type = ROCKSDB_NAMESPACE::kZlibCompression; + } else if (!strcasecmp(ctype, "bzip2")) { + ret_compression_type = ROCKSDB_NAMESPACE::kBZip2Compression; + } else if (!strcasecmp(ctype, "lz4")) { + ret_compression_type = ROCKSDB_NAMESPACE::kLZ4Compression; + } else if (!strcasecmp(ctype, "lz4hc")) { + ret_compression_type = ROCKSDB_NAMESPACE::kLZ4HCCompression; + } else if (!strcasecmp(ctype, "xpress")) { + ret_compression_type = ROCKSDB_NAMESPACE::kXpressCompression; + } else if (!strcasecmp(ctype, "zstd")) { + ret_compression_type = ROCKSDB_NAMESPACE::kZSTD; + } else { + fprintf(stderr, "Cannot parse compression type '%s'\n", ctype); + ret_compression_type = + ROCKSDB_NAMESPACE::kSnappyCompression; // default value + } + if (ret_compression_type != ROCKSDB_NAMESPACE::kDisableCompressionOption && + !CompressionTypeSupported(ret_compression_type)) { + // Use no compression will be more portable but considering this is + // only a stress test and snappy is widely available. Use snappy here. + ret_compression_type = ROCKSDB_NAMESPACE::kSnappyCompression; + } + return ret_compression_type; +} + +inline enum ROCKSDB_NAMESPACE::ChecksumType StringToChecksumType( + const char* ctype) { + assert(ctype); + auto iter = ROCKSDB_NAMESPACE::checksum_type_string_map.find(ctype); + if (iter != ROCKSDB_NAMESPACE::checksum_type_string_map.end()) { + return iter->second; + } + fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype); + return ROCKSDB_NAMESPACE::kCRC32c; +} + +inline std::string ChecksumTypeToString(ROCKSDB_NAMESPACE::ChecksumType ctype) { + auto iter = std::find_if( + ROCKSDB_NAMESPACE::checksum_type_string_map.begin(), + ROCKSDB_NAMESPACE::checksum_type_string_map.end(), + [&](const std::pair& + name_and_enum_val) { return name_and_enum_val.second == ctype; }); + assert(iter != ROCKSDB_NAMESPACE::checksum_type_string_map.end()); + return iter->first; +} + +inline std::vector SplitString(std::string src) { + std::vector ret; + if (src.empty()) { + return ret; + } + size_t pos = 0; + size_t pos_comma; + while ((pos_comma = src.find(',', pos)) != std::string::npos) { + ret.push_back(src.substr(pos, pos_comma - pos)); + pos = pos_comma + 1; + } + ret.push_back(src.substr(pos, src.length())); + return ret; +} + +#ifdef _MSC_VER +#pragma warning(push) +// truncation of constant value on static_cast +#pragma warning(disable : 4309) +#endif +inline bool GetNextPrefix(const ROCKSDB_NAMESPACE::Slice& src, std::string* v) { + std::string ret = src.ToString(); + for (int i = static_cast(ret.size()) - 1; i >= 0; i--) { + if (ret[i] != static_cast(255)) { + ret[i] = ret[i] + 1; + break; + } else if (i != 0) { + ret[i] = 0; + } else { + // all FF. No next prefix + return false; + } + } + *v = ret; + return true; +} +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// Append `val` to `*key` in fixed-width big-endian format +extern inline void AppendIntToString(uint64_t val, std::string* key) { + // PutFixed64 uses little endian + PutFixed64(key, val); + // Reverse to get big endian + char* int_data = &((*key)[key->size() - sizeof(uint64_t)]); + for (size_t i = 0; i < sizeof(uint64_t) / 2; ++i) { + std::swap(int_data[i], int_data[sizeof(uint64_t) - 1 - i]); + } +} + +// A struct for maintaining the parameters for generating variable length keys +struct KeyGenContext { + // Number of adjacent keys in one cycle of key lengths + uint64_t window; + // Number of keys of each possible length in a given window + std::vector weights; +}; +extern KeyGenContext key_gen_ctx; + +// Generate a variable length key string from the given int64 val. The +// order of the keys is preserved. The key could be anywhere from 8 to +// max_key_len * 8 bytes. +// The algorithm picks the length based on the +// offset of the val within a configured window and the distribution of the +// number of keys of various lengths in that window. For example, if x, y, x are +// the weights assigned to each possible key length, the keys generated would be +// - {0}...{x-1} +// {(x-1),0}..{(x-1),(y-1)},{(x-1),(y-1),0}..{(x-1),(y-1),(z-1)} and so on. +// Additionally, a trailer of 0-7 bytes could be appended. +extern inline std::string Key(int64_t val) { + uint64_t window = key_gen_ctx.window; + size_t levels = key_gen_ctx.weights.size(); + std::string key; + // Over-reserve and for now do not bother `shrink_to_fit()` since the key + // strings are transient. + key.reserve(FLAGS_max_key_len * 8); + + uint64_t window_idx = static_cast(val) / window; + uint64_t offset = static_cast(val) % window; + for (size_t level = 0; level < levels; ++level) { + uint64_t weight = key_gen_ctx.weights[level]; + uint64_t pfx; + if (level == 0) { + pfx = window_idx * weight; + } else { + pfx = 0; + } + pfx += offset >= weight ? weight - 1 : offset; + AppendIntToString(pfx, &key); + if (offset < weight) { + // Use the bottom 3 bits of offset as the number of trailing 'x's in the + // key. If the next key is going to be of the next level, then skip the + // trailer as it would break ordering. If the key length is already at + // max, skip the trailer. + if (offset < weight - 1 && level < levels - 1) { + size_t trailer_len = offset & 0x7; + key.append(trailer_len, 'x'); + } + break; + } + offset -= weight; + } + + return key; +} + +// Given a string key, map it to an index into the expected values buffer +extern inline bool GetIntVal(std::string big_endian_key, uint64_t* key_p) { + size_t size_key = big_endian_key.size(); + std::vector prefixes; + + assert(size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t)); + + std::string little_endian_key; + little_endian_key.resize(size_key); + for (size_t start = 0; start + sizeof(uint64_t) <= size_key; + start += sizeof(uint64_t)) { + size_t end = start + sizeof(uint64_t); + for (size_t i = 0; i < sizeof(uint64_t); ++i) { + little_endian_key[start + i] = big_endian_key[end - 1 - i]; + } + Slice little_endian_slice = + Slice(&little_endian_key[start], sizeof(uint64_t)); + uint64_t pfx; + if (!GetFixed64(&little_endian_slice, &pfx)) { + return false; + } + prefixes.emplace_back(pfx); + } + + uint64_t key = 0; + for (size_t i = 0; i < prefixes.size(); ++i) { + uint64_t pfx = prefixes[i]; + key += (pfx / key_gen_ctx.weights[i]) * key_gen_ctx.window + + pfx % key_gen_ctx.weights[i]; + if (i < prefixes.size() - 1) { + // The encoding writes a `key_gen_ctx.weights[i] - 1` that counts for + // `key_gen_ctx.weights[i]` when there are more prefixes to come. So we + // need to add back the one here as we're at a non-last prefix. + ++key; + } + } + *key_p = key; + return true; +} + +// Given a string prefix, map it to the first corresponding index in the +// expected values buffer. +inline bool GetFirstIntValInPrefix(std::string big_endian_prefix, + uint64_t* key_p) { + size_t size_key = big_endian_prefix.size(); + // Pad with zeros to make it a multiple of 8. This function may be called + // with a prefix, in which case we return the first index that falls + // inside or outside that prefix, dependeing on whether the prefix is + // the start of upper bound of a scan + unsigned int pad = sizeof(uint64_t) - (size_key % sizeof(uint64_t)); + if (pad < sizeof(uint64_t)) { + big_endian_prefix.append(pad, '\0'); + } + return GetIntVal(std::move(big_endian_prefix), key_p); +} + +extern inline uint64_t GetPrefixKeyCount(const std::string& prefix, + const std::string& ub) { + uint64_t start = 0; + uint64_t end = 0; + + if (!GetFirstIntValInPrefix(prefix, &start) || + !GetFirstIntValInPrefix(ub, &end)) { + return 0; + } + + return end - start; +} + +extern inline std::string StringToHex(const std::string& str) { + std::string result = "0x"; + result.append(Slice(str).ToString(true)); + return result; +} + +inline std::string WideColumnsToHex(const WideColumns& columns) { + if (columns.empty()) { + return std::string(); + } + + std::ostringstream oss; + + oss << std::hex; + + auto it = columns.begin(); + oss << *it; + for (++it; it != columns.end(); ++it) { + oss << ' ' << *it; + } + + return oss.str(); +} + +// Unified output format for double parameters +extern inline std::string FormatDoubleParam(double param) { + return std::to_string(param); +} + +// Make sure that double parameter is a value we can reproduce by +// re-inputting the value printed. +extern inline void SanitizeDoubleParam(double* param) { + *param = std::atof(FormatDoubleParam(*param).c_str()); +} + +extern void PoolSizeChangeThread(void* v); + +extern void DbVerificationThread(void* v); + +extern void TimestampedSnapshotsThread(void* v); + +extern void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz); + +extern int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration); + +extern std::vector GenerateNKeys(ThreadState* thread, int num_keys, + uint64_t iteration); + +extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz); +extern uint32_t GetValueBase(Slice s); + +extern WideColumns GenerateWideColumns(uint32_t value_base, const Slice& slice); +extern WideColumns GenerateExpectedWideColumns(uint32_t value_base, + const Slice& slice); +extern bool VerifyWideColumns(const Slice& value, const WideColumns& columns); +extern bool VerifyWideColumns(const WideColumns& columns); + +extern StressTest* CreateCfConsistencyStressTest(); +extern StressTest* CreateBatchedOpsStressTest(); +extern StressTest* CreateNonBatchedOpsStressTest(); +extern StressTest* CreateMultiOpsTxnsStressTest(); +extern void CheckAndSetOptionsForMultiOpsTxnStressTest(); +extern void InitializeHotKeyGenerator(double alpha); +extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key); + +extern std::string GetNowNanos(); + +std::shared_ptr GetFileChecksumImpl( + const std::string& name); + +Status DeleteFilesInDirectory(const std::string& dirname); +Status SaveFilesInDirectory(const std::string& src_dirname, + const std::string& dst_dirname); +Status DestroyUnverifiedSubdir(const std::string& dirname); +Status InitUnverifiedSubdir(const std::string& dirname); +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_compaction_filter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_compaction_filter.h new file mode 100644 index 0000000000..408bb48f3e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_compaction_filter.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db_stress_tool/db_stress_common.h" +#include "db_stress_tool/db_stress_shared_state.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +// DbStressCompactionFilter is safe to use with db_stress as it does not perform +// any mutation. It only makes `kRemove` decisions for keys that are already +// non-existent according to the `SharedState`. +class DbStressCompactionFilter : public CompactionFilter { + public: + DbStressCompactionFilter(SharedState* state, int cf_id) + : state_(state), cf_id_(cf_id) {} + + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (state_ == nullptr) { + return Decision::kKeep; + } + if (key.empty() || ('0' <= key[0] && key[0] <= '9')) { + // It is likely leftover from a test_batches_snapshots run. Below this + // conditional, the test_batches_snapshots key format is not handled + // properly. Just keep it to be safe. + return Decision::kKeep; + } + uint64_t key_num = 0; + { + Slice ukey_without_ts = key; + assert(ukey_without_ts.size() >= FLAGS_user_timestamp_size); + ukey_without_ts.remove_suffix(FLAGS_user_timestamp_size); + [[maybe_unused]] bool ok = + GetIntVal(ukey_without_ts.ToString(), &key_num); + assert(ok); + } + port::Mutex* key_mutex = state_->GetMutexForKey(cf_id_, key_num); + if (!key_mutex->TryLock()) { + return Decision::kKeep; + } + // Reaching here means we acquired the lock. + + bool key_exists = state_->Exists(cf_id_, key_num); + const bool allow_overwrite = state_->AllowsOverwrite(key_num); + + key_mutex->Unlock(); + + if (!key_exists) { + return allow_overwrite ? Decision::kRemove : Decision::kPurge; + } + return Decision::kKeep; + } + + const char* Name() const override { return "DbStressCompactionFilter"; } + + private: + SharedState* const state_; + const int cf_id_; +}; + +class DbStressCompactionFilterFactory : public CompactionFilterFactory { + public: + DbStressCompactionFilterFactory() : state_(nullptr) {} + + void SetSharedState(SharedState* state) { + MutexLock state_mutex_guard(&state_mutex_); + state_ = state; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + MutexLock state_mutex_guard(&state_mutex_); + return std::unique_ptr( + new DbStressCompactionFilter(state_, context.column_family_id)); + } + + const char* Name() const override { + return "DbStressCompactionFilterFactory"; + } + + private: + port::Mutex state_mutex_; + SharedState* state_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_driver.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_driver.h new file mode 100644 index 0000000000..a173470ff7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_driver.h @@ -0,0 +1,18 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db_stress_tool/db_stress_shared_state.h" +#ifdef GFLAGS +#pragma once +#include "db_stress_tool/db_stress_test_base.h" +namespace ROCKSDB_NAMESPACE { +extern void ThreadBody(void* /*thread_state*/); +extern bool RunStressTest(SharedState*); +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_env_wrapper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_env_wrapper.h new file mode 100644 index 0000000000..612d9fc6b9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_env_wrapper.h @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef GFLAGS +#pragma once +#include "db_stress_tool/db_stress_common.h" +#include "monitoring/thread_status_util.h" + +namespace ROCKSDB_NAMESPACE { +class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { + public: + explicit DbStressRandomAccessFileWrapper( + std::unique_ptr&& target) + : FSRandomAccessFileOwnerWrapper(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Read(offset, n, options, result, scratch, dbg); + } +}; + +class DbStressFSWrapper : public FileSystemWrapper { + public: + explicit DbStressFSWrapper(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + static const char* kClassName() { return "DbStressFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new DbStressRandomAccessFileWrapper(std::move(file))); + } + return s; + } + + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, + IODebugContext* dbg) override { + // We determine whether it is a manifest file by searching a strong, + // so that there will be false positive if the directory path contains the + // keyword but it is unlikely. + // Checkpoint, backup, and restore directories needs to be exempted. + if (!if_preserve_all_manifests || + f.find("MANIFEST-") == std::string::npos || + f.find("checkpoint") != std::string::npos || + f.find(".backup") != std::string::npos || + f.find(".restore") != std::string::npos) { + return target()->DeleteFile(f, opts, dbg); + } + // Rename the file instead of deletion to keep the history, and + // at the same time it is not visible to RocksDB. + return target()->RenameFile(f, f + "_renamed_", opts, dbg); + } + + // If true, all manifest files will not be delted in DeleteFile(). + bool if_preserve_all_manifests = true; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_listener.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_listener.h new file mode 100644 index 0000000000..97bbdaefa4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_listener.h @@ -0,0 +1,269 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS +#pragma once + +#include +#include + +#include "file/filename.h" +#include "file/writable_file_writer.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/unique_id.h" +#include "util/gflags_compat.h" +#include "util/random.h" + +DECLARE_int32(compact_files_one_in); + +namespace ROCKSDB_NAMESPACE { + +// Verify across process executions that all seen IDs are unique +class UniqueIdVerifier { + public: + explicit UniqueIdVerifier(const std::string& db_name, Env* env); + ~UniqueIdVerifier(); + + void Verify(const std::string& id); + + private: + void VerifyNoWrite(const std::string& id); + + private: + std::mutex mutex_; + // IDs persisted to a hidden file inside DB dir + std::string path_; + std::unique_ptr data_file_writer_; + // Starting byte for which 8 bytes to check in memory within 24 byte ID + size_t offset_; + // Working copy of the set of 8 byte pieces + std::unordered_set id_set_; +}; + +class DbStressListener : public EventListener { + public: + DbStressListener(const std::string& db_name, + const std::vector& db_paths, + const std::vector& column_families, + Env* env) + : db_name_(db_name), + db_paths_(db_paths), + column_families_(column_families), + num_pending_file_creations_(0), + unique_ids_(db_name, env) {} + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "DBStressListener"; } + + ~DbStressListener() override { assert(num_pending_file_creations_ == 0); } + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + assert(IsValidColumnFamilyName(info.cf_name)); + VerifyFilePath(info.file_path); + // pretending doing some work here + RandomSleep(); + } + + void OnFlushBegin(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) override { + RandomSleep(); + } + + void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) override { + RandomSleep(); + } + + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) override { + RandomSleep(); + } + + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + assert(IsValidColumnFamilyName(ci.cf_name)); + assert(ci.input_files.size() + ci.output_files.size() > 0U); + for (const auto& file_path : ci.input_files) { + VerifyFilePath(file_path); + } + for (const auto& file_path : ci.output_files) { + VerifyFilePath(file_path); + } + // pretending doing some work here + RandomSleep(); + } + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*info*/) override { + ++num_pending_file_creations_; + } + + void OnTableFileCreated(const TableFileCreationInfo& info) override { + assert(info.db_name == db_name_); + assert(IsValidColumnFamilyName(info.cf_name)); + assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0); + if (info.status.ok()) { + assert(info.file_size > 0); + VerifyFilePath(info.file_path); + assert(info.table_properties.data_size > 0 || + info.table_properties.num_range_deletions > 0); + assert(info.table_properties.raw_key_size > 0); + assert(info.table_properties.num_entries > 0); + VerifyTableFileUniqueId(info.table_properties, info.file_path); + } + --num_pending_file_creations_; + } + + void OnMemTableSealed(const MemTableInfo& /*info*/) override { + RandomSleep(); + } + + void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* /*handle*/) override { + RandomSleep(); + } + + void OnExternalFileIngested(DB* /*db*/, + const ExternalFileIngestionInfo& info) override { + RandomSleep(); + // Here we assume that each generated external file is ingested + // exactly once (or thrown away in case of crash) + VerifyTableFileUniqueId(info.table_properties, info.internal_file_path); + } + + void OnBackgroundError(BackgroundErrorReason /* reason */, + Status* /* bg_error */) override { + RandomSleep(); + } + + void OnStallConditionsChanged(const WriteStallInfo& /*info*/) override { + RandomSleep(); + } + + void OnFileReadFinish(const FileOperationInfo& info) override { + // Even empty callback is valuable because sometimes some locks are + // released in order to make the callback. + + // Sleep carefully here as it is a frequent operation and we don't want + // to slow down the tests. We always sleep when the read is large. + // When read is small, sleep in a small chance. + size_t length_read = info.length; + if (length_read >= 1000000 || Random::GetTLSInstance()->OneIn(1000)) { + RandomSleep(); + } + } + + void OnFileWriteFinish(const FileOperationInfo& info) override { + // Even empty callback is valuable because sometimes some locks are + // released in order to make the callback. + + // Sleep carefully here as it is a frequent operation and we don't want + // to slow down the tests. When the write is large, always sleep. + // Otherwise, sleep in a relatively small chance. + size_t length_write = info.length; + if (length_write >= 1000000 || Random::GetTLSInstance()->OneIn(64)) { + RandomSleep(); + } + } + + bool ShouldBeNotifiedOnFileIO() override { + RandomSleep(); + return static_cast(Random::GetTLSInstance()->OneIn(1)); + } + + void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */, + Status /* bg_error */, + bool* /* auto_recovery */) override { + RandomSleep(); + } + + void OnErrorRecoveryCompleted(Status /* old_bg_error */) override { + RandomSleep(); + } + + protected: + bool IsValidColumnFamilyName(const std::string& cf_name) const { + if (cf_name == kDefaultColumnFamilyName) { + return true; + } + // The column family names in the stress tests are numbers. + for (size_t i = 0; i < cf_name.size(); ++i) { + if (cf_name[i] < '0' || cf_name[i] > '9') { + return false; + } + } + return true; + } + + void VerifyFileDir(const std::string& file_dir) { +#ifndef NDEBUG + if (db_name_ == file_dir) { + return; + } + for (const auto& db_path : db_paths_) { + if (db_path.path == file_dir) { + return; + } + } + for (auto& cf : column_families_) { + for (const auto& cf_path : cf.options.cf_paths) { + if (cf_path.path == file_dir) { + return; + } + } + } + assert(false); +#else + (void)file_dir; +#endif // !NDEBUG + } + + void VerifyFileName(const std::string& file_name) { +#ifndef NDEBUG + uint64_t file_number; + FileType file_type; + bool result = ParseFileName(file_name, &file_number, &file_type); + assert(result); + assert(file_type == kTableFile); +#else + (void)file_name; +#endif // !NDEBUG + } + + void VerifyFilePath(const std::string& file_path) { +#ifndef NDEBUG + size_t pos = file_path.find_last_of("/"); + if (pos == std::string::npos) { + VerifyFileName(file_path); + } else { + if (pos > 0) { + VerifyFileDir(file_path.substr(0, pos)); + } + VerifyFileName(file_path.substr(pos)); + } +#else + (void)file_path; +#endif // !NDEBUG + } + + // Unique id is verified using the TableProperties. file_path is only used + // for reporting. + void VerifyTableFileUniqueId(const TableProperties& new_file_properties, + const std::string& file_path); + + void RandomSleep() { + std::this_thread::sleep_for( + std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000))); + } + + private: + std::string db_name_; + std::vector db_paths_; + std::vector column_families_; + std::atomic num_pending_file_creations_; + UniqueIdVerifier unique_ids_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_shared_state.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_shared_state.h new file mode 100644 index 0000000000..0137f0b2e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_shared_state.h @@ -0,0 +1,437 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + +#ifdef GFLAGS +#pragma once + +#include "db_stress_tool/db_stress_stat.h" +#include "db_stress_tool/expected_state.h" +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) +#include "test_util/sync_point.h" +#endif // !(defined NDEBUG) || !defined(OS_WIN) +#include "util/gflags_compat.h" + +DECLARE_uint64(seed); +DECLARE_int64(max_key); +DECLARE_uint64(log2_keys_per_lock); +DECLARE_int32(threads); +DECLARE_int32(column_families); +DECLARE_int32(nooverwritepercent); +DECLARE_string(expected_values_dir); +DECLARE_int32(clear_column_family_one_in); +DECLARE_bool(test_batches_snapshots); +DECLARE_int32(compaction_thread_pool_adjust_interval); +DECLARE_int32(continuous_verification_interval); +DECLARE_int32(read_fault_one_in); +DECLARE_int32(write_fault_one_in); +DECLARE_int32(open_metadata_write_fault_one_in); +DECLARE_int32(open_write_fault_one_in); +DECLARE_int32(open_read_fault_one_in); + +DECLARE_int32(injest_error_severity); + +namespace ROCKSDB_NAMESPACE { +class StressTest; + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + // Errors when reading filter blocks are ignored, so we use a thread + // local variable updated via sync points to keep track of errors injected + // while reading filter blocks in order to ignore the Get/MultiGet result + // for those calls + static thread_local bool ignore_read_error; + + SharedState(Env* /*env*/, StressTest* stress_test) + : cv_(&mu_), + seed_(static_cast(FLAGS_seed)), + max_key_(FLAGS_max_key), + log2_keys_per_lock_(static_cast(FLAGS_log2_keys_per_lock)), + num_threads_(0), + num_initialized_(0), + num_populated_(0), + vote_reopen_(0), + num_done_(0), + start_(false), + start_verify_(false), + num_bg_threads_(0), + should_stop_bg_thread_(false), + bg_thread_finished_(0), + stress_test_(stress_test), + verification_failure_(false), + should_stop_test_(false), + no_overwrite_ids_(GenerateNoOverwriteIds()), + expected_state_manager_(nullptr), + printing_verification_results_(false), + start_timestamp_(Env::Default()->NowNanos()) { + Status status; + // TODO: We should introduce a way to explicitly disable verification + // during shutdown. When that is disabled and FLAGS_expected_values_dir + // is empty (disabling verification at startup), we can skip tracking + // expected state. Only then should we permit bypassing the below feature + // compatibility checks. + if (!FLAGS_expected_values_dir.empty()) { + if (!std::atomic{}.is_lock_free()) { + status = Status::InvalidArgument( + "Cannot use --expected_values_dir on platforms without lock-free " + "std::atomic"); + } + if (status.ok() && FLAGS_clear_column_family_one_in > 0) { + status = Status::InvalidArgument( + "Cannot use --expected_values_dir on when " + "--clear_column_family_one_in is greater than zero."); + } + } + if (status.ok()) { + if (FLAGS_expected_values_dir.empty()) { + expected_state_manager_.reset( + new AnonExpectedStateManager(FLAGS_max_key, FLAGS_column_families)); + } else { + expected_state_manager_.reset(new FileExpectedStateManager( + FLAGS_max_key, FLAGS_column_families, FLAGS_expected_values_dir)); + } + status = expected_state_manager_->Open(); + } + if (!status.ok()) { + fprintf(stderr, "Failed setting up expected state with error: %s\n", + status.ToString().c_str()); + exit(1); + } + + if (FLAGS_test_batches_snapshots) { + fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); + return; + } + + long num_locks = static_cast(max_key_ >> log2_keys_per_lock_); + if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) { + num_locks++; + } + fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families); + key_locks_.resize(FLAGS_column_families); + + for (int i = 0; i < FLAGS_column_families; ++i) { + key_locks_[i].reset(new port::Mutex[num_locks]); + } + if (FLAGS_read_fault_one_in) { +#ifdef NDEBUG + // Unsupported in release mode because it relies on + // `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to + // failure. + fprintf(stderr, + "Cannot set nonzero value for --read_fault_one_in in " + "release mode."); + exit(1); +#else // NDEBUG + SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError", + IgnoreReadErrorCallback); + SyncPoint::GetInstance()->EnableProcessing(); +#endif // NDEBUG + } + } + + ~SharedState() { +#ifndef NDEBUG + if (FLAGS_read_fault_one_in) { + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + } +#endif + } + + port::Mutex* GetMutex() { return &mu_; } + + port::CondVar* GetCondVar() { return &cv_; } + + StressTest* GetStressTest() const { return stress_test_; } + + int64_t GetMaxKey() const { return max_key_; } + + uint32_t GetNumThreads() const { return num_threads_; } + + void SetThreads(int num_threads) { num_threads_ = num_threads; } + + void IncInitialized() { num_initialized_++; } + + void IncOperated() { num_populated_++; } + + void IncDone() { num_done_++; } + + void IncVotedReopen() { vote_reopen_ = (vote_reopen_ + 1) % num_threads_; } + + bool AllInitialized() const { return num_initialized_ >= num_threads_; } + + bool AllOperated() const { return num_populated_ >= num_threads_; } + + bool AllDone() const { return num_done_ >= num_threads_; } + + bool AllVotedReopen() { return (vote_reopen_ == 0); } + + void SetStart() { start_ = true; } + + void SetStartVerify() { start_verify_ = true; } + + bool Started() const { return start_; } + + bool VerifyStarted() const { return start_verify_; } + + void SetVerificationFailure() { verification_failure_.store(true); } + + bool HasVerificationFailedYet() const { return verification_failure_.load(); } + + void SetShouldStopTest() { should_stop_test_.store(true); } + + bool ShouldStopTest() const { return should_stop_test_.load(); } + + // Returns a lock covering `key` in `cf`. + port::Mutex* GetMutexForKey(int cf, int64_t key) { + return &key_locks_[cf][key >> log2_keys_per_lock_]; + } + + // Acquires locks for all keys in `cf`. + void LockColumnFamily(int cf) { + for (int i = 0; i < max_key_ >> log2_keys_per_lock_; ++i) { + key_locks_[cf][i].Lock(); + } + } + + // Releases locks for all keys in `cf`. + void UnlockColumnFamily(int cf) { + for (int i = 0; i < max_key_ >> log2_keys_per_lock_; ++i) { + key_locks_[cf][i].Unlock(); + } + } + + // Returns a collection of mutex locks covering the key range [start, end) in + // `cf`. + std::vector> GetLocksForKeyRange(int cf, + int64_t start, + int64_t end) { + std::vector> range_locks; + + if (start >= end) { + return range_locks; + } + + const int64_t start_idx = start >> log2_keys_per_lock_; + + int64_t end_idx = end >> log2_keys_per_lock_; + if ((end & ((1 << log2_keys_per_lock_) - 1)) == 0) { + --end_idx; + } + + for (int64_t idx = start_idx; idx <= end_idx; ++idx) { + range_locks.emplace_back( + std::make_unique(&key_locks_[cf][idx])); + } + + return range_locks; + } + + Status SaveAtAndAfter(DB* db) { + return expected_state_manager_->SaveAtAndAfter(db); + } + + bool HasHistory() { return expected_state_manager_->HasHistory(); } + + Status Restore(DB* db) { return expected_state_manager_->Restore(db); } + + // Requires external locking covering all keys in `cf`. + void ClearColumnFamily(int cf) { + return expected_state_manager_->ClearColumnFamily(cf); + } + + // Prepare a Put that will be started but not finish yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value + // + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PreparePut(int cf, int64_t key) { + return expected_state_manager_->PreparePut(cf, key); + } + + // Does not requires external locking. + ExpectedValue Get(int cf, int64_t key) { + return expected_state_manager_->Get(cf, key); + } + + // Prepare a Delete that will be started but not finish yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value + // + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareDelete(int cf, int64_t key) { + return expected_state_manager_->PrepareDelete(cf, key); + } + + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) { + return expected_state_manager_->PrepareSingleDelete(cf, key); + } + + // Requires external locking covering keys in `[begin_key, end_key)` in `cf` + // to prevent concurrent write or delete to the same `key`. + std::vector PrepareDeleteRange(int cf, + int64_t begin_key, + int64_t end_key) { + return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key); + } + + bool AllowsOverwrite(int64_t key) const { + return no_overwrite_ids_.find(key) == no_overwrite_ids_.end(); + } + + // Requires external locking covering `key` in `cf` to prevent concurrent + // delete to the same `key`. + bool Exists(int cf, int64_t key) { + return expected_state_manager_->Exists(cf, key); + } + + // Sync the `value_base` to the corresponding expected value + void SyncPut(int cf, int64_t key, uint32_t value_base) { + return expected_state_manager_->SyncPut(cf, key, value_base); + } + + // Sync the corresponding expected value to be pending Put + void SyncPendingPut(int cf, int64_t key) { + return expected_state_manager_->SyncPendingPut(cf, key); + } + + // Sync the corresponding expected value to be deleted + void SyncDelete(int cf, int64_t key) { + return expected_state_manager_->SyncDelete(cf, key); + } + + uint32_t GetSeed() const { return seed_; } + + void SetShouldStopBgThread() { should_stop_bg_thread_ = true; } + + bool ShouldStopBgThread() { return should_stop_bg_thread_; } + + void IncBgThreads() { ++num_bg_threads_; } + + void IncBgThreadsFinished() { ++bg_thread_finished_; } + + bool BgThreadsFinished() const { + return bg_thread_finished_ == num_bg_threads_; + } + + bool ShouldVerifyAtBeginning() const { + return !FLAGS_expected_values_dir.empty(); + } + + bool PrintingVerificationResults() { + bool tmp = false; + return !printing_verification_results_.compare_exchange_strong( + tmp, true, std::memory_order_relaxed); + } + + void FinishPrintingVerificationResults() { + printing_verification_results_.store(false, std::memory_order_relaxed); + } + + uint64_t GetStartTimestamp() const { return start_timestamp_; } + + private: + static void IgnoreReadErrorCallback(void*) { ignore_read_error = true; } + + // Pick random keys in each column family that will not experience overwrite. + std::unordered_set GenerateNoOverwriteIds() const { + fprintf(stdout, "Choosing random keys with no overwrite\n"); + // Start with the identity permutation. Subsequent iterations of + // for loop below will start with perm of previous for loop + std::vector permutation(max_key_); + for (int64_t i = 0; i < max_key_; ++i) { + permutation[i] = i; + } + // Now do the Knuth shuffle + const int64_t num_no_overwrite_keys = + (max_key_ * FLAGS_nooverwritepercent) / 100; + // Only need to figure out first num_no_overwrite_keys of permutation + std::unordered_set ret; + ret.reserve(num_no_overwrite_keys); + Random64 rnd(seed_); + for (int64_t i = 0; i < num_no_overwrite_keys; i++) { + assert(i < max_key_); + int64_t rand_index = i + rnd.Next() % (max_key_ - i); + // Swap i and rand_index; + int64_t temp = permutation[i]; + permutation[i] = permutation[rand_index]; + permutation[rand_index] = temp; + // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of + // permutation + ret.insert(permutation[i]); + } + return ret; + } + + port::Mutex mu_; + port::CondVar cv_; + const uint32_t seed_; + const int64_t max_key_; + const uint32_t log2_keys_per_lock_; + int num_threads_; + long num_initialized_; + long num_populated_; + long vote_reopen_; + long num_done_; + bool start_; + bool start_verify_; + int num_bg_threads_; + bool should_stop_bg_thread_; + int bg_thread_finished_; + StressTest* stress_test_; + std::atomic verification_failure_; + std::atomic should_stop_test_; + + // Keys that should not be overwritten + const std::unordered_set no_overwrite_ids_; + + std::unique_ptr expected_state_manager_; + // Cannot store `port::Mutex` directly in vector since it is not copyable + // and storing it in the container may require copying depending on the impl. + std::vector> key_locks_; + std::atomic printing_verification_results_; + const uint64_t start_timestamp_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; // 0..n-1 + Random rand; // Has different seeds for different threads + SharedState* shared; + Stats stats; + struct SnapshotState { + const Snapshot* snapshot; + // The cf from which we did a Get at this snapshot + int cf_at; + // The name of the cf at the time that we did a read + std::string cf_at_name; + // The key with which we did a Get at this snapshot + std::string key; + // The status of the Get + Status status; + // The value of the Get + std::string value; + // optional state of all keys in the db + std::vector* key_vec; + + std::string timestamp; + }; + std::queue> snapshot_queue; + + ThreadState(uint32_t index, SharedState* _shared) + : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {} +}; +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_stat.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_stat.h new file mode 100644 index 0000000000..5b38c6e2bb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_stat.h @@ -0,0 +1,219 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" +#include "util/gflags_compat.h" +#include "util/random.h" + +DECLARE_bool(histogram); +DECLARE_bool(progress_reports); + +namespace ROCKSDB_NAMESPACE { + +// Database statistics +extern std::shared_ptr dbstats; +extern std::shared_ptr dbstats_secondaries; + +class Stats { + private: + uint64_t start_; + uint64_t finish_; + double seconds_; + long done_; + long gets_; + long prefixes_; + long writes_; + long deletes_; + size_t single_deletes_; + long iterator_size_sums_; + long founds_; + long iterations_; + long range_deletions_; + long covered_by_range_deletions_; + long errors_; + long verified_errors_; + long num_compact_files_succeed_; + long num_compact_files_failed_; + int next_report_; + size_t bytes_; + uint64_t last_op_finish_; + HistogramImpl hist_; + + public: + Stats() {} + + void Start() { + next_report_ = 100; + hist_.Clear(); + done_ = 0; + gets_ = 0; + prefixes_ = 0; + writes_ = 0; + deletes_ = 0; + single_deletes_ = 0; + iterator_size_sums_ = 0; + founds_ = 0; + iterations_ = 0; + range_deletions_ = 0; + covered_by_range_deletions_ = 0; + errors_ = 0; + verified_errors_ = 0; + bytes_ = 0; + seconds_ = 0; + num_compact_files_succeed_ = 0; + num_compact_files_failed_ = 0; + start_ = SystemClock::Default()->NowMicros(); + last_op_finish_ = start_; + finish_ = start_; + } + + void Merge(const Stats& other) { + hist_.Merge(other.hist_); + done_ += other.done_; + gets_ += other.gets_; + prefixes_ += other.prefixes_; + writes_ += other.writes_; + deletes_ += other.deletes_; + single_deletes_ += other.single_deletes_; + iterator_size_sums_ += other.iterator_size_sums_; + founds_ += other.founds_; + iterations_ += other.iterations_; + range_deletions_ += other.range_deletions_; + covered_by_range_deletions_ = other.covered_by_range_deletions_; + errors_ += other.errors_; + verified_errors_ += other.verified_errors_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + num_compact_files_succeed_ += other.num_compact_files_succeed_; + num_compact_files_failed_ += other.num_compact_files_failed_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + } + + void Stop() { + finish_ = SystemClock::Default()->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + auto now = SystemClock::Default()->NowMicros(); + auto micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stdout, "long op: %" PRIu64 " micros%30s\r", micros, ""); + } + last_op_finish_ = now; + } + + done_++; + if (FLAGS_progress_reports) { + if (done_ >= next_report_) { + if (next_report_ < 1000) + next_report_ += 100; + else if (next_report_ < 5000) + next_report_ += 500; + else if (next_report_ < 10000) + next_report_ += 1000; + else if (next_report_ < 50000) + next_report_ += 5000; + else if (next_report_ < 100000) + next_report_ += 10000; + else if (next_report_ < 500000) + next_report_ += 50000; + else + next_report_ += 100000; + fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); + } + } + } + + void AddBytesForWrites(long nwrites, size_t nbytes) { + writes_ += nwrites; + bytes_ += nbytes; + } + + void AddGets(long ngets, long nfounds) { + founds_ += nfounds; + gets_ += ngets; + } + + void AddPrefixes(long nprefixes, long count) { + prefixes_ += nprefixes; + iterator_size_sums_ += count; + } + + void AddIterations(long n) { iterations_ += n; } + + void AddDeletes(long n) { deletes_ += n; } + + void AddSingleDeletes(size_t n) { single_deletes_ += n; } + + void AddRangeDeletions(long n) { range_deletions_ += n; } + + void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; } + + void AddErrors(long n) { errors_ += n; } + + void AddVerifiedErrors(long n) { verified_errors_ += n; } + + void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; } + + void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; } + + void Report(const char* name) { + std::string extra; + if (bytes_ < 1 || done_ < 1) { + fprintf(stderr, "No writes or ops?\n"); + return; + } + + double elapsed = (finish_ - start_) * 1e-6; + double bytes_mb = bytes_ / 1048576.0; + double rate = bytes_mb / elapsed; + double throughput = (double)done_ / elapsed; + + fprintf(stdout, "%-12s: ", name); + fprintf(stdout, "%.3f micros/op %ld ops/sec\n", seconds_ * 1e6 / done_, + (long)throughput); + fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n", + "", bytes_mb, rate, (100 * writes_) / done_, done_); + fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_); + fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_); + fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "", + single_deletes_); + fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", gets_, + founds_); + fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_); + fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "", + iterator_size_sums_); + fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_); + fprintf(stdout, "%-12s: Deleted %ld key-ranges\n", "", range_deletions_); + fprintf(stdout, "%-12s: Range deletions covered %ld keys\n", "", + covered_by_range_deletions_); + + fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_); + fprintf(stdout, "%-12s: %ld CompactFiles() succeed\n", "", + num_compact_files_succeed_); + fprintf(stdout, "%-12s: %ld CompactFiles() did not succeed\n", "", + num_compact_files_failed_); + + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_table_properties_collector.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_table_properties_collector.h new file mode 100644 index 0000000000..d1758cbb4c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_table_properties_collector.h @@ -0,0 +1,65 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table.h" +#include "util/gflags_compat.h" +#include "util/random.h" + +DECLARE_int32(mark_for_compaction_one_file_in); + +namespace ROCKSDB_NAMESPACE { + +// A `DbStressTablePropertiesCollector` ignores what keys/values were added to +// the table, adds no properties to the table, and decides at random whether the +// table will be marked for compaction according to +// `FLAGS_mark_for_compaction_one_file_in`. +class DbStressTablePropertiesCollector : public TablePropertiesCollector { + public: + DbStressTablePropertiesCollector() + : need_compact_(Random::GetTLSInstance()->OneInOpt( + FLAGS_mark_for_compaction_one_file_in)) {} + + virtual Status AddUserKey(const Slice& /* key */, const Slice& /* value */, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + + virtual Status Finish(UserCollectedProperties* /* properties */) override { + return Status::OK(); + } + + virtual UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + virtual const char* Name() const override { + return "DbStressTablePropertiesCollector"; + } + + virtual bool NeedCompact() const override { return need_compact_; } + + private: + const bool need_compact_; +}; + +// A `DbStressTablePropertiesCollectorFactory` creates +// `DbStressTablePropertiesCollectorFactory`s. +class DbStressTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /* context */) override { + return new DbStressTablePropertiesCollector(); + } + + virtual const char* Name() const override { + return "DbStressTablePropertiesCollectorFactory"; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_test_base.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_test_base.h new file mode 100644 index 0000000000..f7ee247a74 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/db_stress_test_base.h @@ -0,0 +1,327 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef GFLAGS +#pragma once + +#include "db_stress_tool/db_stress_common.h" +#include "db_stress_tool/db_stress_shared_state.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; +class Transaction; +class TransactionDB; +struct TransactionDBOptions; + +class StressTest { + public: + StressTest(); + + virtual ~StressTest(); + + std::shared_ptr NewCache(size_t capacity, int32_t num_shard_bits); + + static std::vector GetBlobCompressionTags(); + + bool BuildOptionsTable(); + + void InitDb(SharedState*); + // The initialization work is split into two parts to avoid a circular + // dependency with `SharedState`. + virtual void FinishInitDb(SharedState*); + void TrackExpectedState(SharedState* shared); + void OperateDb(ThreadState* thread); + virtual void VerifyDb(ThreadState* thread) const = 0; + virtual void ContinuouslyVerifyDb(ThreadState* /*thread*/) const = 0; + void PrintStatistics(); + + protected: + Status AssertSame(DB* db, ColumnFamilyHandle* cf, + ThreadState::SnapshotState& snap_state); + + // Currently PreloadDb has to be single-threaded. + void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, + SharedState* shared); + + Status SetOptions(ThreadState* thread); + + // For transactionsDB, there can be txns prepared but not yet committeed + // right before previous stress run crash. + // They will be recovered and processed through + // ProcessRecoveredPreparedTxnsHelper on the start of current stress run. + void ProcessRecoveredPreparedTxns(SharedState* shared); + + // Default implementation will first update ExpectedState to be + // `SharedState::UNKNOWN` for each keys in `txn` and then randomly + // commit or rollback `txn`. + virtual void ProcessRecoveredPreparedTxnsHelper(Transaction* txn, + SharedState* shared); + + Status NewTxn(WriteOptions& write_opts, Transaction** txn); + + Status CommitTxn(Transaction* txn, ThreadState* thread = nullptr); + + Status RollbackTxn(Transaction* txn); + + virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {} + + virtual bool ShouldAcquireMutexOnKey() const { return false; } + + // Returns true if DB state is tracked by the stress test. + virtual bool IsStateTracked() const = 0; + + virtual std::vector GenerateColumnFamilies( + const int /* num_column_families */, int rand_column_family) const { + return {rand_column_family}; + } + + virtual std::vector GenerateKeys(int64_t rand_key) const { + return {rand_key}; + } + + virtual Status TestGet(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual std::vector TestMultiGet( + ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual void TestMultiGetEntity(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual Status TestPrefixScan(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts, + const ReadOptions& read_opts, + const std::vector& cf_ids, + const std::vector& keys, + char (&value)[100]) = 0; + + virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual void TestIngestExternalFile( + ThreadState* thread, const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + // Issue compact range, starting with start_key, whose integer value + // is rand_key. + virtual void TestCompactRange(ThreadState* thread, int64_t rand_key, + const Slice& start_key, + ColumnFamilyHandle* column_family); + + // Calculate a hash value for all keys in range [start_key, end_key] + // at a certain snapshot. + uint32_t GetRangeHash(ThreadState* thread, const Snapshot* snapshot, + ColumnFamilyHandle* column_family, + const Slice& start_key, const Slice& end_key); + + // Return a column family handle that mirrors what is pointed by + // `column_family_id`, which will be used to validate data to be correct. + // By default, the column family itself will be returned. + virtual ColumnFamilyHandle* GetControlCfh(ThreadState* /* thread*/, + int column_family_id) { + return column_families_[column_family_id]; + } + + // Generated a list of keys that close to boundaries of SST keys. + // If there isn't any SST file in the DB, return empty list. + std::vector GetWhiteBoxKeys(ThreadState* thread, DB* db, + ColumnFamilyHandle* cfh, + size_t num_keys); + + // Given a key K, this creates an iterator which scans to K and then + // does a random sequence of Next/Prev operations. + virtual Status TestIterate(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys); + + virtual Status TestIterateAgainstExpected( + ThreadState* /* thread */, const ReadOptions& /* read_opts */, + const std::vector& /* rand_column_families */, + const std::vector& /* rand_keys */) { + return Status::NotSupported(); + } + + // Enum used by VerifyIterator() to identify the mode to validate. + enum LastIterateOp { + kLastOpSeek, + kLastOpSeekForPrev, + kLastOpNextOrPrev, + kLastOpSeekToFirst, + kLastOpSeekToLast + }; + + // Compare the two iterator, iter and cmp_iter are in the same position, + // unless iter might be made invalidate or undefined because of + // upper or lower bounds, or prefix extractor. + // Will flag failure if the verification fails. + // diverged = true if the two iterator is already diverged. + // True if verification passed, false if not. + // op_logs is the information to print when validation fails. + void VerifyIterator(ThreadState* thread, ColumnFamilyHandle* cmp_cfh, + const ReadOptions& ro, Iterator* iter, Iterator* cmp_iter, + LastIterateOp op, const Slice& seek_key, + const std::string& op_logs, bool* diverged); + + virtual Status TestBackupRestore(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys); + + virtual Status TestCheckpoint(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys); + + void TestCompactFiles(ThreadState* thread, ColumnFamilyHandle* column_family); + + Status TestFlush(const std::vector& rand_column_families); + + Status TestPauseBackground(ThreadState* thread); + + void TestAcquireSnapshot(ThreadState* thread, int rand_column_family, + const std::string& keystr, uint64_t i); + + Status MaybeReleaseSnapshots(ThreadState* thread, uint64_t i); + Status VerifyGetLiveFiles() const; + Status VerifyGetSortedWalFiles() const; + Status VerifyGetCurrentWalFile() const; + void TestGetProperty(ThreadState* thread) const; + + virtual Status TestApproximateSize( + ThreadState* thread, uint64_t iteration, + const std::vector& rand_column_families, + const std::vector& rand_keys); + + virtual Status TestCustomOperations( + ThreadState* /*thread*/, + const std::vector& /*rand_column_families*/) { + return Status::NotSupported("TestCustomOperations() must be overridden"); + } + + void VerificationAbort(SharedState* shared, std::string msg, Status s) const; + + void VerificationAbort(SharedState* shared, std::string msg, int cf, + int64_t key) const; + + void VerificationAbort(SharedState* shared, std::string msg, int cf, + int64_t key, Slice value_from_db, + Slice value_from_expected) const; + + void VerificationAbort(SharedState* shared, int cf, int64_t key, + const Slice& value, const WideColumns& columns) const; + + static std::string DebugString(const Slice& value, + const WideColumns& columns); + + void PrintEnv() const; + + void Open(SharedState* shared); + + void Reopen(ThreadState* thread); + + virtual void RegisterAdditionalListeners() {} + + virtual void PrepareTxnDbOptions(SharedState* /*shared*/, + TransactionDBOptions& /*txn_db_opts*/) {} + + // Returns whether the timestamp of read_opts is updated. + bool MaybeUseOlderTimestampForPointLookup(ThreadState* thread, + std::string& ts_str, + Slice& ts_slice, + ReadOptions& read_opts); + + void MaybeUseOlderTimestampForRangeScan(ThreadState* thread, + std::string& ts_str, Slice& ts_slice, + ReadOptions& read_opts); + + std::shared_ptr cache_; + std::shared_ptr compressed_cache_; + std::shared_ptr filter_policy_; + DB* db_; + TransactionDB* txn_db_; + + // Currently only used in MultiOpsTxnsStressTest + std::atomic db_aptr_; + + Options options_; + SystemClock* clock_; + std::vector column_families_; + std::vector column_family_names_; + std::atomic new_column_family_name_; + int num_times_reopened_; + std::unordered_map> options_table_; + std::vector options_index_; + std::atomic db_preload_finished_; + + // Fields used for continuous verification from another thread + DB* cmp_db_; + std::vector cmp_cfhs_; + bool is_db_stopped_; +}; + +// Load options from OPTIONS file and populate `options`. +extern bool InitializeOptionsFromFile(Options& options); + +// Initialize `options` using command line arguments. +// When this function is called, `cache`, `block_cache_compressed`, +// `filter_policy` have all been initialized. Therefore, we just pass them as +// input arguments. +extern void InitializeOptionsFromFlags( + const std::shared_ptr& cache, + const std::shared_ptr& filter_policy, Options& options); + +// Initialize `options` on which `InitializeOptionsFromFile()` and +// `InitializeOptionsFromFlags()` have both been called already. +// There are two cases. +// Case 1: OPTIONS file is not specified. Command line arguments have been used +// to initialize `options`. InitializeOptionsGeneral() will use +// `cache` and `filter_policy` to initialize +// corresponding fields of `options`. InitializeOptionsGeneral() will +// also set up other fields of `options` so that stress test can run. +// Examples include `create_if_missing` and +// `create_missing_column_families`, etc. +// Case 2: OPTIONS file is specified. It is possible that, after loading from +// the given OPTIONS files, some shared object fields are still not +// initialized because they are not set in the OPTIONS file. In this +// case, if command line arguments indicate that the user wants to set +// up such shared objects, e.g. block cache, compressed block cache, +// row cache, filter policy, then InitializeOptionsGeneral() will honor +// the user's choice, thus passing `cache`, +// `filter_policy` as input arguments. +// +// InitializeOptionsGeneral() must not overwrite fields of `options` loaded +// from OPTIONS file. +extern void InitializeOptionsGeneral( + const std::shared_ptr& cache, + const std::shared_ptr& filter_policy, Options& options); + +// If no OPTIONS file is specified, set up `options` so that we can test +// user-defined timestamp which requires `-user_timestamp_size=8`. +// This function also checks for known (currently) incompatible features with +// user-defined timestamp. +extern void CheckAndSetOptionsForUserTimestamp(Options& options); + +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_state.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_state.h new file mode 100644 index 0000000000..f3f924cb88 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_state.h @@ -0,0 +1,329 @@ +// Copyright (c) 2021-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#pragma once + +#include + +#include +#include + +#include "db/dbformat.h" +#include "db_stress_tool/expected_value.h" +#include "file/file_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +// `ExpectedState` provides read/write access to expected values stored in +// `ExpectedState` for every key. +class ExpectedState { + public: + explicit ExpectedState(size_t max_key, size_t num_column_families); + + virtual ~ExpectedState() {} + + // Requires external locking preventing concurrent execution with any other + // member function. + virtual Status Open(bool create) = 0; + + // Requires external locking covering all keys in `cf`. + void ClearColumnFamily(int cf); + + // Prepare a Put that will be started but not finished yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value + // + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PreparePut(int cf, int64_t key); + + // Does not requires external locking. + ExpectedValue Get(int cf, int64_t key); + + // Prepare a Delete that will be started but not finished yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value + // + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareDelete(int cf, int64_t key, + bool* prepared = nullptr); + + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key); + + // Requires external locking covering keys in `[begin_key, end_key)` in `cf` + // to prevent concurrent write or delete to the same `key`. + std::vector PrepareDeleteRange(int cf, + int64_t begin_key, + int64_t end_key); + + // Update the expected value for start of an incomplete write or delete + // operation on the key assoicated with this expected value + void Precommit(int cf, int64_t key, const ExpectedValue& value); + + // Requires external locking covering `key` in `cf` to prevent concurrent + // delete to the same `key`. + bool Exists(int cf, int64_t key); + + // Sync the `value_base` to the corresponding expected value + // + // Requires external locking covering `key` in `cf` or be in single thread + // to prevent concurrent write or delete to the same `key` + void SyncPut(int cf, int64_t key, uint32_t value_base); + + // Sync the corresponding expected value to be pending Put + // + // Requires external locking covering `key` in `cf` or be in single thread + // to prevent concurrent write or delete to the same `key` + void SyncPendingPut(int cf, int64_t key); + + // Sync the corresponding expected value to be deleted + // + // Requires external locking covering `key` in `cf` or be in single thread + // to prevent concurrent write or delete to the same `key` + void SyncDelete(int cf, int64_t key); + + // Sync the corresponding expected values to be deleted + // + // Requires external locking covering keys in `[begin_key, end_key)` in `cf` + // to prevent concurrent write or delete to the same `key` + void SyncDeleteRange(int cf, int64_t begin_key, int64_t end_key); + + private: + // Does not requires external locking. + std::atomic& Value(int cf, int64_t key) const { + return values_[cf * max_key_ + key]; + } + + // Does not requires external locking + ExpectedValue Load(int cf, int64_t key) const { + return ExpectedValue(Value(cf, key).load()); + } + + const size_t max_key_; + const size_t num_column_families_; + + protected: + size_t GetValuesLen() const { + return sizeof(std::atomic) * num_column_families_ * max_key_; + } + + // Requires external locking preventing concurrent execution with any other + // member function. + void Reset(); + + std::atomic* values_; +}; + +// A `FileExpectedState` implements `ExpectedState` backed by a file. +class FileExpectedState : public ExpectedState { + public: + explicit FileExpectedState(std::string expected_state_file_path, + size_t max_key, size_t num_column_families); + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open(bool create) override; + + private: + const std::string expected_state_file_path_; + std::unique_ptr expected_state_mmap_buffer_; +}; + +// An `AnonExpectedState` implements `ExpectedState` backed by a memory +// allocation. +class AnonExpectedState : public ExpectedState { + public: + explicit AnonExpectedState(size_t max_key, size_t num_column_families); + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open(bool create) override; + + private: + std::unique_ptr[]> values_allocation_; +}; + +// An `ExpectedStateManager` manages data about the expected state of the +// database. It exposes operations for reading and modifying the latest +// expected state. +class ExpectedStateManager { + public: + explicit ExpectedStateManager(size_t max_key, size_t num_column_families); + + virtual ~ExpectedStateManager(); + + // Requires external locking preventing concurrent execution with any other + // member function. + virtual Status Open() = 0; + + // Saves expected values for the current state of `db` and begins tracking + // changes. Following a successful `SaveAtAndAfter()`, `Restore()` can be + // called on the same DB, as long as its state does not roll back to before + // its current state. + // + // Requires external locking preventing concurrent execution with any other + // member function. Furthermore, `db` must not be mutated while this function + // is executing. + virtual Status SaveAtAndAfter(DB* db) = 0; + + // Returns true if at least one state of historical expected values can be + // restored. + // + // Requires external locking preventing concurrent execution with any other + // member function. + virtual bool HasHistory() = 0; + + // Restores expected values according to the current state of `db`. See + // `SaveAtAndAfter()` for conditions where this can be called. + // + // Requires external locking preventing concurrent execution with any other + // member function. Furthermore, `db` must not be mutated while this function + // is executing. + virtual Status Restore(DB* db) = 0; + + // Requires external locking covering all keys in `cf`. + void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); } + + // See ExpectedState::PreparePut() + PendingExpectedValue PreparePut(int cf, int64_t key) { + return latest_->PreparePut(cf, key); + } + + // See ExpectedState::Get() + ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); } + + // See ExpectedState::PrepareDelete() + PendingExpectedValue PrepareDelete(int cf, int64_t key) { + return latest_->PrepareDelete(cf, key); + } + + // See ExpectedState::PrepareSingleDelete() + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) { + return latest_->PrepareSingleDelete(cf, key); + } + + // See ExpectedState::PrepareDeleteRange() + std::vector PrepareDeleteRange(int cf, + int64_t begin_key, + int64_t end_key) { + return latest_->PrepareDeleteRange(cf, begin_key, end_key); + } + + // See ExpectedState::Exists() + bool Exists(int cf, int64_t key) { return latest_->Exists(cf, key); } + + // See ExpectedState::SyncPut() + void SyncPut(int cf, int64_t key, uint32_t value_base) { + return latest_->SyncPut(cf, key, value_base); + } + + // See ExpectedState::SyncPendingPut() + void SyncPendingPut(int cf, int64_t key) { + return latest_->SyncPendingPut(cf, key); + } + + // See ExpectedState::SyncDelete() + void SyncDelete(int cf, int64_t key) { return latest_->SyncDelete(cf, key); } + + // See ExpectedState::SyncDeleteRange() + void SyncDeleteRange(int cf, int64_t begin_key, int64_t end_key) { + return latest_->SyncDeleteRange(cf, begin_key, end_key); + } + + protected: + const size_t max_key_; + const size_t num_column_families_; + std::unique_ptr latest_; +}; + +// A `FileExpectedStateManager` implements an `ExpectedStateManager` backed by +// a directory of files containing data about the expected state of the +// database. +class FileExpectedStateManager : public ExpectedStateManager { + public: + explicit FileExpectedStateManager(size_t max_key, size_t num_column_families, + std::string expected_state_dir_path); + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open() override; + + // See `ExpectedStateManager::SaveAtAndAfter()` API doc. + // + // This implementation makes a copy of "LATEST.state" into + // ".state", and starts a trace in ".trace". + // Due to using external files, a following `Restore()` can happen even + // from a different process. + Status SaveAtAndAfter(DB* db) override; + + // See `ExpectedStateManager::HasHistory()` API doc. + bool HasHistory() override; + + // See `ExpectedStateManager::Restore()` API doc. + // + // Say `db->GetLatestSequenceNumber()` was `a` last time `SaveAtAndAfter()` + // was called and now it is `b`. Then this function replays `b - a` write + // operations from "`a`.trace" onto "`a`.state", and then copies the resulting + // file into "LATEST.state". + Status Restore(DB* db) override; + + private: + // Requires external locking preventing concurrent execution with any other + // member function. + Status Clean(); + + std::string GetTempPathForFilename(const std::string& filename); + std::string GetPathForFilename(const std::string& filename); + + static const std::string kLatestBasename; + static const std::string kStateFilenameSuffix; + static const std::string kTraceFilenameSuffix; + static const std::string kTempFilenamePrefix; + static const std::string kTempFilenameSuffix; + + const std::string expected_state_dir_path_; + SequenceNumber saved_seqno_ = kMaxSequenceNumber; +}; + +// An `AnonExpectedStateManager` implements an `ExpectedStateManager` backed by +// a memory allocation containing data about the expected state of the database. +class AnonExpectedStateManager : public ExpectedStateManager { + public: + explicit AnonExpectedStateManager(size_t max_key, size_t num_column_families); + + // See `ExpectedStateManager::SaveAtAndAfter()` API doc. + // + // This implementation returns `Status::NotSupported` since we do not + // currently have a need to keep history of expected state within a process. + Status SaveAtAndAfter(DB* /* db */) override { + return Status::NotSupported(); + } + + // See `ExpectedStateManager::HasHistory()` API doc. + bool HasHistory() override { return false; } + + // See `ExpectedStateManager::Restore()` API doc. + // + // This implementation returns `Status::NotSupported` since we do not + // currently have a need to keep history of expected state within a process. + Status Restore(DB* /* db */) override { return Status::NotSupported(); } + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open() override; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_value.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_value.h new file mode 100644 index 0000000000..2e41b130cd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/expected_value.h @@ -0,0 +1,206 @@ +// Copyright (c) 2021-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// `ExpectedValue` represents the expected value of a key used in db stress, +// which provides APIs to obtain various information e.g, value base, existence, +// pending operation status and APIs to edit expected value. +// +// This class is not thread-safe. +class ExpectedValue { + public: + static uint32_t GetValueBaseMask() { return VALUE_BASE_MASK; } + static uint32_t GetValueBaseDelta() { return VALUE_BASE_DELTA; } + static uint32_t GetDelCounterDelta() { return DEL_COUNTER_DELTA; } + static uint32_t GetDelMask() { return DEL_MASK; } + static bool IsValueBaseValid(uint32_t value_base) { + return IsValuePartValid(value_base, VALUE_BASE_MASK); + } + + explicit ExpectedValue(uint32_t expected_value) + : expected_value_(expected_value) {} + + bool Exists() const { return PendingWrite() || !IsDeleted(); } + + uint32_t Read() const { return expected_value_; } + + void Put(bool pending); + + bool Delete(bool pending); + + void SyncPut(uint32_t value_base); + + void SyncPendingPut(); + + void SyncDelete(); + + uint32_t GetValueBase() const { return GetValuePart(VALUE_BASE_MASK); } + + uint32_t NextValueBase() const { + return GetIncrementedValuePart(VALUE_BASE_MASK, VALUE_BASE_DELTA); + } + + void SetValueBase(uint32_t new_value_base) { + SetValuePart(VALUE_BASE_MASK, new_value_base); + } + + bool PendingWrite() const { + const uint32_t pending_write = GetValuePart(PENDING_WRITE_MASK); + return pending_write != 0; + } + + void SetPendingWrite() { + SetValuePart(PENDING_WRITE_MASK, PENDING_WRITE_MASK); + } + + void ClearPendingWrite() { ClearValuePart(PENDING_WRITE_MASK); } + + uint32_t GetDelCounter() const { return GetValuePart(DEL_COUNTER_MASK); } + + uint32_t NextDelCounter() const { + return GetIncrementedValuePart(DEL_COUNTER_MASK, DEL_COUNTER_DELTA); + } + + void SetDelCounter(uint32_t new_del_counter) { + SetValuePart(DEL_COUNTER_MASK, new_del_counter); + } + + bool PendingDelete() const { + const uint32_t pending_del = GetValuePart(PENDING_DEL_MASK); + return pending_del != 0; + } + + void SetPendingDel() { SetValuePart(PENDING_DEL_MASK, PENDING_DEL_MASK); } + + void ClearPendingDel() { ClearValuePart(PENDING_DEL_MASK); } + + bool IsDeleted() const { + const uint32_t deleted = GetValuePart(DEL_MASK); + return deleted != 0; + } + + void SetDeleted() { SetValuePart(DEL_MASK, DEL_MASK); } + + void ClearDeleted() { ClearValuePart(DEL_MASK); } + + uint32_t GetFinalValueBase() const; + + uint32_t GetFinalDelCounter() const; + + private: + static bool IsValuePartValid(uint32_t value_part, uint32_t value_part_mask) { + return (value_part & (~value_part_mask)) == 0; + } + + // The 32-bit expected_value_ is divided into following parts: + // Bit 0 - 14: value base + static constexpr uint32_t VALUE_BASE_MASK = 0x7fff; + static constexpr uint32_t VALUE_BASE_DELTA = 1; + // Bit 15: whether write to this value base is pending (0 equals `false`) + static constexpr uint32_t PENDING_WRITE_MASK = (uint32_t)1 << 15; + // Bit 16 - 29: deletion counter (i.e, number of times this value base has + // been deleted) + static constexpr uint32_t DEL_COUNTER_MASK = 0x3fff0000; + static constexpr uint32_t DEL_COUNTER_DELTA = (uint32_t)1 << 16; + // Bit 30: whether deletion of this value base is pending (0 equals `false`) + static constexpr uint32_t PENDING_DEL_MASK = (uint32_t)1 << 30; + // Bit 31: whether this value base is deleted (0 equals `false`) + static constexpr uint32_t DEL_MASK = (uint32_t)1 << 31; + + uint32_t GetValuePart(uint32_t value_part_mask) const { + return expected_value_ & value_part_mask; + } + + uint32_t GetIncrementedValuePart(uint32_t value_part_mask, + uint32_t value_part_delta) const { + uint32_t current_value_part = GetValuePart(value_part_mask); + ExpectedValue temp_expected_value(current_value_part + value_part_delta); + return temp_expected_value.GetValuePart(value_part_mask); + } + + void SetValuePart(uint32_t value_part_mask, uint32_t new_value_part) { + assert(IsValuePartValid(new_value_part, value_part_mask)); + ClearValuePart(value_part_mask); + expected_value_ |= new_value_part; + } + + void ClearValuePart(uint32_t value_part_mask) { + expected_value_ &= (~value_part_mask); + } + + uint32_t expected_value_; +}; + +// `PendingExpectedValue` represents the expected value of a key undergoing a +// pending operation in db stress. +// +// This class is not thread-safe. +class PendingExpectedValue { + public: + explicit PendingExpectedValue(std::atomic* value_ptr, + ExpectedValue orig_value, + ExpectedValue final_value) + : value_ptr_(value_ptr), + orig_value_(orig_value), + final_value_(final_value) {} + + void Commit() { + // To prevent low-level instruction reordering that results + // in setting expected value happens before db write + std::atomic_thread_fence(std::memory_order_release); + value_ptr_->store(final_value_.Read()); + } + + uint32_t GetFinalValueBase() { return final_value_.GetValueBase(); } + + private: + std::atomic* const value_ptr_; + const ExpectedValue orig_value_; + const ExpectedValue final_value_; +}; + +// `ExpectedValueHelper` provides utils to parse `ExpectedValue` to obtain +// useful info about it in db stress +class ExpectedValueHelper { + public: + // Return whether the key associated with `pre_read_expected_value` and + // `post_read_expected_value` is expected not to exist from begining till the + // end of the read + // + // The negation of `MustHaveNotExisted()` is "may have not existed". + // To assert some key must have existsed, please use `MustHaveExisted()` + static bool MustHaveNotExisted(ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value); + + // Return whether the key associated with `pre_read_expected_value` and + // `post_read_expected_value` is expected to exist from begining till the end + // of the read. + // + // The negation of `MustHaveExisted()` is "may have existed". + // To assert some key must have not existsed, please use + // `MustHaveNotExisted()` + static bool MustHaveExisted(ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value); + + // Return whether the `value_base` falls within the expected value base + static bool InExpectedValueBaseRange(uint32_t value_base, + ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value); +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/multi_ops_txns_stress.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/multi_ops_txns_stress.h new file mode 100644 index 0000000000..12c45aaa32 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/db_stress_tool/multi_ops_txns_stress.h @@ -0,0 +1,446 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef GFLAGS +#include "db_stress_tool/db_stress_common.h" + +namespace ROCKSDB_NAMESPACE { + +// This file defines MultiOpsTxnsStress so that we can stress test RocksDB +// transactions on a simple, emulated relational table. +// +// The record format is similar to the example found at +// https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format. +// +// The table is created by +// ``` +// create table t1 ( +// a int primary key, +// b int, +// c int, +// key(c), +// ) +// ``` +// +// (For simplicity, we use uint32_t for int here.) +// +// For this table, there is a primary index using `a`, as well as a secondary +// index using `c` and `a`. +// +// Primary key format: +// | index id | M(a) | +// Primary index value: +// | b | c | +// M(a) represents the big-endian format of a. +// +// Secondary key format: +// | index id | M(c) | M(a) | +// Secondary index value: +// | crc32 | +// Similarly to M(a), M(c) is the big-endian format of c. +// +// The in-memory representation of a record is defined in class +// MultiOpsTxnsStress:Record that includes a number of helper methods to +// encode/decode primary index keys, primary index values, secondary index keys, +// secondary index values, etc. +// +// Sometimes primary index and secondary index reside on different column +// families, but sometimes they colocate in the same column family. Current +// implementation puts them in the same (default) column family, and this is +// subject to future change if we find it interesting to test the other case. +// +// Class MultiOpsTxnsStressTest has the following transactions for testing. +// +// 1. Primary key update +// UPDATE t1 SET a = 3 WHERE a = 2; +// ``` +// tx->GetForUpdate(primary key a=2) +// tx->GetForUpdate(primary key a=3) +// tx->Delete(primary key a=2) +// tx->Put(primary key a=3, value) +// tx->batch->SingleDelete(secondary key a=2) +// tx->batch->Put(secondary key a=3, value) +// tx->Prepare() +// Tx->Commit() +// ``` +// +// 2. Secondary key update +// UPDATE t1 SET c = 3 WHERE c = 2; +// ``` +// iter->Seek(secondary key) +// // Get corresponding primary key value(s) from iterator +// tx->GetForUpdate(primary key) +// tx->Put(primary key, value c=3) +// tx->batch->SingleDelete(secondary key c=2) +// tx->batch->Put(secondary key c=3) +// tx->Prepare() +// tx->Commit() +// ``` +// +// 3. Primary index value update +// UPDATE t1 SET b = b + 1 WHERE a = 2; +// ``` +// tx->GetForUpdate(primary key a=2) +// tx->Put(primary key a=2, value b=b+1) +// tx->Prepare() +// tx->Commit() +// ``` +// +// 4. Point lookup +// SELECT * FROM t1 WHERE a = 3; +// ``` +// tx->Get(primary key a=3) +// tx->Commit() +// ``` +// +// 5. Range scan +// SELECT * FROM t1 WHERE c = 2; +// ``` +// it = tx->GetIterator() +// it->Seek(secondary key c=2) +// tx->Commit() +// ``` + +class MultiOpsTxnsStressTest : public StressTest { + public: + class Record { + public: + static constexpr uint32_t kMetadataPrefix = 0; + static constexpr uint32_t kPrimaryIndexId = 1; + static constexpr uint32_t kSecondaryIndexId = 2; + + static constexpr size_t kPrimaryIndexEntrySize = 8 + 8; + static constexpr size_t kSecondaryIndexEntrySize = 12 + 4; + + static_assert(kPrimaryIndexId < kSecondaryIndexId, + "kPrimaryIndexId must be smaller than kSecondaryIndexId"); + + static_assert(sizeof(kPrimaryIndexId) == sizeof(uint32_t), + "kPrimaryIndexId must be 4 bytes"); + static_assert(sizeof(kSecondaryIndexId) == sizeof(uint32_t), + "kSecondaryIndexId must be 4 bytes"); + + // Used for generating search key to probe primary index. + static std::string EncodePrimaryKey(uint32_t a); + // Used for generating search prefix to probe secondary index. + static std::string EncodeSecondaryKey(uint32_t c); + // Used for generating search key to probe secondary index. + static std::string EncodeSecondaryKey(uint32_t c, uint32_t a); + + static std::tuple DecodePrimaryIndexValue( + Slice primary_index_value); + + static std::pair DecodeSecondaryIndexValue( + Slice secondary_index_value); + + Record() = default; + Record(uint32_t _a, uint32_t _b, uint32_t _c) : a_(_a), b_(_b), c_(_c) {} + + bool operator==(const Record& other) const { + return a_ == other.a_ && b_ == other.b_ && c_ == other.c_; + } + + bool operator!=(const Record& other) const { return !(*this == other); } + + std::pair EncodePrimaryIndexEntry() const; + + std::string EncodePrimaryKey() const; + + std::string EncodePrimaryIndexValue() const; + + std::pair EncodeSecondaryIndexEntry() const; + + std::string EncodeSecondaryKey() const; + + Status DecodePrimaryIndexEntry(Slice primary_index_key, + Slice primary_index_value); + + Status DecodeSecondaryIndexEntry(Slice secondary_index_key, + Slice secondary_index_value); + + uint32_t a_value() const { return a_; } + uint32_t b_value() const { return b_; } + uint32_t c_value() const { return c_; } + + void SetA(uint32_t _a) { a_ = _a; } + void SetB(uint32_t _b) { b_ = _b; } + void SetC(uint32_t _c) { c_ = _c; } + + std::string ToString() const { + std::string ret("("); + ret.append(std::to_string(a_)); + ret.append(","); + ret.append(std::to_string(b_)); + ret.append(","); + ret.append(std::to_string(c_)); + ret.append(")"); + return ret; + } + + private: + friend class InvariantChecker; + + uint32_t a_{0}; + uint32_t b_{0}; + uint32_t c_{0}; + }; + + MultiOpsTxnsStressTest() {} + + ~MultiOpsTxnsStressTest() override {} + + void FinishInitDb(SharedState*) override; + + void ReopenAndPreloadDbIfNeeded(SharedState* shared); + + bool IsStateTracked() const override { return false; } + + Status TestGet(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + std::vector TestMultiGet( + ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + // Given a key K, this creates an iterator which scans to K and then + // does a random sequence of Next/Prev operations. + Status TestIterate(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestPut(ThreadState* thread, WriteOptions& write_opts, + const ReadOptions& read_opts, const std::vector& cf_ids, + const std::vector& keys, char (&value)[100]) override; + + Status TestDelete(ThreadState* thread, WriteOptions& write_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + void TestIngestExternalFile(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + void TestCompactRange(ThreadState* thread, int64_t rand_key, + const Slice& start_key, + ColumnFamilyHandle* column_family) override; + + Status TestBackupRestore(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestCheckpoint(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestApproximateSize(ThreadState* thread, uint64_t iteration, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestCustomOperations( + ThreadState* thread, + const std::vector& rand_column_families) override; + + void RegisterAdditionalListeners() override; + + void PrepareTxnDbOptions(SharedState* /*shared*/, + TransactionDBOptions& txn_db_opts) override; + + Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a, + uint32_t old_a_pos, uint32_t new_a); + + Status SecondaryKeyUpdateTxn(ThreadState* thread, uint32_t old_c, + uint32_t old_c_pos, uint32_t new_c); + + Status UpdatePrimaryIndexValueTxn(ThreadState* thread, uint32_t a, + uint32_t b_delta); + + Status PointLookupTxn(ThreadState* thread, ReadOptions ropts, uint32_t a); + + Status RangeScanTxn(ThreadState* thread, ReadOptions ropts, uint32_t c); + + void VerifyDb(ThreadState* thread) const override; + + void ContinuouslyVerifyDb(ThreadState* thread) const override { + VerifyDb(thread); + } + + void VerifyPkSkFast(const ReadOptions& read_options, int job_id); + + protected: + class Counter { + public: + uint64_t Next() { return value_.fetch_add(1); } + + private: + std::atomic value_ = Env::Default()->NowNanos(); + }; + + using KeySet = std::set; + class KeyGenerator { + public: + explicit KeyGenerator(uint32_t s, uint32_t low, uint32_t high, + KeySet&& existing_uniq, KeySet&& non_existing_uniq) + : rand_(s), + low_(low), + high_(high), + existing_uniq_(std::move(existing_uniq)), + non_existing_uniq_(std::move(non_existing_uniq)) {} + ~KeyGenerator() { + assert(!existing_uniq_.empty()); + assert(!non_existing_uniq_.empty()); + } + void FinishInit(); + + std::pair ChooseExisting(); + void Replace(uint32_t old_val, uint32_t old_pos, uint32_t new_val); + uint32_t Allocate(); + void UndoAllocation(uint32_t new_val); + + std::string ToString() const { + std::ostringstream oss; + oss << "[" << low_ << ", " << high_ << "): " << existing_.size() + << " elements, " << existing_uniq_.size() << " unique values, " + << non_existing_uniq_.size() << " unique non-existing values"; + return oss.str(); + } + + private: + Random rand_; + uint32_t low_ = 0; + uint32_t high_ = 0; + std::vector existing_{}; + KeySet existing_uniq_{}; + KeySet non_existing_uniq_{}; + bool initialized_ = false; + }; + + // Return + std::pair ChooseExistingA(ThreadState* thread); + + uint32_t GenerateNextA(ThreadState* thread); + + // Return + std::pair ChooseExistingC(ThreadState* thread); + + uint32_t GenerateNextC(ThreadState* thread); + + // Randomly commit or rollback `txn` + void ProcessRecoveredPreparedTxnsHelper(Transaction* txn, + SharedState*) override; + + // Some applications, e.g. MyRocks writes a KV pair to the database via + // commit-time-write-batch (ctwb) in additional to the transaction's regular + // write batch. The key is usually constant representing some system + // metadata, while the value is monoticailly increasing which represents the + // actual value of the metadata. Method WriteToCommitTimeWriteBatch() + // emulates this scenario. + Status WriteToCommitTimeWriteBatch(Transaction& txn); + + Status CommitAndCreateTimestampedSnapshotIfNeeded(ThreadState* thread, + Transaction& txn); + + void SetupSnapshot(ThreadState* thread, ReadOptions& read_opts, + Transaction& txn, + std::shared_ptr& snapshot); + + std::vector> key_gen_for_a_; + std::vector> key_gen_for_c_; + + Counter counter_{}; + + private: + struct KeySpaces { + uint32_t lb_a = 0; + uint32_t ub_a = 0; + uint32_t lb_c = 0; + uint32_t ub_c = 0; + + explicit KeySpaces() = default; + explicit KeySpaces(uint32_t _lb_a, uint32_t _ub_a, uint32_t _lb_c, + uint32_t _ub_c) + : lb_a(_lb_a), ub_a(_ub_a), lb_c(_lb_c), ub_c(_ub_c) {} + + std::string EncodeTo() const; + bool DecodeFrom(Slice data); + }; + + void PersistKeySpacesDesc(const std::string& key_spaces_path, uint32_t lb_a, + uint32_t ub_a, uint32_t lb_c, uint32_t ub_c); + + KeySpaces ReadKeySpacesDesc(const std::string& key_spaces_path); + + void PreloadDb(SharedState* shared, int threads, uint32_t lb_a, uint32_t ub_a, + uint32_t lb_c, uint32_t ub_c); + + void ScanExistingDb(SharedState* shared, int threads); +}; + +class InvariantChecker { + public: + static_assert(sizeof(MultiOpsTxnsStressTest::Record().a_) == sizeof(uint32_t), + "MultiOpsTxnsStressTest::Record::a_ must be 4 bytes"); + static_assert(sizeof(MultiOpsTxnsStressTest::Record().b_) == sizeof(uint32_t), + "MultiOpsTxnsStressTest::Record::b_ must be 4 bytes"); + static_assert(sizeof(MultiOpsTxnsStressTest::Record().c_) == sizeof(uint32_t), + "MultiOpsTxnsStressTest::Record::c_ must be 4 bytes"); +}; + +class MultiOpsTxnsStressListener : public EventListener { + public: + explicit MultiOpsTxnsStressListener(MultiOpsTxnsStressTest* stress_test) + : stress_test_(stress_test) { + assert(stress_test_); + } + + ~MultiOpsTxnsStressListener() override {} + + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + assert(db); +#ifdef NDEBUG + (void)db; +#endif + assert(info.cf_id == 0); + const ReadOptions read_options(Env::IOActivity::kFlush); + stress_test_->VerifyPkSkFast(read_options, info.job_id); + } + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { + assert(db); +#ifdef NDEBUG + (void)db; +#endif + assert(info.cf_id == 0); + const ReadOptions read_options(Env::IOActivity::kCompaction); + stress_test_->VerifyPkSkFast(read_options, info.job_id); + } + + private: + MultiOpsTxnsStressTest* const stress_test_ = nullptr; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env.cc new file mode 100644 index 0000000000..8ddc9a1a6c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env.cc @@ -0,0 +1,534 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "env/composite_env_wrapper.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// The CompositeEnvWrapper class provides an interface that is compatible +// with the old monolithic Env API, and an implementation that wraps around +// the new Env that provides threading and other OS related functionality, and +// the new FileSystem API that provides storage functionality. By +// providing the old Env interface, it allows the rest of RocksDB code to +// be agnostic of whether the underlying Env implementation is a monolithic +// Env or an Env + FileSystem. In the former case, the user will specify +// Options::env only, whereas in the latter case, the user will specify +// Options::env and Options::file_system. + +class CompositeSequentialFileWrapper : public SequentialFile { + public: + explicit CompositeSequentialFileWrapper( + std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(n, io_opts, result, scratch, &dbg); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg); + } + + private: + std::unique_ptr target_; +}; + +class CompositeRandomAccessFileWrapper : public RandomAccessFile { + public: + explicit CompositeRandomAccessFileWrapper( + std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + IOOptions io_opts; + IODebugContext dbg; + std::vector fs_reqs; + Status status; + + fs_reqs.resize(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].offset = reqs[i].offset; + fs_reqs[i].len = reqs[i].len; + fs_reqs[i].scratch = reqs[i].scratch; + fs_reqs[i].status = IOStatus::OK(); + } + status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg); + for (size_t i = 0; i < num_reqs; ++i) { + reqs[i].result = fs_reqs[i].result; + reqs[i].status = fs_reqs[i].status; + } + return status; + } + Status Prefetch(uint64_t offset, size_t n) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Prefetch(offset, n, io_opts, &dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((FSRandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + std::unique_ptr target_; +}; + +class CompositeWritableFileWrapper : public WritableFile { + public: + explicit CompositeWritableFileWrapper(std::unique_ptr& t) + : target_(std::move(t)) {} + + Status Append(const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, &dbg); + } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, verification_info, &dbg); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, &dbg); + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, verification_info, + &dbg); + } + Status Truncate(uint64_t size) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Truncate(size, io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->GetFileSize(io_opts, &dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->RangeSync(offset, nbytes, io_opts, &dbg); + } + + void PrepareWrite(size_t offset, size_t len) override { + IOOptions io_opts; + IODebugContext dbg; + target_->PrepareWrite(offset, len, io_opts, &dbg); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Allocate(offset, len, io_opts, &dbg); + } + + std::unique_ptr* target() { return &target_; } + + private: + std::unique_ptr target_; +}; + +class CompositeRandomRWFileWrapper : public RandomRWFile { + public: + explicit CompositeRandomRWFileWrapper(std::unique_ptr& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Write(offset, data, io_opts, &dbg); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + + private: + std::unique_ptr target_; +}; + +class CompositeDirectoryWrapper : public Directory { + public: + explicit CompositeDirectoryWrapper(std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions()); + } + + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr target_; +}; +} // namespace + +Status CompositeEnv::NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeSequentialFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeRandomAccessFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewWritableFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr file; + status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file, + &dbg); + if (status.ok()) { + result->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr file; + status = file_system_->ReuseWritableFile(fname, old_fname, + FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg); + if (status.ok()) { + result->reset(new CompositeRandomRWFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + IOOptions io_opts; + IODebugContext dbg; + std::unique_ptr dir; + Status status; + status = file_system_->NewDirectory(name, io_opts, &dir, &dbg); + if (status.ok()) { + result->reset(new CompositeDirectoryWrapper(dir)); + } + return status; +} + +namespace { +static std::unordered_map env_wrapper_type_info = { + {"target", + OptionTypeInfo(0, OptionType::kUnknown, OptionVerificationType::kByName, + OptionTypeFlags::kDontSerialize) + .SetParseFunc([](const ConfigOptions& opts, + const std::string& /*name*/, const std::string& value, + void* addr) { + auto target = static_cast(addr); + return Env::CreateFromString(opts, value, &(target->env), + &(target->guard)); + }) + .SetEqualsFunc([](const ConfigOptions& opts, + const std::string& /*name*/, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto target1 = static_cast(addr1); + const auto target2 = static_cast(addr2); + if (target1->env != nullptr) { + return target1->env->AreEquivalent(opts, target2->env, mismatch); + } else { + return (target2->env == nullptr); + } + }) + .SetPrepareFunc([](const ConfigOptions& opts, + const std::string& /*name*/, void* addr) { + auto target = static_cast(addr); + if (target->guard.get() != nullptr) { + target->env = target->guard.get(); + } else if (target->env == nullptr) { + target->env = Env::Default(); + } + return target->env->PrepareOptions(opts); + }) + .SetValidateFunc([](const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts, + const std::string& /*name*/, const void* addr) { + const auto target = static_cast(addr); + if (target->env == nullptr) { + return Status::InvalidArgument("Target Env not specified"); + } else { + return target->env->ValidateOptions(db_opts, cf_opts); + } + })}, +}; +static std::unordered_map + composite_fs_wrapper_type_info = { + {"file_system", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +}; + +static std::unordered_map + composite_clock_wrapper_type_info = { + {"clock", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +}; + +} // namespace + +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs) { + return std::unique_ptr(new CompositeEnvWrapper(Env::Default(), fs)); +} + +CompositeEnvWrapper::CompositeEnvWrapper(Env* env, + const std::shared_ptr& fs, + const std::shared_ptr& sc) + : CompositeEnv(fs, sc), target_(env) { + RegisterOptions("", &target_, &env_wrapper_type_info); + RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info); + RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info); +} + +CompositeEnvWrapper::CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& fs, + const std::shared_ptr& sc) + : CompositeEnv(fs, sc), target_(env) { + RegisterOptions("", &target_, &env_wrapper_type_info); + RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info); + RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info); +} + +Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) { + target_.Prepare(); + if (file_system_ == nullptr) { + file_system_ = target_.env->GetFileSystem(); + } + if (system_clock_ == nullptr) { + system_clock_ = target_.env->GetSystemClock(); + } + return Env::PrepareOptions(options); +} + +std::string CompositeEnvWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto options = CompositeEnv::SerializeOptions(config_options, header); + if (target_.env != nullptr && target_.env != Env::Default()) { + options.append("target="); + options.append(target_.env->ToString(config_options)); + } + return options; +} + +EnvWrapper::EnvWrapper(Env* t) : target_(t) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::EnvWrapper(std::unique_ptr&& t) : target_(std::move(t)) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::EnvWrapper(const std::shared_ptr& t) : target_(t) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::~EnvWrapper() {} + +Status EnvWrapper::PrepareOptions(const ConfigOptions& options) { + target_.Prepare(); + return Env::PrepareOptions(options); +} + +std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const { + auto parent = Env::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_.env == nullptr || + target_.env == Env::Default()) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_.env->ToString(config_options)); + return result; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env_wrapper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env_wrapper.h new file mode 100644 index 0000000000..003c50658b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/composite_env_wrapper.h @@ -0,0 +1,378 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#undef LoadLibrary +#endif + +namespace ROCKSDB_NAMESPACE { + +class CompositeEnv : public Env { + public: + // Initialize a CompositeEnvWrapper that delegates all thread/time related + // calls to env, and all file operations to fs + explicit CompositeEnv(const std::shared_ptr& fs, + const std::shared_ptr& clock) + : Env(fs, clock) {} + + Status RegisterDbPaths(const std::vector& paths) override { + return file_system_->RegisterDbPaths(paths); + } + Status UnregisterDbPaths(const std::vector& paths) override { + return file_system_->UnregisterDbPaths(paths); + } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override; + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override; + + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& options) override; + + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) override; + + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return file_system_->NewMemoryMappedFileBuffer(fname, result); + } + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override; + + Status FileExists(const std::string& f) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->FileExists(f, io_opts, &dbg); + } + Status GetChildren(const std::string& dir, + std::vector* r) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetChildren(dir, io_opts, r, &dbg); + } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetChildrenFileAttributes(dir, io_opts, result, &dbg); + } + Status DeleteFile(const std::string& f) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->DeleteFile(f, io_opts, &dbg); + } + Status Truncate(const std::string& fname, size_t size) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->Truncate(fname, size, io_opts, &dbg); + } + Status CreateDir(const std::string& d) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->CreateDir(d, io_opts, &dbg); + } + Status CreateDirIfMissing(const std::string& d) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->CreateDirIfMissing(d, io_opts, &dbg); + } + Status DeleteDir(const std::string& d) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->DeleteDir(d, io_opts, &dbg); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetFileSize(f, io_opts, s, &dbg); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetFileModificationTime(fname, io_opts, file_mtime, + &dbg); + } + + Status RenameFile(const std::string& s, const std::string& t) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->RenameFile(s, t, io_opts, &dbg); + } + + Status LinkFile(const std::string& s, const std::string& t) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->LinkFile(s, t, io_opts, &dbg); + } + + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->NumFileLinks(fname, io_opts, count, &dbg); + } + + Status AreFilesSame(const std::string& first, const std::string& second, + bool* res) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->AreFilesSame(first, second, io_opts, res, &dbg); + } + + Status LockFile(const std::string& f, FileLock** l) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->LockFile(f, io_opts, l, &dbg); + } + + Status UnlockFile(FileLock* l) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->UnlockFile(l, io_opts, &dbg); + } + + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetAbsolutePath(db_path, io_opts, output_path, &dbg); + } + + Status NewLogger(const std::string& fname, + std::shared_ptr* result) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->NewLogger(fname, io_opts, result, &dbg); + } + + Status IsDirectory(const std::string& path, bool* is_dir) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->IsDirectory(path, io_opts, is_dir, &dbg); + } + + Status GetTestDirectory(std::string* path) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetTestDirectory(io_opts, path, &dbg); + } + + EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { + return file_system_->OptimizeForLogRead(FileOptions(env_options)); + } + + EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const override { + return file_system_->OptimizeForManifestRead(FileOptions(env_options)); + } + + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override { + return file_system_->OptimizeForLogWrite(FileOptions(env_options), + db_options); + } + + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override { + return file_system_->OptimizeForManifestWrite(FileOptions(env_options)); + } + + EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const override { + return file_system_->OptimizeForCompactionTableWrite( + FileOptions(env_options), immutable_ops); + } + EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForCompactionTableRead( + FileOptions(env_options), db_options); + } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForBlobFileRead(FileOptions(env_options), + db_options); + } + // This seems to clash with a macro on Windows, so #undef it here +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg); + } + uint64_t NowMicros() override { return system_clock_->NowMicros(); } + uint64_t NowNanos() override { return system_clock_->NowNanos(); } + + uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); } + + void SleepForMicroseconds(int micros) override { + system_clock_->SleepForMicroseconds(micros); + } + + Status GetCurrentTime(int64_t* unix_time) override { + return system_clock_->GetCurrentTime(unix_time); + } + std::string TimeToString(uint64_t time) override { + return system_clock_->TimeToString(time); + } +}; + +class CompositeEnvWrapper : public CompositeEnv { + public: + // Initialize a CompositeEnvWrapper that delegates all thread/time related + // calls to env, and all file operations to fs + explicit CompositeEnvWrapper(Env* env) + : CompositeEnvWrapper(env, env->GetFileSystem(), env->GetSystemClock()) {} + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} + + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} + + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& sc); + + explicit CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} + + explicit CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} + + explicit CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& fs, + const std::shared_ptr& sc); + + static const char* kClassName() { return "CompositeEnv"; } + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return CompositeEnv::IsInstanceOf(name); + } + } + const Customizable* Inner() const override { return target_.env; } + + Status PrepareOptions(const ConfigOptions& options) override; + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; + + // Return the target to which this Env forwards all calls + Env* env_target() const { return target_.env; } + +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return target_.env->LoadLibrary(lib_name, search_path, result); + } +#endif + + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return target_.env->Schedule(f, a, pri, tag, u); + } + + int UnSchedule(void* tag, Priority pri) override { + return target_.env->UnSchedule(tag, pri); + } + + void StartThread(void (*f)(void*), void* a) override { + return target_.env->StartThread(f, a); + } + void WaitForJoin() override { return target_.env->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return target_.env->GetThreadPoolQueueLen(pri); + } + + int ReserveThreads(int threads_to_be_reserved, Priority pri) override { + return target_.env->ReserveThreads(threads_to_be_reserved, pri); + } + + int ReleaseThreads(int threads_to_be_released, Priority pri) override { + return target_.env->ReleaseThreads(threads_to_be_released, pri); + } + + Status GetHostName(char* name, uint64_t len) override { + return target_.env->GetHostName(name, len); + } + void SetBackgroundThreads(int num, Priority pri) override { + return target_.env->SetBackgroundThreads(num, pri); + } + int GetBackgroundThreads(Priority pri) override { + return target_.env->GetBackgroundThreads(pri); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access); + } + + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return target_.env->IncBackgroundThreadsIfNeeded(num, pri); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + target_.env->LowerThreadPoolIOPriority(pool); + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + target_.env->LowerThreadPoolCPUPriority(pool); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return target_.env->LowerThreadPoolCPUPriority(pool, pri); + } + + Status GetThreadList(std::vector* thread_list) override { + return target_.env->GetThreadList(thread_list); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_.env->GetThreadStatusUpdater(); + } + + uint64_t GetThreadID() const override { return target_.env->GetThreadID(); } + + std::string GenerateUniqueId() override { + return target_.env->GenerateUniqueId(); + } + + private: + EnvWrapper::Target target_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/emulated_clock.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/emulated_clock.h new file mode 100644 index 0000000000..6227376350 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/emulated_clock.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +// A SystemClock that can "mock" sleep and counts its operations. +class EmulatedSystemClock : public SystemClockWrapper { + private: + // Something to return when mocking current time + const int64_t maybe_starting_time_; + std::atomic sleep_counter_{0}; + std::atomic cpu_counter_{0}; + std::atomic addon_microseconds_{0}; + // Do not modify in the env of a running DB (could cause deadlock) + std::atomic time_elapse_only_sleep_; + bool no_slowdown_; + + public: + explicit EmulatedSystemClock(const std::shared_ptr& base, + bool time_elapse_only_sleep = false); + + static const char* kClassName() { return "TimeEmulatedSystemClock"; } + const char* Name() const override { return kClassName(); } + + virtual void SleepForMicroseconds(int micros) override { + sleep_counter_++; + if (no_slowdown_ || time_elapse_only_sleep_) { + addon_microseconds_.fetch_add(micros); + } + if (!no_slowdown_) { + SystemClockWrapper::SleepForMicroseconds(micros); + } + } + + void MockSleepForMicroseconds(int64_t micros) { + sleep_counter_++; + assert(no_slowdown_); + addon_microseconds_.fetch_add(micros); + } + + void MockSleepForSeconds(int64_t seconds) { + sleep_counter_++; + assert(no_slowdown_); + addon_microseconds_.fetch_add(seconds * 1000000); + } + + void SetTimeElapseOnlySleep(bool enabled) { + // We cannot set these before destroying the last DB because they might + // cause a deadlock or similar without the appropriate options set in + // the DB. + time_elapse_only_sleep_ = enabled; + no_slowdown_ = enabled; + } + + bool IsTimeElapseOnlySleep() const { return time_elapse_only_sleep_.load(); } + void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; } + bool IsMockSleepEnabled() const { return no_slowdown_; } + + int GetSleepCounter() const { return sleep_counter_.load(); } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s; + if (time_elapse_only_sleep_) { + *unix_time = maybe_starting_time_; + } else { + s = SystemClockWrapper::GetCurrentTime(unix_time); + } + if (s.ok()) { + // mock microseconds elapsed to seconds of time + *unix_time += addon_microseconds_.load() / 1000000; + } + return s; + } + + virtual uint64_t CPUNanos() override { + cpu_counter_++; + return SystemClockWrapper::CPUNanos(); + } + + virtual uint64_t CPUMicros() override { + cpu_counter_++; + return SystemClockWrapper::CPUMicros(); + } + + virtual uint64_t NowNanos() override { + return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) + + addon_microseconds_.load() * 1000; + } + + virtual uint64_t NowMicros() override { + return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) + + addon_microseconds_.load(); + } + + int GetCpuCounter() const { return cpu_counter_.load(); } + + void ResetCounters() { + cpu_counter_.store(0); + sleep_counter_.store(0); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env.cc new file mode 100644 index 0000000000..2137738c7b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env.cc @@ -0,0 +1,1233 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env.h" + +#include + +#include "env/composite_env_wrapper.h" +#include "env/emulated_clock.h" +#include "env/mock_env.h" +#include "env/unique_id_gen.h" +#include "logging/env_logger.h" +#include "memory/arena.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static int RegisterBuiltinEnvs(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory(MockEnv::kClassName(), [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(MockEnv::Create(Env::Default())); + return guard->get(); + }); + library.AddFactory( + CompositeEnvWrapper::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CompositeEnvWrapper(Env::Default())); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} + +static void RegisterSystemEnvs() { + static std::once_flag loaded; + std::call_once(loaded, [&]() { + RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), ""); + }); +} + +class LegacySystemClock : public SystemClock { + private: + Env* env_; + + public: + explicit LegacySystemClock(Env* env) : env_(env) {} + const char* Name() const override { return "LegacySystemClock"; } + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + uint64_t NowMicros() override { return env_->NowMicros(); } + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + uint64_t NowNanos() override { return env_->NowNanos(); } + + uint64_t CPUMicros() override { return CPUNanos() / 1000; } + uint64_t CPUNanos() override { return env_->NowCPUNanos(); } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + void SleepForMicroseconds(int micros) override { + env_->SleepForMicroseconds(micros); + } + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + Status GetCurrentTime(int64_t* unix_time) override { + return env_->GetCurrentTime(unix_time); + } + // Converts seconds-since-Jan-01-1970 to a printable string + std::string TimeToString(uint64_t time) override { + return env_->TimeToString(time); + } + + std::string SerializeOptions(const ConfigOptions& /*config_options*/, + const std::string& /*prefix*/) const override { + // We do not want the LegacySystemClock to appear in the serialized output. + // This clock is an internal class for those who do not implement one and + // would be part of the Env. As such, do not serialize it here. + return ""; + } +}; + +class LegacySequentialFileWrapper : public FSSequentialFile { + public: + explicit LegacySequentialFileWrapper( + std::unique_ptr&& _target) + : target_(std::move(_target)) {} + + IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Read(n, result, scratch)); + } + IOStatus Skip(uint64_t n) override { + return status_to_io_status(target_->Skip(n)); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + IOStatus PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->PositionedRead(offset, n, result, scratch)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { + public: + explicit LegacyRandomAccessFileWrapper( + std::unique_ptr&& target) + : target_(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + + IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + std::vector reqs; + Status status; + + reqs.reserve(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest req; + + req.offset = fs_reqs[i].offset; + req.len = fs_reqs[i].len; + req.scratch = fs_reqs[i].scratch; + req.status = Status::OK(); + + reqs.emplace_back(req); + } + status = target_->MultiRead(reqs.data(), num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].result = reqs[i].result; + fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status)); + } + return status_to_io_status(std::move(status)); + } + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Prefetch(offset, n)); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((RandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyRandomRWFileWrapper : public FSRandomRWFile { + public: + explicit LegacyRandomRWFileWrapper(std::unique_ptr&& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Write(offset, data)); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + + private: + std::unique_ptr target_; +}; + +class LegacyWritableFileWrapper : public FSWritableFile { + public: + explicit LegacyWritableFileWrapper(std::unique_ptr&& _target) + : target_(std::move(_target)) {} + + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(size)); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return target_->GetFileSize(); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RangeSync(offset, nbytes)); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + target_->PrepareWrite(offset, len); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Allocate(offset, len)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyDirectoryWrapper : public FSDirectory { + public: + explicit LegacyDirectoryWrapper(std::unique_ptr&& target) + : target_(std::move(target)) {} + + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr target_; +}; + +class LegacyFileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} + ~LegacyFileSystemWrapper() override {} + + static const char* kClassName() { return "LegacyFileSystem"; } + const char* Name() const override { return kClassName(); } + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewSequentialFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacySequentialFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewRandomAccessFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyRandomAccessFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewWritableFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->ReopenWritableFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewRandomRWFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyRandomRWFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return status_to_io_status( + target_->NewMemoryMappedFileBuffer(fname, result)); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr dir; + Status s = target_->NewDirectory(name, &dir); + if (s.ok()) { + result->reset(new LegacyDirectoryWrapper(std::move(dir))); + } + return status_to_io_status(std::move(s)); + } + IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->FileExists(f)); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/, + std::vector* r, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildren(dir, r)); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& /*options*/, + std::vector* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildrenFileAttributes(dir, result)); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteFile(f)); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(fname, size)); + } + IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDir(d)); + } + IOStatus CreateDirIfMissing(const std::string& d, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDirIfMissing(d)); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteDir(d)); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFileSize(f, s)); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->GetFileModificationTime(fname, file_mtime)); + } + + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetAbsolutePath(db_path, output_path)); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RenameFile(s, t)); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LinkFile(s, t)); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/, + uint64_t* count, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NumFileLinks(fname, count)); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& /*options*/, bool* res, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->AreFilesSame(first, second, res)); + } + + IOStatus LockFile(const std::string& f, const IOOptions& /*options*/, + FileLock** l, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LockFile(f, l)); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->UnlockFile(l)); + } + + IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetTestDirectory(path)); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/, + std::shared_ptr* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NewLogger(fname, result)); + } + + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeEnvOptions(opts); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/, + uint64_t* diskfree, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFreeSpace(path, diskfree)); + } + IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->IsDirectory(path, is_dir)); + } + + std::string SerializeOptions(const ConfigOptions& /*config_options*/, + const std::string& /*prefix*/) const override { + // We do not want the LegacyFileSystem to appear in the serialized output. + // This clock is an internal class for those who do not implement one and + // would be part of the Env. As such, do not serialize it here. + return ""; + } + private: + Env* target_; +}; +} // end anonymous namespace + +Env::Env() : thread_status_updater_(nullptr) { + file_system_ = std::make_shared(this); + system_clock_ = std::make_shared(this); +} + +Env::Env(const std::shared_ptr& fs) + : thread_status_updater_(nullptr), file_system_(fs) { + system_clock_ = std::make_shared(this); +} + +Env::Env(const std::shared_ptr& fs, + const std::shared_ptr& clock) + : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {} + +Env::~Env() {} + +Status Env::NewLogger(const std::string& fname, + std::shared_ptr* result) { + return NewEnvLogger(fname, this, result); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result) { + Env* base = Env::Default(); + if (value.empty() || base->IsInstanceOf(value)) { + *result = base; + return Status::OK(); + } else { + RegisterSystemEnvs(); + Env* env = *result; + Status s = LoadStaticObject(config_options, value, &env); + if (s.ok()) { + *result = env; + } + return s; + } +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard) { + assert(result); + assert(guard != nullptr); + std::unique_ptr uniq; + + Env* env = *result; + std::string id; + std::unordered_map opt_map; + + Status status = + Customizable::GetOptionsMap(config_options, env, value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + Env* base = Env::Default(); + if (id.empty() || base->IsInstanceOf(id)) { + env = base; + status = Status::OK(); + } else { + RegisterSystemEnvs(); + // First, try to load the Env as a unique object. + status = config_options.registry->NewObject(id, &env, &uniq); + } + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, env, opt_map); + } + if (status.ok()) { + guard->reset(uniq.release()); + *result = env; + } + return status; +} + +Status Env::CreateFromUri(const ConfigOptions& config_options, + const std::string& env_uri, const std::string& fs_uri, + Env** result, std::shared_ptr* guard) { + *result = config_options.env; + if (env_uri.empty() && fs_uri.empty()) { + // Neither specified. Use the default + guard->reset(); + return Status::OK(); + } else if (!env_uri.empty() && !fs_uri.empty()) { + // Both specified. Cannot choose. Return Invalid + return Status::InvalidArgument("cannot specify both fs_uri and env_uri"); + } else if (fs_uri.empty()) { // Only have an ENV URI. Create an Env from it + return CreateFromString(config_options, env_uri, result, guard); + } else { + std::shared_ptr fs; + Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs); + if (s.ok()) { + guard->reset(new CompositeEnvWrapper(*result, fs)); + *result = guard->get(); + } + return s; + } +} + +std::string Env::PriorityToString(Env::Priority priority) { + switch (priority) { + case Env::Priority::BOTTOM: + return "Bottom"; + case Env::Priority::LOW: + return "Low"; + case Env::Priority::HIGH: + return "High"; + case Env::Priority::USER: + return "User"; + case Env::Priority::TOTAL: + assert(false); + } + return "Invalid"; +} + +uint64_t Env::GetThreadID() const { + std::hash hasher; + return hasher(std::this_thread::get_id()); +} + +Status Env::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status s = RenameFile(old_fname, fname); + if (!s.ok()) { + return s; + } + return NewWritableFile(fname, result, options); +} + +Status Env::GetChildrenFileAttributes(const std::string& dir, + std::vector* result) { + assert(result != nullptr); + std::vector child_fnames; + Status s = GetChildren(dir, &child_fnames); + if (!s.ok()) { + return s; + } + result->resize(child_fnames.size()); + size_t result_size = 0; + for (size_t i = 0; i < child_fnames.size(); ++i) { + const std::string path = dir + "/" + child_fnames[i]; + if (!(s = GetFileSize(path, &(*result)[result_size].size_bytes)).ok()) { + if (FileExists(path).IsNotFound()) { + // The file may have been deleted since we listed the directory + continue; + } + return s; + } + (*result)[result_size].name = std::move(child_fnames[i]); + result_size++; + } + result->resize(result_size); + return Status::OK(); +} + +Status Env::GetHostNameString(std::string* result) { + std::array hostname_buf{}; + Status s = GetHostName(hostname_buf.data(), hostname_buf.size()); + if (s.ok()) { + hostname_buf[hostname_buf.size() - 1] = '\0'; + result->assign(hostname_buf.data()); + } + return s; +} + +std::string Env::GenerateUniqueId() { + std::string result; + bool success = port::GenerateRfcUuid(&result); + if (!success) { + // Fall back on our own way of generating a unique ID and adapt it to + // RFC 4122 variant 1 version 4 (a random ID). + // https://en.wikipedia.org/wiki/Universally_unique_identifier + // We already tried GenerateRfcUuid so no need to try it again in + // GenerateRawUniqueId + constexpr bool exclude_port_uuid = true; + uint64_t upper, lower; + GenerateRawUniqueId(&upper, &lower, exclude_port_uuid); + + // Set 4-bit version to 4 + upper = (upper & (~uint64_t{0xf000})) | 0x4000; + // Set unary-encoded variant to 1 (0b10) + lower = (lower & (~(uint64_t{3} << 62))) | (uint64_t{2} << 62); + + // Use 36 character format of RFC 4122 + result.resize(36U); + char* buf = &result[0]; + PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, upper, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, lower >> 48, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 12, lower, /*!uppercase*/ false); + assert(buf == &result[36]); + + // Verify variant 1 version 4 + assert(result[14] == '4'); + assert(result[19] == '8' || result[19] == '9' || result[19] == 'a' || + result[19] == 'b'); + } + return result; +} + +SequentialFile::~SequentialFile() {} + +RandomAccessFile::~RandomAccessFile() {} + +WritableFile::~WritableFile() {} + +MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {} + +Logger::~Logger() {} + +Status Logger::Close() { + if (!closed_) { + closed_ = true; + return CloseImpl(); + } else { + return Status::OK(); + } +} + +Status Logger::CloseImpl() { return Status::NotSupported(); } + +FileLock::~FileLock() {} + +void LogFlush(Logger* info_log) { + if (info_log) { + info_log->Flush(); + } +} + +static void Logv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + } +} + +void Log(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Logv(info_log, format, ap); + va_end(ap); +} + +void Logger::Logv(const InfoLogLevel log_level, const char* format, + va_list ap) { + static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", "ERROR", + "FATAL"}; + if (log_level < log_level_) { + return; + } + + if (log_level == InfoLogLevel::INFO_LEVEL) { + // Doesn't print log level if it is INFO level. + // This is to avoid unexpected performance regression after we add + // the feature of log level. All the logs before we add the feature + // are INFO level. We don't want to add extra costs to those existing + // logging. + Logv(format, ap); + } else if (log_level == InfoLogLevel::HEADER_LEVEL) { + LogHeader(format, ap); + } else { + char new_format[500]; + snprintf(new_format, sizeof(new_format) - 1, "[%s] %s", + kInfoLogLevelNames[log_level], format); + Logv(new_format, ap); + } + + if (log_level >= InfoLogLevel::WARN_LEVEL && + log_level != InfoLogLevel::HEADER_LEVEL) { + // Log messages with severity of warning or higher should be rare and are + // sometimes followed by an unclean crash. We want to be sure important + // messages are not lost in an application buffer when that happens. + Flush(); + } +} + +static void Logv(const InfoLogLevel log_level, Logger* info_log, + const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= log_level) { + if (log_level == InfoLogLevel::HEADER_LEVEL) { + info_log->LogHeader(format, ap); + } else { + info_log->Logv(log_level, format, ap); + } + } +} + +void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, + ...) { + va_list ap; + va_start(ap, format); + Logv(log_level, info_log, format, ap); + va_end(ap); +} + +static void Headerv(Logger* info_log, const char* format, va_list ap) { + if (info_log) { + info_log->LogHeader(format, ap); + } +} + +void Header(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Headerv(info_log, format, ap); + va_end(ap); +} + +static void Debugv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { + info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap); + } +} + +void Debug(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Debugv(info_log, format, ap); + va_end(ap); +} + +static void Infov(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + } +} + +void Info(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Infov(info_log, format, ap); + va_end(ap); +} + +static void Warnv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) { + info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap); + } +} + +void Warn(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Warnv(info_log, format, ap); + va_end(ap); +} + +static void Errorv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) { + info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap); + } +} + +void Error(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Errorv(info_log, format, ap); + va_end(ap); +} + +static void Fatalv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) { + info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap); + } +} + +void Fatal(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Fatalv(info_log, format, ap); + va_end(ap); +} + +void LogFlush(const std::shared_ptr& info_log) { + LogFlush(info_log.get()); +} + +void Log(const InfoLogLevel log_level, const std::shared_ptr& info_log, + const char* format, ...) { + va_list ap; + va_start(ap, format); + Logv(log_level, info_log.get(), format, ap); + va_end(ap); +} + +void Header(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Headerv(info_log.get(), format, ap); + va_end(ap); +} + +void Debug(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Debugv(info_log.get(), format, ap); + va_end(ap); +} + +void Info(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Infov(info_log.get(), format, ap); + va_end(ap); +} + +void Warn(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Warnv(info_log.get(), format, ap); + va_end(ap); +} + +void Error(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Errorv(info_log.get(), format, ap); + va_end(ap); +} + +void Fatal(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Fatalv(info_log.get(), format, ap); + va_end(ap); +} + +void Log(const std::shared_ptr& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Logv(info_log.get(), format, ap); + va_end(ap); +} + +Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, + bool should_sync) { + const auto& fs = env->GetFileSystem(); + return WriteStringToFile(fs.get(), data, fname, should_sync); +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + const auto& fs = env->GetFileSystem(); + return ReadFileToString(fs.get(), fname, data); +} + +namespace { // anonymous namespace + +void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { + env_options->use_mmap_reads = options.allow_mmap_reads; + env_options->use_mmap_writes = options.allow_mmap_writes; + env_options->use_direct_reads = options.use_direct_reads; + env_options->set_fd_cloexec = options.is_fd_close_on_exec; + env_options->bytes_per_sync = options.bytes_per_sync; + env_options->compaction_readahead_size = options.compaction_readahead_size; + env_options->random_access_max_buffer_size = + options.random_access_max_buffer_size; + env_options->rate_limiter = options.rate_limiter.get(); + env_options->writable_file_max_buffer_size = + options.writable_file_max_buffer_size; + env_options->allow_fallocate = options.allow_fallocate; + env_options->strict_bytes_per_sync = options.strict_bytes_per_sync; + options.env->SanitizeEnvOptions(env_options); +} + +} // namespace + +EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized_env_options.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions Env::OptimizeForLogRead(const EnvOptions& env_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = false; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForManifestRead(const EnvOptions& env_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = false; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForCompactionTableWrite( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_writes = + db_options.use_direct_io_for_flush_and_compaction; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForCompactionTableRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} +EnvOptions Env::OptimizeForBlobFileRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} + +EnvOptions::EnvOptions(const DBOptions& options) { + AssignEnvOptions(this, options); +} + +EnvOptions::EnvOptions() { + DBOptions options; + AssignEnvOptions(this, options); +} + +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result) { + FileOptions options; + // TODO: Tune the buffer size. + options.writable_file_max_buffer_size = 1024 * 1024; + std::unique_ptr writable_file; + const auto status = env->GetFileSystem()->NewWritableFile( + fname, options, &writable_file, nullptr); + if (!status.ok()) { + return status; + } + + *result = std::make_shared(std::move(writable_file), fname, + options, env); + return Status::OK(); +} + +const std::shared_ptr& Env::GetFileSystem() const { + return file_system_; +} + +const std::shared_ptr& Env::GetSystemClock() const { + return system_clock_; +} +namespace { +static std::unordered_map sc_wrapper_type_info = { + {"target", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, +}; + +} // namespace +SystemClockWrapper::SystemClockWrapper(const std::shared_ptr& t) + : target_(t) { + RegisterOptions("", &target_, &sc_wrapper_type_info); +} + +Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) { + if (target_ == nullptr) { + target_ = SystemClock::Default(); + } + return SystemClock::PrepareOptions(options); +} + +std::string SystemClockWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto parent = SystemClock::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_ == nullptr || + target_->IsInstanceOf(SystemClock::kDefaultName())) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_->ToString(config_options)); + return result; + } +} + +static int RegisterBuiltinSystemClocks(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + EmulatedSystemClock::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new EmulatedSystemClock(SystemClock::Default())); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} + +Status SystemClock::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + auto clock = SystemClock::Default(); + if (clock->IsInstanceOf(value)) { + *result = clock; + return Status::OK(); + } else { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), ""); + }); + return LoadSharedObject(config_options, value, result); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.cc new file mode 100644 index 0000000000..5ff32a7e44 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !defined(OS_WIN) + +#include "env/env_chroot.h" + +#include // errno +#include // realpath, free +#include // geteuid + +#include "env/composite_env_wrapper.h" +#include "env/fs_remap.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" // errnoStr + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map chroot_fs_type_info = { + {"chroot_dir", {0, OptionType::kString}}}; +} // namespace +ChrootFileSystem::ChrootFileSystem(const std::shared_ptr& base, + const std::string& chroot_dir) + : RemapFileSystem(base), chroot_dir_(chroot_dir) { + RegisterOptions("chroot_dir", &chroot_dir_, &chroot_fs_type_info); +} + +Status ChrootFileSystem::PrepareOptions(const ConfigOptions& options) { + Status s = FileSystemWrapper::PrepareOptions(options); + if (!s.ok()) { + return s; + } else if (chroot_dir_.empty()) { + s = Status::InvalidArgument("ChRootFileSystem requires a chroot dir"); + } else { + s = target_->FileExists(chroot_dir_, IOOptions(), nullptr); + } + if (s.ok()) { +#if defined(OS_AIX) + char resolvedName[PATH_MAX]; + char* real_chroot_dir = realpath(chroot_dir_.c_str(), resolvedName); +#else + char* real_chroot_dir = realpath(chroot_dir_.c_str(), nullptr); +#endif + // chroot_dir must exist so realpath() returns non-nullptr. + assert(real_chroot_dir != nullptr); + chroot_dir_ = real_chroot_dir; +#if !defined(OS_AIX) + free(real_chroot_dir); +#endif + } + return s; +} + +IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options, + std::string* path, + IODebugContext* dbg) { + // Adapted from PosixEnv's implementation since it doesn't provide a way to + // create directory in the chroot. + char buf[256]; + snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast(geteuid())); + *path = buf; + + // Directory may already exist, so ignore return + return CreateDirIfMissing(*path, options, dbg); +} + +// Returns status and expanded absolute path including the chroot directory. +// Checks whether the provided path breaks out of the chroot. If it returns +// non-OK status, the returned path should not be used. +std::pair ChrootFileSystem::EncodePath( + const std::string& path) { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + std::pair res; + res.second = chroot_dir_ + path; +#if defined(OS_AIX) + char resolvedName[PATH_MAX]; + char* normalized_path = realpath(res.second.c_str(), resolvedName); +#else + char* normalized_path = realpath(res.second.c_str(), nullptr); +#endif + if (normalized_path == nullptr) { + res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str()); + } else if (strlen(normalized_path) < chroot_dir_.size() || + strncmp(normalized_path, chroot_dir_.c_str(), + chroot_dir_.size()) != 0) { + res.first = IOStatus::IOError(res.second, + "Attempted to access path outside chroot"); + } else { + res.first = IOStatus::OK(); + } +#if !defined(OS_AIX) + free(normalized_path); +#endif + return res; +} + +// Similar to EncodePath() except assumes the basename in the path hasn't been +// created yet. +std::pair ChrootFileSystem::EncodePathWithNewBasename( + const std::string& path) { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + // Basename may be followed by trailing slashes + size_t final_idx = path.find_last_not_of('/'); + if (final_idx == std::string::npos) { + // It's only slashes so no basename to extract + return EncodePath(path); + } + + // Pull off the basename temporarily since realname(3) (used by + // EncodePath()) requires a path that exists + size_t base_sep = path.rfind('/', final_idx); + auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1)); + status_and_enc_path.second.append(path.substr(base_sep + 1)); + return status_and_enc_path; +} + +std::shared_ptr NewChrootFileSystem( + const std::shared_ptr& base, const std::string& chroot_dir) { + auto chroot_fs = std::make_shared(base, chroot_dir); + Status s = chroot_fs->PrepareOptions(ConfigOptions()); + if (s.ok()) { + return chroot_fs; + } else { + return nullptr; + } +} + +Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) { + if (!base_env->FileExists(chroot_dir).ok()) { + return nullptr; + } + auto chroot_fs = NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir); + if (chroot_fs != nullptr) { + return new CompositeEnvWrapper(base_env, chroot_fs); + } else { + return nullptr; + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(OS_WIN) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.h new file mode 100644 index 0000000000..9cead1561b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_chroot.h @@ -0,0 +1,55 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(OS_WIN) + +#include + +#include "env/fs_remap.h" +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { +class ChrootFileSystem : public RemapFileSystem { + public: + ChrootFileSystem(const std::shared_ptr& base, + const std::string& chroot_dir); + + static const char* kClassName() { return "ChrootFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + Status PrepareOptions(const ConfigOptions& options) override; + + protected: + // Returns status and expanded absolute path including the chroot directory. + // Checks whether the provided path breaks out of the chroot. If it returns + // non-OK status, the returned path should not be used. + std::pair EncodePath(const std::string& path) override; + + // Similar to EncodePath() except assumes the basename in the path hasn't been + // created yet. + std::pair EncodePathWithNewBasename( + const std::string& path) override; + + private: + std::string chroot_dir_; +}; + +// Returns an Env that translates paths such that the root directory appears to +// be chroot_dir. chroot_dir should refer to an existing directory. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir); +std::shared_ptr NewChrootFileSystem( + const std::shared_ptr& base, const std::string& chroot_dir); + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(OS_WIN) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption.cc new file mode 100644 index 0000000000..beb31a9cf9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption.cc @@ -0,0 +1,1346 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/env_encryption.h" + +#include +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "env/env_encryption_ctr.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/convenience.h" +#include "rocksdb/io_status.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +std::shared_ptr EncryptionProvider::NewCTRProvider( + const std::shared_ptr& cipher) { + return std::make_shared(cipher); +} + +// Read up to "n" bytes from the file. "scratch[0..n-1]" may be +// written by this routine. Sets "*result" to the data that was +// read (including if fewer than "n" bytes were successfully read). +// May set "*result" to point at data in "scratch[0..n-1]", so +// "scratch[0..n-1]" must be live when "*result" is used. +// If an error was encountered, returns a non-OK status. +// +// REQUIRES: External synchronization +IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + assert(scratch); + IOStatus io_s = file_->Read(n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset_, (char*)result->data(), result->size())); + } + if (io_s.ok()) { + offset_ += result->size(); // We've already ready data from disk, so update + // offset_ even if decryption fails. + } + return io_s; +} + +// Skip "n" bytes from the file. This is guaranteed to be no +// slower that reading the same data, but may be faster. +// +// If end of file is reached, skipping will stop at the end of the +// file, and Skip will return OK. +// +// REQUIRES: External synchronization +IOStatus EncryptedSequentialFile::Skip(uint64_t n) { + auto status = file_->Skip(n); + if (!status.ok()) { + return status; + } + offset_ += n; + return status; +} + +// Indicates the upper layers if the current SequentialFile implementation +// uses direct IO. +bool EncryptedSequentialFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} + +// Positioned Read for direct I/O +// If Direct I/O enabled, offset, n, and scratch should be properly aligned +IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + assert(scratch); + offset += prefixLength_; // Skip prefix + auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + offset_ = offset + result->size(); + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return io_s; +} + +// Read up to "n" bytes from the file starting at "offset". +// "scratch[0..n-1]" may be written by this routine. Sets "*result" +// to the data that was read (including if fewer than "n" bytes were +// successfully read). May set "*result" to point at data in +// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when +// "*result" is used. If an error was encountered, returns a non-OK +// status. +// +// Safe for concurrent use by multiple threads. +// If Direct I/O enabled, offset, n, and scratch should be aligned properly. +IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + assert(scratch); + offset += prefixLength_; + auto io_s = file_->Read(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return io_s; +} + +// Readahead the file starting from offset by n bytes for caching. +IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + // return Status::OK(); + return file_->Prefetch(offset + prefixLength_, n, options, dbg); +} + +// Tries to get an unique ID for this file that will be the same each time +// the file is opened (and will stay the same while the file is open). +// Furthermore, it tries to make this ID at most "max_size" bytes. If such an +// ID can be created this function returns the length of the ID and places it +// in "id"; otherwise, this function returns 0, in which case "id" +// may not have been modified. +// +// This function guarantees, for IDs from a given environment, two unique ids +// cannot be made equal to each other by adding arbitrary bytes to one of +// them. That is, no unique ID is the prefix of another. +// +// This function guarantees that the returned ID will not be interpretable as +// a single varint. +// +// Note: these IDs are only valid for the duration of the process. +size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return file_->GetUniqueId(id, max_size); +}; + +void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { + file_->Hint(pattern); +} + +// Indicates the upper layers if the current RandomAccessFile implementation +// uses direct IO. +bool EncryptedRandomAccessFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +IOStatus EncryptedWritableFile::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToAppend(data); + if (data.size() > 0) { + auto offset = file_->GetFileSize(options, dbg); // size including prefix + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove + // so that the next two lines can be replaced with buf.Append(). + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); + } + if (!io_s.ok()) { + return io_s; + } + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); + } + return file_->Append(dataToAppend, options, dbg); +} + +IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data, + uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToAppend(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); + } + if (!io_s.ok()) { + return io_s; + } + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); + } + return file_->PositionedAppend(dataToAppend, offset, options, dbg); +} + +// Indicates the upper layers if the current WritableFile implementation +// uses direct IO. +bool EncryptedWritableFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// true if Sync() and Fsync() are safe to call concurrently with Append() +// and Flush(). +bool EncryptedWritableFile::IsSyncThreadSafe() const { + return file_->IsSyncThreadSafe(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +/* + * Get the size of valid data in the file. + */ +uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + return file_->GetFileSize(options, dbg) - prefixLength_; +} + +// Truncate is necessary to trim the file to the correct size +// before closing. It is not always possible to keep track of the file +// size due to whole pages writes. The behavior is undefined if called +// with other writes to follow. +IOStatus EncryptedWritableFile::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Truncate(size + prefixLength_, options, dbg); +} + +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +// This call has no effect on dirty pages in the cache. +IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} + +// Sync a file range with disk. +// offset is the starting byte of the file range to be synchronized. +// nbytes specifies the length of the range to be synchronized. +// This asks the OS to initiate flushing the cached data to disk, +// without waiting for completion. +// Default implementation does nothing. +IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) { + return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg); +} + +// PrepareWrite performs any necessary preparation for a write +// before the write actually occurs. This allows for pre-allocation +// of space on devices where it can result in less file +// fragmentation and/or less waste from over-zealous filesystem +// pre-allocation. +void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len, + const IOOptions& options, + IODebugContext* dbg) { + file_->PrepareWrite(offset + prefixLength_, len, options, dbg); +} + +void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) { + // the size here doesn't need to include prefixLength_, as it's a + // configuration will be use for `PrepareWrite()`. + file_->SetPreallocationBlockSize(size); +} + +void EncryptedWritableFile::GetPreallocationStatus( + size_t* block_size, size_t* last_allocated_block) { + file_->GetPreallocationStatus(block_size, last_allocated_block); +} + +// Pre-allocates space for a file. +IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Allocate(offset + prefixLength_, len, options, dbg); +} + +IOStatus EncryptedWritableFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} + +IOStatus EncryptedWritableFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} + +// A file abstraction for random reading and writing. + +// Indicates if the class makes use of direct I/O +// If false you must pass aligned buffer to Write() +bool EncryptedRandomRWFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. +// Pass aligned buffer when use_direct_io() returns true. +IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToWrite(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); + } + if (!io_s.ok()) { + return io_s; + } + dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); + } + return file_->Write(offset, dataToWrite, options, dbg); +} + +// Read up to `n` bytes starting from offset `offset` and store them in +// result, provided `scratch` size should be at least `n`. +// Returns Status::OK() on success. +IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { + assert(scratch); + offset += prefixLength_; + auto status = file_->Read(offset, n, options, result, scratch, dbg); + if (!status.ok()) { + return status; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return status; +} + +IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Fsync(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} + +namespace { +static std::unordered_map encrypted_fs_type_info = + { + {"provider", + OptionTypeInfo::AsCustomSharedPtr( + 0 /* No offset, whole struct*/, OptionVerificationType::kByName, + OptionTypeFlags::kNone)}, +}; +// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption +// to files stored on disk. +class EncryptedFileSystemImpl : public EncryptedFileSystem { + public: + const char* Name() const override { + return EncryptedFileSystem::kClassName(); + } + // Returns the raw encryption provider that should be used to write the input + // encrypted file. If there is no such provider, NotFound is returned. + IOStatus GetWritableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return IOStatus::OK(); + } else { + *result = nullptr; + return IOStatus::NotFound("No WriteProvider specified"); + } + } + + // Returns the raw encryption provider that should be used to read the input + // encrypted file. If there is no such provider, NotFound is returned. + IOStatus GetReadableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return IOStatus::OK(); + } else { + *result = nullptr; + return IOStatus::NotFound("No Provider specified"); + } + } + + // Creates a CipherStream for the underlying file/name using the options + // If a writable provider is found and encryption is enabled, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // should be encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateWritableCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + IOStatus status = GetWritableProvider(fname, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + status = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (status.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + status = underlying->Append(prefix, options.io_options, dbg); + } + if (!status.ok()) { + return status; + } + } + // Create cipher stream + status = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); + } + return status; + } + + template + IOStatus CreateWritableEncryptedFile(const std::string& fname, + std::unique_ptr& underlying, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + // Create cipher stream + std::unique_ptr stream; + size_t prefix_length; + IOStatus status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + if (stream) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // Creates a CipherStream for the underlying file/name using the options + // If a writable provider is found and encryption is enabled, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // should be encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateRandomWriteCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + IOStatus io_s = GetWritableProvider(fname, &provider); + if (!io_s.ok()) { + return io_s; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + io_s = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (io_s.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + io_s = underlying->Write(0, prefix, options.io_options, dbg); + } + if (!io_s.ok()) { + return io_s; + } + } + // Create cipher stream + io_s = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); + } + return io_s; + } + + // Creates a CipherStream for the underlying file/name using the options + // If a readable provider is found and the file is encrypted, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // is encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateSequentialCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + IOStatus status = underlying->Read(*prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); + } + + // Creates a CipherStream for the underlying file/name using the options + // If a readable provider is found and the file is encrypted, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // is encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateRandomReadCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + IOStatus status = underlying->Read(0, *prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); + } + + public: + EncryptedFileSystemImpl(const std::shared_ptr& base, + const std::shared_ptr& provider) + : EncryptedFileSystem(base) { + provider_ = provider; + RegisterOptions("EncryptionProvider", &provider_, &encrypted_fs_type_info); + } + + Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) override { + return provider_->AddCipher(descriptor, cipher, len, for_write); + } + + // NewSequentialFile opens a file for sequential reading. + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_reads) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr underlying; + auto status = + FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + uint64_t file_size; + status = FileSystemWrapper::GetFileSize(fname, options.io_options, + &file_size, dbg); + if (!status.ok()) { + return status; + } + if (!file_size) { + *result = std::move(underlying); + return status; + } + // Create cipher stream + std::unique_ptr stream; + size_t prefix_length; + status = CreateSequentialCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + result->reset(new EncryptedSequentialFile( + std::move(underlying), std::move(stream), prefix_length)); + } + return status; + } + + // NewRandomAccessFile opens a file for random read access. + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_reads) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr underlying; + auto status = FileSystemWrapper::NewRandomAccessFile(fname, options, + &underlying, dbg); + if (!status.ok()) { + return status; + } + std::unique_ptr stream; + size_t prefix_length; + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + if (stream) { + result->reset(new EncryptedRandomAccessFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // NewWritableFile opens a file for sequential writing. + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr underlying; + IOStatus status = + FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + } + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr underlying; + IOStatus status = + FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + } + + // Reuse an existing file by renaming it and opening it as writable. + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr underlying; + auto status = FileSystemWrapper::ReuseWritableFile( + fname, old_fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + } + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_reads || options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Check file exists + bool isNewFile = !FileExists(fname, options.io_options, dbg).ok(); + + // Open file using underlying Env implementation + std::unique_ptr underlying; + auto status = + FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + // Create cipher stream + std::unique_ptr stream; + size_t prefix_length = 0; + if (!isNewFile) { + // File already exists, read prefix + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + } else { + status = CreateRandomWriteCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + } + if (status.ok()) { + if (stream) { + result->reset(new EncryptedRandomRWFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // Store in *result the attributes of the children of the specified + // directory. + // In case the implementation lists the directory prior to iterating the + // files + // and files are concurrently deleted, the deleted files will be omitted + // from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not + // have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg); + if (!status.ok()) { + return status; + } + for (auto it = std::begin(*result); it != std::end(*result); ++it) { + // assert(it->size_bytes >= prefixLength); + // breaks env_basic_test when called on directory containing + // directories + // which makes subtraction of prefixLength worrisome since + // FileAttributes does not identify directories + EncryptionProvider* provider; + status = GetReadableProvider(it->name, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + it->size_bytes -= provider->GetPrefixLength(); + } + } + return IOStatus::OK(); + } + + // Store the size of fname in *file_size. + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetFileSize(fname, options, file_size, dbg); + if (!status.ok() || !(*file_size)) { + return status; + } + EncryptionProvider* provider; + status = GetReadableProvider(fname, &provider); + if (provider != nullptr && status.ok()) { + size_t prefixLength = provider->GetPrefixLength(); + assert(*file_size >= prefixLength); + *file_size -= prefixLength; + } + return status; + } + + private: + std::shared_ptr provider_; +}; +} // namespace + +Status NewEncryptedFileSystemImpl( + const std::shared_ptr& base, + const std::shared_ptr& provider, + std::unique_ptr* result) { + result->reset(new EncryptedFileSystemImpl(base, provider)); + return Status::OK(); +} + +std::shared_ptr NewEncryptedFS( + const std::shared_ptr& base, + const std::shared_ptr& provider) { + std::unique_ptr efs; + Status s = NewEncryptedFileSystemImpl(base, provider, &efs); + if (s.ok()) { + s = efs->PrepareOptions(ConfigOptions()); + } + if (s.ok()) { + std::shared_ptr result(efs.release()); + return result; + } else { + return nullptr; + } +} +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider) { + return new CompositeEnvWrapper( + base_env, NewEncryptedFS(base_env->GetFileSystem(), provider)); +} + +// Encrypt one or more (partial) blocks of data at the file offset. +// Length of data is given in dataSize. +Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, + size_t dataSize) { + // Calculate block index + auto blockSize = BlockSize(); + uint64_t blockIndex = fileOffset / blockSize; + size_t blockOffset = fileOffset % blockSize; + std::unique_ptr blockBuffer; + + std::string scratch; + AllocateScratch(scratch); + + // Encrypt individual blocks. + while (1) { + char* block = data; + size_t n = std::min(dataSize, blockSize - blockOffset); + if (n != blockSize) { + // We're not encrypting a full block. + // Copy data to blockBuffer + if (!blockBuffer.get()) { + // Allocate buffer + blockBuffer = std::unique_ptr(new char[blockSize]); + } + block = blockBuffer.get(); + // Copy plain data to block buffer + memmove(block + blockOffset, data, n); + } + auto status = EncryptBlock(blockIndex, block, (char*)scratch.data()); + if (!status.ok()) { + return status; + } + if (block != data) { + // Copy encrypted data back to `data`. + memmove(data, block + blockOffset, n); + } + dataSize -= n; + if (dataSize == 0) { + return Status::OK(); + } + data += n; + blockOffset = 0; + blockIndex++; + } +} + +// Decrypt one or more (partial) blocks of data at the file offset. +// Length of data is given in dataSize. +Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, + size_t dataSize) { + // Calculate block index + auto blockSize = BlockSize(); + uint64_t blockIndex = fileOffset / blockSize; + size_t blockOffset = fileOffset % blockSize; + std::unique_ptr blockBuffer; + + std::string scratch; + AllocateScratch(scratch); + + // Decrypt individual blocks. + while (1) { + char* block = data; + size_t n = std::min(dataSize, blockSize - blockOffset); + if (n != blockSize) { + // We're not decrypting a full block. + // Copy data to blockBuffer + if (!blockBuffer.get()) { + // Allocate buffer + blockBuffer = std::unique_ptr(new char[blockSize]); + } + block = blockBuffer.get(); + // Copy encrypted data to block buffer + memmove(block + blockOffset, data, n); + } + auto status = DecryptBlock(blockIndex, block, (char*)scratch.data()); + if (!status.ok()) { + return status; + } + if (block != data) { + // Copy decrypted data back to `data`. + memmove(data, block + blockOffset, n); + } + + // Simply decrementing dataSize by n could cause it to underflow, + // which will very likely make it read over the original bounds later + assert(dataSize >= n); + if (dataSize < n) { + return Status::Corruption("Cannot decrypt data at given offset"); + } + + dataSize -= n; + if (dataSize == 0) { + return Status::OK(); + } + data += n; + blockOffset = 0; + blockIndex++; + } +} + +namespace { +static std::unordered_map + rot13_block_cipher_type_info = { + {"block_size", + {0 /* No offset, whole struct*/, OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; +// Implements a BlockCipher using ROT13. +// +// Note: This is a sample implementation of BlockCipher, +// it is NOT considered safe and should NOT be used in production. +class ROT13BlockCipher : public BlockCipher { + private: + size_t blockSize_; + + public: + explicit ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) { + RegisterOptions("ROT13BlockCipherOptions", &blockSize_, + &rot13_block_cipher_type_info); + } + + static const char* kClassName() { return "ROT13"; } + const char* Name() const override { return kClassName(); } + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return blockSize_; } + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + Status Encrypt(char* data) override { + for (size_t i = 0; i < blockSize_; ++i) { + data[i] += 13; + } + return Status::OK(); + } + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + Status Decrypt(char* data) override { return Encrypt(data); } +}; +static const std::unordered_map + ctr_encryption_provider_type_info = { + {"cipher", + OptionTypeInfo::AsCustomSharedPtr( + 0 /* No offset, whole struct*/, OptionVerificationType::kByName, + OptionTypeFlags::kNone)}, +}; +} // anonymous namespace + +// Allocate scratch space which is passed to EncryptBlock/DecryptBlock. +void CTRCipherStream::AllocateScratch(std::string& scratch) { + auto blockSize = cipher_->BlockSize(); + scratch.reserve(blockSize); +} + +// Encrypt a block of data at the given block index. +// Length of data is equal to BlockSize(); +Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) { + // Create nonce + counter + auto blockSize = cipher_->BlockSize(); + memmove(scratch, iv_.data(), blockSize); + EncodeFixed64(scratch, blockIndex + initialCounter_); + + // Encrypt nonce+counter + auto status = cipher_->Encrypt(scratch); + if (!status.ok()) { + return status; + } + + // XOR data with ciphertext. + for (size_t i = 0; i < blockSize; i++) { + data[i] = data[i] ^ scratch[i]; + } + return Status::OK(); +} + +// Decrypt a block of data at the given block index. +// Length of data is equal to BlockSize(); +Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) { + // For CTR decryption & encryption are the same + return EncryptBlock(blockIndex, data, scratch); +} + +CTREncryptionProvider::CTREncryptionProvider( + const std::shared_ptr& c) + : cipher_(c) { + RegisterOptions("Cipher", &cipher_, &ctr_encryption_provider_type_info); +} + +bool CTREncryptionProvider::IsInstanceOf(const std::string& name) const { + // Special case for test purposes. + if (name == "1://test" && cipher_ != nullptr) { + return cipher_->IsInstanceOf(ROT13BlockCipher::kClassName()); + } else { + return EncryptionProvider::IsInstanceOf(name); + } +} + +// GetPrefixLength returns the length of the prefix that is added to every file +// and used for storing encryption options. +// For optimal performance, the prefix length should be a multiple of +// the page size. +size_t CTREncryptionProvider::GetPrefixLength() const { + return defaultPrefixLength; +} + +Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/, + const char* cipher, size_t len, + bool /*for_write*/) { + if (cipher_) { + return Status::NotSupported("Cannot add keys to CTREncryptionProvider"); + } else if (strcmp(ROT13BlockCipher::kClassName(), cipher) == 0) { + cipher_.reset(new ROT13BlockCipher(len)); + return Status::OK(); + } else { + return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher), + &cipher_); + } +} + +// decodeCTRParameters decodes the initial counter & IV from the given +// (plain text) prefix. +static void decodeCTRParameters(const char* prefix, size_t blockSize, + uint64_t& initialCounter, Slice& iv) { + // First block contains 64-bit initial counter + initialCounter = DecodeFixed64(prefix); + // Second block contains IV + iv = Slice(prefix + blockSize, blockSize); +} + +// CreateNewPrefix initialized an allocated block of prefix memory +// for a new file. +Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, + char* prefix, + size_t prefixLength) const { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } + // Create & seed rnd. + Random rnd((uint32_t)SystemClock::Default()->NowMicros()); + // Fill entire prefix block with random values. + for (size_t i = 0; i < prefixLength; i++) { + prefix[i] = rnd.Uniform(256) & 0xFF; + } + // Take random data to extract initial counter & IV + auto blockSize = cipher_->BlockSize(); + uint64_t initialCounter; + Slice prefixIV; + decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV); + + // Now populate the rest of the prefix, starting from the third block. + PopulateSecretPrefixPart(prefix + (2 * blockSize), + prefixLength - (2 * blockSize), blockSize); + + // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial + // counter & IV unencrypted) + CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter); + Status status; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = cipherStream.Encrypt(0, prefix + (2 * blockSize), + prefixLength - (2 * blockSize)); + } + if (!status.ok()) { + return status; + } + return Status::OK(); +} + +// PopulateSecretPrefixPart initializes the data into a new prefix block +// in plain text. +// Returns the amount of space (starting from the start of the prefix) +// that has been initialized. +size_t CTREncryptionProvider::PopulateSecretPrefixPart( + char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const { + // Nothing to do here, put in custom data in override when needed. + return 0; +} + +Status CTREncryptionProvider::CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } + // Read plain text part of prefix. + auto blockSize = cipher_->BlockSize(); + uint64_t initialCounter; + Slice iv; + decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv); + + // If the prefix is smaller than twice the block size, we would below read a + // very large chunk of the file (and very likely read over the bounds) + assert(prefix.size() >= 2 * blockSize); + if (prefix.size() < 2 * blockSize) { + return Status::Corruption("Unable to read from file " + fname + + ": read attempt would read beyond file bounds"); + } + + // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 + // with initial counter & IV are unencrypted) + CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter); + Status status; + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), + prefix.size() - (2 * blockSize)); + } + if (!status.ok()) { + return status; + } + + // Create cipher stream + return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, + prefix, result); +} + +// CreateCipherStreamFromPrefix creates a block access cipher stream for a file +// given given name and options. The given prefix is already decrypted. +Status CTREncryptionProvider::CreateCipherStreamFromPrefix( + const std::string& /*fname*/, const EnvOptions& /*options*/, + uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/, + std::unique_ptr* result) { + (*result) = std::unique_ptr( + new CTRCipherStream(cipher_, iv.data(), initialCounter)); + return Status::OK(); +} + +namespace { +static void RegisterEncryptionBuiltins() { + static std::once_flag once; + std::call_once(once, [&]() { + auto lib = ObjectRegistry::Default()->AddLibrary("encryption"); + // Match "CTR" or "CTR://test" + lib->AddFactory( + ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true) + .AddSuffix("://test"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + if (EndsWith(uri, "://test")) { + std::shared_ptr cipher = + std::make_shared(32); + guard->reset(new CTREncryptionProvider(cipher)); + } else { + guard->reset(new CTREncryptionProvider()); + } + return guard->get(); + }); + + lib->AddFactory( + "1://test", [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /*errmsg*/) { + std::shared_ptr cipher = + std::make_shared(32); + guard->reset(new CTREncryptionProvider(cipher)); + return guard->get(); + }); + + // Match "ROT13" or "ROT13:[0-9]+" + lib->AddFactory( + ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + size_t colon = uri.find(':'); + if (colon != std::string::npos) { + size_t block_size = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new ROT13BlockCipher(block_size)); + } else { + guard->reset(new ROT13BlockCipher(32)); + } + + return guard->get(); + }); + }); +} +} // namespace + +Status BlockCipher::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + RegisterEncryptionBuiltins(); + return LoadSharedObject(config_options, value, result); +} + +Status EncryptionProvider::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + RegisterEncryptionBuiltins(); + return LoadSharedObject(config_options, value, result); +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption_ctr.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption_ctr.h new file mode 100644 index 0000000000..fab1da1fbf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_encryption_ctr.h @@ -0,0 +1,114 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include "rocksdb/env_encryption.h" + +namespace ROCKSDB_NAMESPACE { +// CTRCipherStream implements BlockAccessCipherStream using an +// Counter operations mode. +// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation +// +// Note: This is a possible implementation of BlockAccessCipherStream, +// it is considered suitable for use. +class CTRCipherStream final : public BlockAccessCipherStream { + private: + std::shared_ptr cipher_; + std::string iv_; + uint64_t initialCounter_; + + public: + CTRCipherStream(const std::shared_ptr& c, const char* iv, + uint64_t initialCounter) + : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){}; + virtual ~CTRCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return cipher_->BlockSize(); } + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + void AllocateScratch(std::string&) override; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override; +}; + +// This encryption provider uses a CTR cipher stream, with a given block cipher +// and IV. +// +// Note: This is a possible implementation of EncryptionProvider, +// it is considered suitable for use, provided a safe BlockCipher is used. +class CTREncryptionProvider : public EncryptionProvider { + private: + std::shared_ptr cipher_; + + protected: + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. This size is to ensure the first real data byte + // is placed at largest known alignment point for direct io. + const static size_t defaultPrefixLength = 4096; + + public: + explicit CTREncryptionProvider( + const std::shared_ptr& c = nullptr); + virtual ~CTREncryptionProvider() {} + + static const char* kClassName() { return "CTR"; } + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override; + // GetPrefixLength returns the length of the prefix that is added to every + // file + // and used for storing encryption options. + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. + size_t GetPrefixLength() const override; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) const override; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) override; + + Status AddCipher(const std::string& descriptor, const char* /*cipher*/, + size_t /*len*/, bool /*for_write*/) override; + + protected: + // PopulateSecretPrefixPart initializes the data into a new prefix block + // that will be encrypted. This function will store the data in plain text. + // It will be encrypted later (before written to disk). + // Returns the amount of space (starting from the start of the prefix) + // that has been initialized. + virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, + size_t blockSize) const; + + // CreateCipherStreamFromPrefix creates a block access cipher stream for a + // file given + // given name and options. The given prefix is already decrypted. + virtual Status CreateCipherStreamFromPrefix( + const std::string& fname, const EnvOptions& options, + uint64_t initialCounter, const Slice& iv, const Slice& prefix, + std::unique_ptr* result); +}; + +Status NewEncryptedFileSystemImpl( + const std::shared_ptr& base_fs, + const std::shared_ptr& provider, + std::unique_ptr* fs); + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_posix.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_posix.cc new file mode 100644 index 0000000000..400b8ac6ee --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/env_posix.cc @@ -0,0 +1,524 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + +#include "port/lang.h" +#if !defined(OS_WIN) + +#include +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include +#endif +#include +#include + +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) +#include +#endif +#include +#include +#include +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#endif +#include +#include + +#include +// Get nano time includes +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) +#elif defined(__MACH__) +#include +#include +#include +#else +#include +#endif +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "env/io_posix.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "port/port.h" +#include "port/sys_time.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/thread_local.h" +#include "util/threadpool_imp.h" + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +namespace ROCKSDB_NAMESPACE { +#if defined(OS_WIN) +static const std::string kSharedLibExt = ".dll"; +static const char kPathSeparator = ';'; +#else +static const char kPathSeparator = ':'; +#if defined(OS_MACOSX) +static const std::string kSharedLibExt = ".dylib"; +#else +static const std::string kSharedLibExt = ".so"; +#endif +#endif + +namespace { + +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +class PosixDynamicLibrary : public DynamicLibrary { + public: + PosixDynamicLibrary(const std::string& name, void* handle) + : name_(name), handle_(handle) {} + ~PosixDynamicLibrary() override { dlclose(handle_); } + + Status LoadSymbol(const std::string& sym_name, void** func) override { + assert(nullptr != func); + dlerror(); // Clear any old error + *func = dlsym(handle_, sym_name.c_str()); + if (*func != nullptr) { + return Status::OK(); + } else { + char* err = dlerror(); + return Status::NotFound("Error finding symbol: " + sym_name, err); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + void* handle_; +}; +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + +class PosixClock : public SystemClock { + public: + static const char* kClassName() { return "PosixClock"; } + const char* Name() const override { return kDefaultName(); } + const char* NickName() const override { return kClassName(); } + + uint64_t NowMicros() override { + port::TimeVal tv; + port::GetTimeOfDay(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + uint64_t NowNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(OS_SOLARIS) + return gethrtime(); +#elif defined(__MACH__) + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#else + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif + } + + uint64_t CPUMicros() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return (static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000; +#endif + return 0; + } + + uint64_t CPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + + void SleepForMicroseconds(int micros) override { usleep(micros); } + + Status GetCurrentTime(int64_t* unix_time) override { + time_t ret = time(nullptr); + if (ret == (time_t)-1) { + return IOError("GetCurrentTime", "", errno); + } + *unix_time = (int64_t)ret; + return Status::OK(); + } + + std::string TimeToString(uint64_t secondsSince1970) override { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + port::LocalTimeR(&seconds, &t); + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + return dummy; + } +}; + +class PosixEnv : public CompositeEnv { + public: + static const char* kClassName() { return "PosixEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + struct JoinThreadsOnExit { + explicit JoinThreadsOnExit(PosixEnv& _deflt) : deflt(_deflt) {} + ~JoinThreadsOnExit() { + for (const auto tid : deflt.threads_to_join_) { + pthread_join(tid, nullptr); + } + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + deflt.thread_pools_[pool_id].JoinAllThreads(); + } + // Do not delete the thread_status_updater_ in order to avoid the + // free after use when Env::Default() is destructed while some other + // child threads are still trying to update thread status. All + // PosixEnv instances use the same thread_status_updater_, so never + // explicitly delete it. + } + PosixEnv& deflt; + }; + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION + // Loads the named library into the result. + // If the input name is empty, the current executable is loaded + // On *nix systems, a "lib" prefix is added to the name if one is not supplied + // Comparably, the appropriate shared library extension is added to the name + // if not supplied. If search_path is not specified, the shared library will + // be loaded using the default path (LD_LIBRARY_PATH) If search_path is + // specified, the shared library will be searched for in the directories + // provided by the search path + Status LoadLibrary(const std::string& name, const std::string& path, + std::shared_ptr* result) override { + assert(result != nullptr); + if (name.empty()) { + void* hndl = dlopen(NULL, RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(name, hndl)); + return Status::OK(); + } + } else { + std::string library_name = name; + if (library_name.find(kSharedLibExt) == std::string::npos) { + library_name = library_name + kSharedLibExt; + } +#if !defined(OS_WIN) + if (library_name.find('/') == std::string::npos && + library_name.compare(0, 3, "lib") != 0) { + library_name = "lib" + library_name; + } +#endif + if (path.empty()) { + void* hndl = dlopen(library_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(library_name, hndl)); + return Status::OK(); + } + } else { + std::string local_path; + std::stringstream ss(path); + while (getline(ss, local_path, kPathSeparator)) { + if (!path.empty()) { + std::string full_name = local_path + "/" + library_name; + void* hndl = dlopen(full_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(full_name, hndl)); + return Status::OK(); + } + } + } + } + } + return Status::IOError( + IOErrorMsg("Failed to open shared library: xs", name), dlerror()); + } +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + + void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, + void* tag = nullptr, + void (*unschedFunction)(void* arg) = nullptr) override; + + int UnSchedule(void* arg, Priority pri) override; + + void StartThread(void (*function)(void* arg), void* arg) override; + + void WaitForJoin() override; + + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; + + int ReserveThreads(int threads_to_be_reserved, Priority pri) override; + + int ReleaseThreads(int threads_to_be_released, Priority pri) override; + + Status GetThreadList(std::vector* thread_list) override { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); + } + + uint64_t GetThreadID() const override { + uint64_t thread_id = 0; +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 30) + thread_id = ::gettid(); +#else // __GLIBC_PREREQ(2, 30) + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // __GLIBC_PREREQ(2, 30) +#else // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + return thread_id; + } + + Status GetHostName(char* name, uint64_t len) override { + int ret = gethostname(name, static_cast(len)); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) { + return Status::InvalidArgument(errnoStr(errno).c_str()); + } else { + return IOError("GetHostName", name, errno); + } + } + return Status::OK(); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return Env::GetThreadStatusUpdater(); + } + + std::string GenerateUniqueId() override { return Env::GenerateUniqueId(); } + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); + } + + int GetBackgroundThreads(Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + allow_non_owner_access_ = allow_non_owner_access; + return Status::OK(); + } + + // Allow increasing the number of worker threads. + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); +#ifdef OS_LINUX + thread_pools_[pool].LowerIOPriority(); +#else + (void)pool; +#endif + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(pri); + return Status::OK(); + } + + private: + friend Env* Env::Default(); + // Constructs the default Env, a singleton + PosixEnv(); + + // The below 4 members are only used by the default PosixEnv instance. + // Non-default instances simply maintain references to the backing + // members in te default instance + std::vector thread_pools_storage_; + pthread_mutex_t mu_storage_; + std::vector threads_to_join_storage_; + bool allow_non_owner_access_storage_; + + std::vector& thread_pools_; + pthread_mutex_t& mu_; + std::vector& threads_to_join_; + // If true, allow non owner read access for db files. Otherwise, non-owner + // has no access to db files. + bool& allow_non_owner_access_; +}; + +PosixEnv::PosixEnv() + : CompositeEnv(FileSystem::Default(), SystemClock::Default()), + thread_pools_storage_(Priority::TOTAL), + allow_non_owner_access_storage_(true), + thread_pools_(thread_pools_storage_), + mu_(mu_storage_), + threads_to_join_(threads_to_join_storage_), + allow_non_owner_access_(allow_non_owner_access_storage_) { + ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(this); + } + thread_status_updater_ = CreateThreadStatusUpdater(); +} + +void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +int PosixEnv::UnSchedule(void* arg, Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} + +unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +int PosixEnv::ReserveThreads(int threads_to_reserved, Priority pri) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].ReserveThreads(threads_to_reserved); +} + +int PosixEnv::ReleaseThreads(int threads_to_released, Priority pri) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].ReleaseThreads(threads_to_released); +} + +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; + +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + ThreadPoolImpl::PthreadCall( + "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state)); + ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::WaitForJoin() { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + threads_to_join_.clear(); +} + +} // namespace + +// +// Default Posix Env +// +Env* Env::Default() { + // The following function call initializes the singletons of ThreadLocalPtr + // right before the static default_env. This guarantees default_env will + // always being destructed before the ThreadLocalPtr singletons get + // destructed as C++ guarantees that the destructions of static variables + // is in the reverse order of their constructions. + // + // Since static members are destructed in the reverse order + // of their construction, having this call here guarantees that + // the destructor of static PosixEnv will go first, then the + // the singletons of ThreadLocalPtr. + ThreadLocalPtr::InitSingletons(); + CompressionContextCache::InitSingleton(); + INIT_SYNC_POINT_SINGLETONS(); + // Avoid problems with accessing most members of Env::Default() during + // static destruction. + STATIC_AVOID_DESTRUCTION(PosixEnv, default_env); + // This destructor must be called on exit + static PosixEnv::JoinThreadsOnExit thread_joiner(default_env); + return &default_env; +} + +// +// Default Posix SystemClock +// +const std::shared_ptr& SystemClock::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared()); + return instance; +} +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system.cc new file mode 100644 index 0000000000..71fb4d5bc7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system.cc @@ -0,0 +1,277 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "rocksdb/file_system.h" + +#include "env/composite_env_wrapper.h" +#include "env/env_chroot.h" +#include "env/env_encryption_ctr.h" +#include "env/fs_readonly.h" +#include "env/mock_env.h" +#include "logging/env_logger.h" +#include "options/db_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" +#include "utilities/counted_fs.h" +#include "utilities/env_timed.h" + +namespace ROCKSDB_NAMESPACE { + +FileSystem::FileSystem() {} + +FileSystem::~FileSystem() {} + +static int RegisterBuiltinFileSystems(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + TimedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TimedFileSystem(nullptr)); + return guard->get(); + }); + library.AddFactory( + ReadOnlyFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new ReadOnlyFileSystem(nullptr)); + return guard->get(); + }); + library.AddFactory( + EncryptedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* errmsg) { + Status s = NewEncryptedFileSystemImpl(nullptr, nullptr, guard); + if (!s.ok()) { + *errmsg = s.ToString(); + } + return guard->get(); + }); + library.AddFactory( + CountedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new CountedFileSystem(FileSystem::Default())); + return guard->get(); + }); + library.AddFactory( + MockFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new MockFileSystem(SystemClock::Default())); + return guard->get(); + }); +#ifndef OS_WIN + library.AddFactory( + ChrootFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new ChrootFileSystem(nullptr, "")); + return guard->get(); + }); +#endif // OS_WIN + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} + +Status FileSystem::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + auto default_fs = FileSystem::Default(); + if (default_fs->IsInstanceOf(value)) { + *result = default_fs; + return Status::OK(); + } else { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), ""); + }); + return LoadSharedObject(config_options, value, result); + } +} + +IOStatus FileSystem::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) { + IOStatus s = RenameFile(old_fname, fname, opts.io_options, dbg); + if (!s.ok()) { + return s; + } + return NewWritableFile(fname, opts, result, dbg); +} + +IOStatus FileSystem::NewLogger(const std::string& fname, + const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) { + FileOptions options; + options.io_options = io_opts; + // TODO: Tune the buffer size. + options.writable_file_max_buffer_size = 1024 * 1024; + std::unique_ptr writable_file; + const IOStatus status = NewWritableFile(fname, options, &writable_file, dbg); + if (!status.ok()) { + return status; + } + + *result = std::make_shared(std::move(writable_file), fname, + options, Env::Default()); + return IOStatus::OK(); +} + +FileOptions FileSystem::OptimizeForLogRead( + const FileOptions& file_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = false; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForManifestRead( + const FileOptions& file_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = false; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized_file_options.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForManifestWrite( + const FileOptions& file_options) const { + return file_options; +} + +FileOptions FileSystem::OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_writes = + db_options.use_direct_io_for_flush_and_compaction; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + +IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, bool should_sync) { + std::unique_ptr file; + EnvOptions soptions; + IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); + if (!s.ok()) { + return s; + } + s = file->Append(data, IOOptions(), nullptr); + if (s.ok() && should_sync) { + s = file->Sync(IOOptions(), nullptr); + } + if (!s.ok()) { + fs->DeleteFile(fname, IOOptions(), nullptr); + } + return s; +} + +IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data) { + FileOptions soptions; + data->clear(); + std::unique_ptr file; + IOStatus s = status_to_io_status( + fs->NewSequentialFile(fname, soptions, &file, nullptr)); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, IOOptions(), &fragment, space, nullptr); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + return s; +} + +namespace { +static std::unordered_map fs_wrapper_type_info = { + {"target", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, +}; +} // namespace +FileSystemWrapper::FileSystemWrapper(const std::shared_ptr& t) + : target_(t) { + RegisterOptions("", &target_, &fs_wrapper_type_info); +} + +Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) { + if (target_ == nullptr) { + target_ = FileSystem::Default(); + } + return FileSystem::PrepareOptions(options); +} + +std::string FileSystemWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto parent = FileSystem::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_ == nullptr || + target_->IsInstanceOf(FileSystem::kDefaultName())) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_->ToString(config_options)); + return result; + } +} + +DirFsyncOptions::DirFsyncOptions() { reason = kDefault; } + +DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) { + reason = kFileRenamed; + renamed_new_name = file_renamed_new_name; +} + +DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) { + assert(fsync_reason != kFileRenamed); + reason = fsync_reason; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.cc new file mode 100644 index 0000000000..d0c45c57ee --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "env/file_system_tracer.h" + +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus FileSystemTracingWrapper::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& file_opts, std::unique_ptr* result, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomRWFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewDirectory( + const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewDirectory(name, io_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + name.substr(name.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir, + const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->GetChildren(dir, io_opts, r, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dir.substr(dir.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->DeleteFile(fname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->CreateDir(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::CreateDirIfMissing( + const std::string& dirname, const IOOptions& options, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->DeleteDir(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->GetFileSize(fname, options, file_size, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed, + s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname, + size_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(fname, size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1), size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::Read(size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + offset); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::PositionedRead( + uint64_t offset, size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs, + size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t latency = elapsed; + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + for (size_t i = 0; i < num_reqs; i++) { + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency, + reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset); + io_tracer_->WriteIOOp(io_record, dbg); + } + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Prefetch(offset, n, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + static_cast(offset)); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { + // Create a callback and populate info. + auto read_async_callback = + std::bind(&FSRandomAccessFileTracingWrapper::ReadAsyncCallback, this, + std::placeholders::_1, std::placeholders::_2); + ReadAsyncCallbackInfo* read_async_cb_info = new ReadAsyncCallbackInfo; + read_async_cb_info->cb_ = cb; + read_async_cb_info->cb_arg_ = cb_arg; + read_async_cb_info->start_time_ = clock_->NowNanos(); + read_async_cb_info->file_op_ = __func__; + + IOStatus s = target()->ReadAsync(req, opts, read_async_callback, + read_async_cb_info, io_handle, del_fn, dbg); + + if (!s.ok()) { + delete read_async_cb_info; + } + return s; +} + +void FSRandomAccessFileTracingWrapper::ReadAsyncCallback( + const FSReadRequest& req, void* cb_arg) { + ReadAsyncCallbackInfo* read_async_cb_info = + static_cast(cb_arg); + assert(read_async_cb_info); + assert(read_async_cb_info->cb_); + + uint64_t elapsed = clock_->NowNanos() - read_async_cb_info->start_time_; + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + read_async_cb_info->file_op_, elapsed, + req.status.ToString(), file_name_, req.result.size(), + req.offset); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + + // call the underlying callback. + read_async_cb_info->cb_(req, read_async_cb_info->cb_arg_); + delete read_async_cb_info; +} + +IOStatus FSWritableFileTracingWrapper::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Append(data, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->PositionedAppend(data, offset, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, size, + 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + uint64_t file_size = target()->GetFileSize(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, "OK", file_name_, file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return file_size; +} + +IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + static_cast(offset)); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Write(offset, data, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Flush(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Sync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Fsync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.h new file mode 100644 index 0000000000..979a0bf120 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/file_system_tracer.h @@ -0,0 +1,461 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "trace_replay/io_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +// FileSystemTracingWrapper is a wrapper class above FileSystem that forwards +// the call to the underlying storage system. It then invokes IOTracer to record +// file operations and other contextual information in a binary format for +// tracing. It overrides methods we are interested in tracing and extends +// FileSystemWrapper, which forwards all methods that are not explicitly +// overridden. +class FileSystemTracingWrapper : public FileSystemWrapper { + public: + FileSystemTracingWrapper(const std::shared_ptr& t, + const std::shared_ptr& io_tracer) + : FileSystemWrapper(t), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()) {} + + ~FileSystemTracingWrapper() override {} + + static const char* kClassName() { return "FileSystemTracing"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; +}; + +// The FileSystemPtr is a wrapper class that takes pointer to storage systems +// (such as posix filesystems). It overloads operator -> and returns a pointer +// of either FileSystem or FileSystemTracingWrapper based on whether tracing is +// enabled or not. It is added to bypass FileSystemTracingWrapper when tracing +// is disabled. +class FileSystemPtr { + public: + FileSystemPtr(std::shared_ptr fs, + const std::shared_ptr& io_tracer) + : fs_(fs), io_tracer_(io_tracer) { + fs_tracer_ = std::make_shared(fs_, io_tracer_); + } + + std::shared_ptr operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_; + } else { + return fs_; + } + } + + /* Returns the underlying File System pointer */ + FileSystem* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else { + return fs_.get(); + } + } + + private: + std::shared_ptr fs_; + std::shared_ptr io_tracer_; + std::shared_ptr fs_tracer_; +}; + +// FSSequentialFileTracingWrapper is a wrapper class above FSSequentialFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSSequentialFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSSequentialFileTracingWrapper : public FSSequentialFileOwnerWrapper { + public: + FSSequentialFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSSequentialFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSSequentialFileTracingWrapper() override {} + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + std::string file_name_; +}; + +// The FSSequentialFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSSequentialFile or FSSequentialFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSSequentialFileTracingWrapper when tracing is disabled. +class FSSequentialFilePtr { + public: + FSSequentialFilePtr() = delete; + FSSequentialFilePtr(std::unique_ptr&& fs, + const std::shared_ptr& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSSequentialFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSSequentialFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr io_tracer_; + FSSequentialFileTracingWrapper fs_tracer_; +}; + +// FSRandomAccessFileTracingWrapper is a wrapper class above FSRandomAccessFile +// that forwards the call to the underlying storage system. It then invokes +// IOTracer to record file operations and other contextual information in a +// binary format for tracing. It overrides methods we are interested in tracing +// and extends FSRandomAccessFileWrapper, which forwards all methods that are +// not explicitly overridden. +class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper { + public: + FSRandomAccessFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSRandomAccessFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSRandomAccessFileTracingWrapper() override {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override; + + void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg); + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; + + struct ReadAsyncCallbackInfo { + uint64_t start_time_; + std::function cb_; + void* cb_arg_; + std::string file_op_; + }; +}; + +// The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSRandomAccessFile or FSRandomAccessFileTracingWrapper +// based on whether tracing is enabled or not. It is added to bypass +// FSRandomAccessFileTracingWrapper when tracing is disabled. +class FSRandomAccessFilePtr { + public: + FSRandomAccessFilePtr(std::unique_ptr&& fs, + const std::shared_ptr& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSRandomAccessFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSRandomAccessFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr io_tracer_; + FSRandomAccessFileTracingWrapper fs_tracer_; +}; + +// FSWritableFileTracingWrapper is a wrapper class above FSWritableFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSWritableFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSWritableFileTracingWrapper : public FSWritableFileOwnerWrapper { + public: + FSWritableFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSWritableFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSWritableFileTracingWrapper() override {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* dbg) override { + return Append(data, options, dbg); + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, options, dbg); + } + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSWritableFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSWritableFile or FSWritableFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSWritableFileTracingWrapper when tracing is disabled. +class FSWritableFilePtr { + public: + FSWritableFilePtr(std::unique_ptr&& fs, + const std::shared_ptr& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer) { + fs_tracer_.reset(new FSWritableFileTracingWrapper( + std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */)); + } + + FSWritableFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else { + return fs_tracer_->target(); + } + } + + FSWritableFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else if (fs_tracer_) { + return fs_tracer_->target(); + } else { + return nullptr; + } + } + + void reset() { + fs_tracer_.reset(); + io_tracer_ = nullptr; + } + + private: + std::shared_ptr io_tracer_; + std::unique_ptr fs_tracer_; +}; + +// FSRandomRWFileTracingWrapper is a wrapper class above FSRandomRWFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSRandomRWFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSRandomRWFileTracingWrapper : public FSRandomRWFileOwnerWrapper { + public: + FSRandomRWFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSRandomRWFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSRandomRWFileTracingWrapper() override {} + + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSRandomRWFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSRandomRWFile or FSRandomRWFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSRandomRWFileTracingWrapper when tracing is disabled. +class FSRandomRWFilePtr { + public: + FSRandomRWFilePtr(std::unique_ptr&& fs, + std::shared_ptr io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSRandomRWFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSRandomRWFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr io_tracer_; + FSRandomRWFileTracingWrapper fs_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_posix.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_posix.cc new file mode 100644 index 0000000000..c2e8cfaf04 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_posix.cc @@ -0,0 +1,1284 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + +#if !defined(OS_WIN) + +#include +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) +#include +#include +#endif +#include +#include +#include +#include + +#include +// Get nano time includes +#if defined(OS_LINUX) || defined(OS_FREEBSD) +#elif defined(__MACH__) +#include +#include +#include +#else +#include +#endif +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "env/io_posix.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/thread_local.h" +#include "util/threadpool_imp.h" + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__)); + +namespace ROCKSDB_NAMESPACE { + +namespace { + +inline mode_t GetDBFileMode(bool allow_non_owner_access) { + return allow_non_owner_access ? 0644 : 0600; +} + +// list of pathnames that are locked +// Only used for error message. +struct LockHoldingInfo { + int64_t acquire_time; + uint64_t acquiring_thread; +}; +static std::map locked_files; +static port::Mutex mutex_locked_files; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + + return value; +} + +class PosixFileLock : public FileLock { + public: + int fd_ = /*invalid*/ -1; + std::string filename; + + void Clear() { + fd_ = -1; + filename.clear(); + } + + virtual ~PosixFileLock() override { + // Check for destruction without UnlockFile + assert(fd_ == -1); + } +}; + +int cloexec_flags(int flags, const EnvOptions* options) { + // If the system supports opening the file with cloexec enabled, + // do so, as this avoids a race condition if a db is opened around + // the same time that a child process is forked +#ifdef O_CLOEXEC + if (options == nullptr || options->set_fd_cloexec) { + flags |= O_CLOEXEC; + } +#else + (void)options; +#endif + return flags; +} + +class PosixFileSystem : public FileSystem { + public: + PosixFileSystem(); + + static const char* kClassName() { return "PosixFileSystem"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + ~PosixFileSystem() override {} + bool IsInstanceOf(const std::string& name) const override { + if (name == "posix") { + return true; + } else { + return FileSystem::IsInstanceOf(name); + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + result->reset(); + int fd = -1; + int flags = cloexec_flags(O_RDONLY, &options); + FILE* file = nullptr; + + if (options.use_direct_reads && !options.use_mmap_reads) { +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; + TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags); +#endif + } + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + return IOError("While opening a file for sequentially reading", fname, + errno); + } + + SetFD_CLOEXEC(fd, &options); + + if (options.use_direct_reads && !options.use_mmap_reads) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + return IOError("While fcntl NoCache", fname, errno); + } +#endif + } else { + do { + IOSTATS_TIMER_GUARD(open_nanos); + file = fdopen(fd, "r"); + } while (file == nullptr && errno == EINTR); + if (file == nullptr) { + close(fd); + return IOError("While opening file for sequentially read", fname, + errno); + } + } + result->reset(new PosixSequentialFile( + fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd), + options)); + return IOStatus::OK(); + } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + result->reset(); + IOStatus s = IOStatus::OK(); + int fd; + int flags = cloexec_flags(O_RDONLY, &options); + + if (options.use_direct_reads && !options.use_mmap_reads) { +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; + TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags); +#endif + } + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("While open a file for random read", fname, errno); + return s; + } + SetFD_CLOEXEC(fd, &options); + + if (options.use_mmap_reads) { + // Use of mmap for random reads has been removed because it + // kills performance when storage is fast. + // Use mmap when virtual address-space is plentiful. + uint64_t size; + IOOptions opts; + s = GetFileSize(fname, opts, &size, nullptr); + if (s.ok()) { + void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + result->reset( + new PosixMmapReadableFile(fd, fname, base, size, options)); + } else { + s = IOError("while mmap file for read", fname, errno); + close(fd); + } + } else { + close(fd); + } + } else { + if (options.use_direct_reads && !options.use_mmap_reads) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + return IOError("while fcntl NoCache", fname, errno); + } +#endif + } + result->reset(new PosixRandomAccessFile( + fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd), + options +#if defined(ROCKSDB_IOURING_PRESENT) + , + !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get() +#endif + )); + } + return s; + } + + virtual IOStatus OpenWritableFile(const std::string& fname, + const FileOptions& options, bool reopen, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + result->reset(); + IOStatus s; + int fd = -1; + int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC); + // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) + if (options.use_direct_writes && !options.use_mmap_writes) { + // Note: we should avoid O_APPEND here due to ta the following bug: + // POSIX requires that opening a file with the O_APPEND flag should + // have no affect on the location at which pwrite() writes data. + // However, on Linux, if a file is opened with O_APPEND, pwrite() + // appends data to the end of the file, regardless of the value of + // offset. + // More info here: https://linux.die.net/man/2/pwrite + flags |= O_WRONLY; +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; +#endif + TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags); + } else if (options.use_mmap_writes) { + // non-direct I/O + flags |= O_RDWR; + } else { + flags |= O_WRONLY; + } + + flags = cloexec_flags(flags, &options); + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + + if (fd < 0) { + s = IOError("While open a file for appending", fname, errno); + return s; + } + SetFD_CLOEXEC(fd, &options); + + if (options.use_mmap_writes) { + MaybeForceDisableMmap(fd); + } + if (options.use_mmap_writes && !forceMmapOff_) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else if (options.use_direct_writes && !options.use_mmap_writes) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + s = IOError("While fcntl NoCache an opened file for appending", fname, + errno); + return s; + } +#elif defined(OS_SOLARIS) + if (directio(fd, DIRECTIO_ON) == -1) { + if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON + close(fd); + s = IOError("While calling directio()", fname, errno); + return s; + } + } +#endif + result->reset(new PosixWritableFile( + fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), + options)); + } else { + // disable mmap writes + EnvOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + result->reset( + new PosixWritableFile(fname, fd, + GetLogicalBlockSizeForWriteIfNeeded( + no_mmap_writes_options, fname, fd), + no_mmap_writes_options)); + } + return s; + } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + return OpenWritableFile(fname, options, false, result, dbg); + } + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { + return OpenWritableFile(fname, options, true, result, dbg); + } + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + result->reset(); + IOStatus s; + int fd = -1; + + int flags = 0; + // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) + if (options.use_direct_writes && !options.use_mmap_writes) { + flags |= O_WRONLY; +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; +#endif + TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags); + } else if (options.use_mmap_writes) { + // mmap needs O_RDWR mode + flags |= O_RDWR; + } else { + flags |= O_WRONLY; + } + + flags = cloexec_flags(flags, &options); + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(old_fname.c_str(), flags, + GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("while reopen file for write", fname, errno); + return s; + } + + SetFD_CLOEXEC(fd, &options); + // rename into place + if (rename(old_fname.c_str(), fname.c_str()) != 0) { + s = IOError("while rename file to " + fname, old_fname, errno); + close(fd); + return s; + } + + if (options.use_mmap_writes) { + MaybeForceDisableMmap(fd); + } + if (options.use_mmap_writes && !forceMmapOff_) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else if (options.use_direct_writes && !options.use_mmap_writes) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + s = IOError("while fcntl NoCache for reopened file for append", fname, + errno); + return s; + } +#elif defined(OS_SOLARIS) + if (directio(fd, DIRECTIO_ON) == -1) { + if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON + close(fd); + s = IOError("while calling directio()", fname, errno); + return s; + } + } +#endif + result->reset(new PosixWritableFile( + fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), + options)); + } else { + // disable mmap writes + FileOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + result->reset( + new PosixWritableFile(fname, fd, + GetLogicalBlockSizeForWriteIfNeeded( + no_mmap_writes_options, fname, fd), + no_mmap_writes_options)); + } + return s; + } + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + int fd = -1; + int flags = cloexec_flags(O_RDWR, &options); + + while (fd < 0) { + IOSTATS_TIMER_GUARD(open_nanos); + + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + if (fd < 0) { + // Error while opening the file + if (errno == EINTR) { + continue; + } + return IOError("While open file for random read/write", fname, errno); + } + } + + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixRandomRWFile(fname, fd, options)); + return IOStatus::OK(); + } + + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + int fd = -1; + IOStatus status; + int flags = cloexec_flags(O_RDWR, nullptr); + + while (fd < 0) { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, 0644); + if (fd < 0) { + // Error while opening the file + if (errno == EINTR) { + continue; + } + status = + IOError("While open file for raw mmap buffer access", fname, errno); + break; + } + } + uint64_t size; + if (status.ok()) { + IOOptions opts; + status = GetFileSize(fname, opts, &size, nullptr); + } + void* base = nullptr; + if (status.ok()) { + base = mmap(nullptr, static_cast(size), PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + status = IOError("while mmap file for read", fname, errno); + } + } + if (status.ok()) { + result->reset( + new PosixMemoryMappedFileBuffer(base, static_cast(size))); + } + if (fd >= 0) { + // don't need to keep it open after mmap has been called + close(fd); + } + return status; + } + + IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + result->reset(); + int fd; + int flags = cloexec_flags(0, nullptr); + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(name.c_str(), flags); + } + if (fd < 0) { + return IOError("While open directory", name, errno); + } else { + result->reset(new PosixDirectory(fd, name)); + } + return IOStatus::OK(); + } + + IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + int result = access(fname.c_str(), F_OK); + + if (result == 0) { + return IOStatus::OK(); + } + + int err = errno; + switch (err) { + case EACCES: + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + case ENOTDIR: + return IOStatus::NotFound(); + default: + assert(err == EIO || err == ENOMEM); + return IOStatus::IOError("Unexpected error(" + std::to_string(err) + + ") accessing file `" + fname + "' "); + } + } + + IOStatus GetChildren(const std::string& dir, const IOOptions& opts, + std::vector* result, + IODebugContext* /*dbg*/) override { + result->clear(); + + DIR* d = opendir(dir.c_str()); + if (d == nullptr) { + switch (errno) { + case EACCES: + case ENOENT: + case ENOTDIR: + return IOStatus::NotFound(); + default: + return IOError("While opendir", dir, errno); + } + } + + // reset errno before calling readdir() + errno = 0; + struct dirent* entry; + + while ((entry = readdir(d)) != nullptr) { + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + entry->d_type == DT_DIR && + (strcmp(entry->d_name, ".") == 0 || + strcmp(entry->d_name, "..") == 0 +#ifndef ASSERT_STATUS_CHECKED + // In case of ASSERT_STATUS_CHECKED, GetChildren support older + // version of API for debugging purpose. + || opts.do_not_recurse +#endif + ); + if (!ignore) { + result->push_back(entry->d_name); + } + errno = 0; // reset errno if readdir() success + } + + // always attempt to close the dir + const auto pre_close_errno = errno; // errno may be modified by closedir + const int close_result = closedir(d); + + if (pre_close_errno != 0) { + // error occurred during readdir + return IOError("While readdir", dir, pre_close_errno); + } + + if (close_result != 0) { + // error occurred during closedir + return IOError("While closedir", dir, errno); + } + + return IOStatus::OK(); + } + + IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + IOStatus result; + if (unlink(fname.c_str()) != 0) { + result = IOError("while unlink() file", fname, errno); + } + return result; + } + + IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (mkdir(name.c_str(), 0755) != 0) { + return IOError("While mkdir", name, errno); + } + return IOStatus::OK(); + } + + IOStatus CreateDirIfMissing(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (mkdir(name.c_str(), 0755) != 0) { + if (errno != EEXIST) { + return IOError("While mkdir if missing", name, errno); + } else if (!DirExists(name)) { // Check that name is actually a + // directory. + // Message is taken from mkdir + return IOStatus::IOError("`" + name + + "' exists but is not a directory"); + } + } + return IOStatus::OK(); + } + + IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (rmdir(name.c_str()) != 0) { + return IOError("file rmdir", name, errno); + } + return IOStatus::OK(); + } + + IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/, + uint64_t* size, IODebugContext* /*dbg*/) override { + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + return IOError("while stat a file for size", fname, errno); + } else { + *size = sbuf.st_size; + } + return IOStatus::OK(); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& /*opts*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) override { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + return IOError("while stat a file for modification time", fname, errno); + } + *file_mtime = static_cast(s.st_mtime); + return IOStatus::OK(); + } + + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (rename(src.c_str(), target.c_str()) != 0) { + return IOError("While renaming a file to " + target, src, errno); + } + return IOStatus::OK(); + } + + IOStatus LinkFile(const std::string& src, const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (link(src.c_str(), target.c_str()) != 0) { + if (errno == EXDEV || errno == ENOTSUP) { + return IOStatus::NotSupported(errno == EXDEV + ? "No cross FS links allowed" + : "Links not supported by FS"); + } + return IOError("while link file to " + target, src, errno); + } + return IOStatus::OK(); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/, + uint64_t* count, IODebugContext* /*dbg*/) override { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + return IOError("while stat a file for num file links", fname, errno); + } + *count = static_cast(s.st_nlink); + return IOStatus::OK(); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& /*opts*/, bool* res, + IODebugContext* /*dbg*/) override { + struct stat statbuf[2]; + if (stat(first.c_str(), &statbuf[0]) != 0) { + return IOError("stat file", first, errno); + } + if (stat(second.c_str(), &statbuf[1]) != 0) { + return IOError("stat file", second, errno); + } + + if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) || + minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) || + statbuf[0].st_ino != statbuf[1].st_ino) { + *res = false; + } else { + *res = true; + } + return IOStatus::OK(); + } + + IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/, + FileLock** lock, IODebugContext* /*dbg*/) override { + *lock = nullptr; + + LockHoldingInfo lhi; + int64_t current_time = 0; + // Ignore status code as the time is only used for error message. + SystemClock::Default() + ->GetCurrentTime(¤t_time) + .PermitUncheckedError(); + lhi.acquire_time = current_time; + lhi.acquiring_thread = Env::Default()->GetThreadID(); + + mutex_locked_files.Lock(); + // If it already exists in the locked_files set, then it is already locked, + // and fail this lock attempt. Otherwise, insert it into locked_files. + // This check is needed because fcntl() does not detect lock conflict + // if the fcntl is issued by the same thread that earlier acquired + // this lock. + // We must do this check *before* opening the file: + // Otherwise, we will open a new file descriptor. Locks are associated with + // a process, not a file descriptor and when *any* file descriptor is + // closed, all locks the process holds for that *file* are released + const auto it_success = locked_files.insert({fname, lhi}); + if (it_success.second == false) { + LockHoldingInfo prev_info = it_success.first->second; + mutex_locked_files.Unlock(); + errno = ENOLCK; + // Note that the thread ID printed is the same one as the one in + // posix logger, but posix logger prints it hex format. + return IOError("lock hold by current process, acquire time " + + std::to_string(prev_info.acquire_time) + + " acquiring thread " + + std::to_string(prev_info.acquiring_thread), + fname, errno); + } + + IOStatus result = IOStatus::OK(); + int fd; + int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr); + + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, 0644); + } + if (fd < 0) { + result = IOError("while open a file for lock", fname, errno); + } else if (LockOrUnlock(fd, true) == -1) { + result = IOError("While lock file", fname, errno); + close(fd); + } else { + SetFD_CLOEXEC(fd, nullptr); + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + my_lock->filename = fname; + *lock = my_lock; + } + if (!result.ok()) { + // If there is an error in locking, then remove the pathname from + // locked_files. (If we got this far, it did not exist in locked_files + // before this call.) + locked_files.erase(fname); + } + + mutex_locked_files.Unlock(); + return result; + } + + IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + PosixFileLock* my_lock = reinterpret_cast(lock); + IOStatus result; + mutex_locked_files.Lock(); + // If we are unlocking, then verify that we had locked it earlier, + // it should already exist in locked_files. Remove it from locked_files. + if (locked_files.erase(my_lock->filename) != 1) { + errno = ENOLCK; + result = IOError("unlock", my_lock->filename, errno); + } else if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = IOError("unlock", my_lock->filename, errno); + } + close(my_lock->fd_); + my_lock->Clear(); + delete my_lock; + mutex_locked_files.Unlock(); + return result; + } + + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*opts*/, std::string* output_path, + IODebugContext* /*dbg*/) override { + if (!db_path.empty() && db_path[0] == '/') { + *output_path = db_path; + return IOStatus::OK(); + } + + char the_path[4096]; + char* ret = getcwd(the_path, 4096); + if (ret == nullptr) { + return IOStatus::IOError(errnoStr(errno).c_str()); + } + + *output_path = ret; + return IOStatus::OK(); + } + + IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result, + IODebugContext* /*dbg*/) override { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + { + IOOptions opts; + return CreateDirIfMissing(*result, opts, nullptr); + } + return IOStatus::OK(); + } + + IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/, + uint64_t* free_space, + IODebugContext* /*dbg*/) override { + struct statvfs sbuf; + + if (statvfs(fname.c_str(), &sbuf) < 0) { + return IOError("While doing statvfs", fname, errno); + } + + // sbuf.bfree is total free space available to root + // sbuf.bavail is total free space available to unprivileged user + // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id + if (geteuid()) { + // non-zero user is unprivileged, or -1 if error. take more conservative + // size + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail); + } else { + // root user can access all disk space + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + } + return IOStatus::OK(); + } + + IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + // First open + int fd = -1; + int flags = cloexec_flags(O_RDONLY, nullptr); + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(path.c_str(), flags); + } + if (fd < 0) { + return IOError("While open for IsDirectory()", path, errno); + } + IOStatus io_s; + struct stat sbuf; + if (fstat(fd, &sbuf) < 0) { + io_s = IOError("While doing stat for IsDirectory()", path, errno); + } + close(fd); + if (io_s.ok() && nullptr != is_dir) { + *is_dir = S_ISDIR(sbuf.st_mode); + } + return io_s; + } + + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + FileOptions optimized = file_options; + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + optimized.bytes_per_sync = db_options.wal_bytes_per_sync; + // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it + // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit + // test and make this false + optimized.fallocate_with_keep_size = true; + optimized.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + return optimized; + } + + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + FileOptions optimized = file_options; + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } +#ifdef OS_LINUX + Status RegisterDbPaths(const std::vector& paths) override { + return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths); + } + Status UnregisterDbPaths(const std::vector& paths) override { + logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths); + return Status::OK(); + } +#endif + private: + bool forceMmapOff_ = false; // do we override Env options? + + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname) { + struct stat statbuf; + if (stat(dname.c_str(), &statbuf) == 0) { + return S_ISDIR(statbuf.st_mode); + } + return false; // stat() failed return false + } + + bool SupportsFastAllocate(int fd) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + struct statfs s; + if (fstatfs(fd, &s)) { + return false; + } + switch (s.f_type) { + case EXT4_SUPER_MAGIC: + return true; + case XFS_SUPER_MAGIC: + return true; + case TMPFS_MAGIC: + return true; + default: + return false; + } +#else + (void)fd; + return false; +#endif + } + + void MaybeForceDisableMmap(int fd) { + static std::once_flag s_check_disk_for_mmap_once; + assert(this == FileSystem::Default().get()); + std::call_once( + s_check_disk_for_mmap_once, + [this](int fdesc) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fdesc)) { + forceMmapOff_ = true; + } + }, + fd); + } + +#ifdef ROCKSDB_IOURING_PRESENT + bool IsIOUringEnabled() { + if (RocksDbIOUringEnable && RocksDbIOUringEnable()) { + return true; + } else { + return false; + } + } +#endif // ROCKSDB_IOURING_PRESENT + + // EXPERIMENTAL + // + // TODO akankshamahajan: + // 1. Update Poll API to take into account min_completions + // and returns if number of handles in io_handles (any order) completed is + // equal to atleast min_completions. + // 2. Currently in case of direct_io, Read API is called because of which call + // to Poll API fails as it expects IOHandle to be populated. + virtual IOStatus Poll(std::vector& io_handles, + size_t /*min_completions*/) override { +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring_queue_init. + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast(thread_local_io_urings_->Get()); + } + + // Init failed, platform doesn't support io_uring. + if (iu == nullptr) { + return IOStatus::NotSupported("Poll"); + } + + for (size_t i = 0; i < io_handles.size(); i++) { + // The request has been completed in earlier runs. + if ((static_cast(io_handles[i]))->is_finished) { + continue; + } + // Loop until IO for io_handles[i] is completed. + while (true) { + // io_uring_wait_cqe. + struct io_uring_cqe* cqe = nullptr; + ssize_t ret = io_uring_wait_cqe(iu, &cqe); + if (ret) { + // abort as it shouldn't be in indeterminate state and there is no + // good way currently to handle this error. + abort(); + } + + // Step 3: Populate the request. + assert(cqe != nullptr); + Posix_IOHandle* posix_handle = + static_cast(io_uring_cqe_get_data(cqe)); + assert(posix_handle->iu == iu); + if (posix_handle->iu != iu) { + return IOStatus::IOError(""); + } + // Reset cqe data to catch any stray reuse of it + static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + + FSReadRequest req; + req.scratch = posix_handle->scratch; + req.offset = posix_handle->offset; + req.len = posix_handle->len; + + size_t finished_len = 0; + size_t bytes_read = 0; + bool read_again = false; + UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, + true /*async_read*/, posix_handle->use_direct_io, + posix_handle->alignment, finished_len, &req, bytes_read, + read_again); + posix_handle->is_finished = true; + io_uring_cqe_seen(iu, cqe); + posix_handle->cb(req, posix_handle->cb_arg); + + (void)finished_len; + (void)bytes_read; + (void)read_again; + + if (static_cast(io_handles[i]) == posix_handle) { + break; + } + } + } + return IOStatus::OK(); +#else + (void)io_handles; + return IOStatus::NotSupported("Poll"); +#endif + } + + virtual IOStatus AbortIO(std::vector& io_handles) override { +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring_queue_init. + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast(thread_local_io_urings_->Get()); + } + + // Init failed, platform doesn't support io_uring. + // If Poll is not supported then it didn't submit any request and it should + // return OK. + if (iu == nullptr) { + return IOStatus::OK(); + } + + for (size_t i = 0; i < io_handles.size(); i++) { + Posix_IOHandle* posix_handle = + static_cast(io_handles[i]); + if (posix_handle->is_finished == true) { + continue; + } + assert(posix_handle->iu == iu); + if (posix_handle->iu != iu) { + return IOStatus::IOError(""); + } + + // Prepare the cancel request. + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(iu); + + // In order to cancel the request, sqe->addr of cancel request should + // match with the read request submitted which is posix_handle->iov. + io_uring_prep_cancel(sqe, &posix_handle->iov, 0); + // Sets sqe->user_data to posix_handle. + io_uring_sqe_set_data(sqe, posix_handle); + + // submit the request. + ssize_t ret = io_uring_submit(iu); + if (ret < 0) { + fprintf(stderr, "io_uring_submit error: %ld\n", long(ret)); + return IOStatus::IOError("io_uring_submit() requested but returned " + + std::to_string(ret)); + } + } + + // After submitting the requests, wait for the requests. + for (size_t i = 0; i < io_handles.size(); i++) { + if ((static_cast(io_handles[i]))->is_finished) { + continue; + } + + while (true) { + struct io_uring_cqe* cqe = nullptr; + ssize_t ret = io_uring_wait_cqe(iu, &cqe); + if (ret) { + // abort as it shouldn't be in indeterminate state and there is no + // good way currently to handle this error. + abort(); + } + assert(cqe != nullptr); + + // Returns cqe->user_data. + Posix_IOHandle* posix_handle = + static_cast(io_uring_cqe_get_data(cqe)); + assert(posix_handle->iu == iu); + if (posix_handle->iu != iu) { + return IOStatus::IOError(""); + } + posix_handle->req_count++; + + // Reset cqe data to catch any stray reuse of it + static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + io_uring_cqe_seen(iu, cqe); + + // - If the request is cancelled successfully, the original request is + // completed with -ECANCELED and the cancel request is completed with + // a result of 0. + // - If the request was already running, the original may or + // may not complete in error. The cancel request will complete with + // -EALREADY for that case. + // - And finally, if the request to cancel wasn't + // found, the cancel request is completed with -ENOENT. + // + // Every handle has to wait for 2 requests completion: original one and + // the cancel request which is tracked by PosixHandle::req_count. + if (posix_handle->req_count == 2 && + static_cast(io_handles[i]) == posix_handle) { + posix_handle->is_finished = true; + FSReadRequest req; + req.status = IOStatus::Aborted(); + posix_handle->cb(req, posix_handle->cb_arg); + + break; + } + } + } + return IOStatus::OK(); +#else + // If Poll is not supported then it didn't submit any request and it should + // return OK. + (void)io_handles; + return IOStatus::OK(); +#endif + } + + bool use_async_io() override { +#if defined(ROCKSDB_IOURING_PRESENT) + return IsIOUringEnabled(); +#else + return false; +#endif + } + +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring instance + std::unique_ptr thread_local_io_urings_; +#endif + + size_t page_size_; + + // If true, allow non owner read access for db files. Otherwise, non-owner + // has no access to db files. + bool allow_non_owner_access_; + +#ifdef OS_LINUX + static LogicalBlockSizeCache logical_block_size_cache_; +#endif + static size_t GetLogicalBlockSize(const std::string& fname, int fd); + // In non-direct IO mode, this directly returns kDefaultPageSize. + // Otherwise call GetLogicalBlockSize. + static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options, + const std::string& fname, + int fd); + static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options, + const std::string& fname, + int fd); +}; + +#ifdef OS_LINUX +LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_; +#endif + +size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) { +#ifdef OS_LINUX + return logical_block_size_cache_.GetLogicalBlockSize(fname, fd); +#else + (void)fname; + return PosixHelper::GetLogicalBlockSizeOfFd(fd); +#endif +} + +size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded( + const EnvOptions& options, const std::string& fname, int fd) { + return options.use_direct_reads + ? PosixFileSystem::GetLogicalBlockSize(fname, fd) + : kDefaultPageSize; +} + +size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded( + const EnvOptions& options, const std::string& fname, int fd) { + return options.use_direct_writes + ? PosixFileSystem::GetLogicalBlockSize(fname, fd) + : kDefaultPageSize; +} + +PosixFileSystem::PosixFileSystem() + : forceMmapOff_(false), + page_size_(getpagesize()), + allow_non_owner_access_(true) { +#if defined(ROCKSDB_IOURING_PRESENT) + // Test whether IOUring is supported, and if it does, create a managing + // object for thread local point so that in the future thread-local + // io_uring can be created. + struct io_uring* new_io_uring = CreateIOUring(); + if (new_io_uring != nullptr) { + thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring)); + delete new_io_uring; + } +#endif +} + +} // namespace + +// +// Default Posix FileSystem +// +std::shared_ptr FileSystem::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared()); + return instance; +} + +static FactoryFunc posix_filesystem_reg = + ObjectLibrary::Default()->AddFactory( + ObjectLibrary::PatternEntry("posix").AddSeparator("://", false), + [](const std::string& /* uri */, std::unique_ptr* f, + std::string* /* errmsg */) { + f->reset(new PosixFileSystem()); + return f->get(); + }); + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_readonly.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_readonly.h new file mode 100644 index 0000000000..7a04aea080 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_readonly.h @@ -0,0 +1,105 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// A FileSystem wrapper that only allows read-only operation. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class ReadOnlyFileSystem : public FileSystemWrapper { + static inline IOStatus FailReadOnly() { + IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem"); + assert(s.GetRetryable() == false); + return s; + } + + public: + explicit ReadOnlyFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { return "ReadOnlyFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewWritableFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus ReuseWritableFile(const std::string& /*fname*/, + const std::string& /*old_fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewDirectory(const std::string& /*dir*/, + const IOOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus DeleteFile(const std::string& /*fname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override { + // Allow if dir already exists + bool is_dir = false; + IOStatus s = IsDirectory(dirname, options, &is_dir, dbg); + if (s.ok() && is_dir) { + return s; + } else { + return FailReadOnly(); + } + } + IOStatus DeleteDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/, + FileLock** /*lock*/, IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/, + std::shared_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.cc new file mode 100644 index 0000000000..b9832e6cba --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.cc @@ -0,0 +1,341 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "env/fs_remap.h" + +namespace ROCKSDB_NAMESPACE { + +RemapFileSystem::RemapFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + +std::pair RemapFileSystem::EncodePathWithNewBasename( + const std::string& path) { + // No difference by default + return EncodePath(path); +} + +Status RemapFileSystem::RegisterDbPaths(const std::vector& paths) { + std::vector encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::RegisterDbPaths(encoded_paths); +} + +Status RemapFileSystem::UnregisterDbPaths( + const std::vector& paths) { + std::vector encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::UnregisterDbPaths(encoded_paths); +} + +IOStatus RemapFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + auto status_and_old_enc_path = EncodePath(old_fname); + if (!status_and_old_enc_path.first.ok()) { + return status_and_old_enc_path.first; + } + return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second, + status_and_old_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::NewDirectory(const std::string& dir, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + // A hassle to remap DirFsyncOptions::renamed_new_name + class RemapFSDirectory : public FSDirectoryWrapper { + public: + RemapFSDirectory(RemapFileSystem* fs, std::unique_ptr&& t) + : FSDirectoryWrapper(std::move(t)), fs_(fs) {} + IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override { + if (dir_fsync_options.renamed_new_name.empty()) { + return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, + dir_fsync_options); + } else { + auto status_and_enc_path = + fs_->EncodePath(dir_fsync_options.renamed_new_name); + if (status_and_enc_path.first.ok()) { + DirFsyncOptions mapped_options = dir_fsync_options; + mapped_options.renamed_new_name = status_and_enc_path.second; + return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, + mapped_options); + } else { + return status_and_enc_path.first; + } + } + } + + private: + RemapFileSystem* const fs_; + }; + + auto status_and_enc_path = EncodePathWithNewBasename(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + IOStatus ios = FileSystemWrapper::NewDirectory(status_and_enc_path.second, + options, result, dbg); + if (ios.ok()) { + *result = std::make_unique(this, std::move(*result)); + } + return ios; +} + +IOStatus RemapFileSystem::FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::FileExists(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::GetChildren(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildren(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildrenFileAttributes( + status_and_enc_path.second, options, result, dbg); +} + +IOStatus RemapFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second, + options, dbg); +} + +IOStatus RemapFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options, + file_size, dbg); +} + +IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second, + options, file_mtime, dbg); +} + +IOStatus RemapFileSystem::IsDirectory(const std::string& path, + const IOOptions& options, bool* is_dir, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options, + is_dir, dbg); +} + +IOStatus RemapFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + if (status_and_src_enc_path.first.IsNotFound()) { + const IOStatus& s = status_and_src_enc_path.first; + status_and_src_enc_path.first = IOStatus::PathNotFound(s.ToString()); + } + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::RenameFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::LinkFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LockFile(const std::string& fname, + const IOOptions& options, FileLock** lock, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + // FileLock subclasses may store path (e.g., PosixFileLock stores it). We + // can skip stripping the chroot directory from this path because callers + // shouldn't use it. + return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock, + dbg); +} + +IOStatus RemapFileSystem::NewLogger(const std::string& fname, + const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewLogger(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(db_path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options, + output_path, dbg); +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.h new file mode 100644 index 0000000000..a3c998262c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/fs_remap.h @@ -0,0 +1,137 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// An abstract FileSystem wrapper that creates a view of an existing +// FileSystem by remapping names in some way. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class RemapFileSystem : public FileSystemWrapper { + public: + explicit RemapFileSystem(const std::shared_ptr& base); + + protected: + // Returns status and mapped-to path in the wrapped filesystem. + // If it returns non-OK status, the returned path should not be used. + virtual std::pair EncodePath( + const std::string& path) = 0; + + // Similar to EncodePath() except used in cases in which it is OK for + // no file or directory on 'path' to already exist, such as if the + // operation would create one. However, the parent of 'path' is expected + // to exist for the operation to succeed. + // Default implementation: call EncodePath + virtual std::pair EncodePathWithNewBasename( + const std::string& path); + + public: + // Left abstract: + // const char* Name() const override { ... } + static const char* kClassName() { return "RemapFileSystem"; } + bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return FileSystemWrapper::IsInstanceOf(id); + } + } + + Status RegisterDbPaths(const std::vector& paths) override; + + Status UnregisterDbPaths(const std::vector& paths) override; + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& dir, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus FileExists(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override; + + IOStatus RenameFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LinkFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override; + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.cc new file mode 100644 index 0000000000..0ec0e9c83b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.cc @@ -0,0 +1,1733 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef ROCKSDB_LIB_IO_POSIX +#include "env/io_posix.h" + +#include +#include + +#include +#if defined(OS_LINUX) +#include +#ifndef FALLOC_FL_KEEP_SIZE +#include +#endif +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#include +#endif +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "util/autovector.h" +#include "util/coding.h" +#include "util/string_util.h" + +#if defined(OS_LINUX) && !defined(F_SET_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#endif + +namespace ROCKSDB_NAMESPACE { + +std::string IOErrorMsg(const std::string& context, + const std::string& file_name) { + if (file_name.empty()) { + return context; + } + return context + ": " + file_name; +} + +// file_name can be left empty if it is not unkown. +IOStatus IOError(const std::string& context, const std::string& file_name, + int err_number) { + switch (err_number) { + case ENOSPC: { + IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + s.SetRetryable(true); + return s; + } + case ESTALE: + return IOStatus::IOError(IOStatus::kStaleFile); + case ENOENT: + return IOStatus::PathNotFound(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + default: + return IOStatus::IOError(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + } +} + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return 0. +int Fadvise(int fd, off_t offset, size_t len, int advice) { +#ifdef OS_LINUX + return posix_fadvise(fd, offset, len, advice); +#else + (void)fd; + (void)offset; + (void)len; + (void)advice; + return 0; // simply do nothing. +#endif +} + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return 0. +int Madvise(void* addr, size_t len, int advice) { +#ifdef OS_LINUX + return posix_madvise(addr, len, advice); +#else + (void)addr; + (void)len; + (void)advice; + return 0; // simply do nothing. +#endif +} + +namespace { + +// On MacOS (and probably *BSD), the posix write and pwrite calls do not support +// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by +// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep +// the writes aligned. + +bool PosixWrite(int fd, const char* buf, size_t nbyte) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = write(fd, src, bytes_to_write); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + src += done; + } + return true; +} + +bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = pwrite(fd, src, bytes_to_write, offset); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + offset += done; + src += done; + } + + return true; +} + +#ifdef ROCKSDB_RANGESYNC_PRESENT + +#if !defined(ZFS_SUPER_MAGIC) +// The magic number for ZFS was not exposed until recently. It should be fixed +// forever so we can just copy the magic number here. +#define ZFS_SUPER_MAGIC 0x2fc12fc1 +#endif + +bool IsSyncFileRangeSupported(int fd) { + // This function tracks and checks for cases where we know `sync_file_range` + // definitely will not work properly despite passing the compile-time check + // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks + // fail in unexpected ways, we allow `sync_file_range` to be used. This way + // should minimize risk of impacting existing use cases. + struct statfs buf; + int ret = fstatfs(fd, &buf); + assert(ret == 0); + if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) { + // Testing on ZFS showed the writeback did not happen asynchronously when + // `sync_file_range` was called, even though it returned success. Avoid it + // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`, + // even though this'll incur extra I/O for metadata. + return false; + } + + ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */); + assert(!(ret == -1 && errno != ENOSYS)); + if (ret == -1 && errno == ENOSYS) { + // `sync_file_range` is not implemented on all platforms even if + // compile-time checks pass and a supported filesystem is in-use. For + // example, using ext4 on WSL (Windows Subsystem for Linux), + // `sync_file_range()` returns `ENOSYS` + // ("Function not implemented"). + return false; + } + // None of the known cases matched, so allow `sync_file_range` use. + return true; +} + +#undef ZFS_SUPER_MAGIC + +#endif // ROCKSDB_RANGESYNC_PRESENT + +} // anonymous namespace + +/* + * PosixSequentialFile + */ +PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file, + int fd, size_t logical_block_size, + const EnvOptions& options) + : filename_(fname), + file_(file), + fd_(fd), + use_direct_io_(options.use_direct_reads), + logical_sector_size_(logical_block_size) { + assert(!options.use_direct_reads || !options.use_mmap_reads); +} + +PosixSequentialFile::~PosixSequentialFile() { + if (!use_direct_io()) { + assert(file_); + fclose(file_); + } else { + assert(fd_); + close(fd_); + } +} + +IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + assert(result != nullptr && !use_direct_io()); + IOStatus s; + size_t r = 0; + do { + clearerr(file_); + r = fread_unlocked(scratch, 1, n, file_); + } while (r == 0 && ferror(file_) && errno == EINTR); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + // We also clear the error so that the reads can continue + // if a new data is written to the file + clearerr(file_); + } else { + // A partial read with an error: return a non-ok status + s = IOError("While reading file sequentially", filename_, errno); + } + } + return s; +} + +IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + assert(use_direct_io()); + assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(n, GetRequiredBufferAlignment())); + assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); + + IOStatus s; + ssize_t r = -1; + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast(offset)); + if (r <= 0) { + if (r == -1 && errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + if (!IsSectorAligned(r, GetRequiredBufferAlignment())) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + break; + } + } + if (r < 0) { + // An error: return a non-ok status + s = IOError("While pread " + std::to_string(n) + " bytes from offset " + + std::to_string(offset), + filename_, errno); + } + *result = Slice(scratch, (r < 0) ? 0 : n - left); + return s; +} + +IOStatus PosixSequentialFile::Skip(uint64_t n) { + if (fseek(file_, static_cast(n), SEEK_CUR)) { + return IOError("While fseek to skip " + std::to_string(n) + " bytes", + filename_, errno); + } + return IOStatus::OK(); +} + +IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + if (!use_direct_io()) { + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret != 0) { + return IOError("While fadvise NotNeeded offset " + + std::to_string(offset) + " len " + + std::to_string(length), + filename_, errno); + } + } + return IOStatus::OK(); +#endif +} + +/* + * PosixRandomAccessFile + */ +#if defined(OS_LINUX) +size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length * 3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd, FS_IOC_GETVERSION, &version); + TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast(rid - id); +} +#endif + +#if defined(OS_MACOSX) || defined(OS_AIX) +size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length * 3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, buf.st_gen); + assert(rid >= id); + return static_cast(rid - id); +} +#endif + +#ifdef OS_LINUX +std::string RemoveTrailingSlash(const std::string& path) { + std::string p = path; + if (p.size() > 1 && p.back() == '/') { + p.pop_back(); + } + return p; +} + +Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize( + const std::vector& directories) { + std::vector dirs; + dirs.reserve(directories.size()); + for (auto& d : directories) { + dirs.emplace_back(RemoveTrailingSlash(d)); + } + + std::map dir_sizes; + { + ReadLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + if (cache_.find(dir) == cache_.end()) { + dir_sizes.emplace(dir, 0); + } + } + } + + Status s; + for (auto& dir_size : dir_sizes) { + s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second); + if (!s.ok()) { + return s; + } + } + + WriteLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + auto& v = cache_[dir]; + v.ref++; + auto dir_size = dir_sizes.find(dir); + if (dir_size != dir_sizes.end()) { + v.size = dir_size->second; + } + } + return s; +} + +void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize( + const std::vector& directories) { + std::vector dirs; + dirs.reserve(directories.size()); + for (auto& dir : directories) { + dirs.emplace_back(RemoveTrailingSlash(dir)); + } + + WriteLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + auto it = cache_.find(dir); + if (it != cache_.end() && !(--(it->second.ref))) { + cache_.erase(it); + } + } +} + +size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, + int fd) { + std::string dir = fname.substr(0, fname.find_last_of("/")); + if (dir.empty()) { + dir = "/"; + } + { + ReadLock lock(&cache_mutex_); + auto it = cache_.find(dir); + if (it != cache_.end()) { + return it->second.size; + } + } + return get_logical_block_size_of_fd_(fd); +} +#endif + +Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory, + size_t* size) { + int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY); + if (fd == -1) { + close(fd); + return Status::IOError("Cannot open directory " + directory); + } + *size = PosixHelper::GetLogicalBlockSizeOfFd(fd); + close(fd); + return Status::OK(); +} + +size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { +#ifdef OS_LINUX + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return kDefaultPageSize; + } + if (major(buf.st_dev) == 0) { + // Unnamed devices (e.g. non-device mounts), reserved as null device number. + // These don't have an entry in /sys/dev/block/. Return a sensible default. + return kDefaultPageSize; + } + + // Reading queue/logical_block_size does not require special permissions. + const int kBufferSize = 100; + char path[kBufferSize]; + char real_path[PATH_MAX + 1]; + snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), + minor(buf.st_dev)); + if (realpath(path, real_path) == nullptr) { + return kDefaultPageSize; + } + std::string device_dir(real_path); + if (!device_dir.empty() && device_dir.back() == '/') { + device_dir.pop_back(); + } + // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda + // and nvme0n1 have it. + // $ ls -al '/sys/dev/block/8:3' + // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> + // ../../block/sda/sda3 + // $ ls -al '/sys/dev/block/259:4' + // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> + // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 + size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); + if (parent_end == std::string::npos) { + return kDefaultPageSize; + } + size_t parent_begin = device_dir.rfind('/', parent_end - 1); + if (parent_begin == std::string::npos) { + return kDefaultPageSize; + } + std::string parent = + device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); + std::string child = device_dir.substr(parent_end + 1, std::string::npos); + if (parent != "block" && + (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { + device_dir = device_dir.substr(0, parent_end); + } + std::string fname = device_dir + "/queue/logical_block_size"; + FILE* fp; + size_t size = 0; + fp = fopen(fname.c_str(), "r"); + if (fp != nullptr) { + char* line = nullptr; + size_t len = 0; + if (getline(&line, &len, fp) != -1) { + sscanf(line, "%zu", &size); + } + free(line); + fclose(fp); + } + if (size != 0 && (size & (size - 1)) == 0) { + return size; + } +#endif + (void)fd; + return kDefaultPageSize; +} + +/* + * PosixRandomAccessFile + * + * pread() based random-access + */ +PosixRandomAccessFile::PosixRandomAccessFile( + const std::string& fname, int fd, size_t logical_block_size, + const EnvOptions& options +#if defined(ROCKSDB_IOURING_PRESENT) + , + ThreadLocalPtr* thread_local_io_urings +#endif + ) + : filename_(fname), + fd_(fd), + use_direct_io_(options.use_direct_reads), + logical_sector_size_(logical_block_size) +#if defined(ROCKSDB_IOURING_PRESENT) + , + thread_local_io_urings_(thread_local_io_urings) +#endif +{ + assert(!options.use_direct_reads || !options.use_mmap_reads); + assert(!options.use_mmap_reads); +} + +PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } + +IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& /*opts*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + if (use_direct_io()) { + assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(n, GetRequiredBufferAlignment())); + assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); + } + IOStatus s; + ssize_t r = -1; + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast(offset)); + if (r <= 0) { + if (r == -1 && errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + if (use_direct_io() && + r % static_cast(GetRequiredBufferAlignment()) != 0) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + break; + } + } + if (r < 0) { + // An error: return a non-ok status + s = IOError("While pread offset " + std::to_string(offset) + " len " + + std::to_string(n), + filename_, errno); + } + *result = Slice(scratch, (r < 0) ? 0 : n - left); + return s; +} + +IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + if (use_direct_io()) { + for (size_t i = 0; i < num_reqs; i++) { + assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment())); + assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment())); + } + } + +#if defined(ROCKSDB_IOURING_PRESENT) + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast(thread_local_io_urings_->Get()); + if (iu == nullptr) { + iu = CreateIOUring(); + if (iu != nullptr) { + thread_local_io_urings_->Reset(iu); + } + } + } + + // Init failed, platform doesn't support io_uring. Fall back to + // serialized reads + if (iu == nullptr) { + return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); + } + + IOStatus ios = IOStatus::OK(); + + struct WrappedReadRequest { + FSReadRequest* req; + struct iovec iov; + size_t finished_len; + explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {} + }; + + autovector req_wraps; + autovector incomplete_rq_list; + std::unordered_set wrap_cache; + + for (size_t i = 0; i < num_reqs; i++) { + req_wraps.emplace_back(&reqs[i]); + } + + size_t reqs_off = 0; + while (num_reqs > reqs_off || !incomplete_rq_list.empty()) { + size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size(); + + // If requests exceed depth, split it into batches + if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth; + + assert(incomplete_rq_list.size() <= this_reqs); + for (size_t i = 0; i < this_reqs; i++) { + WrappedReadRequest* rep_to_submit; + if (i < incomplete_rq_list.size()) { + rep_to_submit = incomplete_rq_list[i]; + } else { + rep_to_submit = &req_wraps[reqs_off++]; + } + assert(rep_to_submit->req->len > rep_to_submit->finished_len); + rep_to_submit->iov.iov_base = + rep_to_submit->req->scratch + rep_to_submit->finished_len; + rep_to_submit->iov.iov_len = + rep_to_submit->req->len - rep_to_submit->finished_len; + + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(iu); + io_uring_prep_readv( + sqe, fd_, &rep_to_submit->iov, 1, + rep_to_submit->req->offset + rep_to_submit->finished_len); + io_uring_sqe_set_data(sqe, rep_to_submit); + wrap_cache.emplace(rep_to_submit); + } + incomplete_rq_list.clear(); + + ssize_t ret = + io_uring_submit_and_wait(iu, static_cast(this_reqs)); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + &ret); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + iu); + + if (static_cast(ret) != this_reqs) { + fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); + // If error happens and we submitted fewer than expected, it is an + // exception case and we don't retry here. We should still consume + // what is is submitted in the ring. + for (ssize_t i = 0; i < ret; i++) { + struct io_uring_cqe* cqe = nullptr; + io_uring_wait_cqe(iu, &cqe); + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + } + return IOStatus::IOError("io_uring_submit_and_wait() requested " + + std::to_string(this_reqs) + " but returned " + + std::to_string(ret)); + } + + for (size_t i = 0; i < this_reqs; i++) { + struct io_uring_cqe* cqe = nullptr; + WrappedReadRequest* req_wrap; + + // We could use the peek variant here, but this seems safer in terms + // of our initial wait not reaping all completions + ret = io_uring_wait_cqe(iu, &cqe); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret); + if (ret) { + ios = IOStatus::IOError("io_uring_wait_cqe() returns " + + std::to_string(ret)); + + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + continue; + } + + req_wrap = static_cast(io_uring_cqe_get_data(cqe)); + // Reset cqe data to catch any stray reuse of it + static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + // Check that we got a valid unique cqe data + auto wrap_check = wrap_cache.find(req_wrap); + if (wrap_check == wrap_cache.end()) { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "Bad cqe data from IO uring - %p\n", + req_wrap); + port::PrintStack(); + ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + + std::to_string((uint64_t)req_wrap)); + continue; + } + wrap_cache.erase(wrap_check); + + FSReadRequest* req = req_wrap->req; + size_t bytes_read = 0; + bool read_again = false; + UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len, + false /*async_read*/, use_direct_io(), + GetRequiredBufferAlignment(), req_wrap->finished_len, req, + bytes_read, read_again); + int32_t res = cqe->res; + if (res >= 0) { + if (bytes_read == 0) { + if (read_again) { + Slice tmp_slice; + req->status = + Read(req->offset + req_wrap->finished_len, + req->len - req_wrap->finished_len, options, &tmp_slice, + req->scratch + req_wrap->finished_len, dbg); + req->result = + Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + } + // else It means EOF so no need to do anything. + } else if (bytes_read < req_wrap->iov.iov_len) { + incomplete_rq_list.push_back(req_wrap); + } + } + io_uring_cqe_seen(iu, cqe); + } + wrap_cache.clear(); + } + return ios; +#else + return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); +#endif +} + +IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + if (!use_direct_io()) { + ssize_t r = 0; +#ifdef OS_LINUX + r = readahead(fd_, offset, n); +#endif +#ifdef OS_MACOSX + radvisory advice; + advice.ra_offset = static_cast(offset); + advice.ra_count = static_cast(n); + r = fcntl(fd_, F_RDADVISE, &advice); +#endif + if (r == -1) { + s = IOError("While prefetching offset " + std::to_string(offset) + + " len " + std::to_string(n), + filename_, errno); + } + } + return s; +} + +#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) +size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); +} +#endif + +void PosixRandomAccessFile::Hint(AccessPattern pattern) { + if (use_direct_io()) { + return; + } + switch (pattern) { + case kNormal: + Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case kRandom: + Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case kSequential: + Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case kWillNeed: + Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case kWontNeed: + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } +} + +IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + if (use_direct_io()) { + return IOStatus::OK(); + } +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise NotNeeded offset " + std::to_string(offset) + + " len " + std::to_string(length), + filename_, errno); +#endif +} + +IOStatus PosixRandomAccessFile::ReadAsync( + FSReadRequest& req, const IOOptions& /*opts*/, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { + if (use_direct_io()) { + assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(req.len, GetRequiredBufferAlignment())); + assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment())); + } + +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring_queue_init. + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast(thread_local_io_urings_->Get()); + if (iu == nullptr) { + iu = CreateIOUring(); + if (iu != nullptr) { + thread_local_io_urings_->Reset(iu); + } + } + } + + // Init failed, platform doesn't support io_uring. + if (iu == nullptr) { + return IOStatus::NotSupported("ReadAsync"); + } + + // Allocate io_handle. + IOHandleDeleter deletefn = [](void* args) -> void { + delete (static_cast(args)); + args = nullptr; + }; + + // Initialize Posix_IOHandle. + Posix_IOHandle* posix_handle = + new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch, + use_direct_io(), GetRequiredBufferAlignment()); + posix_handle->iov.iov_base = req.scratch; + posix_handle->iov.iov_len = req.len; + + *io_handle = static_cast(posix_handle); + *del_fn = deletefn; + + // Step 3: io_uring_sqe_set_data + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(iu); + + io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov, + /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset); + + // Sets sqe->user_data to posix_handle. + io_uring_sqe_set_data(sqe, posix_handle); + + // Step 4: io_uring_submit + ssize_t ret = io_uring_submit(iu); + if (ret < 0) { + fprintf(stderr, "io_uring_submit error: %ld\n", long(ret)); + return IOStatus::IOError("io_uring_submit() requested but returned " + + std::to_string(ret)); + } + return IOStatus::OK(); +#else + (void)req; + (void)cb; + (void)cb_arg; + (void)io_handle; + (void)del_fn; + return IOStatus::NotSupported("ReadAsync"); +#endif +} + +/* + * PosixMmapReadableFile + * + * mmap() based random-access + */ +// base[0,length-1] contains the mmapped contents of the file. +PosixMmapReadableFile::PosixMmapReadableFile(const int fd, + const std::string& fname, + void* base, size_t length, + const EnvOptions& options) + : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { +#ifdef NDEBUG + (void)options; +#endif + fd_ = fd_ + 0; // suppress the warning for used variables + assert(options.use_mmap_reads); + assert(!options.use_direct_reads); +} + +PosixMmapReadableFile::~PosixMmapReadableFile() { + int ret = munmap(mmapped_region_, length_); + if (ret != 0) { + fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n", + mmapped_region_, length_); + } + close(fd_); +} + +IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n, + const IOOptions& /*opts*/, Slice* result, + char* /*scratch*/, + IODebugContext* /*dbg*/) const { + IOStatus s; + if (offset > length_) { + *result = Slice(); + return IOError("While mmap read offset " + std::to_string(offset) + + " larger than file length " + std::to_string(length_), + filename_, EINVAL); + } else if (offset + n > length_) { + n = static_cast(length_ - offset); + } + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + return s; +} + +void PosixMmapReadableFile::Hint(AccessPattern pattern) { + switch (pattern) { + case kNormal: + Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL); + break; + case kRandom: + Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM); + break; + case kSequential: + Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL); + break; + case kWillNeed: + Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED); + break; + case kWontNeed: + Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED); + break; + default: + assert(false); + break; + } +} + +IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise not needed. Offset " + std::to_string(offset) + + " len" + std::to_string(length), + filename_, errno); +#endif +} + +/* + * PosixMmapFile + * + * We preallocate up to an extra megabyte and use memcpy to append new + * data to the file. This is safe since we either properly close the + * file before reading from it, or for log files, the reading code + * knows enough to skip zero suffixes. + */ +IOStatus PosixMmapFile::UnmapCurrentRegion() { + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); + if (base_ != nullptr) { + int munmap_status = munmap(base_, limit_ - base_); + if (munmap_status != 0) { + return IOError("While munmap", filename_, munmap_status); + } + file_offset_ += limit_ - base_; + base_ = nullptr; + limit_ = nullptr; + last_sync_ = nullptr; + dst_ = nullptr; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1 << 20)) { + map_size_ *= 2; + } + } + return IOStatus::OK(); +} + +IOStatus PosixMmapFile::MapNewRegion() { +#ifdef ROCKSDB_FALLOCATE_PRESENT + assert(base_ == nullptr); + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + if (allow_fallocate_) { + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } + if (alloc_status != 0) { + return IOStatus::IOError("Error allocating space to file : " + filename_ + + "Error : " + errnoStr(alloc_status).c_str()); + } + } + + TEST_KILL_RANDOM("PosixMmapFile::Append:1"); + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, + file_offset_); + if (ptr == MAP_FAILED) { + return IOStatus::IOError("MMap failed on " + filename_); + } + TEST_KILL_RANDOM("PosixMmapFile::Append:2"); + + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return IOStatus::OK(); +#else + return IOStatus::NotSupported("This platform doesn't support fallocate()"); +#endif +} + +IOStatus PosixMmapFile::Msync() { + if (dst_ == last_sync_) { + return IOStatus::OK(); + } + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + TEST_KILL_RANDOM("PosixMmapFile::Msync:0"); + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + return IOError("While msync", filename_, errno); + } + return IOStatus::OK(); +} + +PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(nullptr), + limit_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + allow_fallocate_ = options.allow_fallocate; + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#else + (void)options; +#endif + assert((page_size & (page_size - 1)) == 0); + assert(options.use_mmap_writes); + assert(!options.use_direct_writes); +} + +PosixMmapFile::~PosixMmapFile() { + if (fd_ >= 0) { + IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + IOStatus s = UnmapCurrentRegion(); + if (!s.ok()) { + return s; + } + s = MapNewRegion(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM("PosixMmapFile::Append:0"); + } + + size_t n = (left <= avail) ? left : avail; + assert(dst_); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return IOStatus::OK(); +} + +IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + size_t unused = limit_ - dst_; + + s = UnmapCurrentRegion(); + if (!s.ok()) { + s = IOError("While closing mmapped file", filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError("While ftruncating mmaped file", filename_, errno); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError("While closing mmapped file", filename_, errno); + } + } + + fd_ = -1; + base_ = nullptr; + limit_ = nullptr; + return s; +} + +IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fdatasync(fd_) < 0) { + return IOError("While fdatasync mmapped file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + + return Msync(); +} + +/** + * Flush data as well as metadata to stable storage. + */ +IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fsync(fd_) < 0) { + return IOError("While fsync mmaped file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + + return Msync(); +} + +/** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + size_t used = dst_ - base_; + return file_offset_ + used; +} + +IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise NotNeeded mmapped file", filename_, errno); +#endif +} + +#ifdef ROCKSDB_FALLOCATE_PRESENT +IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + assert(offset <= static_cast(std::numeric_limits::max())); + assert(len <= static_cast(std::numeric_limits::max())); + TEST_KILL_RANDOM("PosixMmapFile::Allocate:0"); + int alloc_status = 0; + if (allow_fallocate_) { + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); + } + if (alloc_status == 0) { + return IOStatus::OK(); + } else { + return IOError("While fallocate offset " + std::to_string(offset) + + " len " + std::to_string(len), + filename_, errno); + } +} +#endif + +/* + * PosixWritableFile + * + * Use posix write to write data to a file. + */ +PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, + size_t logical_block_size, + const EnvOptions& options) + : FSWritableFile(options), + filename_(fname), + use_direct_io_(options.use_direct_writes), + fd_(fd), + filesize_(0), + logical_sector_size_(logical_block_size) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + allow_fallocate_ = options.allow_fallocate; + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif +#ifdef ROCKSDB_RANGESYNC_PRESENT + sync_file_range_supported_ = IsSyncFileRangeSupported(fd_); +#endif // ROCKSDB_RANGESYNC_PRESENT + assert(!options.use_mmap_writes); +} + +PosixWritableFile::~PosixWritableFile() { + if (fd_ >= 0) { + IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + if (use_direct_io()) { + assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); + assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); + } + const char* src = data.data(); + size_t nbytes = data.size(); + + if (!PosixWrite(fd_, src, nbytes)) { + return IOError("While appending to file", filename_, errno); + } + + filesize_ += nbytes; + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + if (use_direct_io()) { + assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); + assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); + } + assert(offset <= static_cast(std::numeric_limits::max())); + const char* src = data.data(); + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { + return IOError("While pwrite to file at offset " + std::to_string(offset), + filename_, errno); + } + filesize_ = offset + nbytes; + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + int r = ftruncate(fd_, size); + if (r < 0) { + s = IOError("While ftruncate file to size " + std::to_string(size), + filename_, errno); + } else { + filesize_ = size; + } + return s; +} + +IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + + size_t block_size; + size_t last_allocated_block; + GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block); + if (last_allocated_block > 0) { + // trim the extra space preallocated at the end of the file + // NOTE(ljin): we probably don't want to surface failure as an IOError, + // but it will be nice to log these errors. + int dummy __attribute__((__unused__)); + dummy = ftruncate(fd_, filesize_); +#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) + // in some file systems, ftruncate only trims trailing space if the + // new file size is smaller than the current size. Calling fallocate + // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused + // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following + // filesystems: + // XFS (since Linux 2.6.38) + // ext4 (since Linux 3.0) + // Btrfs (since Linux 3.7) + // tmpfs (since Linux 3.5) + // We ignore error since failure of this operation does not affect + // correctness. + struct stat file_stats; + int result = fstat(fd_, &file_stats); + // After ftruncate, we check whether ftruncate has the correct behavior. + // If not, we should hack it with FALLOC_FL_PUNCH_HOLE + if (result == 0 && + (file_stats.st_size + file_stats.st_blksize - 1) / + file_stats.st_blksize != + file_stats.st_blocks / (file_stats.st_blksize / 512)) { + IOSTATS_TIMER_GUARD(allocate_nanos); + if (allow_fallocate_) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, + block_size * last_allocated_block - filesize_); + } + } +#endif + } + + if (close(fd_) < 0) { + s = IOError("While closing file after writing", filename_, errno); + } + fd_ = -1; + return s; +} + +// write out the cached data to the OS cache +IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fdatasync(fd_) < 0) { + return IOError("While fdatasync", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fsync(fd_) < 0) { + return IOError("While fsync", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +bool PosixWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return filesize_; +} + +void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { +#ifdef OS_LINUX +// Suppress Valgrind "Unimplemented functionality" error. +#ifndef ROCKSDB_VALGRIND_RUN + if (hint == write_hint_) { + return; + } + if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) { + write_hint_ = hint; + } +#else + (void)hint; +#endif // ROCKSDB_VALGRIND_RUN +#else + (void)hint; +#endif // OS_LINUX +} + +IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) { + if (use_direct_io()) { + return IOStatus::OK(); + } +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise NotNeeded", filename_, errno); +#endif +} + +#ifdef ROCKSDB_FALLOCATE_PRESENT +IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + assert(offset <= static_cast(std::numeric_limits::max())); + assert(len <= static_cast(std::numeric_limits::max())); + TEST_KILL_RANDOM("PosixWritableFile::Allocate:0"); + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = 0; + if (allow_fallocate_) { + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); + } + if (alloc_status == 0) { + return IOStatus::OK(); + } else { + return IOError("While fallocate offset " + std::to_string(offset) + + " len " + std::to_string(len), + filename_, errno); + } +} +#endif + +IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& opts, + IODebugContext* dbg) { +#ifdef ROCKSDB_RANGESYNC_PRESENT + assert(offset <= static_cast(std::numeric_limits::max())); + assert(nbytes <= static_cast(std::numeric_limits::max())); + if (sync_file_range_supported_) { + int ret; + if (strict_bytes_per_sync_) { + // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length + // that spans all bytes written so far tells `sync_file_range` to wait for + // any outstanding writeback requests to finish before issuing a new one. + ret = + sync_file_range(fd_, 0, static_cast(offset + nbytes), + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE); + } else { + ret = sync_file_range(fd_, static_cast(offset), + static_cast(nbytes), SYNC_FILE_RANGE_WRITE); + } + if (ret != 0) { + return IOError("While sync_file_range returned " + std::to_string(ret), + filename_, errno); + } + return IOStatus::OK(); + } +#endif // ROCKSDB_RANGESYNC_PRESENT + return FSWritableFile::RangeSync(offset, nbytes, opts, dbg); +} + +#ifdef OS_LINUX +size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { + return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); +} +#endif + +/* + * PosixRandomRWFile + */ + +PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd, + const EnvOptions& /*options*/) + : filename_(fname), fd_(fd) {} + +PosixRandomRWFile::~PosixRandomRWFile() { + if (fd_ >= 0) { + IOStatus s = Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + const char* src = data.data(); + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { + return IOError("While write random read/write file at offset " + + std::to_string(offset), + filename_, errno); + } + + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) const { + size_t left = n; + char* ptr = scratch; + while (left > 0) { + ssize_t done = pread(fd_, ptr, left, offset); + if (done < 0) { + // error while reading from file + if (errno == EINTR) { + // read was interrupted, try again. + continue; + } + return IOError("While reading random read/write file offset " + + std::to_string(offset) + " len " + std::to_string(n), + filename_, errno); + } else if (done == 0) { + // Nothing more to read + break; + } + + // Read `done` bytes + ptr += done; + offset += done; + left -= done; + } + + *result = Slice(scratch, n - left); + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fdatasync(fd_) < 0) { + return IOError("While fdatasync random read/write file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fsync(fd_) < 0) { + return IOError("While fsync random read/write file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + if (close(fd_) < 0) { + return IOError("While close random read/write file", filename_, errno); + } + fd_ = -1; + return IOStatus::OK(); +} + +PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() { + // TODO should have error handling though not much we can do... + munmap(this->base_, length_); +} + +/* + * PosixDirectory + */ +#if !defined(BTRFS_SUPER_MAGIC) +// The magic number for BTRFS is fixed, if it's not defined, define it here +#define BTRFS_SUPER_MAGIC 0x9123683E +#endif +PosixDirectory::PosixDirectory(int fd, const std::string& directory_name) + : fd_(fd), directory_name_(directory_name) { + is_btrfs_ = false; +#ifdef OS_LINUX + struct statfs buf; + int ret = fstatfs(fd, &buf); + is_btrfs_ = (ret == 0 && buf.f_type == static_cast( + BTRFS_SUPER_MAGIC)); +#endif +} + +PosixDirectory::~PosixDirectory() { + if (fd_ >= 0) { + IOStatus s = PosixDirectory::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) { + return FsyncWithDirOptions(opts, dbg, DirFsyncOptions()); +} + +// Users who want the file entries synced in Directory project must call a +// Fsync or FsyncWithDirOptions function before Close +IOStatus PosixDirectory::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s = IOStatus::OK(); + if (close(fd_) < 0) { + s = IOError("While closing directory ", directory_name_, errno); + } else { + fd_ = -1; + } + return s; +} + +IOStatus PosixDirectory::FsyncWithDirOptions( + const IOOptions& /*opts*/, IODebugContext* /*dbg*/, + const DirFsyncOptions& dir_fsync_options) { + assert(fd_ >= 0); // Check use after close + IOStatus s = IOStatus::OK(); +#ifndef OS_AIX + if (is_btrfs_) { + // skip dir fsync for new file creation, which is not needed for btrfs + if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) { + return s; + } + // skip dir fsync for renaming file, only need to sync new file + if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) { + std::string new_name = dir_fsync_options.renamed_new_name; + assert(!new_name.empty()); + int fd; + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(new_name.c_str(), O_RDONLY); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("While open renaming file", new_name, errno); + } else if (fsync(fd) < 0) { + s = IOError("While fsync renaming file", new_name, errno); + } + if (close(fd) < 0) { + s = IOError("While closing file after fsync", new_name, errno); + } + return s; + } + // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted + } + + // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed + // in either the de-construction or the close function, data must have been + // fsync-ed before de-construction and close is called +#ifdef HAVE_FULLFSYNC + // btrfs is a Linux file system, while currently F_FULLFSYNC is available on + // Mac OS. + assert(!is_btrfs_); + if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno); + } +#else // HAVE_FULLFSYNC + if (fd_ != -1 && fsync(fd_) == -1) { + s = IOError("While fsync", "a directory", errno); + } +#endif // HAVE_FULLFSYNC +#endif // OS_AIX + return s; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.h new file mode 100644 index 0000000000..f129668ea5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/io_posix.h @@ -0,0 +1,523 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#include +#endif +#include + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" +#include "util/thread_local.h" + +// For non linux platform, the following macros are used only as place +// holder. +#if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX) +#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */ + +#define POSIX_MADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_MADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_MADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_MADV_DONTNEED 4 /* [MC1] don't need these pages */ +#endif + +namespace ROCKSDB_NAMESPACE { +std::string IOErrorMsg(const std::string& context, + const std::string& file_name); +// file_name can be left empty if it is not unkown. +IOStatus IOError(const std::string& context, const std::string& file_name, + int err_number); + +class PosixHelper { + public: + static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size); + static size_t GetLogicalBlockSizeOfFd(int fd); + static Status GetLogicalBlockSizeOfDirectory(const std::string& directory, + size_t* size); +}; + +/* + * DirectIOHelper + */ +inline bool IsSectorAligned(const size_t off, size_t sector_size) { + assert((sector_size & (sector_size - 1)) == 0); + return (off & (sector_size - 1)) == 0; +} + +#ifndef NDEBUG +inline bool IsSectorAligned(const void* ptr, size_t sector_size) { + return uintptr_t(ptr) % sector_size == 0; +} +#endif + +#if defined(ROCKSDB_IOURING_PRESENT) +struct Posix_IOHandle { + Posix_IOHandle(struct io_uring* _iu, + std::function _cb, + void* _cb_arg, uint64_t _offset, size_t _len, char* _scratch, + bool _use_direct_io, size_t _alignment) + : iu(_iu), + cb(_cb), + cb_arg(_cb_arg), + offset(_offset), + len(_len), + scratch(_scratch), + use_direct_io(_use_direct_io), + alignment(_alignment), + is_finished(false), + req_count(0) {} + + struct iovec iov; + struct io_uring* iu; + std::function cb; + void* cb_arg; + uint64_t offset; + size_t len; + char* scratch; + bool use_direct_io; + size_t alignment; + bool is_finished; + // req_count is used by AbortIO API to keep track of number of requests. + uint32_t req_count; +}; + +inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name, + size_t len, size_t iov_len, bool async_read, + bool use_direct_io, size_t alignment, + size_t& finished_len, FSReadRequest* req, + size_t& bytes_read, bool& read_again) { + read_again = false; + if (cqe->res < 0) { + req->result = Slice(req->scratch, 0); + req->status = IOError("Req failed", file_name, cqe->res); + } else { + bytes_read = static_cast(cqe->res); + TEST_SYNC_POINT_CALLBACK("UpdateResults::io_uring_result", &bytes_read); + if (bytes_read == iov_len) { + req->result = Slice(req->scratch, req->len); + req->status = IOStatus::OK(); + } else if (bytes_read == 0) { + /// cqe->res == 0 can means EOF, or can mean partial results. See + // comment + // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435 + // Fall back to pread in this case. + if (use_direct_io && !IsSectorAligned(finished_len, alignment)) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + req->result = Slice(req->scratch, finished_len); + req->status = IOStatus::OK(); + } else { + if (async_read) { + // No bytes read. It can means EOF. In case of partial results, it's + // caller responsibility to call read/readasync again. + req->result = Slice(req->scratch, 0); + req->status = IOStatus::OK(); + } else { + read_again = true; + } + } + } else if (bytes_read < iov_len) { + assert(bytes_read > 0); + if (async_read) { + req->result = Slice(req->scratch, bytes_read); + req->status = IOStatus::OK(); + } else { + assert(bytes_read + finished_len < len); + finished_len += bytes_read; + } + } else { + req->result = Slice(req->scratch, 0); + req->status = IOError("Req returned more bytes than requested", file_name, + cqe->res); + } + } +#ifdef NDEBUG + (void)len; +#endif +} +#endif + +#ifdef OS_LINUX +// Files under a specific directory have the same logical block size. +// This class caches the logical block size for the specified directories to +// save the CPU cost of computing the size. +// Safe for concurrent access from multiple threads without any external +// synchronization. +class LogicalBlockSizeCache { + public: + LogicalBlockSizeCache( + std::function get_logical_block_size_of_fd = + PosixHelper::GetLogicalBlockSizeOfFd, + std::function + get_logical_block_size_of_directory = + PosixHelper::GetLogicalBlockSizeOfDirectory) + : get_logical_block_size_of_fd_(get_logical_block_size_of_fd), + get_logical_block_size_of_directory_( + get_logical_block_size_of_directory) {} + + // Takes the following actions: + // 1. Increases reference count of the directories; + // 2. If the directory's logical block size is not cached, + // compute the buffer size and cache the result. + Status RefAndCacheLogicalBlockSize( + const std::vector& directories); + + // Takes the following actions: + // 1. Decreases reference count of the directories; + // 2. If the reference count of a directory reaches 0, remove the directory + // from the cache. + void UnrefAndTryRemoveCachedLogicalBlockSize( + const std::vector& directories); + + // Returns the logical block size for the file. + // + // If the file is under a cached directory, return the cached size. + // Otherwise, the size is computed. + size_t GetLogicalBlockSize(const std::string& fname, int fd); + + int GetRefCount(const std::string& dir) { + ReadLock lock(&cache_mutex_); + auto it = cache_.find(dir); + if (it == cache_.end()) { + return 0; + } + return it->second.ref; + } + + size_t Size() const { return cache_.size(); } + + bool Contains(const std::string& dir) { + ReadLock lock(&cache_mutex_); + return cache_.find(dir) != cache_.end(); + } + + private: + struct CacheValue { + CacheValue() : size(0), ref(0) {} + + // Logical block size of the directory. + size_t size; + // Reference count of the directory. + int ref; + }; + + std::function get_logical_block_size_of_fd_; + std::function + get_logical_block_size_of_directory_; + + std::map cache_; + port::RWMutex cache_mutex_; +}; +#endif + +class PosixSequentialFile : public FSSequentialFile { + private: + std::string filename_; + FILE* file_; + int fd_; + bool use_direct_io_; + size_t logical_sector_size_; + + public: + PosixSequentialFile(const std::string& fname, FILE* file, int fd, + size_t logical_block_size, const EnvOptions& options); + virtual ~PosixSequentialFile(); + + virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) override; + virtual IOStatus PositionedRead(uint64_t offset, size_t n, + const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) override; + virtual IOStatus Skip(uint64_t n) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual bool use_direct_io() const override { return use_direct_io_; } + virtual size_t GetRequiredBufferAlignment() const override { + return logical_sector_size_; + } +}; + +#if defined(ROCKSDB_IOURING_PRESENT) +// io_uring instance queue depth +const unsigned int kIoUringDepth = 256; + +inline void DeleteIOUring(void* p) { + struct io_uring* iu = static_cast(p); + delete iu; +} + +inline struct io_uring* CreateIOUring() { + struct io_uring* new_io_uring = new struct io_uring; + int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0); + if (ret) { + delete new_io_uring; + new_io_uring = nullptr; + } + return new_io_uring; +} +#endif // defined(ROCKSDB_IOURING_PRESENT) + +class PosixRandomAccessFile : public FSRandomAccessFile { + protected: + std::string filename_; + int fd_; + bool use_direct_io_; + size_t logical_sector_size_; +#if defined(ROCKSDB_IOURING_PRESENT) + ThreadLocalPtr* thread_local_io_urings_; +#endif + + public: + PosixRandomAccessFile(const std::string& fname, int fd, + size_t logical_block_size, const EnvOptions& options +#if defined(ROCKSDB_IOURING_PRESENT) + , + ThreadLocalPtr* thread_local_io_urings +#endif + ); + virtual ~PosixRandomAccessFile(); + + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts, + IODebugContext* dbg) override; + +#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +#endif + virtual void Hint(AccessPattern pattern) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual bool use_direct_io() const override { return use_direct_io_; } + virtual size_t GetRequiredBufferAlignment() const override { + return logical_sector_size_; + } + // EXPERIMENTAL + virtual IOStatus ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; +}; + +class PosixWritableFile : public FSWritableFile { + protected: + const std::string filename_; + const bool use_direct_io_; + int fd_; + uint64_t filesize_; + size_t logical_sector_size_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool allow_fallocate_; + bool fallocate_with_keep_size_; +#endif +#ifdef ROCKSDB_RANGESYNC_PRESENT + // Even if the syscall is present, the filesystem may still not properly + // support it, so we need to do a dynamic check too. + bool sync_file_range_supported_; +#endif // ROCKSDB_RANGESYNC_PRESENT + + public: + explicit PosixWritableFile(const std::string& fname, int fd, + size_t logical_block_size, + const EnvOptions& options); + virtual ~PosixWritableFile(); + + // Need to implement this so the file is truncated correctly + // with direct I/O + virtual IOStatus Truncate(uint64_t size, const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } + virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual bool IsSyncThreadSafe() const override; + virtual bool use_direct_io() const override { return use_direct_io_; } + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override; + virtual uint64_t GetFileSize(const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual size_t GetRequiredBufferAlignment() const override { + return logical_sector_size_; + } +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual IOStatus Allocate(uint64_t offset, uint64_t len, + const IOOptions& opts, + IODebugContext* dbg) override; +#endif + virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& opts, + IODebugContext* dbg) override; +#ifdef OS_LINUX + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +#endif +}; + +// mmap() based random-access +class PosixMmapReadableFile : public FSRandomAccessFile { + private: + int fd_; + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + PosixMmapReadableFile(const int fd, const std::string& fname, void* base, + size_t length, const EnvOptions& options); + virtual ~PosixMmapReadableFile(); + IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) const override; + void Hint(AccessPattern pattern) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; +}; + +class PosixMmapFile : public FSWritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool allow_fallocate_; // If false, fallocate calls are bypassed + bool fallocate_with_keep_size_; +#endif + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + IOStatus MapNewRegion(); + IOStatus UnmapCurrentRegion(); + IOStatus Msync(); + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options); + ~PosixMmapFile(); + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual uint64_t GetFileSize(const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual IOStatus Allocate(uint64_t offset, uint64_t len, + const IOOptions& opts, + IODebugContext* dbg) override; +#endif +}; + +class PosixRandomRWFile : public FSRandomRWFile { + public: + explicit PosixRandomRWFile(const std::string& fname, int fd, + const EnvOptions& options); + virtual ~PosixRandomRWFile(); + + virtual IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& opts, IODebugContext* dbg) override; + + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + + private: + const std::string filename_; + int fd_; +}; + +struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer { + PosixMemoryMappedFileBuffer(void* _base, size_t _length) + : MemoryMappedFileBuffer(_base, _length) {} + virtual ~PosixMemoryMappedFileBuffer(); +}; + +class PosixDirectory : public FSDirectory { + public: + explicit PosixDirectory(int fd, const std::string& directory_name); + ~PosixDirectory(); + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + + virtual IOStatus FsyncWithDirOptions( + const IOOptions&, IODebugContext*, + const DirFsyncOptions& dir_fsync_options) override; + + private: + int fd_; + bool is_btrfs_; + const std::string directory_name_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.cc new file mode 100644 index 0000000000..c232af61eb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.cc @@ -0,0 +1,1058 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "env/mock_env.h" + +#include +#include + +#include "env/emulated_clock.h" +#include "file/filename.h" +#include "port/sys_time.h" +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/options_type.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/rate_limiter_impl.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +int64_t MaybeCurrentTime(const std::shared_ptr& clock) { + int64_t time = 1337346000; // arbitrary fallback default + clock->GetCurrentTime(&time).PermitUncheckedError(); + return time; +} + +static std::unordered_map time_elapse_type_info = { + {"time_elapse_only_sleep", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto clock = static_cast(addr); + clock->SetTimeElapseOnlySleep(ParseBoolean("", value)); + return Status::OK(); + }, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto clock = static_cast(addr); + *value = clock->IsTimeElapseOnlySleep() ? "true" : "false"; + return Status::OK(); + }, + nullptr}}, +}; +static std::unordered_map mock_sleep_type_info = { + {"mock_sleep", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto clock = static_cast(addr); + clock->SetMockSleep(ParseBoolean("", value)); + return Status::OK(); + }, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto clock = static_cast(addr); + *value = clock->IsMockSleepEnabled() ? "true" : "false"; + return Status::OK(); + }, + nullptr}}, +}; +} // namespace + +EmulatedSystemClock::EmulatedSystemClock( + const std::shared_ptr& base, bool time_elapse_only_sleep) + : SystemClockWrapper(base), + maybe_starting_time_(MaybeCurrentTime(base)), + time_elapse_only_sleep_(time_elapse_only_sleep), + no_slowdown_(time_elapse_only_sleep) { + RegisterOptions("", this, &time_elapse_type_info); + RegisterOptions("", this, &mock_sleep_type_info); +} + +class MemFile { + public: + explicit MemFile(SystemClock* clock, const std::string& fn, + bool _is_lock_file = false) + : clock_(clock), + fn_(fn), + refs_(0), + is_lock_file_(_is_lock_file), + locked_(false), + size_(0), + modified_time_(Now()), + rnd_(Lower32of64(GetSliceNPHash64(fn))), + fsynced_bytes_(0) {} + // No copying allowed. + MemFile(const MemFile&) = delete; + void operator=(const MemFile&) = delete; + + void Ref() { + MutexLock lock(&mutex_); + ++refs_; + } + + bool is_lock_file() const { return is_lock_file_; } + + bool Lock() { + assert(is_lock_file_); + MutexLock lock(&mutex_); + if (locked_) { + return false; + } else { + locked_ = true; + return true; + } + } + + void Unlock() { + assert(is_lock_file_); + MutexLock lock(&mutex_); + locked_ = false; + } + + void Unref() { + bool do_delete = false; + { + MutexLock lock(&mutex_); + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + do_delete = true; + } + } + + if (do_delete) { + delete this; + } + } + + uint64_t Size() const { return size_; } + + void Truncate(size_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + if (size < size_) { + data_.resize(size); + size_ = size; + } + } + + void CorruptBuffer() { + if (fsynced_bytes_ >= size_) { + return; + } + uint64_t buffered_bytes = size_ - fsynced_bytes_; + uint64_t start = + fsynced_bytes_ + rnd_.Uniform(static_cast(buffered_bytes)); + uint64_t end = std::min(start + 512, size_.load()); + MutexLock lock(&mutex_); + for (uint64_t pos = start; pos < end; ++pos) { + data_[static_cast(pos)] = static_cast(rnd_.Uniform(256)); + } + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, IODebugContext* /*dbg*/) const { + { + IOStatus s; + TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s); + if (!s.ok()) { + // with sync point only + *result = Slice(); + return s; + } + } + MutexLock lock(&mutex_); + const uint64_t available = Size() - std::min(Size(), offset); + size_t offset_ = static_cast(offset); + if (n > available) { + n = static_cast(available); + } + if (n == 0) { + *result = Slice(); + return IOStatus::OK(); + } + if (scratch) { + memcpy(scratch, &(data_[offset_]), n); + *result = Slice(scratch, n); + } else { + *result = Slice(&(data_[offset_]), n); + } + return IOStatus::OK(); + } + + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + size_t offset_ = static_cast(offset); + if (offset + data.size() > data_.size()) { + data_.resize(offset_ + data.size()); + } + data_.replace(offset_, data.size(), data.data(), data.size()); + size_ = data_.size(); + modified_time_ = Now(); + return IOStatus::OK(); + } + + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + data_.append(data.data(), data.size()); + size_ = data_.size(); + modified_time_ = Now(); + return IOStatus::OK(); + } + + IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) { + fsynced_bytes_ = size_.load(); + return IOStatus::OK(); + } + + uint64_t ModifiedTime() const { return modified_time_; } + + private: + uint64_t Now() { + int64_t unix_time = 0; + auto s = clock_->GetCurrentTime(&unix_time); + assert(s.ok()); + return static_cast(unix_time); + } + + // Private since only Unref() should be used to delete it. + ~MemFile() { assert(refs_ == 0); } + + SystemClock* clock_; + const std::string fn_; + mutable port::Mutex mutex_; + int refs_; + bool is_lock_file_; + bool locked_; + + // Data written into this file, all bytes before fsynced_bytes are + // persistent. + std::string data_; + std::atomic size_; + std::atomic modified_time_; + + Random rnd_; + std::atomic fsynced_bytes_; +}; + +namespace { + +class MockSequentialFile : public FSSequentialFile { + public: + explicit MockSequentialFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads), + pos_(0) { + file_->Ref(); + } + + ~MockSequentialFile() override { file_->Unref(); } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + IOStatus s = file_->Read(pos_, n, options, result, + (use_mmap_read_) ? nullptr : scratch, dbg); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + bool use_direct_io() const override { return use_direct_io_; } + IOStatus Skip(uint64_t n) override { + if (pos_ > file_->Size()) { + return IOStatus::IOError("pos_ > file_->Size()"); + } + const uint64_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += static_cast(n); + return IOStatus::OK(); + } + + private: + MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; + size_t pos_; +}; + +class MockRandomAccessFile : public FSRandomAccessFile { + public: + explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads) { + file_->Ref(); + } + + ~MockRandomAccessFile() override { file_->Unref(); } + + bool use_direct_io() const override { return use_direct_io_; } + + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + if (use_mmap_read_) { + return file_->Read(offset, n, options, result, nullptr, dbg); + } else { + return file_->Read(offset, n, options, result, scratch, dbg); + } + } + + private: + MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; +}; + +class MockRandomRWFile : public FSRandomRWFile { + public: + explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); } + + ~MockRandomRWFile() override { file_->Unref(); } + + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return file_->Write(offset, data, options, dbg); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return file_->Read(offset, n, options, result, scratch, dbg); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + private: + MemFile* file_; +}; + +class MockWritableFile : public FSWritableFile { + public: + MockWritableFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_writes), + rate_limiter_(opts.rate_limiter) { + file_->Ref(); + } + + ~MockWritableFile() override { file_->Unref(); } + + bool use_direct_io() const override { return false && use_direct_io_; } + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + size_t bytes_written = 0; + while (bytes_written < data.size()) { + auto bytes = RequestToken(data.size() - bytes_written); + IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes), + options, dbg); + if (!s.ok()) { + return s; + } + bytes_written += bytes; + } + return IOStatus::OK(); + } + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/, + const IOOptions& options, + IODebugContext* dbg) override { + assert(use_direct_io_); + return Append(data, options, dbg); + } + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + file_->Truncate(static_cast(size), options, dbg); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return file_->Size(); + } + + private: + inline size_t RequestToken(size_t bytes) { + if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) { + bytes = std::min( + bytes, static_cast(rate_limiter_->GetSingleBurstBytes())); + rate_limiter_->Request(bytes, io_priority_); + } + return bytes; + } + + MemFile* file_; + bool use_direct_io_; + RateLimiter* rate_limiter_; +}; + +class MockEnvDirectory : public FSDirectory { + public: + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } +}; + +class MockEnvFileLock : public FileLock { + public: + explicit MockEnvFileLock(const std::string& fname) : fname_(fname) {} + + std::string FileName() const { return fname_; } + + private: + const std::string fname_; +}; + +class TestMemLogger : public Logger { + private: + std::unique_ptr file_; + std::atomic_size_t log_size_; + static const uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + SystemClock* clock_; + IOOptions options_; + IODebugContext* dbg_; + std::atomic flush_pending_; + + public: + TestMemLogger(std::unique_ptr f, SystemClock* clock, + const IOOptions& options, IODebugContext* dbg, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(std::move(f)), + log_size_(0), + last_flush_micros_(0), + clock_(clock), + options_(options), + dbg_(dbg), + flush_pending_(false) {} + ~TestMemLogger() override {} + + void Flush() override { + if (flush_pending_) { + flush_pending_ = false; + } + last_flush_micros_ = clock_->NowMicros(); + } + + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + memset(&t, 0, sizeof(t)); + struct tm* ret __attribute__((__unused__)); + ret = port::LocalTimeR(&seconds, &t); + assert(ret); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + + Status s = file_->Append(Slice(base, write_size), options_, dbg_); + if (s.ok()) { + flush_pending_ = true; + log_size_ += write_size; + } + uint64_t now_micros = + static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + last_flush_micros_ = now_micros; + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const override { return log_size_; } +}; + +static std::unordered_map mock_fs_type_info = { + {"supports_direct_io", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; +} // namespace + +MockFileSystem::MockFileSystem(const std::shared_ptr& clock, + bool supports_direct_io) + : system_clock_(clock), supports_direct_io_(supports_direct_io) { + clock_ = system_clock_.get(); + RegisterOptions("", &supports_direct_io_, &mock_fs_type_info); +} + +MockFileSystem::~MockFileSystem() { + for (auto i = file_map_.begin(); i != file_map_.end(); ++i) { + i->second->Unref(); + } +} + +Status MockFileSystem::PrepareOptions(const ConfigOptions& options) { + Status s = FileSystem::PrepareOptions(options); + if (s.ok() && system_clock_ == SystemClock::Default()) { + system_clock_ = options.env->GetSystemClock(); + clock_ = system_clock_.get(); + } + return s; +} + +IOStatus MockFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) { + *output_path = NormalizeMockPath(db_path); + if (output_path->at(0) != '/') { + return IOStatus::NotSupported("GetAbsolutePath"); + } else { + return IOStatus::OK(); + } +} + +std::string MockFileSystem::NormalizeMockPath(const std::string& path) { + std::string p = NormalizePath(path); + if (p.back() == kFilePathSeparator && p.size() > 1) { + p.pop_back(); + } + return p; +} + +// Partial implementation of the FileSystem interface. +IOStatus MockFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = nullptr; + return IOStatus::PathNotFound(fn); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockSequentialFile(f, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = nullptr; + return IOStatus::PathNotFound(fn); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockRandomAccessFile(f, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& /*file_opts*/, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = nullptr; + return IOStatus::PathNotFound(fn); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } + result->reset(new MockRandomRWFile(f)); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + auto s = RenameFile(old_fname, fname, IOOptions(), dbg); + if (!s.ok()) { + return s; + } else { + result->reset(); + return NewWritableFile(fname, options, result, dbg); + } +} + +IOStatus MockFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + DeleteFileInternal(fn); + } + MemFile* file = new MemFile(clock_, fn, false); + file->Ref(); + file_map_[fn] = file; + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + MemFile* file = nullptr; + if (file_map_.find(fn) == file_map_.end()) { + file = new MemFile(clock_, fn, false); + // Only take a reference when we create the file objectt + file->Ref(); + file_map_[fn] = file; + } else { + file = file_map_[fn]; + } + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/, + const IOOptions& /*io_opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + result->reset(new MockEnvDirectory()); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::FileExists(const std::string& fname, + const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + // File exists + return IOStatus::OK(); + } + // Now also check if fn exists as a dir + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' && + Slice(filename).starts_with(Slice(fn))) { + return IOStatus::OK(); + } + } + return IOStatus::NotFound(); +} + +bool MockFileSystem::GetChildrenInternal(const std::string& dir, + std::vector* result) { + auto d = NormalizeMockPath(dir); + bool found_dir = false; + result->clear(); + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + + if (filename == d) { + found_dir = true; + } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && + Slice(filename).starts_with(Slice(d))) { + found_dir = true; + size_t next_slash = filename.find('/', d.size() + 1); + if (next_slash != std::string::npos) { + result->push_back( + filename.substr(d.size() + 1, next_slash - d.size() - 1)); + } else { + result->push_back(filename.substr(d.size() + 1)); + } + } + } + result->erase(std::unique(result->begin(), result->end()), result->end()); + return found_dir; +} + +IOStatus MockFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*options*/, + std::vector* result, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + bool found_dir = GetChildrenInternal(dir, result); +#ifndef __clang_analyzer__ + return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir); +#else + return found_dir ? IOStatus::OK() : IOStatus::NotFound(); +#endif +} + +void MockFileSystem::DeleteFileInternal(const std::string& fname) { + assert(fname == NormalizeMockPath(fname)); + const auto& pair = file_map_.find(fname); + if (pair != file_map_.end()) { + pair->second->Unref(); + file_map_.erase(fname); + } +} + +IOStatus MockFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + + DeleteFileInternal(fn); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& options, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + iter->second->Truncate(size, options, dbg); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::CreateDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dn = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); + if (file_map_.find(dn) == file_map_.end()) { + MemFile* file = new MemFile(clock_, dn, false); + file->Ref(); + file_map_[dn] = file; + } else { + return IOStatus::IOError(); + } + return IOStatus::OK(); +} + +IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + CreateDir(dirname, options, dbg).PermitUncheckedError(); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dir = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); + if (file_map_.find(dir) == file_map_.end()) { + return IOStatus::PathNotFound(dir); + } else { + std::vector children; + if (GetChildrenInternal(dir, &children)) { + for (const auto& child : children) { + DeleteFileInternal(child); + } + } + DeleteFileInternal(dir); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_size, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + TEST_SYNC_POINT_CALLBACK("MockFileSystem::GetFileSize:CheckFileType", &fn); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + + *file_size = iter->second->Size(); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* time, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + *time = iter->second->ModifiedTime(); + return IOStatus::OK(); +} + +bool MockFileSystem::RenameFileInternal(const std::string& src, + const std::string& dest) { + if (file_map_.find(src) == file_map_.end()) { + return false; + } else { + std::vector children; + if (GetChildrenInternal(src, &children)) { + for (const auto& child : children) { + RenameFileInternal(src + "/" + child, dest + "/" + child); + } + } + DeleteFileInternal(dest); + file_map_[dest] = file_map_[src]; + file_map_.erase(src); + return true; + } +} + +IOStatus MockFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); + MutexLock lock(&mutex_); + bool found = RenameFileInternal(s, t); + if (!found) { + return IOStatus::PathNotFound(s); + } else { + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); + MutexLock lock(&mutex_); + if (file_map_.find(s) == file_map_.end()) { + return IOStatus::PathNotFound(s); + } + + DeleteFileInternal(t); + file_map_[t] = file_map_[s]; + file_map_[t]->Ref(); // Otherwise it might get deleted when noone uses s + return IOStatus::OK(); +} + +IOStatus MockFileSystem::NewLogger(const std::string& fname, + const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + MemFile* file = nullptr; + if (iter == file_map_.end()) { + file = new MemFile(clock_, fn, false); + file->Ref(); + file_map_[fn] = file; + } else { + file = iter->second; + } + std::unique_ptr f(new MockWritableFile(file, FileOptions())); + result->reset(new TestMemLogger(std::move(f), clock_, io_opts, dbg)); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::LockFile(const std::string& fname, + const IOOptions& /*options*/, + FileLock** flock, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + { + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + if (!file_map_[fn]->is_lock_file()) { + return IOStatus::InvalidArgument(fname, "Not a lock file."); + } + if (!file_map_[fn]->Lock()) { + return IOStatus::IOError(fn, "lock is already held."); + } + } else { + auto* file = new MemFile(clock_, fn, true); + file->Ref(); + file->Lock(); + file_map_[fn] = file; + } + } + *flock = new MockEnvFileLock(fn); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::UnlockFile(FileLock* flock, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + std::string fn = static_cast_with_check(flock)->FileName(); + { + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + if (!file_map_[fn]->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Not a lock file."); + } + file_map_[fn]->Unlock(); + } + } + delete flock; + return IOStatus::OK(); +} + +IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/, + std::string* path, + IODebugContext* /*dbg*/) { + *path = "/test"; + return IOStatus::OK(); +} + +Status MockFileSystem::CorruptBuffer(const std::string& fname) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + iter->second->CorruptBuffer(); + return Status::OK(); +} + +MockEnv::MockEnv(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& clock) + : CompositeEnvWrapper(env, fs, clock) {} + +MockEnv* MockEnv::Create(Env* env) { + auto clock = + std::make_shared(env->GetSystemClock(), true); + return MockEnv::Create(env, clock); +} + +MockEnv* MockEnv::Create(Env* env, const std::shared_ptr& clock) { + auto fs = std::make_shared(clock); + return new MockEnv(env, fs, clock); +} + +Status MockEnv::CorruptBuffer(const std::string& fname) { + auto mock = static_cast_with_check(GetFileSystem().get()); + return mock->CorruptBuffer(fname); +} + +// This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv +Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); } + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.h new file mode 100644 index 0000000000..406a31f635 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/mock_env.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +class MemFile; +class MockFileSystem : public FileSystem { + public: + explicit MockFileSystem(const std::shared_ptr& clock, + bool supports_direct_io = true); + ~MockFileSystem() override; + + static const char* kClassName() { return "MemoryFileSystem"; } + const char* Name() const override { return kClassName(); } + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override; + IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& /*options*/, bool* /*is_dir*/, + IODebugContext* /*dgb*/) override { + return IOStatus::NotSupported("IsDirectory"); + } + + Status CorruptBuffer(const std::string& fname); + Status PrepareOptions(const ConfigOptions& options) override; + + private: + bool RenameFileInternal(const std::string& src, const std::string& dest); + void DeleteFileInternal(const std::string& fname); + bool GetChildrenInternal(const std::string& fname, + std::vector* results); + + std::string NormalizeMockPath(const std::string& path); + + private: + // Map from filenames to MemFile objects, representing a simple file system. + port::Mutex mutex_; + std::map file_map_; // Protected by mutex_. + std::shared_ptr system_clock_; + SystemClock* clock_; + bool supports_direct_io_; +}; + +class MockEnv : public CompositeEnvWrapper { + public: + static MockEnv* Create(Env* base); + static MockEnv* Create(Env* base, const std::shared_ptr& clock); + + static const char* kClassName() { return "MockEnv"; } + const char* Name() const override { return kClassName(); } + + Status CorruptBuffer(const std::string& fname); + + private: + MockEnv(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& clock); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.cc new file mode 100644 index 0000000000..a1986fa15d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.cc @@ -0,0 +1,164 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "env/unique_id_gen.h" + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/version.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +struct GenerateRawUniqueIdOpts { + Env* env = Env::Default(); + bool exclude_port_uuid = false; + bool exclude_env_details = false; + bool exclude_random_device = false; +}; + +// Each of these "tracks" below should be sufficient for generating 128 bits +// of entropy, after hashing the raw bytes. The tracks are separable for +// testing purposes, but in production we combine as many tracks as possible +// to ensure quality results even if some environments have degraded +// capabilities or quality in some APIs. +// +// This approach has not been validated for use in cryptography. The goal is +// generating globally unique values with high probability without coordination +// between instances. +// +// Linux performance: EntropyTrackRandomDevice is much faster than +// EntropyTrackEnvDetails, which is much faster than EntropyTrackPortUuid. + +struct EntropyTrackPortUuid { + std::array uuid; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_port_uuid) { + return; + } + std::string s; + port::GenerateRfcUuid(&s); + if (s.size() >= uuid.size()) { + std::copy_n(s.begin(), uuid.size(), uuid.begin()); + } + } +}; + +struct EntropyTrackEnvDetails { + std::array hostname_buf; + int64_t process_id; + uint64_t thread_id; + int64_t unix_time; + uint64_t nano_time; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_env_details) { + return; + } + opts.env->GetHostName(hostname_buf.data(), hostname_buf.size()) + .PermitUncheckedError(); + process_id = port::GetProcessID(); + thread_id = opts.env->GetThreadID(); + opts.env->GetCurrentTime(&unix_time).PermitUncheckedError(); + nano_time = opts.env->NowNanos(); + } +}; + +struct EntropyTrackRandomDevice { + using RandType = std::random_device::result_type; + static constexpr size_t kNumRandVals = + /* generous bits */ 192U / (8U * sizeof(RandType)); + std::array rand_vals; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_random_device) { + return; + } + std::random_device r; + for (auto& val : rand_vals) { + val = r(); + } + } +}; + +struct Entropy { + uint64_t version_identifier; + EntropyTrackRandomDevice et1; + EntropyTrackEnvDetails et2; + EntropyTrackPortUuid et3; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + // If we change the format of what goes into the entropy inputs, it's + // conceivable there could be a physical collision in the hash input + // even though they are logically different. This value should change + // if there's a change to the "schema" here, including byte order. + version_identifier = (uint64_t{ROCKSDB_MAJOR} << 32) + + (uint64_t{ROCKSDB_MINOR} << 16) + + uint64_t{ROCKSDB_PATCH}; + et1.Populate(opts); + et2.Populate(opts); + et3.Populate(opts); + } +}; + +void GenerateRawUniqueIdImpl(uint64_t* a, uint64_t* b, + const GenerateRawUniqueIdOpts& opts) { + Entropy e; + std::memset(&e, 0, sizeof(e)); + e.Populate(opts); + Hash2x64(reinterpret_cast(&e), sizeof(e), a, b); +} + +} // namespace + +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid) { + GenerateRawUniqueIdOpts opts; + opts.exclude_port_uuid = exclude_port_uuid; + assert(!opts.exclude_env_details); + assert(!opts.exclude_random_device); + GenerateRawUniqueIdImpl(a, b, opts); +} + +#ifndef NDEBUG +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device) { + GenerateRawUniqueIdOpts opts; + opts.exclude_port_uuid = exclude_port_uuid; + opts.exclude_env_details = exclude_env_details; + opts.exclude_random_device = exclude_random_device; + GenerateRawUniqueIdImpl(a, b, opts); +} +#endif + +void SemiStructuredUniqueIdGen::Reset() { + saved_process_id_ = port::GetProcessID(); + GenerateRawUniqueId(&base_upper_, &base_lower_); + counter_ = 0; +} + +void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) { + if (port::GetProcessID() == saved_process_id_) { + // Safe to increment the atomic for guaranteed uniqueness within this + // process lifetime. Xor slightly better than +. See + // https://github.com/pdillinger/unique_id + *lower = base_lower_ ^ counter_.fetch_add(1); + *upper = base_upper_; + } else { + // There must have been a fork() or something. Rather than attempting to + // update in a thread-safe way, simply fall back on GenerateRawUniqueId. + GenerateRawUniqueId(upper, lower); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.h new file mode 100644 index 0000000000..69033f8d3a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/env/unique_id_gen.h @@ -0,0 +1,85 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file is for functions that generate unique identifiers by +// (at least in part) by extracting novel entropy or sources of uniqueness +// from the execution environment. (By contrast, random.h is for algorithmic +// pseudorandomness.) +// +// These functions could eventually migrate to public APIs, such as in Env. + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Generates a new 128-bit identifier that is universally unique +// (with high probability) for each call. The result is split into +// two 64-bit pieces. This function has NOT been validated for use in +// cryptography. +// +// This is used in generating DB session IDs and by Env::GenerateUniqueId +// (used for DB IDENTITY) if the platform does not provide a generator of +// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this +// function is used as a fallback for GenerateRfcUuid, because no need +// trying it again.) +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, + bool exclude_port_uuid = false); + +#ifndef NDEBUG +// A version of above with options for challenge testing +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device); +#endif + +// Generates globally unique ids with lower probability of any collisions +// vs. each unique id being independently random (GenerateRawUniqueId). +// We call this "semi-structured" because between different +// SemiStructuredUniqueIdGen objects, the IDs are separated by random +// intervals (unstructured), but within a single SemiStructuredUniqueIdGen +// object, the generated IDs are trivially related (structured). See +// https://github.com/pdillinger/unique_id for how this improves probability +// of no collision. In short, if we have n SemiStructuredUniqueIdGen +// objects each generating m IDs, the first collision is expected at +// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64, +// rather than n * m = 2^64 for fully random IDs. +class SemiStructuredUniqueIdGen { + public: + // Initializes with random starting state (from GenerateRawUniqueId) + SemiStructuredUniqueIdGen() { Reset(); } + // Re-initializes, but not thread safe + void Reset(); + + // Assuming no fork(), `lower` is guaranteed unique from one call + // to the next (thread safe). + void GenerateNext(uint64_t* upper, uint64_t* lower); + + // For generating smaller values. Will cycle through all the possibilities + // before repeating. + template + T GenerateNext() { + static_assert(sizeof(T) <= sizeof(uint64_t)); + static_assert(std::is_integral_v); + uint64_t ignore, val; + GenerateNext(&ignore, &val); + return static_cast(val); + } + + uint64_t GetBaseUpper() const { return base_upper_; } + + private: + uint64_t base_upper_; + uint64_t base_lower_; + std::atomic counter_; + int64_t saved_process_id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.cc new file mode 100644 index 0000000000..8a2d1615d0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.cc @@ -0,0 +1,409 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "file/delete_scheduler.h" + +#include +#include +#include + +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs, + int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) + : clock_(clock), + fs_(fs), + total_trash_size_(0), + rate_bytes_per_sec_(rate_bytes_per_sec), + pending_files_(0), + bytes_max_delete_chunk_(bytes_max_delete_chunk), + closing_(false), + cv_(&mu_), + bg_thread_(nullptr), + info_log_(info_log), + sst_file_manager_(sst_file_manager), + max_trash_db_ratio_(max_trash_db_ratio) { + assert(sst_file_manager != nullptr); + assert(max_trash_db_ratio >= 0); + MaybeCreateBackgroundThread(); +} + +DeleteScheduler::~DeleteScheduler() { + { + InstrumentedMutexLock l(&mu_); + closing_ = true; + cv_.SignalAll(); + } + if (bg_thread_) { + bg_thread_->join(); + } + for (const auto& it : bg_errors_) { + it.second.PermitUncheckedError(); + } +} + +Status DeleteScheduler::DeleteFile(const std::string& file_path, + const std::string& dir_to_sync, + const bool force_bg) { + if (rate_bytes_per_sec_.load() <= 0 || + (!force_bg && + total_trash_size_.load() > + sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { + // Rate limiting is disabled or trash size makes up more than + // max_trash_db_ratio_ (default 25%) of the total DB size + TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); + Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr); + if (s.ok()) { + s = sst_file_manager_->OnDeleteFile(file_path); + ROCKS_LOG_INFO(info_log_, + "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64 + ", total_trash_size %" PRIu64 " max_trash_db_ratio %lf", + file_path.c_str(), rate_bytes_per_sec_.load(), + total_trash_size_.load(), max_trash_db_ratio_.load()); + InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY); + } + return s; + } + + // Move file to trash + std::string trash_file; + Status s = MarkAsTrash(file_path, &trash_file); + ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(), + s.ToString().c_str()); + + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s", + file_path.c_str(), s.ToString().c_str()); + s = fs_->DeleteFile(file_path, IOOptions(), nullptr); + if (s.ok()) { + s = sst_file_manager_->OnDeleteFile(file_path); + ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately", + trash_file.c_str()); + InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY); + } + return s; + } + + // Update the total trash size + uint64_t trash_file_size = 0; + IOStatus io_s = + fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); + if (io_s.ok()) { + total_trash_size_.fetch_add(trash_file_size); + } + //**TODO: What should we do if we failed to + // get the file size? + + // Add file to delete queue + { + InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_MARKED_TRASH); + queue_.emplace(trash_file, dir_to_sync); + pending_files_++; + if (pending_files_ == 1) { + cv_.SignalAll(); + } + } + return s; +} + +std::map DeleteScheduler::GetBackgroundErrors() { + InstrumentedMutexLock l(&mu_); + return bg_errors_; +} + +const std::string DeleteScheduler::kTrashExtension = ".trash"; +bool DeleteScheduler::IsTrashFile(const std::string& file_path) { + return (file_path.size() >= kTrashExtension.size() && + file_path.rfind(kTrashExtension) == + file_path.size() - kTrashExtension.size()); +} + +Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm, + const std::string& path) { + Status s; + // Check if there are any files marked as trash in this path + std::vector files_in_path; + const auto& fs = env->GetFileSystem(); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs->GetChildren(path, io_opts, &files_in_path, + /*IODebugContext*=*/nullptr); + if (!s.ok()) { + return s; + } + for (const std::string& current_file : files_in_path) { + if (!DeleteScheduler::IsTrashFile(current_file)) { + // not a trash file, skip + continue; + } + + Status file_delete; + std::string trash_file = path + "/" + current_file; + if (sfm) { + // We have an SstFileManager that will schedule the file delete + s = sfm->OnAddFile(trash_file); + file_delete = sfm->ScheduleFileDeletion(trash_file, path); + } else { + // Delete the file immediately + file_delete = env->DeleteFile(trash_file); + } + + if (s.ok() && !file_delete.ok()) { + s = file_delete; + } + } + + return s; +} + +Status DeleteScheduler::MarkAsTrash(const std::string& file_path, + std::string* trash_file) { + // Sanity check of the path + size_t idx = file_path.rfind("/"); + if (idx == std::string::npos || idx == file_path.size() - 1) { + return Status::InvalidArgument("file_path is corrupted"); + } + + if (DeleteScheduler::IsTrashFile(file_path)) { + // This is already a trash file + *trash_file = file_path; + return Status::OK(); + } + + *trash_file = file_path + kTrashExtension; + // TODO(tec) : Implement Env::RenameFileIfNotExist and remove + // file_move_mu mutex. + int cnt = 0; + Status s; + InstrumentedMutexLock l(&file_move_mu_); + while (true) { + s = fs_->FileExists(*trash_file, IOOptions(), nullptr); + if (s.IsNotFound()) { + // We found a path for our file in trash + s = fs_->RenameFile(file_path, *trash_file, IOOptions(), nullptr); + break; + } else if (s.ok()) { + // Name conflict, generate new random suffix + *trash_file = file_path + std::to_string(cnt) + kTrashExtension; + } else { + // Error during FileExists call, we cannot continue + break; + } + cnt++; + } + if (s.ok()) { + s = sst_file_manager_->OnMoveFile(file_path, *trash_file); + } + return s; +} + +void DeleteScheduler::BackgroundEmptyTrash() { + TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash"); + + while (true) { + InstrumentedMutexLock l(&mu_); + while (queue_.empty() && !closing_) { + cv_.Wait(); + } + + if (closing_) { + return; + } + + // Delete all files in queue_ + uint64_t start_time = clock_->NowMicros(); + uint64_t total_deleted_bytes = 0; + int64_t current_delete_rate = rate_bytes_per_sec_.load(); + while (!queue_.empty() && !closing_) { + if (current_delete_rate != rate_bytes_per_sec_.load()) { + // User changed the delete rate + current_delete_rate = rate_bytes_per_sec_.load(); + start_time = clock_->NowMicros(); + total_deleted_bytes = 0; + ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64, + current_delete_rate); + } + + // Get new file to delete + const FileAndDir& fad = queue_.front(); + std::string path_in_trash = fad.fname; + + // We don't need to hold the lock while deleting the file + mu_.Unlock(); + uint64_t deleted_bytes = 0; + bool is_complete = true; + // Delete file from trash and update total_penlty value + Status s = + DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete); + total_deleted_bytes += deleted_bytes; + mu_.Lock(); + if (is_complete) { + queue_.pop(); + } + + if (!s.ok()) { + bg_errors_[path_in_trash] = s; + } + + // Apply penalty if necessary + uint64_t total_penalty; + if (current_delete_rate > 0) { + // rate limiting is enabled + total_penalty = + ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate); + ROCKS_LOG_INFO(info_log_, + "Rate limiting is enabled with penalty %" PRIu64 + " after deleting file %s", + total_penalty, path_in_trash.c_str()); + while (!closing_ && !cv_.TimedWait(start_time + total_penalty)) { + } + } else { + // rate limiting is disabled + total_penalty = 0; + ROCKS_LOG_INFO(info_log_, + "Rate limiting is disabled after deleting file %s", + path_in_trash.c_str()); + } + TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", + &total_penalty); + + if (is_complete) { + pending_files_--; + } + if (pending_files_ == 0) { + // Unblock WaitForEmptyTrash since there are no more files waiting + // to be deleted + cv_.SignalAll(); + } + } + } +} + +Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, + const std::string& dir_to_sync, + uint64_t* deleted_bytes, + bool* is_complete) { + uint64_t file_size; + Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr); + *is_complete = true; + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); + if (s.ok()) { + bool need_full_delete = true; + if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { + uint64_t num_hard_links = 2; + // We don't have to worry aobut data race between linking a new + // file after the number of file link check and ftruncte because + // the file is now in trash and no hardlink is supposed to create + // to trash files by RocksDB. + Status my_status = fs_->NumFileLinks(path_in_trash, IOOptions(), + &num_hard_links, nullptr); + if (my_status.ok()) { + if (num_hard_links == 1) { + std::unique_ptr wf; + my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf, + nullptr); + if (my_status.ok()) { + my_status = wf->Truncate(file_size - bytes_max_delete_chunk_, + IOOptions(), nullptr); + if (my_status.ok()) { + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); + my_status = wf->Fsync(IOOptions(), nullptr); + } + } + if (my_status.ok()) { + *deleted_bytes = bytes_max_delete_chunk_; + need_full_delete = false; + *is_complete = false; + } else { + ROCKS_LOG_WARN(info_log_, + "Failed to partially delete %s from trash -- %s", + path_in_trash.c_str(), my_status.ToString().c_str()); + } + } else { + ROCKS_LOG_INFO(info_log_, + "Cannot delete %s slowly through ftruncate from trash " + "as it has other links", + path_in_trash.c_str()); + } + } else if (!num_link_error_printed_) { + ROCKS_LOG_INFO( + info_log_, + "Cannot delete files slowly through ftruncate from trash " + "as Env::NumFileLinks() returns error: %s", + my_status.ToString().c_str()); + num_link_error_printed_ = true; + } + } + + if (need_full_delete) { + s = fs_->DeleteFile(path_in_trash, IOOptions(), nullptr); + if (!dir_to_sync.empty()) { + std::unique_ptr dir_obj; + if (s.ok()) { + s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); + TEST_SYNC_POINT_CALLBACK( + "DeleteScheduler::DeleteTrashFile::AfterSyncDir", + reinterpret_cast(const_cast(&dir_to_sync))); + } + } + if (s.ok()) { + *deleted_bytes = file_size; + s = sst_file_manager_->OnDeleteFile(path_in_trash); + } + } + } + if (!s.ok()) { + // Error while getting file size or while deleting + ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s", + path_in_trash.c_str(), s.ToString().c_str()); + *deleted_bytes = 0; + } else { + total_trash_size_.fetch_sub(*deleted_bytes); + } + + return s; +} + +void DeleteScheduler::WaitForEmptyTrash() { + InstrumentedMutexLock l(&mu_); + while (pending_files_ > 0 && !closing_) { + cv_.Wait(); + } +} + +void DeleteScheduler::MaybeCreateBackgroundThread() { + if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) { + bg_thread_.reset( + new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this)); + ROCKS_LOG_INFO(info_log_, + "Created background thread for deletion scheduler with " + "rate_bytes_per_sec: %" PRIi64, + rate_bytes_per_sec_.load()); + } +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.h new file mode 100644 index 0000000000..da3735aed8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/delete_scheduler.h @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "port/port.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class FileSystem; +class Logger; +class SstFileManagerImpl; +class SystemClock; + +// DeleteScheduler allows the DB to enforce a rate limit on file deletion, +// Instead of deleteing files immediately, files are marked as trash +// and deleted in a background thread that apply sleep penalty between deletes +// if they are happening in a rate faster than rate_bytes_per_sec, +// +// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this +// case DeleteScheduler will delete files immediately. +class DeleteScheduler { + public: + DeleteScheduler(SystemClock* clock, FileSystem* fs, + int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager, + double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); + + ~DeleteScheduler(); + + // Return delete rate limit in bytes per second + int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); } + + // Set delete rate limit in bytes per second + void SetRateBytesPerSecond(int64_t bytes_per_sec) { + rate_bytes_per_sec_.store(bytes_per_sec); + MaybeCreateBackgroundThread(); + } + + // Mark file as trash directory and schedule its deletion. If force_bg is + // set, it forces the file to always be deleted in the background thread, + // except when rate limiting is disabled + Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, + const bool force_bg = false); + + // Wait for all files being deleteing in the background to finish or for + // destructor to be called. + void WaitForEmptyTrash(); + + // Return a map containing errors that happened in BackgroundEmptyTrash + // file_path => error status + std::map GetBackgroundErrors(); + + uint64_t GetTotalTrashSize() { return total_trash_size_.load(); } + + // Return trash/DB size ratio where new files will be deleted immediately + double GetMaxTrashDBRatio() { return max_trash_db_ratio_.load(); } + + // Update trash/DB size ratio where new files will be deleted immediately + void SetMaxTrashDBRatio(double r) { + assert(r >= 0); + max_trash_db_ratio_.store(r); + } + + static const std::string kTrashExtension; + static bool IsTrashFile(const std::string& file_path); + + // Check if there are any .trash files in path, and schedule their deletion + // Or delete immediately if sst_file_manager is nullptr + static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm, + const std::string& path); + + void SetStatisticsPtr(const std::shared_ptr& stats) { + InstrumentedMutexLock l(&mu_); + stats_ = stats; + } + + private: + Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash); + + Status DeleteTrashFile(const std::string& path_in_trash, + const std::string& dir_to_sync, + uint64_t* deleted_bytes, bool* is_complete); + + void BackgroundEmptyTrash(); + + void MaybeCreateBackgroundThread(); + + SystemClock* clock_; + FileSystem* fs_; + + // total size of trash files + std::atomic total_trash_size_; + // Maximum number of bytes that should be deleted per second + std::atomic rate_bytes_per_sec_; + // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_ + InstrumentedMutex mu_; + + struct FileAndDir { + FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {} + std::string fname; + std::string dir; // empty will be skipped. + }; + + // Queue of trash files that need to be deleted + std::queue queue_; + // Number of trash files that are waiting to be deleted + int32_t pending_files_; + uint64_t bytes_max_delete_chunk_; + // Errors that happened in BackgroundEmptyTrash (file_path => error) + std::map bg_errors_; + + bool num_link_error_printed_ = false; + // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop + bool closing_; + // Condition variable signaled in these conditions + // - pending_files_ value change from 0 => 1 + // - pending_files_ value change from 1 => 0 + // - closing_ value is set to true + InstrumentedCondVar cv_; + // Background thread running BackgroundEmptyTrash + std::unique_ptr bg_thread_; + // Mutex to protect threads from file name conflicts + InstrumentedMutex file_move_mu_; + Logger* info_log_; + SstFileManagerImpl* sst_file_manager_; + // If the trash size constitutes for more than this fraction of the total DB + // size we will start deleting new files passed to DeleteScheduler + // immediately + std::atomic max_trash_db_ratio_; + static const uint64_t kMicrosInSecond = 1000 * 1000LL; + std::shared_ptr stats_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.cc new file mode 100644 index 0000000000..050947210f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.cc @@ -0,0 +1,955 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/file_prefetch_buffer.h" + +#include +#include + +#include "file/random_access_file_reader.h" +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { + +void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment, + uint64_t offset, + size_t roundup_len, + uint32_t index, bool refit_tail, + uint64_t& chunk_len) { + uint64_t chunk_offset_in_buffer = 0; + bool copy_data_to_new_buffer = false; + // Check if requested bytes are in the existing buffer_. + // If only a few bytes exist -- reuse them & read only what is really needed. + // This is typically the case of incremental reading of data. + // If no bytes exist in buffer -- full pread. + if (DoesBufferContainData(index) && IsOffsetInBuffer(offset, index)) { + // Only a few requested bytes are in the buffer. memmove those chunk of + // bytes to the beginning, and memcpy them back into the new buffer if a + // new buffer is created. + chunk_offset_in_buffer = Rounddown( + static_cast(offset - bufs_[index].offset_), alignment); + chunk_len = static_cast(bufs_[index].buffer_.CurrentSize()) - + chunk_offset_in_buffer; + assert(chunk_offset_in_buffer % alignment == 0); + assert(chunk_len % alignment == 0); + assert(chunk_offset_in_buffer + chunk_len <= + bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); + if (chunk_len > 0) { + copy_data_to_new_buffer = true; + } else { + // this reset is not necessary, but just to be safe. + chunk_offset_in_buffer = 0; + } + } + + // Create a new buffer only if current capacity is not sufficient, and memcopy + // bytes from old buffer if needed (i.e., if chunk_len is greater than 0). + if (bufs_[index].buffer_.Capacity() < roundup_len) { + bufs_[index].buffer_.Alignment(alignment); + bufs_[index].buffer_.AllocateNewBuffer( + static_cast(roundup_len), copy_data_to_new_buffer, + chunk_offset_in_buffer, static_cast(chunk_len)); + } else if (chunk_len > 0 && refit_tail) { + // New buffer not needed. But memmove bytes from tail to the beginning since + // chunk_len is greater than 0. + bufs_[index].buffer_.RefitTail(static_cast(chunk_offset_in_buffer), + static_cast(chunk_len)); + } else if (chunk_len > 0) { + // For async prefetching, it doesn't call RefitTail with chunk_len > 0. + // Allocate new buffer if needed because aligned buffer calculate remaining + // buffer as capacity_ - cursize_ which might not be the case in this as we + // are not refitting. + // TODO akanksha: Update the condition when asynchronous prefetching is + // stable. + bufs_[index].buffer_.Alignment(alignment); + bufs_[index].buffer_.AllocateNewBuffer( + static_cast(roundup_len), copy_data_to_new_buffer, + chunk_offset_in_buffer, static_cast(chunk_len)); + } +} + +Status FilePrefetchBuffer::Read(const IOOptions& opts, + RandomAccessFileReader* reader, + Env::IOPriority rate_limiter_priority, + uint64_t read_len, uint64_t chunk_len, + uint64_t rounddown_start, uint32_t index) { + Slice result; + Status s = reader->Read(opts, rounddown_start + chunk_len, read_len, &result, + bufs_[index].buffer_.BufferStart() + chunk_len, + /*aligned_buf=*/nullptr, rate_limiter_priority); +#ifndef NDEBUG + if (result.size() < read_len) { + // Fake an IO error to force db_stress fault injection to ignore + // truncated read errors + IGNORE_STATUS_IF_ERROR(Status::IOError()); + } +#endif + if (!s.ok()) { + return s; + } + + // Update the buffer offset and size. + bufs_[index].offset_ = rounddown_start; + bufs_[index].buffer_.Size(static_cast(chunk_len) + result.size()); + return s; +} + +Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t read_len, + uint64_t rounddown_start, uint32_t index) { + TEST_SYNC_POINT("FilePrefetchBuffer::ReadAsync"); + // callback for async read request. + auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this, + std::placeholders::_1, std::placeholders::_2); + FSReadRequest req; + Slice result; + req.len = read_len; + req.offset = rounddown_start; + req.result = result; + req.scratch = bufs_[index].buffer_.BufferStart(); + bufs_[index].async_req_len_ = req.len; + + Status s = + reader->ReadAsync(req, opts, fp, &(bufs_[index].pos_), + &(bufs_[index].io_handle_), &(bufs_[index].del_fn_), + /*aligned_buf=*/nullptr); + req.status.PermitUncheckedError(); + if (s.ok()) { + bufs_[index].async_read_in_progress_ = true; + } + return s; +} + +Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, + Env::IOPriority rate_limiter_priority) { + if (!enable_ || reader == nullptr) { + return Status::OK(); + } + TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start"); + + if (offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) { + // All requested bytes are already in the curr_ buffer. So no need to Read + // again. + return Status::OK(); + } + + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + size_t offset_ = static_cast(offset); + uint64_t rounddown_offset = Rounddown(offset_, alignment); + uint64_t roundup_end = Roundup(offset_ + n, alignment); + uint64_t roundup_len = roundup_end - rounddown_offset; + assert(roundup_len >= alignment); + assert(roundup_len % alignment == 0); + + uint64_t chunk_len = 0; + CalculateOffsetAndLen(alignment, offset, roundup_len, curr_, + true /*refit_tail*/, chunk_len); + size_t read_len = static_cast(roundup_len - chunk_len); + + Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len, + rounddown_offset, curr_); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) { + RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len); + } + return s; +} + +// Copy data from src to third buffer. +void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset, + size_t& length) { + if (length == 0) { + return; + } + uint64_t copy_offset = (offset - bufs_[src].offset_); + size_t copy_len = 0; + if (IsDataBlockInBuffer(offset, length, src)) { + // All the bytes are in src. + copy_len = length; + } else { + copy_len = bufs_[src].buffer_.CurrentSize() - copy_offset; + } + + memcpy(bufs_[2].buffer_.BufferStart() + bufs_[2].buffer_.CurrentSize(), + bufs_[src].buffer_.BufferStart() + copy_offset, copy_len); + + bufs_[2].buffer_.Size(bufs_[2].buffer_.CurrentSize() + copy_len); + + // Update offset and length. + offset += copy_len; + length -= copy_len; + + // length > 0 indicates it has consumed all data from the src buffer and it + // still needs to read more other buffer. + if (length > 0) { + bufs_[src].buffer_.Clear(); + } +} + +// Clear the buffers if it contains outdated data. Outdated data can be +// because previous sequential reads were read from the cache instead of these +// buffer. In that case outdated IOs should be aborted. +void FilePrefetchBuffer::AbortIOIfNeeded(uint64_t offset) { + uint32_t second = curr_ ^ 1; + std::vector handles; + autovector buf_pos; + if (IsBufferOutdatedWithAsyncProgress(offset, curr_)) { + handles.emplace_back(bufs_[curr_].io_handle_); + buf_pos.emplace_back(curr_); + } + if (IsBufferOutdatedWithAsyncProgress(offset, second)) { + handles.emplace_back(bufs_[second].io_handle_); + buf_pos.emplace_back(second); + } + if (!handles.empty()) { + StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); + Status s = fs_->AbortIO(handles); + assert(s.ok()); + } + + for (auto& pos : buf_pos) { + // Release io_handle. + DestroyAndClearIOHandle(pos); + } + + if (bufs_[second].io_handle_ == nullptr) { + bufs_[second].async_read_in_progress_ = false; + } + + if (bufs_[curr_].io_handle_ == nullptr) { + bufs_[curr_].async_read_in_progress_ = false; + } +} + +void FilePrefetchBuffer::AbortAllIOs() { + uint32_t second = curr_ ^ 1; + std::vector handles; + for (uint32_t i = 0; i < 2; i++) { + if (bufs_[i].async_read_in_progress_ && bufs_[i].io_handle_ != nullptr) { + handles.emplace_back(bufs_[i].io_handle_); + } + } + if (!handles.empty()) { + StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); + Status s = fs_->AbortIO(handles); + assert(s.ok()); + } + + // Release io_handles. + if (bufs_[curr_].io_handle_ != nullptr && bufs_[curr_].del_fn_ != nullptr) { + DestroyAndClearIOHandle(curr_); + } else { + bufs_[curr_].async_read_in_progress_ = false; + } + + if (bufs_[second].io_handle_ != nullptr && bufs_[second].del_fn_ != nullptr) { + DestroyAndClearIOHandle(second); + } else { + bufs_[second].async_read_in_progress_ = false; + } +} + +// Clear the buffers if it contains outdated data. Outdated data can be +// because previous sequential reads were read from the cache instead of these +// buffer. +void FilePrefetchBuffer::UpdateBuffersIfNeeded(uint64_t offset) { + uint32_t second = curr_ ^ 1; + if (IsBufferOutdated(offset, curr_)) { + bufs_[curr_].buffer_.Clear(); + } + if (IsBufferOutdated(offset, second)) { + bufs_[second].buffer_.Clear(); + } + + { + // In case buffers do not align, reset second buffer. This can happen in + // case readahead_size is set. + if (!bufs_[second].async_read_in_progress_ && + !bufs_[curr_].async_read_in_progress_) { + if (DoesBufferContainData(curr_)) { + if (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() != + bufs_[second].offset_) { + bufs_[second].buffer_.Clear(); + } + } else { + if (!IsOffsetInBuffer(offset, second)) { + bufs_[second].buffer_.Clear(); + } + } + } + } + + // If data starts from second buffer, make it curr_. Second buffer can be + // either partial filled, full or async read is in progress. + if (bufs_[second].async_read_in_progress_) { + if (IsOffsetInBufferWithAsyncProgress(offset, second)) { + curr_ = curr_ ^ 1; + } + } else { + if (DoesBufferContainData(second) && IsOffsetInBuffer(offset, second)) { + assert(bufs_[curr_].async_read_in_progress_ || + bufs_[curr_].buffer_.CurrentSize() == 0); + curr_ = curr_ ^ 1; + } + } +} + +void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) { + if (bufs_[curr_].async_read_in_progress_ && fs_ != nullptr) { + if (bufs_[curr_].io_handle_ != nullptr) { + // Wait for prefetch data to complete. + // No mutex is needed as async_read_in_progress behaves as mutex and is + // updated by main thread only. + std::vector handles; + handles.emplace_back(bufs_[curr_].io_handle_); + StopWatch sw(clock_, stats_, POLL_WAIT_MICROS); + fs_->Poll(handles, 1).PermitUncheckedError(); + } + + // Reset and Release io_handle after the Poll API as request has been + // completed. + DestroyAndClearIOHandle(curr_); + } + UpdateBuffersIfNeeded(offset); +} + +Status FilePrefetchBuffer::HandleOverlappingData( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t length, size_t readahead_size, + Env::IOPriority /*rate_limiter_priority*/, bool& copy_to_third_buffer, + uint64_t& tmp_offset, size_t& tmp_length) { + Status s; + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + uint32_t second; + + // Check if the first buffer has the required offset and the async read is + // still in progress. This should only happen if a prefetch was initiated + // by Seek, but the next access is at another offset. + if (bufs_[curr_].async_read_in_progress_ && + IsOffsetInBufferWithAsyncProgress(offset, curr_)) { + PollAndUpdateBuffersIfNeeded(offset); + } + second = curr_ ^ 1; + + // If data is overlapping over two buffers, copy the data from curr_ and + // call ReadAsync on curr_. + if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) && + IsOffsetInBuffer(offset, curr_) && + (/*Data extends over curr_ buffer and second buffer either has data or in + process of population=*/ + (offset + length > bufs_[second].offset_) && + (bufs_[second].async_read_in_progress_ || + DoesBufferContainData(second)))) { + // Allocate new buffer to third buffer; + bufs_[2].buffer_.Clear(); + bufs_[2].buffer_.Alignment(alignment); + bufs_[2].buffer_.AllocateNewBuffer(length); + bufs_[2].offset_ = offset; + copy_to_third_buffer = true; + + CopyDataToBuffer(curr_, tmp_offset, tmp_length); + + // Call async prefetching on curr_ since data has been consumed in curr_ + // only if data lies within second buffer. + size_t second_size = bufs_[second].async_read_in_progress_ + ? bufs_[second].async_req_len_ + : bufs_[second].buffer_.CurrentSize(); + if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size) { + uint64_t rounddown_start = bufs_[second].offset_ + second_size; + uint64_t roundup_end = + Roundup(rounddown_start + readahead_size, alignment); + uint64_t roundup_len = roundup_end - rounddown_start; + uint64_t chunk_len = 0; + CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, curr_, + false, chunk_len); + assert(chunk_len == 0); + assert(roundup_len >= chunk_len); + + bufs_[curr_].offset_ = rounddown_start; + uint64_t read_len = static_cast(roundup_len - chunk_len); + s = ReadAsync(opts, reader, read_len, rounddown_start, curr_); + if (!s.ok()) { + DestroyAndClearIOHandle(curr_); + bufs_[curr_].buffer_.Clear(); + return s; + } + } + curr_ = curr_ ^ 1; + } + return s; +} +// If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is +// called. When buffers are switched, we clear the curr_ buffer as we assume the +// data has been consumed because of sequential reads. +// Data in buffers will always be sequential with curr_ following second and +// not vice versa. +// +// Scenarios for prefetching asynchronously: +// Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes +// synchronously in curr_ and prefetch readahead_size_/2 async in second +// buffer. +// Case2: If second buffer has partial or full data, make it current and +// prefetch readahead_size_/2 async in second buffer. In case of +// partial data, prefetch remaining bytes from size n synchronously to +// fulfill the requested bytes request. +// Case3: If curr_ has partial data, prefetch remaining bytes from size n +// synchronously in curr_ to fulfill the requested bytes request and +// prefetch readahead_size_/2 bytes async in second buffer. +// Case4: (Special case) If data is in both buffers, copy requested data from +// curr_, send async request on curr_, wait for poll to fill second +// buffer (if any), and copy remaining data from second buffer to third +// buffer. +Status FilePrefetchBuffer::PrefetchAsyncInternal( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority, + bool& copy_to_third_buffer) { + if (!enable_) { + return Status::OK(); + } + + TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start"); + + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + Status s; + uint64_t tmp_offset = offset; + size_t tmp_length = length; + + // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with + // data. + if (!explicit_prefetch_submitted_) { + AbortIOIfNeeded(offset); + } + UpdateBuffersIfNeeded(offset); + + // 2. Handle overlapping data over two buffers. If data is overlapping then + // during this call: + // - data from curr_ is copied into third buffer, + // - curr_ is send for async prefetching of further data if second buffer + // contains remaining requested data or in progress for async prefetch, + // - switch buffers and curr_ now points to second buffer to copy remaining + // data. + s = HandleOverlappingData(opts, reader, offset, length, readahead_size, + rate_limiter_priority, copy_to_third_buffer, + tmp_offset, tmp_length); + if (!s.ok()) { + return s; + } + + // 3. Call Poll only if data is needed for the second buffer. + // - Return if whole data is in curr_ and second buffer is in progress or + // already full. + // - If second buffer is empty, it will go for ReadAsync for second buffer. + if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) && + IsDataBlockInBuffer(offset, length, curr_)) { + // Whole data is in curr_. + UpdateBuffersIfNeeded(offset); + if (!IsSecondBuffEligibleForPrefetching()) { + return s; + } + } else { + // After poll request, curr_ might be empty because of IOError in + // callback while reading or may contain required data. + PollAndUpdateBuffersIfNeeded(offset); + } + + if (copy_to_third_buffer) { + offset = tmp_offset; + length = tmp_length; + } + + // 4. After polling and swapping buffers, if all the requested bytes are in + // curr_, it will only go for async prefetching. + // copy_to_third_buffer is a special case so it will be handled separately. + if (!copy_to_third_buffer && DoesBufferContainData(curr_) && + IsDataBlockInBuffer(offset, length, curr_)) { + offset += length; + length = 0; + + // Since async request was submitted directly by calling PrefetchAsync in + // last call, we don't need to prefetch further as this call is to poll + // the data submitted in previous call. + if (explicit_prefetch_submitted_) { + return s; + } + if (!IsSecondBuffEligibleForPrefetching()) { + return s; + } + } + + uint32_t second = curr_ ^ 1; + assert(!bufs_[curr_].async_read_in_progress_); + + // In case because of some IOError curr_ got empty, abort IO for second as + // well. Otherwise data might not align if more data needs to be read in curr_ + // which might overlap with second buffer. + if (!DoesBufferContainData(curr_) && bufs_[second].async_read_in_progress_) { + if (bufs_[second].io_handle_ != nullptr) { + std::vector handles; + handles.emplace_back(bufs_[second].io_handle_); + { + StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); + Status status = fs_->AbortIO(handles); + assert(status.ok()); + } + } + DestroyAndClearIOHandle(second); + bufs_[second].buffer_.Clear(); + } + + // 5. Data is overlapping i.e. some of the data has been copied to third + // buffer and remaining will be updated below. + if (copy_to_third_buffer && DoesBufferContainData(curr_)) { + CopyDataToBuffer(curr_, offset, length); + + // Length == 0: All the requested data has been copied to third buffer and + // it has already gone for async prefetching. It can return without doing + // anything further. + // Length > 0: More data needs to be consumed so it will continue async + // and sync prefetching and copy the remaining data to third buffer in the + // end. + if (length == 0) { + return s; + } + } + + // 6. Go for ReadAsync and Read (if needed). + size_t prefetch_size = length + readahead_size; + size_t _offset = static_cast(offset); + + // offset and size alignment for curr_ buffer with synchronous prefetching + uint64_t rounddown_start1 = Rounddown(_offset, alignment); + uint64_t roundup_end1 = Roundup(_offset + prefetch_size, alignment); + uint64_t roundup_len1 = roundup_end1 - rounddown_start1; + assert(roundup_len1 >= alignment); + assert(roundup_len1 % alignment == 0); + uint64_t chunk_len1 = 0; + uint64_t read_len1 = 0; + + assert(!bufs_[second].async_read_in_progress_ && + !DoesBufferContainData(second)); + + // For length == 0, skip the synchronous prefetching. read_len1 will be 0. + if (length > 0) { + CalculateOffsetAndLen(alignment, offset, roundup_len1, curr_, + false /*refit_tail*/, chunk_len1); + assert(roundup_len1 >= chunk_len1); + read_len1 = static_cast(roundup_len1 - chunk_len1); + } + { + // offset and size alignment for second buffer for asynchronous + // prefetching + uint64_t rounddown_start2 = roundup_end1; + uint64_t roundup_end2 = + Roundup(rounddown_start2 + readahead_size, alignment); + + // For length == 0, do the asynchronous prefetching in second instead of + // synchronous prefetching in curr_. + if (length == 0) { + rounddown_start2 = + bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize(); + roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); + } + + uint64_t roundup_len2 = roundup_end2 - rounddown_start2; + uint64_t chunk_len2 = 0; + CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, + false /*refit_tail*/, chunk_len2); + assert(chunk_len2 == 0); + // Update the buffer offset. + bufs_[second].offset_ = rounddown_start2; + assert(roundup_len2 >= chunk_len2); + uint64_t read_len2 = static_cast(roundup_len2 - chunk_len2); + s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); + if (!s.ok()) { + DestroyAndClearIOHandle(second); + bufs_[second].buffer_.Clear(); + return s; + } + } + + if (read_len1 > 0) { + s = Read(opts, reader, rate_limiter_priority, read_len1, chunk_len1, + rounddown_start1, curr_); + if (!s.ok()) { + if (bufs_[second].io_handle_ != nullptr) { + std::vector handles; + handles.emplace_back(bufs_[second].io_handle_); + { + StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); + Status status = fs_->AbortIO(handles); + assert(status.ok()); + } + } + DestroyAndClearIOHandle(second); + bufs_[second].buffer_.Clear(); + bufs_[curr_].buffer_.Clear(); + return s; + } + } + // Copy remaining requested bytes to third_buffer. + if (copy_to_third_buffer && length > 0) { + CopyDataToBuffer(curr_, offset, length); + } + return s; +} + +bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, + Slice* result, Status* status, + Env::IOPriority rate_limiter_priority, + bool for_compaction /* = false */) { + bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status, + rate_limiter_priority, for_compaction); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) { + if (track_min_offset_ && offset < min_offset_read_) { + min_offset_read_ = static_cast(offset); + } + if (!enable_ || (offset < bufs_[curr_].offset_)) { + return false; + } + + // If the buffer contains only a few of the requested bytes: + // If readahead is enabled: prefetch the remaining bytes + readahead bytes + // and satisfy the request. + // If readahead is not enabled: return false. + TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache", + &readahead_size_); + if (offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) { + if (readahead_size_ > 0) { + Status s; + assert(reader != nullptr); + assert(max_readahead_size_ >= readahead_size_); + if (for_compaction) { + s = Prefetch(opts, reader, offset, std::max(n, readahead_size_), + rate_limiter_priority); + } else { + if (implicit_auto_readahead_) { + if (!IsEligibleForPrefetch(offset, n)) { + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } + } + s = Prefetch(opts, reader, offset, n + readahead_size_, + rate_limiter_priority); + } + if (!s.ok()) { + if (status) { + *status = s; + } +#ifndef NDEBUG + IGNORE_STATUS_IF_ERROR(s); +#endif + return false; + } + readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); + } else { + return false; + } + } + UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); + + uint64_t offset_in_buffer = offset - bufs_[curr_].offset_; + *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n); + return true; +} + +bool FilePrefetchBuffer::TryReadFromCacheAsync( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority) { + bool ret = TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result, + status, rate_limiter_priority); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority) { + if (track_min_offset_ && offset < min_offset_read_) { + min_offset_read_ = static_cast(offset); + } + + if (!enable_) { + return false; + } + + if (explicit_prefetch_submitted_) { + // explicit_prefetch_submitted_ is special case where it expects request + // submitted in PrefetchAsync should match with this request. Otherwise + // buffers will be outdated. + // Random offset called. So abort the IOs. + if (prev_offset_ != offset) { + AbortAllIOs(); + bufs_[curr_].buffer_.Clear(); + bufs_[curr_ ^ 1].buffer_.Clear(); + explicit_prefetch_submitted_ = false; + return false; + } + } + + if (!explicit_prefetch_submitted_ && offset < bufs_[curr_].offset_) { + return false; + } + + bool prefetched = false; + bool copy_to_third_buffer = false; + // If the buffer contains only a few of the requested bytes: + // If readahead is enabled: prefetch the remaining bytes + readahead bytes + // and satisfy the request. + // If readahead is not enabled: return false. + TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache", + &readahead_size_); + + if (explicit_prefetch_submitted_ || + (bufs_[curr_].async_read_in_progress_ || + offset + n > + bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize())) { + if (readahead_size_ > 0) { + Status s; + assert(reader != nullptr); + assert(max_readahead_size_ >= readahead_size_); + + if (implicit_auto_readahead_) { + if (!IsEligibleForPrefetch(offset, n)) { + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } + } + // Prefetch n + readahead_size_/2 synchronously as remaining + // readahead_size_/2 will be prefetched asynchronously. + s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2, + rate_limiter_priority, copy_to_third_buffer); + explicit_prefetch_submitted_ = false; + if (!s.ok()) { + if (status) { + *status = s; + } +#ifndef NDEBUG + IGNORE_STATUS_IF_ERROR(s); +#endif + return false; + } + prefetched = explicit_prefetch_submitted_ ? false : true; + } else { + return false; + } + } + + UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); + + uint32_t index = curr_; + if (copy_to_third_buffer) { + index = 2; + } + uint64_t offset_in_buffer = offset - bufs_[index].offset_; + *result = Slice(bufs_[index].buffer_.BufferStart() + offset_in_buffer, n); + if (prefetched) { + readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); + } + return true; +} + +void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req, + void* cb_arg) { + uint32_t index = *(static_cast(cb_arg)); +#ifndef NDEBUG + if (req.result.size() < req.len) { + // Fake an IO error to force db_stress fault injection to ignore + // truncated read errors + IGNORE_STATUS_IF_ERROR(Status::IOError()); + } + IGNORE_STATUS_IF_ERROR(req.status); +#endif + + if (req.status.ok()) { + if (req.offset + req.result.size() <= + bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()) { + // All requested bytes are already in the buffer or no data is read + // because of EOF. So no need to update. + return; + } + if (req.offset < bufs_[index].offset_) { + // Next block to be read has changed (Recent read was not a sequential + // read). So ignore this read. + return; + } + size_t current_size = bufs_[index].buffer_.CurrentSize(); + bufs_[index].buffer_.Size(current_size + req.result.size()); + } +} + +Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, + Slice* result) { + assert(reader != nullptr); + if (!enable_) { + return Status::NotSupported(); + } + + TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start"); + + num_file_reads_ = 0; + explicit_prefetch_submitted_ = false; + bool is_eligible_for_prefetching = false; + if (readahead_size_ > 0 && + (!implicit_auto_readahead_ || + num_file_reads_ + 1 >= num_file_reads_for_auto_readahead_)) { + is_eligible_for_prefetching = true; + } + + // 1. Cancel any pending async read to make code simpler as buffers can be out + // of sync. + AbortAllIOs(); + + // 2. Clear outdated data. + UpdateBuffersIfNeeded(offset); + uint32_t second = curr_ ^ 1; + // Since PrefetchAsync can be called on non sequential reads. So offset can + // be less than curr_ buffers' offset. In that case also it clears both + // buffers. + if (DoesBufferContainData(curr_) && !IsOffsetInBuffer(offset, curr_)) { + bufs_[curr_].buffer_.Clear(); + bufs_[second].buffer_.Clear(); + } + + UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false); + + bool data_found = false; + + // 3. If curr_ has full data. + if (DoesBufferContainData(curr_) && IsDataBlockInBuffer(offset, n, curr_)) { + uint64_t offset_in_buffer = offset - bufs_[curr_].offset_; + *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n); + data_found = true; + // Update num_file_reads_ as TryReadFromCacheAsync won't be called for + // poll and update num_file_reads_ if data is found. + num_file_reads_++; + + // 3.1 If second also has some data or is not eligible for prefetching, + // return. + if (!is_eligible_for_prefetching || DoesBufferContainData(second)) { + return Status::OK(); + } + } else { + // Partial data in curr_. + bufs_[curr_].buffer_.Clear(); + } + bufs_[second].buffer_.Clear(); + + Status s; + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + size_t prefetch_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0; + size_t offset_to_read = static_cast(offset); + uint64_t rounddown_start1 = 0; + uint64_t roundup_end1 = 0; + uint64_t rounddown_start2 = 0; + uint64_t roundup_end2 = 0; + uint64_t chunk_len1 = 0; + uint64_t chunk_len2 = 0; + size_t read_len1 = 0; + size_t read_len2 = 0; + + // - If curr_ is empty. + // - Call async read for full data + prefetch_size on curr_. + // - Call async read for prefetch_size on second if eligible. + // - If curr_ is filled. + // - prefetch_size on second. + // Calculate length and offsets for reading. + if (!DoesBufferContainData(curr_)) { + // Prefetch full data + prefetch_size in curr_. + rounddown_start1 = Rounddown(offset_to_read, alignment); + roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment); + uint64_t roundup_len1 = roundup_end1 - rounddown_start1; + assert(roundup_len1 >= alignment); + assert(roundup_len1 % alignment == 0); + + CalculateOffsetAndLen(alignment, rounddown_start1, roundup_len1, curr_, + false, chunk_len1); + assert(chunk_len1 == 0); + assert(roundup_len1 >= chunk_len1); + read_len1 = static_cast(roundup_len1 - chunk_len1); + bufs_[curr_].offset_ = rounddown_start1; + } + + if (is_eligible_for_prefetching) { + if (DoesBufferContainData(curr_)) { + rounddown_start2 = + bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize(); + } else { + rounddown_start2 = roundup_end1; + } + + roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); + uint64_t roundup_len2 = roundup_end2 - rounddown_start2; + + assert(roundup_len2 >= alignment); + CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, + false, chunk_len2); + assert(chunk_len2 == 0); + assert(roundup_len2 >= chunk_len2); + read_len2 = static_cast(roundup_len2 - chunk_len2); + // Update the buffer offset. + bufs_[second].offset_ = rounddown_start2; + } + + if (read_len1) { + s = ReadAsync(opts, reader, read_len1, rounddown_start1, curr_); + if (!s.ok()) { + DestroyAndClearIOHandle(curr_); + bufs_[curr_].buffer_.Clear(); + return s; + } + explicit_prefetch_submitted_ = true; + prev_len_ = 0; + } + if (read_len2) { + s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); + if (!s.ok()) { + DestroyAndClearIOHandle(second); + bufs_[second].buffer_.Clear(); + return s; + } + readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); + } + return (data_found ? Status::OK() : Status::TryAgain()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.h new file mode 100644 index 0000000000..ae84724960 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_prefetch_buffer.h @@ -0,0 +1,471 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "file/readahead_file_info.h" +#include "monitoring/statistics_impl.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "util/aligned_buffer.h" +#include "util/autovector.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +#define DEFAULT_DECREMENT 8 * 1024 + +struct IOOptions; +class RandomAccessFileReader; + +struct BufferInfo { + AlignedBuffer buffer_; + + uint64_t offset_ = 0; + + // Below parameters are used in case of async read flow. + // Length requested for in ReadAsync. + size_t async_req_len_ = 0; + + // async_read_in_progress can be used as mutex. Callback can update the buffer + // and its size but async_read_in_progress is only set by main thread. + bool async_read_in_progress_ = false; + + // io_handle is allocated and used by underlying file system in case of + // asynchronous reads. + void* io_handle_ = nullptr; + + IOHandleDeleter del_fn_ = nullptr; + + // pos represents the index of this buffer in vector of BufferInfo. + uint32_t pos_ = 0; +}; + +enum class FilePrefetchBufferUsage { + kTableOpenPrefetchTail, + kUnknown, +}; + +// FilePrefetchBuffer is a smart buffer to store and read data from a file. +class FilePrefetchBuffer { + public: + // Constructor. + // + // All arguments are optional. + // readahead_size : the initial readahead size. + // max_readahead_size : the maximum readahead size. + // If max_readahead_size > readahead_size, the readahead size will be + // doubled on every IO until max_readahead_size is hit. + // Typically this is set as a multiple of readahead_size. + // max_readahead_size should be greater than equal to readahead_size. + // enable : controls whether reading from the buffer is enabled. + // If false, TryReadFromCache() always return false, and we only take stats + // for the minimum offset if track_min_offset = true. + // track_min_offset : Track the minimum offset ever read and collect stats on + // it. Used for adaptable readahead of the file footer/metadata. + // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after + // doing sequential scans for two times. + // + // Automatic readhead is enabled for a file if readahead_size + // and max_readahead_size are passed in. + // A user can construct a FilePrefetchBuffer without any arguments, but use + // `Prefetch` to load data into the buffer. + FilePrefetchBuffer( + size_t readahead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false, + bool implicit_auto_readahead = false, uint64_t num_file_reads = 0, + uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr, + SystemClock* clock = nullptr, Statistics* stats = nullptr, + FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) + : curr_(0), + readahead_size_(readahead_size), + initial_auto_readahead_size_(readahead_size), + max_readahead_size_(max_readahead_size), + min_offset_read_(std::numeric_limits::max()), + enable_(enable), + track_min_offset_(track_min_offset), + implicit_auto_readahead_(implicit_auto_readahead), + prev_offset_(0), + prev_len_(0), + num_file_reads_for_auto_readahead_(num_file_reads_for_auto_readahead), + num_file_reads_(num_file_reads), + explicit_prefetch_submitted_(false), + fs_(fs), + clock_(clock), + stats_(stats), + usage_(usage) { + assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) || + (num_file_reads_ == 0)); + // If ReadOptions.async_io is enabled, data is asynchronously filled in + // second buffer while curr_ is being consumed. If data is overlapping in + // two buffers, data is copied to third buffer to return continuous buffer. + bufs_.resize(3); + for (uint32_t i = 0; i < 2; i++) { + bufs_[i].pos_ = i; + } + } + + ~FilePrefetchBuffer() { + // Abort any pending async read request before destroying the class object. + if (fs_ != nullptr) { + std::vector handles; + for (uint32_t i = 0; i < 2; i++) { + if (bufs_[i].async_read_in_progress_ && + bufs_[i].io_handle_ != nullptr) { + handles.emplace_back(bufs_[i].io_handle_); + } + } + if (!handles.empty()) { + StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); + Status s = fs_->AbortIO(handles); + assert(s.ok()); + } + } + + // Prefetch buffer bytes discarded. + uint64_t bytes_discarded = 0; + // Iterated over 2 buffers. + for (int i = 0; i < 2; i++) { + int first = i; + int second = i ^ 1; + + if (DoesBufferContainData(first)) { + // If last block was read completely from first and some bytes in + // first buffer are still unconsumed. + if (prev_offset_ >= bufs_[first].offset_ && + prev_offset_ + prev_len_ < + bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) { + bytes_discarded += bufs_[first].buffer_.CurrentSize() - + (prev_offset_ + prev_len_ - bufs_[first].offset_); + } + // If data was in second buffer and some/whole block bytes were read + // from second buffer. + else if (prev_offset_ < bufs_[first].offset_ && + !DoesBufferContainData(second)) { + // If last block read was completely from different buffer, this + // buffer is unconsumed. + if (prev_offset_ + prev_len_ <= bufs_[first].offset_) { + bytes_discarded += bufs_[first].buffer_.CurrentSize(); + } + // If last block read overlaps with this buffer and some data is + // still unconsumed and previous buffer (second) is not cleared. + else if (prev_offset_ + prev_len_ > bufs_[first].offset_ && + bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() == + bufs_[second].offset_) { + bytes_discarded += bufs_[first].buffer_.CurrentSize() - + (/*bytes read from this buffer=*/prev_len_ - + (bufs_[first].offset_ - prev_offset_)); + } + } + } + } + + for (uint32_t i = 0; i < 2; i++) { + // Release io_handle. + DestroyAndClearIOHandle(i); + } + RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); + } + + bool Enabled() const { return enable_; } + + // Load data into the buffer from a file. + // reader : the file reader. + // offset : the file offset to start reading from. + // n : the number of bytes to read. + // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to + // bypass. + Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t offset, size_t n, + Env::IOPriority rate_limiter_priority); + + // Request for reading the data from a file asynchronously. + // If data already exists in the buffer, result will be updated. + // reader : the file reader. + // offset : the file offset to start reading from. + // n : the number of bytes to read. + // result : if data already exists in the buffer, result will + // be updated with the data. + // + // If data already exist in the buffer, it will return Status::OK, otherwise + // it will send asynchronous request and return Status::TryAgain. + Status PrefetchAsync(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result); + + // Tries returning the data for a file read from this buffer if that data is + // in the buffer. + // It handles tracking the minimum read offset if track_min_offset = true. + // It also does the exponential readahead when readahead_size is set as part + // of the constructor. + // + // opts : the IO options to use. + // reader : the file reader. + // offset : the file offset. + // n : the number of bytes. + // result : output buffer to put the data into. + // s : output status. + // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to + // bypass. + // for_compaction : true if cache read is done for compaction read. + bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, Status* s, + Env::IOPriority rate_limiter_priority, + bool for_compaction = false); + + bool TryReadFromCacheAsync(const IOOptions& opts, + RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + Env::IOPriority rate_limiter_priority); + + // The minimum `offset` ever passed to TryReadFromCache(). This will nly be + // tracked if track_min_offset = true. + size_t min_offset_read() const { return min_offset_read_; } + + size_t GetPrefetchOffset() const { return bufs_[curr_].offset_; } + + // Called in case of implicit auto prefetching. + void UpdateReadPattern(const uint64_t& offset, const size_t& len, + bool decrease_readaheadsize) { + if (decrease_readaheadsize) { + // Since this block was eligible for prefetch but it was found in + // cache, so check and decrease the readahead_size by 8KB (default) + // if eligible. + DecreaseReadAheadIfEligible(offset, len); + } + prev_offset_ = offset; + prev_len_ = len; + explicit_prefetch_submitted_ = false; + } + + void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) { + readahead_info->readahead_size = readahead_size_; + readahead_info->num_file_reads = num_file_reads_; + } + + void DecreaseReadAheadIfEligible(uint64_t offset, size_t size, + size_t value = DEFAULT_DECREMENT) { + // Decrease the readahead_size if + // - its enabled internally by RocksDB (implicit_auto_readahead_) and, + // - readahead_size is greater than 0 and, + // - this block would have called prefetch API if not found in cache for + // which conditions are: + // - few/no bytes are in buffer and, + // - block is sequential with the previous read and, + // - num_file_reads_ + 1 (including this read) > + // num_file_reads_for_auto_readahead_ + size_t curr_size = bufs_[curr_].async_read_in_progress_ + ? bufs_[curr_].async_req_len_ + : bufs_[curr_].buffer_.CurrentSize(); + if (implicit_auto_readahead_ && readahead_size_ > 0) { + if ((offset + size > bufs_[curr_].offset_ + curr_size) && + IsBlockSequential(offset) && + (num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) { + readahead_size_ = + std::max(initial_auto_readahead_size_, + (readahead_size_ >= value ? readahead_size_ - value : 0)); + } + } + } + + // Callback function passed to underlying FS in case of asynchronous reads. + void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg); + + private: + // Calculates roundoff offset and length to be prefetched based on alignment + // and data present in buffer_. It also allocates new buffer or refit tail if + // required. + void CalculateOffsetAndLen(size_t alignment, uint64_t offset, + size_t roundup_len, uint32_t index, + bool refit_tail, uint64_t& chunk_len); + + void AbortIOIfNeeded(uint64_t offset); + + void AbortAllIOs(); + + void UpdateBuffersIfNeeded(uint64_t offset); + + // It calls Poll API if any there is any pending asynchronous request. It then + // checks if data is in any buffer. It clears the outdated data and swaps the + // buffers if required. + void PollAndUpdateBuffersIfNeeded(uint64_t offset); + + Status PrefetchAsyncInternal(const IOOptions& opts, + RandomAccessFileReader* reader, uint64_t offset, + size_t length, size_t readahead_size, + Env::IOPriority rate_limiter_priority, + bool& copy_to_third_buffer); + + Status Read(const IOOptions& opts, RandomAccessFileReader* reader, + Env::IOPriority rate_limiter_priority, uint64_t read_len, + uint64_t chunk_len, uint64_t rounddown_start, uint32_t index); + + Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t read_len, uint64_t rounddown_start, uint32_t index); + + // Copy the data from src to third buffer. + void CopyDataToBuffer(uint32_t src, uint64_t& offset, size_t& length); + + bool IsBlockSequential(const size_t& offset) { + return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); + } + + // Called in case of implicit auto prefetching. + void ResetValues() { + num_file_reads_ = 1; + readahead_size_ = initial_auto_readahead_size_; + } + + // Called in case of implicit auto prefetching. + bool IsEligibleForPrefetch(uint64_t offset, size_t n) { + // Prefetch only if this read is sequential otherwise reset readahead_size_ + // to initial value. + if (!IsBlockSequential(offset)) { + UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); + ResetValues(); + return false; + } + num_file_reads_++; + + // Since async request was submitted in last call directly by calling + // PrefetchAsync, it skips num_file_reads_ check as this call is to poll the + // data submitted in previous call. + if (explicit_prefetch_submitted_) { + return true; + } + if (num_file_reads_ <= num_file_reads_for_auto_readahead_) { + UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); + return false; + } + return true; + } + + // Helper functions. + bool IsDataBlockInBuffer(uint64_t offset, size_t length, uint32_t index) { + return (offset >= bufs_[index].offset_ && + offset + length <= + bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); + } + bool IsOffsetInBuffer(uint64_t offset, uint32_t index) { + return (offset >= bufs_[index].offset_ && + offset < bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); + } + bool DoesBufferContainData(uint32_t index) { + return bufs_[index].buffer_.CurrentSize() > 0; + } + bool IsBufferOutdated(uint64_t offset, uint32_t index) { + return ( + !bufs_[index].async_read_in_progress_ && DoesBufferContainData(index) && + offset >= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); + } + bool IsBufferOutdatedWithAsyncProgress(uint64_t offset, uint32_t index) { + return (bufs_[index].async_read_in_progress_ && + bufs_[index].io_handle_ != nullptr && + offset >= bufs_[index].offset_ + bufs_[index].async_req_len_); + } + bool IsOffsetInBufferWithAsyncProgress(uint64_t offset, uint32_t index) { + return (bufs_[index].async_read_in_progress_ && + offset >= bufs_[index].offset_ && + offset < bufs_[index].offset_ + bufs_[index].async_req_len_); + } + + bool IsSecondBuffEligibleForPrefetching() { + uint32_t second = curr_ ^ 1; + if (bufs_[second].async_read_in_progress_) { + return false; + } + assert(!bufs_[curr_].async_read_in_progress_); + + if (DoesBufferContainData(curr_) && DoesBufferContainData(second) && + (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() == + bufs_[second].offset_)) { + return false; + } + bufs_[second].buffer_.Clear(); + return true; + } + + void DestroyAndClearIOHandle(uint32_t index) { + if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) { + bufs_[index].del_fn_(bufs_[index].io_handle_); + bufs_[index].io_handle_ = nullptr; + bufs_[index].del_fn_ = nullptr; + } + bufs_[index].async_read_in_progress_ = false; + } + + Status HandleOverlappingData(const IOOptions& opts, + RandomAccessFileReader* reader, uint64_t offset, + size_t length, size_t readahead_size, + Env::IOPriority rate_limiter_priority, + bool& copy_to_third_buffer, uint64_t& tmp_offset, + size_t& tmp_length); + + bool TryReadFromCacheUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* s, + Env::IOPriority rate_limiter_priority, + bool for_compaction = false); + + bool TryReadFromCacheAsyncUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* status, + Env::IOPriority rate_limiter_priority); + + std::vector bufs_; + // curr_ represents the index for bufs_ indicating which buffer is being + // consumed currently. + uint32_t curr_; + + size_t readahead_size_; + size_t initial_auto_readahead_size_; + // FilePrefetchBuffer object won't be created from Iterator flow if + // max_readahead_size_ = 0. + size_t max_readahead_size_; + + // The minimum `offset` ever passed to TryReadFromCache(). + size_t min_offset_read_; + // if false, TryReadFromCache() always return false, and we only take stats + // for track_min_offset_ if track_min_offset_ = true + bool enable_; + // If true, track minimum `offset` ever passed to TryReadFromCache(), which + // can be fetched from min_offset_read(). + bool track_min_offset_; + + // implicit_auto_readahead is enabled by rocksdb internally after 2 + // sequential IOs. + bool implicit_auto_readahead_; + uint64_t prev_offset_; + size_t prev_len_; + // num_file_reads_ and num_file_reads_for_auto_readahead_ is only used when + // implicit_auto_readahead_ is set. + uint64_t num_file_reads_for_auto_readahead_; + uint64_t num_file_reads_; + + // If explicit_prefetch_submitted_ is set then it indicates RocksDB called + // PrefetchAsync to submit request. It needs to call TryReadFromCacheAsync to + // poll the submitted request without checking if data is sequential and + // num_file_reads_. + bool explicit_prefetch_submitted_; + + FileSystem* fs_; + SystemClock* clock_; + Statistics* stats_; + + FilePrefetchBufferUsage usage_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.cc new file mode 100644 index 0000000000..46faac67cb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.cc @@ -0,0 +1,277 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "file/file_util.h" + +#include +#include + +#include "file/random_access_file_reader.h" +#include "file/sequence_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +// Utility function to copy a file up to a specified length +IOStatus CopyFile(FileSystem* fs, const std::string& source, + std::unique_ptr& dest_writer, + uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer, + const Temperature temperature) { + FileOptions soptions; + IOStatus io_s; + std::unique_ptr src_reader; + + { + soptions.temperature = temperature; + std::unique_ptr srcfile; + io_s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr); + if (!io_s.ok()) { + return io_s; + } + + if (size == 0) { + // default argument means copy everything + io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr); + if (!io_s.ok()) { + return io_s; + } + } + src_reader.reset( + new SequentialFileReader(std::move(srcfile), source, io_tracer)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + size_t bytes_to_read = std::min(sizeof(buffer), static_cast(size)); + // TODO: rate limit copy file + io_s = status_to_io_status( + src_reader->Read(bytes_to_read, &slice, buffer, + Env::IO_TOTAL /* rate_limiter_priority */)); + if (!io_s.ok()) { + return io_s; + } + if (slice.size() == 0) { + return IOStatus::Corruption("file too small"); + } + io_s = dest_writer->Append(slice); + if (!io_s.ok()) { + return io_s; + } + size -= slice.size(); + } + return dest_writer->Sync(use_fsync); +} + +IOStatus CopyFile(FileSystem* fs, const std::string& source, + const std::string& destination, uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer, + const Temperature temperature) { + FileOptions options; + IOStatus io_s; + std::unique_ptr dest_writer; + + { + options.temperature = temperature; + std::unique_ptr destfile; + io_s = fs->NewWritableFile(destination, options, &destfile, nullptr); + if (!io_s.ok()) { + return io_s; + } + + dest_writer.reset( + new WritableFileWriter(std::move(destfile), destination, options)); + } + + return CopyFile(fs, source, dest_writer, size, use_fsync, io_tracer, + temperature); +} + +// Utility function to create a file with the provided contents +IOStatus CreateFile(FileSystem* fs, const std::string& destination, + const std::string& contents, bool use_fsync) { + const EnvOptions soptions; + IOStatus io_s; + std::unique_ptr dest_writer; + + std::unique_ptr destfile; + io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); + if (!io_s.ok()) { + return io_s; + } + dest_writer.reset( + new WritableFileWriter(std::move(destfile), destination, soptions)); + io_s = dest_writer->Append(Slice(contents)); + if (!io_s.ok()) { + return io_s; + } + return dest_writer->Sync(use_fsync); +} + +Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, const std::string& dir_to_sync, + const bool force_bg, const bool force_fg) { + SstFileManagerImpl* sfm = + static_cast(db_options->sst_file_manager.get()); + if (sfm && !force_fg) { + return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); + } else { + return db_options->env->DeleteFile(fname); + } +} + +// requested_checksum_func_name brings the function name of the checksum +// generator in checksum_factory. Empty string is permitted, in which case the +// name of the generator created by the factory is unchecked. When +// `requested_checksum_func_name` is non-empty, however, the created generator's +// name must match it, otherwise an `InvalidArgument` error is returned. +IOStatus GenerateOneFileChecksum( + FileSystem* fs, const std::string& file_path, + FileChecksumGenFactory* checksum_factory, + const std::string& requested_checksum_func_name, std::string* file_checksum, + std::string* file_checksum_func_name, + size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/, + std::shared_ptr& io_tracer, RateLimiter* rate_limiter, + Env::IOPriority rate_limiter_priority) { + if (checksum_factory == nullptr) { + return IOStatus::InvalidArgument("Checksum factory is invalid"); + } + assert(file_checksum != nullptr); + assert(file_checksum_func_name != nullptr); + + FileChecksumGenContext gen_context; + gen_context.requested_checksum_func_name = requested_checksum_func_name; + gen_context.file_name = file_path; + std::unique_ptr checksum_generator = + checksum_factory->CreateFileChecksumGenerator(gen_context); + if (checksum_generator == nullptr) { + std::string msg = + "Cannot get the file checksum generator based on the requested " + "checksum function name: " + + requested_checksum_func_name + + " from checksum factory: " + checksum_factory->Name(); + return IOStatus::InvalidArgument(msg); + } else { + // For backward compatibility and use in file ingestion clients where there + // is no stored checksum function name, `requested_checksum_func_name` can + // be empty. If we give the requested checksum function name, we expect it + // is the same name of the checksum generator. + if (!requested_checksum_func_name.empty() && + checksum_generator->Name() != requested_checksum_func_name) { + std::string msg = "Expected file checksum generator named '" + + requested_checksum_func_name + + "', while the factory created one " + "named '" + + checksum_generator->Name() + "'"; + return IOStatus::InvalidArgument(msg); + } + } + + uint64_t size; + IOStatus io_s; + std::unique_ptr reader; + { + std::unique_ptr r_file; + io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr); + if (!io_s.ok()) { + return io_s; + } + io_s = fs->GetFileSize(file_path, IOOptions(), &size, nullptr); + if (!io_s.ok()) { + return io_s; + } + reader.reset(new RandomAccessFileReader( + std::move(r_file), file_path, nullptr /*Env*/, io_tracer, nullptr, + Histograms::HISTOGRAM_ENUM_MAX, nullptr, rate_limiter)); + } + + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + size_t default_max_read_ahead_size = 256 * 1024; + size_t readahead_size = (verify_checksums_readahead_size != 0) + ? verify_checksums_readahead_size + : default_max_read_ahead_size; + std::unique_ptr buf; + if (reader->use_direct_io()) { + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + readahead_size = (readahead_size + alignment - 1) & ~(alignment - 1); + } + buf.reset(new char[readahead_size]); + + Slice slice; + uint64_t offset = 0; + IOOptions opts; + while (size > 0) { + size_t bytes_to_read = + static_cast(std::min(uint64_t{readahead_size}, size)); + io_s = reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr, + rate_limiter_priority); + if (!io_s.ok()) { + return IOStatus::Corruption("file read failed with error: " + + io_s.ToString()); + } + if (slice.size() == 0) { + return IOStatus::Corruption("file too small"); + } + checksum_generator->Update(slice.data(), slice.size()); + size -= slice.size(); + offset += slice.size(); + + TEST_SYNC_POINT("GenerateOneFileChecksum::Chunk:0"); + } + checksum_generator->Finalize(); + *file_checksum = checksum_generator->GetChecksum(); + *file_checksum_func_name = checksum_generator->Name(); + return IOStatus::OK(); +} + +Status DestroyDir(Env* env, const std::string& dir) { + Status s; + if (env->FileExists(dir).IsNotFound()) { + return s; + } + std::vector files_in_dir; + s = env->GetChildren(dir, &files_in_dir); + if (s.ok()) { + for (auto& file_in_dir : files_in_dir) { + std::string path = dir + "/" + file_in_dir; + bool is_dir = false; + s = env->IsDirectory(path, &is_dir); + if (s.ok()) { + if (is_dir) { + s = DestroyDir(env, path); + } else { + s = env->DeleteFile(path); + } + } else if (s.IsNotSupported()) { + s = Status::OK(); + } + if (!s.ok()) { + // IsDirectory, etc. might not report NotFound + if (s.IsNotFound() || env->FileExists(path).IsNotFound()) { + // Allow files to be deleted externally + s = Status::OK(); + } else { + break; + } + } + } + } + + if (s.ok()) { + s = env->DeleteDir(dir); + // DeleteDir might or might not report NotFound + if (!s.ok() && (s.IsNotFound() || env->FileExists(dir).IsNotFound())) { + // Allow to be deleted externally + s = Status::OK(); + } + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.h new file mode 100644 index 0000000000..e279cfba0a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/file_util.h @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include + +#include "file/filename.h" +#include "options/db_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/types.h" +#include "trace_replay/io_tracer.h" + +namespace ROCKSDB_NAMESPACE { +// use_fsync maps to options.use_fsync, which determines the way that +// the file is synced after copying. +extern IOStatus CopyFile(FileSystem* fs, const std::string& source, + std::unique_ptr& dest_writer, + uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer, + const Temperature temperature); +extern IOStatus CopyFile(FileSystem* fs, const std::string& source, + const std::string& destination, uint64_t size, + bool use_fsync, + const std::shared_ptr& io_tracer, + const Temperature temperature); +inline IOStatus CopyFile(const std::shared_ptr& fs, + const std::string& source, + const std::string& destination, uint64_t size, + bool use_fsync, + const std::shared_ptr& io_tracer, + const Temperature temperature) { + return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer, + temperature); +} +extern IOStatus CreateFile(FileSystem* fs, const std::string& destination, + const std::string& contents, bool use_fsync); + +inline IOStatus CreateFile(const std::shared_ptr& fs, + const std::string& destination, + const std::string& contents, bool use_fsync) { + return CreateFile(fs.get(), destination, contents, use_fsync); +} + +extern Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, + const std::string& path_to_sync, const bool force_bg, + const bool force_fg); + +extern IOStatus GenerateOneFileChecksum( + FileSystem* fs, const std::string& file_path, + FileChecksumGenFactory* checksum_factory, + const std::string& requested_checksum_func_name, std::string* file_checksum, + std::string* file_checksum_func_name, + size_t verify_checksums_readahead_size, bool allow_mmap_reads, + std::shared_ptr& io_tracer, RateLimiter* rate_limiter, + Env::IOPriority rate_limiter_priority); + +inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, + SystemClock* clock, IOOptions& opts) { + if (ro.deadline.count()) { + std::chrono::microseconds now = + std::chrono::microseconds(clock->NowMicros()); + // Ensure there is atleast 1us available. We don't want to pass a value of + // 0 as that means no timeout + if (now >= ro.deadline) { + return IOStatus::TimedOut("Deadline exceeded"); + } + opts.timeout = ro.deadline - now; + } + + if (ro.io_timeout.count() && + (!opts.timeout.count() || ro.io_timeout < opts.timeout)) { + opts.timeout = ro.io_timeout; + } + + opts.rate_limiter_priority = ro.rate_limiter_priority; + opts.io_activity = ro.io_activity; + + return IOStatus::OK(); +} + +// Test method to delete the input directory and all of its contents. +// This method is destructive and is meant for use only in tests!!! +Status DestroyDir(Env* env, const std::string& dir); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.cc new file mode 100644 index 0000000000..1e04c73395 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.cc @@ -0,0 +1,523 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "file/filename.h" + +#include +#include + +#include +#include + +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kCurrentFileName = "CURRENT"; +const std::string kOptionsFileNamePrefix = "OPTIONS-"; +const std::string kTempFileNameSuffix = "dbtmp"; + +static const std::string kRocksDbTFileExt = "sst"; +static const std::string kLevelDbTFileExt = "ldb"; +static const std::string kRocksDBBlobFileExt = "blob"; +static const std::string kArchivalDirName = "archive"; + +// Given a path, flatten the path name by replacing all chars not in +// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end. +// Return the number of chars stored in dest not including the trailing '\0'. +static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { + const char suffix[] = "_LOG"; + + size_t write_idx = 0; + size_t i = 0; + size_t src_len = path.size(); + + while (i < src_len && write_idx < len - sizeof(suffix)) { + if ((path[i] >= 'a' && path[i] <= 'z') || + (path[i] >= '0' && path[i] <= '9') || + (path[i] >= 'A' && path[i] <= 'Z') || path[i] == '-' || + path[i] == '.' || path[i] == '_') { + dest[write_idx++] = path[i]; + } else { + if (i > 0) { + dest[write_idx++] = '_'; + } + } + i++; + } + assert(sizeof(suffix) <= len - write_idx); + // "\0" is automatically added by snprintf + snprintf(dest + write_idx, len - write_idx, suffix); + write_idx += sizeof(suffix) - 1; + return write_idx; +} + +static std::string MakeFileName(uint64_t number, const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "%06llu.%s", + static_cast(number), suffix); + return buf; +} + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + return name + "/" + MakeFileName(number, suffix); +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string LogFileName(uint64_t number) { + assert(number > 0); + return MakeFileName(number, "log"); +} + +std::string BlobFileName(uint64_t number) { + assert(number > 0); + return MakeFileName(number, kRocksDBBlobFileExt.c_str()); +} + +std::string BlobFileName(const std::string& blobdirname, uint64_t number) { + assert(number > 0); + return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str()); +} + +std::string BlobFileName(const std::string& dbname, const std::string& blob_dir, + uint64_t number) { + assert(number > 0); + return MakeFileName(dbname + "/" + blob_dir, number, + kRocksDBBlobFileExt.c_str()); +} + +std::string ArchivalDirectory(const std::string& dir) { + return dir + "/" + kArchivalDirName; +} +std::string ArchivedLogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name + "/" + kArchivalDirName, number, "log"); +} + +std::string MakeTableFileName(const std::string& path, uint64_t number) { + return MakeFileName(path, number, kRocksDbTFileExt.c_str()); +} + +std::string MakeTableFileName(uint64_t number) { + return MakeFileName(number, kRocksDbTFileExt.c_str()); +} + +std::string Rocks2LevelTableFileName(const std::string& fullname) { + assert(fullname.size() > kRocksDbTFileExt.size() + 1); + if (fullname.size() <= kRocksDbTFileExt.size() + 1) { + return ""; + } + return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) + + kLevelDbTFileExt; +} + +uint64_t TableFileNameToNumber(const std::string& name) { + uint64_t number = 0; + uint64_t base = 1; + int pos = static_cast(name.find_last_of('.')); + while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') { + number += (name[pos] - '0') * base; + base *= 10; + } + return number; +} + +std::string TableFileName(const std::vector& db_paths, uint64_t number, + uint32_t path_id) { + assert(number > 0); + std::string path; + if (path_id >= db_paths.size()) { + path = db_paths.back().path; + } else { + path = db_paths[path_id].path; + } + return MakeTableFileName(path, number); +} + +void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, + size_t out_buf_size) { + if (path_id == 0) { + snprintf(out_buf, out_buf_size, "%" PRIu64, number); + } else { + snprintf(out_buf, out_buf_size, + "%" PRIu64 + "(path " + "%" PRIu32 ")", + number, path_id); + } +} + +std::string DescriptorFileName(uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "MANIFEST-%06llu", + static_cast(number)); + return buf; +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + return dbname + "/" + DescriptorFileName(number); +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/" + kCurrentFileName; +} + +std::string LockFileName(const std::string& dbname) { return dbname + "/LOCK"; } + +std::string TempFileName(const std::string& dbname, uint64_t number) { + return MakeFileName(dbname, number, kTempFileNameSuffix.c_str()); +} + +InfoLogPrefix::InfoLogPrefix(bool has_log_dir, + const std::string& db_absolute_path) { + if (!has_log_dir) { + const char kInfoLogPrefix[] = "LOG"; + // "\0" is automatically added to the end + snprintf(buf, sizeof(buf), kInfoLogPrefix); + prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1); + } else { + size_t len = + GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf)); + prefix = Slice(buf, len); + } +} + +std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path, + const std::string& log_dir) { + if (log_dir.empty()) { + return dbname + "/LOG"; + } + + InfoLogPrefix info_log_prefix(true, db_path); + return log_dir + "/" + info_log_prefix.buf; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path, + const std::string& log_dir) { + char buf[50]; + snprintf(buf, sizeof(buf), "%llu", static_cast(ts)); + + if (log_dir.empty()) { + return dbname + "/LOG.old." + buf; + } + + InfoLogPrefix info_log_prefix(true, db_path); + return log_dir + "/" + info_log_prefix.buf + ".old." + buf; +} + +std::string OptionsFileName(uint64_t file_num) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%06" PRIu64, + kOptionsFileNamePrefix.c_str(), file_num); + return buffer; +} +std::string OptionsFileName(const std::string& dbname, uint64_t file_num) { + return dbname + "/" + OptionsFileName(file_num); +} + +std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s", + kOptionsFileNamePrefix.c_str(), file_num, + kTempFileNameSuffix.c_str()); + return dbname + "/" + buffer; +} + +std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { + char buf[100]; + snprintf(buf, sizeof(buf), "/METADB-%llu", + static_cast(number)); + return dbname + buf; +} + +std::string IdentityFileName(const std::string& dbname) { + return dbname + "/IDENTITY"; +} + +// Owned filenames have the form: +// dbname/IDENTITY +// dbname/CURRENT +// dbname/LOCK +// dbname/ +// dbname/.old.[0-9]+ +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9]+.(log|sst|blob) +// dbname/METADB-[0-9]+ +// dbname/OPTIONS-[0-9]+ +// dbname/OPTIONS-[0-9]+.dbtmp +// Disregards / at the beginning +bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type, + WalFileType* log_type) { + return ParseFileName(fname, number, "", type, log_type); +} + +bool ParseFileName(const std::string& fname, uint64_t* number, + const Slice& info_log_name_prefix, FileType* type, + WalFileType* log_type) { + Slice rest(fname); + if (fname.length() > 1 && fname[0] == '/') { + rest.remove_prefix(1); + } + if (rest == "IDENTITY") { + *number = 0; + *type = kIdentityFile; + } else if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (info_log_name_prefix.size() > 0 && + rest.starts_with(info_log_name_prefix)) { + rest.remove_prefix(info_log_name_prefix.size()); + if (rest == "" || rest == ".old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.starts_with(".old.")) { + uint64_t ts_suffix; + // sizeof also counts the trailing '\0'. + rest.remove_prefix(sizeof(".old.") - 1); + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = kInfoLogFile; + } + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else if (rest.starts_with("METADB-")) { + rest.remove_prefix(strlen("METADB-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kMetaDatabase; + *number = num; + } else if (rest.starts_with(kOptionsFileNamePrefix)) { + uint64_t ts_suffix; + bool is_temp_file = false; + rest.remove_prefix(kOptionsFileNamePrefix.size()); + const std::string kTempFileNameSuffixWithDot = + std::string(".") + kTempFileNameSuffix; + if (rest.ends_with(kTempFileNameSuffixWithDot)) { + rest.remove_suffix(kTempFileNameSuffixWithDot.size()); + is_temp_file = true; + } + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = is_temp_file ? kTempFile : kOptionsFile; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + bool archive_dir_found = false; + if (rest.starts_with(kArchivalDirName)) { + if (rest.size() <= kArchivalDirName.size()) { + return false; + } + rest.remove_prefix(kArchivalDirName.size() + + 1); // Add 1 to remove / also + if (log_type) { + *log_type = kArchivedLogFile; + } + archive_dir_found = true; + } + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (rest.size() <= 1 || rest[0] != '.') { + return false; + } + rest.remove_prefix(1); + + Slice suffix = rest; + if (suffix == Slice("log")) { + *type = kWalFile; + if (log_type && !archive_dir_found) { + *log_type = kAliveLogFile; + } + } else if (archive_dir_found) { + return false; // Archive dir can contain only log files + } else if (suffix == Slice(kRocksDbTFileExt) || + suffix == Slice(kLevelDbTFileExt)) { + *type = kTableFile; + } else if (suffix == Slice(kRocksDBBlobFileExt)) { + *type = kBlobFile; + } else if (suffix == Slice(kTempFileNameSuffix)) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, + uint64_t descriptor_number, + FSDirectory* dir_contains_current_file) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true); + TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); + if (s.ok()) { + TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2); + s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr); + TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2); + TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s); + } + if (s.ok()) { + if (dir_contains_current_file != nullptr) { + s = dir_contains_current_file->FsyncWithDirOptions( + IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname))); + } + } else { + fs->DeleteFile(tmp, IOOptions(), nullptr) + .PermitUncheckedError(); // NOTE: PermitUncheckedError is acceptable + // here as we are already handling an error + // case, and this is just a best-attempt + // effort at some cleanup + } + return s; +} + +Status SetIdentityFile(Env* env, const std::string& dbname, + const std::string& db_id) { + std::string id; + if (db_id.empty()) { + id = env->GenerateUniqueId(); + } else { + id = db_id; + } + assert(!id.empty()); + // Reserve the filename dbname/000000.dbtmp for the temporary identity file + std::string tmp = TempFileName(dbname, 0); + std::string identify_file_name = IdentityFileName(dbname); + Status s = WriteStringToFile(env, id, tmp, true); + if (s.ok()) { + s = env->RenameFile(tmp, identify_file_name); + } + std::unique_ptr dir_obj; + if (s.ok()) { + s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj, + nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(identify_file_name)); + } + + // The default Close() could return "NotSupported" and we bypass it + // if it is not impelmented. Detailed explanations can be found in + // db/db_impl/db_impl.h + if (s.ok()) { + Status temp_s = dir_obj->Close(IOOptions(), nullptr); + if (!temp_s.ok()) { + if (temp_s.IsNotSupported()) { + temp_s.PermitUncheckedError(); + } else { + s = temp_s; + } + } + } + if (!s.ok()) { + env->DeleteFile(tmp).PermitUncheckedError(); + } + return s; +} + +IOStatus SyncManifest(const ImmutableDBOptions* db_options, + WritableFileWriter* file) { + TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2); + StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS); + return file->Sync(db_options->use_fsync); +} + +Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, const std::string& dbname, + std::string* parent_dir, + std::vector* info_log_list) { + assert(parent_dir != nullptr); + assert(info_log_list != nullptr); + uint64_t number = 0; + FileType type = kWalFile; + + if (!db_log_dir.empty()) { + *parent_dir = db_log_dir; + } else { + *parent_dir = dbname; + } + + InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname); + + std::vector file_names; + Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr); + + if (!s.ok()) { + return s; + } + + for (auto& f : file_names) { + if (ParseFileName(f, &number, info_log_prefix.prefix, &type) && + (type == kInfoLogFile)) { + info_log_list->push_back(f); + } + } + return Status::OK(); +} + +std::string NormalizePath(const std::string& path) { + std::string dst; + + if (path.length() > 2 && path[0] == kFilePathSeparator && + path[1] == kFilePathSeparator) { // Handle UNC names + dst.append(2, kFilePathSeparator); + } + + for (auto c : path) { + if (!dst.empty() && (c == kFilePathSeparator || c == '/') && + (dst.back() == kFilePathSeparator || dst.back() == '/')) { + continue; + } + dst.push_back(c); + } + return dst; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.h new file mode 100644 index 0000000000..2eb125b6a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/filename.h @@ -0,0 +1,188 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#pragma once +#include + +#include +#include +#include + +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Directory; +class SystemClock; +class WritableFileWriter; + +#ifdef OS_WIN +constexpr char kFilePathSeparator = '\\'; +#else +constexpr char kFilePathSeparator = '/'; +#endif + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +extern std::string LogFileName(uint64_t number); + +extern std::string BlobFileName(uint64_t number); + +extern std::string BlobFileName(const std::string& bdirname, uint64_t number); + +extern std::string BlobFileName(const std::string& dbname, + const std::string& blob_dir, uint64_t number); + +extern std::string ArchivalDirectory(const std::string& dbname); + +// Return the name of the archived log file with the specified number +// in the db named by "dbname". The result will be prefixed with "dbname". +extern std::string ArchivedLogFileName(const std::string& dbname, uint64_t num); + +extern std::string MakeTableFileName(const std::string& name, uint64_t number); + +extern std::string MakeTableFileName(uint64_t number); + +// Return the name of sstable with LevelDB suffix +// created from RocksDB sstable suffixed name +extern std::string Rocks2LevelTableFileName(const std::string& fullname); + +// the reverse function of MakeTableFileName +// TODO(yhchiang): could merge this function with ParseFileName() +extern uint64_t TableFileNameToNumber(const std::string& name); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::vector& db_paths, + uint64_t number, uint32_t path_id); + +// Sufficient buffer size for FormatFileNumber. +const size_t kFormatFileNumberBufSize = 38; + +extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, + size_t out_buf_size); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +extern std::string DescriptorFileName(uint64_t number); + +extern const std::string kCurrentFileName; // = "CURRENT" + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// A helper structure for prefix of info log names. +struct InfoLogPrefix { + char buf[260]; + Slice prefix; + // Prefix with DB absolute path encoded + explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path); + // Default Prefix + explicit InfoLogPrefix(); +}; + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path = "", + const std::string& log_dir = ""); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path = "", + const std::string& log_dir = ""); + +extern const std::string kOptionsFileNamePrefix; // = "OPTIONS-" +extern const std::string kTempFileNameSuffix; // = "dbtmp" + +// Return a options file name given the "dbname" and file number. +// Format: OPTIONS-[number].dbtmp +extern std::string OptionsFileName(const std::string& dbname, + uint64_t file_num); +extern std::string OptionsFileName(uint64_t file_num); + +// Return a temp options file name given the "dbname" and file number. +// Format: OPTIONS-[number] +extern std::string TempOptionsFileName(const std::string& dbname, + uint64_t file_num); + +// Return the name to use for a metadatabase. The result will be prefixed with +// "dbname". +extern std::string MetaDatabaseName(const std::string& dbname, uint64_t number); + +// Return the name of the Identity file which stores a unique number for the db +// that will get regenerated if the db loses all its data and is recreated fresh +// either from a backup-image or empty +extern std::string IdentityFileName(const std::string& dbname); + +// If filename is a rocksdb file, store the type of the file in *type. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. +// info_log_name_prefix is the path of info logs. +extern bool ParseFileName(const std::string& filename, uint64_t* number, + const Slice& info_log_name_prefix, FileType* type, + WalFileType* log_type = nullptr); +// Same as previous function, but skip info log files. +extern bool ParseFileName(const std::string& filename, uint64_t* number, + FileType* type, WalFileType* log_type = nullptr); + +// Make the CURRENT file point to the descriptor file with the +// specified number. On its success and when dir_contains_current_file is not +// nullptr, the function will fsync the directory containing the CURRENT file +// when +extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, + uint64_t descriptor_number, + FSDirectory* dir_contains_current_file); + +// Make the IDENTITY file for the db +extern Status SetIdentityFile(Env* env, const std::string& dbname, + const std::string& db_id = {}); + +// Sync manifest file `file`. +extern IOStatus SyncManifest(const ImmutableDBOptions* db_options, + WritableFileWriter* file); + +// Return list of file names of info logs in `file_names`. +// The list only contains file name. The parent directory name is stored +// in `parent_dir`. +// `db_log_dir` should be the one as in options.db_log_dir +extern Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, + const std::string& dbname, + std::string* parent_dir, + std::vector* file_names); + +extern std::string NormalizePath(const std::string& path); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.cc new file mode 100644 index 0000000000..50c415dc68 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.cc @@ -0,0 +1,73 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/line_file_reader.h" + +#include + +#include "monitoring/iostats_context_imp.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus LineFileReader::Create(const std::shared_ptr& fs, + const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg, + RateLimiter* rate_limiter) { + std::unique_ptr file; + IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new LineFileReader( + std::move(file), fname, nullptr, + std::vector>{}, rate_limiter)); + } + return io_s; +} + +bool LineFileReader::ReadLine(std::string* out, + Env::IOPriority rate_limiter_priority) { + assert(out); + if (!io_status_.ok()) { + // Status should be checked (or permit unchecked) any time we return false. + io_status_.MustCheck(); + return false; + } + out->clear(); + for (;;) { + // Look for line delimiter + const char* found = static_cast( + std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_)); + if (found) { + size_t len = found - buf_begin_; + out->append(buf_begin_, len); + buf_begin_ += len + /*delim*/ 1; + ++line_number_; + return true; + } + if (at_eof_) { + io_status_.MustCheck(); + return false; + } + // else flush and reload buffer + out->append(buf_begin_, buf_end_ - buf_begin_); + Slice result; + io_status_ = + sfr_.Read(buf_.size(), &result, buf_.data(), rate_limiter_priority); + IOSTATS_ADD(bytes_read, result.size()); + if (!io_status_.ok()) { + io_status_.MustCheck(); + return false; + } + if (result.size() != buf_.size()) { + // The obscure way of indicating EOF + at_eof_ = true; + } + buf_begin_ = result.data(); + buf_end_ = result.data() + result.size(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.h new file mode 100644 index 0000000000..cc302d3115 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/line_file_reader.h @@ -0,0 +1,60 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "file/sequence_file_reader.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper on top of Env::SequentialFile for reading text lines from a file. +// Lines are delimited by '\n'. The last line may or may not include a +// trailing newline. Uses SequentialFileReader internally. +class LineFileReader { + private: + std::array buf_; + SequentialFileReader sfr_; + IOStatus io_status_; + const char* buf_begin_ = buf_.data(); + const char* buf_end_ = buf_.data(); + size_t line_number_ = 0; + bool at_eof_ = false; + + public: + // See SequentialFileReader constructors + template + explicit LineFileReader(Args&&... args) + : sfr_(std::forward(args)...) {} + + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg, RateLimiter* rate_limiter); + + LineFileReader(const LineFileReader&) = delete; + LineFileReader& operator=(const LineFileReader&) = delete; + + // Reads another line from the file, returning true on success and saving + // the line to `out`, without delimiter, or returning false on failure. You + // must check GetStatus() to determine whether the failure was just + // end-of-file (OK status) or an I/O error (another status). + // The internal rate limiter will be charged at the specified priority. + bool ReadLine(std::string* out, Env::IOPriority rate_limiter_priority); + + // Returns the number of the line most recently returned from ReadLine. + // Return value is unspecified if ReadLine has returned false due to + // I/O error. After ReadLine returns false due to end-of-file, return + // value is the last returned line number, or equivalently the total + // number of lines returned. + size_t GetLineNumber() const { return line_number_; } + + // Returns any error encountered during read. The error is considered + // permanent and no retry or recovery is attempted with the same + // LineFileReader. + const IOStatus& GetStatus() const { return io_status_; } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.cc new file mode 100644 index 0000000000..308cd3e5bb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.cc @@ -0,0 +1,599 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/random_access_file_reader.h" + +#include +#include + +#include "file/file_util.h" +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "table/format.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { +const std::array + kReadHistograms{{ + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + FILE_READ_DB_OPEN_MICROS, + }}; +inline void RecordIOStats(Statistics* stats, Temperature file_temperature, + bool is_last_level, size_t size) { + IOSTATS_ADD(bytes_read, size); + // record for last/non-last level + if (is_last_level) { + RecordTick(stats, LAST_LEVEL_READ_BYTES, size); + RecordTick(stats, LAST_LEVEL_READ_COUNT, 1); + } else { + RecordTick(stats, NON_LAST_LEVEL_READ_BYTES, size); + RecordTick(stats, NON_LAST_LEVEL_READ_COUNT, 1); + } + + // record for temperature file + if (file_temperature != Temperature::kUnknown) { + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1); + RecordTick(stats, HOT_FILE_READ_BYTES, size); + RecordTick(stats, HOT_FILE_READ_COUNT, 1); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1); + RecordTick(stats, WARM_FILE_READ_BYTES, size); + RecordTick(stats, WARM_FILE_READ_COUNT, 1); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1); + RecordTick(stats, COLD_FILE_READ_BYTES, size); + RecordTick(stats, COLD_FILE_READ_COUNT, 1); + break; + default: + break; + } + } +} + +IOStatus RandomAccessFileReader::Create( + const std::shared_ptr& fs, const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* reader, IODebugContext* dbg) { + std::unique_ptr file; + IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new RandomAccessFileReader(std::move(file), fname)); + } + return io_s; +} + +IOStatus RandomAccessFileReader::Read( + const IOOptions& opts, uint64_t offset, size_t n, Slice* result, + char* scratch, AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority) const { + (void)aligned_buf; + + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr); + + // To be paranoid: modify scratch a little bit, so in case underlying + // FileSystem doesn't fill the buffer but return success and `scratch` returns + // contains a previous block, returned value will not pass checksum. + if (n > 0 && scratch != nullptr) { + // This byte might not change anything for direct I/O case, but it's OK. + scratch[0]++; + } + + IOStatus io_s; + uint64_t elapsed = 0; + { + StopWatch sw(clock_, stats_, hist_type_, + (opts.io_activity != Env::IOActivity::kUnknown) + ? kReadHistograms[(std::size_t)(opts.io_activity)] + : Histograms::HISTOGRAM_ENUM_MAX, + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_TIMER_GUARD(read_nanos); + if (use_direct_io()) { + size_t alignment = file_->GetRequiredBufferAlignment(); + size_t aligned_offset = + TruncateToPageBoundary(alignment, static_cast(offset)); + size_t offset_advance = static_cast(offset) - aligned_offset; + size_t read_size = + Roundup(static_cast(offset + n), alignment) - aligned_offset; + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(read_size); + while (buf.CurrentSize() < read_size) { + size_t allowed; + if (rate_limiter_priority != Env::IO_TOTAL && + rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + buf.Capacity() - buf.CurrentSize(), buf.Alignment(), + rate_limiter_priority, stats_, RateLimiter::OpType::kRead); + } else { + assert(buf.CurrentSize() == 0); + allowed = read_size; + } + Slice tmp; + + FileOperationInfo::StartTimePoint start_ts; + uint64_t orig_offset = 0; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + orig_offset = aligned_offset + buf.CurrentSize(); + } + + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + // Only user reads are expected to specify a timeout. And user reads + // are not subjected to rate_limiter and should go through only + // one iteration of this loop, so we don't need to check and adjust + // the opts.timeout before calling file_->Read + assert(!opts.timeout.count() || allowed == read_size); + io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts, + &tmp, buf.Destination(), nullptr); + } + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, + io_s); + if (!io_s.ok()) { + NotifyOnIOError(io_s, FileOperationType::kRead, file_name(), + tmp.size(), orig_offset); + } + } + + buf.Size(buf.CurrentSize() + tmp.size()); + if (!io_s.ok() || tmp.size() < allowed) { + break; + } + } + size_t res_len = 0; + if (io_s.ok() && offset_advance < buf.CurrentSize()) { + res_len = std::min(buf.CurrentSize() - offset_advance, n); + if (aligned_buf == nullptr) { + buf.Read(scratch, offset_advance, res_len); + } else { + scratch = buf.BufferStart() + offset_advance; + aligned_buf->reset(buf.Release()); + } + } + *result = Slice(scratch, res_len); + } else { + size_t pos = 0; + const char* res_scratch = nullptr; + while (pos < n) { + size_t allowed; + if (rate_limiter_priority != Env::IO_TOTAL && + rate_limiter_ != nullptr) { + if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { + sw.DelayStart(); + } + allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */, + rate_limiter_priority, stats_, + RateLimiter::OpType::kRead); + if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { + sw.DelayStop(); + } + } else { + allowed = n; + } + Slice tmp_result; + + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + // Only user reads are expected to specify a timeout. And user reads + // are not subjected to rate_limiter and should go through only + // one iteration of this loop, so we don't need to check and adjust + // the opts.timeout before calling file_->Read + assert(!opts.timeout.count() || allowed == n); + io_s = file_->Read(offset + pos, allowed, opts, &tmp_result, + scratch + pos, nullptr); + } + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, + finish_ts, io_s); + + if (!io_s.ok()) { + NotifyOnIOError(io_s, FileOperationType::kRead, file_name(), + tmp_result.size(), offset + pos); + } + } + if (res_scratch == nullptr) { + // we can't simply use `scratch` because reads of mmap'd files return + // data in a different buffer. + res_scratch = tmp_result.data(); + } else { + // make sure chunks are inserted contiguously into `res_scratch`. + assert(tmp_result.data() == res_scratch + pos); + } + pos += tmp_result.size(); + if (!io_s.ok() || tmp_result.size() < allowed) { + break; + } + } + *result = Slice(res_scratch, io_s.ok() ? pos : 0); + } + RecordIOStats(stats_, file_temperature_, is_last_level_, result->size()); + SetPerfLevel(prev_perf_level); + } + if (stats_ != nullptr && file_read_hist_ != nullptr) { + file_read_hist_->Add(elapsed); + } + + return io_s; +} + +size_t End(const FSReadRequest& r) { + return static_cast(r.offset) + r.len; +} + +FSReadRequest Align(const FSReadRequest& r, size_t alignment) { + FSReadRequest req; + req.offset = static_cast( + TruncateToPageBoundary(alignment, static_cast(r.offset))); + req.len = Roundup(End(r), alignment) - req.offset; + req.scratch = nullptr; + return req; +} + +bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) { + size_t dest_offset = static_cast(dest->offset); + size_t src_offset = static_cast(src.offset); + size_t dest_end = End(*dest); + size_t src_end = End(src); + if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) { + return false; + } + dest->offset = static_cast(std::min(dest_offset, src_offset)); + dest->len = std::max(dest_end, src_end) - dest->offset; + return true; +} + +IOStatus RandomAccessFileReader::MultiRead( + const IOOptions& opts, FSReadRequest* read_reqs, size_t num_reqs, + AlignedBuf* aligned_buf, Env::IOPriority rate_limiter_priority) const { + (void)aligned_buf; // suppress warning of unused variable in LITE mode + assert(num_reqs > 0); + +#ifndef NDEBUG + for (size_t i = 0; i < num_reqs - 1; ++i) { + assert(read_reqs[i].offset <= read_reqs[i + 1].offset); + } +#endif // !NDEBUG + + // To be paranoid modify scratch a little bit, so in case underlying + // FileSystem doesn't fill the buffer but return success and `scratch` returns + // contains a previous block, returned value will not pass checksum. + // This byte might not change anything for direct I/O case, but it's OK. + for (size_t i = 0; i < num_reqs; i++) { + FSReadRequest& r = read_reqs[i]; + if (r.len > 0 && r.scratch != nullptr) { + r.scratch[0]++; + } + } + + IOStatus io_s; + uint64_t elapsed = 0; + { + StopWatch sw(clock_, stats_, hist_type_, + (opts.io_activity != Env::IOActivity::kUnknown) + ? kReadHistograms[(std::size_t)(opts.io_activity)] + : Histograms::HISTOGRAM_ENUM_MAX, + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_TIMER_GUARD(read_nanos); + + FSReadRequest* fs_reqs = read_reqs; + size_t num_fs_reqs = num_reqs; + std::vector aligned_reqs; + if (use_direct_io()) { + // num_reqs is the max possible size, + // this can reduce std::vecector's internal resize operations. + aligned_reqs.reserve(num_reqs); + // Align and merge the read requests. + size_t alignment = file_->GetRequiredBufferAlignment(); + for (size_t i = 0; i < num_reqs; i++) { + const auto& r = Align(read_reqs[i], alignment); + if (i == 0) { + // head + aligned_reqs.push_back(r); + + } else if (!TryMerge(&aligned_reqs.back(), r)) { + // head + n + aligned_reqs.push_back(r); + + } else { + // unused + r.status.PermitUncheckedError(); + } + } + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs", + &aligned_reqs); + + // Allocate aligned buffer and let scratch buffers point to it. + size_t total_len = 0; + for (const auto& r : aligned_reqs) { + total_len += r.len; + } + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(total_len); + char* scratch = buf.BufferStart(); + for (auto& r : aligned_reqs) { + r.scratch = scratch; + scratch += r.len; + } + + aligned_buf->reset(buf.Release()); + fs_reqs = aligned_reqs.data(); + num_fs_reqs = aligned_reqs.size(); + } + + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) { + // TODO: ideally we should call `RateLimiter::RequestToken()` for + // allowed bytes to multi-read and then consume those bytes by + // satisfying as many requests in `MultiRead()` as possible, instead of + // what we do here, which can cause burst when the + // `total_multi_read_size` is big. + size_t total_multi_read_size = 0; + assert(fs_reqs != nullptr); + for (size_t i = 0; i < num_fs_reqs; ++i) { + FSReadRequest& req = fs_reqs[i]; + total_multi_read_size += req.len; + } + size_t remaining_bytes = total_multi_read_size; + size_t request_bytes = 0; + while (remaining_bytes > 0) { + request_bytes = std::min( + static_cast(rate_limiter_->GetSingleBurstBytes()), + remaining_bytes); + rate_limiter_->Request(request_bytes, rate_limiter_priority, + nullptr /* stats */, + RateLimiter::OpType::kRead); + remaining_bytes -= request_bytes; + } + } + io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr); + RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs); + } + + if (use_direct_io()) { + // Populate results in the unaligned read requests. + size_t aligned_i = 0; + for (size_t i = 0; i < num_reqs; i++) { + auto& r = read_reqs[i]; + if (static_cast(r.offset) > End(aligned_reqs[aligned_i])) { + aligned_i++; + } + const auto& fs_r = fs_reqs[aligned_i]; + r.status = fs_r.status; + if (r.status.ok()) { + uint64_t offset = r.offset - fs_r.offset; + if (fs_r.result.size() <= offset) { + // No byte in the read range is returned. + r.result = Slice(); + } else { + size_t len = std::min( + r.len, static_cast(fs_r.result.size() - offset)); + r.result = Slice(fs_r.scratch + offset, len); + } + } else { + r.result = Slice(); + } + } + } + + for (size_t i = 0; i < num_reqs; ++i) { + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(), + start_ts, finish_ts, read_reqs[i].status); + } + if (!read_reqs[i].status.ok()) { + NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead, + file_name(), read_reqs[i].result.size(), + read_reqs[i].offset); + } + + RecordIOStats(stats_, file_temperature_, is_last_level_, + read_reqs[i].result.size()); + } + SetPerfLevel(prev_perf_level); + } + if (stats_ != nullptr && file_read_hist_ != nullptr) { + file_read_hist_->Add(elapsed); + } + + return io_s; +} + +IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, + IOOptions& opts) const { + if (clock_ != nullptr) { + return PrepareIOFromReadOptions(ro, clock_, opts); + } else { + return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts); + } +} + +IOStatus RandomAccessFileReader::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) { + IOStatus s; + // Create a callback and populate info. + auto read_async_callback = + std::bind(&RandomAccessFileReader::ReadAsyncCallback, this, + std::placeholders::_1, std::placeholders::_2); + ReadAsyncInfo* read_async_info = + new ReadAsyncInfo(cb, cb_arg, clock_->NowMicros()); + + if (ShouldNotifyListeners()) { + read_async_info->fs_start_ts_ = FileOperationInfo::StartNow(); + } + + size_t alignment = file_->GetRequiredBufferAlignment(); + bool is_aligned = (req.offset & (alignment - 1)) == 0 && + (req.len & (alignment - 1)) == 0 && + (uintptr_t(req.scratch) & (alignment - 1)) == 0; + read_async_info->is_aligned_ = is_aligned; + + uint64_t elapsed = 0; + if (use_direct_io() && is_aligned == false) { + FSReadRequest aligned_req = Align(req, alignment); + aligned_req.status.PermitUncheckedError(); + + // Allocate aligned buffer. + read_async_info->buf_.Alignment(alignment); + read_async_info->buf_.AllocateNewBuffer(aligned_req.len); + + // Set rem fields in aligned FSReadRequest. + aligned_req.scratch = read_async_info->buf_.BufferStart(); + + // Set user provided fields to populate back in callback. + read_async_info->user_scratch_ = req.scratch; + read_async_info->user_aligned_buf_ = aligned_buf; + read_async_info->user_len_ = req.len; + read_async_info->user_offset_ = req.offset; + read_async_info->user_result_ = req.result; + + assert(read_async_info->buf_.CurrentSize() == 0); + + StopWatch sw(clock_, nullptr /*stats*/, + Histograms::HISTOGRAM_ENUM_MAX /*hist_type*/, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/, + true /*delay_enabled*/); + s = file_->ReadAsync(aligned_req, opts, read_async_callback, + read_async_info, io_handle, del_fn, nullptr /*dbg*/); + } else { + StopWatch sw(clock_, nullptr /*stats*/, + Histograms::HISTOGRAM_ENUM_MAX /*hist_type*/, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/, + true /*delay_enabled*/); + s = file_->ReadAsync(req, opts, read_async_callback, read_async_info, + io_handle, del_fn, nullptr /*dbg*/); + } + RecordTick(stats_, READ_ASYNC_MICROS, elapsed); + +// Suppress false positive clang analyzer warnings. +// Memory is not released if file_->ReadAsync returns !s.ok(), because +// ReadAsyncCallback is never called in that case. If ReadAsyncCallback is +// called then ReadAsync should always return IOStatus::OK(). +#ifndef __clang_analyzer__ + if (!s.ok()) { + delete read_async_info; + } +#endif // __clang_analyzer__ + + return s; +} + +void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req, + void* cb_arg) { + ReadAsyncInfo* read_async_info = static_cast(cb_arg); + assert(read_async_info); + assert(read_async_info->cb_); + + if (use_direct_io() && read_async_info->is_aligned_ == false) { + // Create FSReadRequest with user provided fields. + FSReadRequest user_req; + user_req.scratch = read_async_info->user_scratch_; + user_req.offset = read_async_info->user_offset_; + user_req.len = read_async_info->user_len_; + + // Update results in user_req. + user_req.result = req.result; + user_req.status = req.status; + + read_async_info->buf_.Size(read_async_info->buf_.CurrentSize() + + req.result.size()); + + size_t offset_advance_len = static_cast( + /*offset_passed_by_user=*/read_async_info->user_offset_ - + /*aligned_offset=*/req.offset); + + size_t res_len = 0; + if (req.status.ok() && + offset_advance_len < read_async_info->buf_.CurrentSize()) { + res_len = + std::min(read_async_info->buf_.CurrentSize() - offset_advance_len, + read_async_info->user_len_); + if (read_async_info->user_aligned_buf_ == nullptr) { + // Copy the data into user's scratch. +// Clang analyzer assumes that it will take use_direct_io() == false in +// ReadAsync and use_direct_io() == true in Callback which cannot be true. +#ifndef __clang_analyzer__ + read_async_info->buf_.Read(user_req.scratch, offset_advance_len, + res_len); +#endif // __clang_analyzer__ + } else { + // Set aligned_buf provided by user without additional copy. + user_req.scratch = + read_async_info->buf_.BufferStart() + offset_advance_len; + read_async_info->user_aligned_buf_->reset( + read_async_info->buf_.Release()); + } + user_req.result = Slice(user_req.scratch, res_len); + } else { + // Either req.status is not ok or data was not read. + user_req.result = Slice(); + } + read_async_info->cb_(user_req, read_async_info->cb_arg_); + } else { + read_async_info->cb_(req, read_async_info->cb_arg_); + } + + // Update stats and notify listeners. + if (stats_ != nullptr && file_read_hist_ != nullptr) { + // elapsed doesn't take into account delay and overwrite as StopWatch does + // in Read. + uint64_t elapsed = clock_->NowMicros() - read_async_info->start_time_; + file_read_hist_->Add(elapsed); + } + if (req.status.ok()) { + RecordInHistogram(stats_, ASYNC_READ_BYTES, req.result.size()); + } else if (!req.status.IsAborted()) { + RecordTick(stats_, ASYNC_READ_ERROR_COUNT, 1); + } + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileReadFinish(req.offset, req.result.size(), + read_async_info->fs_start_ts_, finish_ts, + req.status); + } + if (!req.status.ok()) { + NotifyOnIOError(req.status, FileOperationType::kRead, file_name(), + req.result.size(), req.offset); + } + RecordIOStats(stats_, file_temperature_, is_last_level_, req.result.size()); + delete read_async_info; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.h new file mode 100644 index 0000000000..ab4d1e7978 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/random_access_file_reader.h @@ -0,0 +1,210 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include "env/file_system_tracer.h" +#include "port/port.h" +#include "rocksdb/file_system.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/rate_limiter.h" +#include "util/aligned_buffer.h" + +namespace ROCKSDB_NAMESPACE { +class Statistics; +class HistogramImpl; +class SystemClock; + +using AlignedBuf = std::unique_ptr; + +// Align the request r according to alignment and return the aligned result. +FSReadRequest Align(const FSReadRequest& r, size_t alignment); + +// Try to merge src to dest if they have overlap. +// +// Each request represents an inclusive interval [offset, offset + len]. +// If the intervals have overlap, update offset and len to represent the +// merged interval, and return true. +// Otherwise, do nothing and return false. +bool TryMerge(FSReadRequest* dest, const FSReadRequest& src); + +// RandomAccessFileReader is a wrapper on top of FSRandomAccessFile. It is +// responsible for: +// - Handling Buffered and Direct reads appropriately. +// - Rate limiting compaction reads. +// - Notifying any interested listeners on the completion of a read. +// - Updating IO stats. +class RandomAccessFileReader { + private: + void NotifyOnFileReadFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const Status& status) const { + FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts, + finish_ts, status, file_temperature_); + info.offset = offset; + info.length = length; + + for (auto& listener : listeners_) { + listener->OnFileReadFinish(info); + } + info.status.PermitUncheckedError(); + } + + void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation, + const std::string& file_path, size_t length, + uint64_t offset) const { + if (listeners_.empty()) { + return; + } + IOErrorInfo io_error_info(io_status, operation, file_path, length, offset); + + for (auto& listener : listeners_) { + listener->OnIOError(io_error_info); + } + io_status.PermitUncheckedError(); + } + + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + + FSRandomAccessFilePtr file_; + std::string file_name_; + SystemClock* clock_; + Statistics* stats_; + uint32_t hist_type_; + HistogramImpl* file_read_hist_; + RateLimiter* rate_limiter_; + std::vector> listeners_; + const Temperature file_temperature_; + const bool is_last_level_; + + struct ReadAsyncInfo { + ReadAsyncInfo(std::function cb, + void* cb_arg, uint64_t start_time) + : cb_(cb), + cb_arg_(cb_arg), + start_time_(start_time), + user_scratch_(nullptr), + user_aligned_buf_(nullptr), + user_offset_(0), + user_len_(0), + is_aligned_(false) {} + + std::function cb_; + void* cb_arg_; + uint64_t start_time_; + FileOperationInfo::StartTimePoint fs_start_ts_; + // Below fields stores the parameters passed by caller in case of direct_io. + char* user_scratch_; + AlignedBuf* user_aligned_buf_; + uint64_t user_offset_; + size_t user_len_; + Slice user_result_; + // Used in case of direct_io + AlignedBuffer buf_; + bool is_aligned_; + }; + + public: + explicit RandomAccessFileReader( + std::unique_ptr&& raf, const std::string& _file_name, + SystemClock* clock = nullptr, + const std::shared_ptr& io_tracer = nullptr, + Statistics* stats = nullptr, + uint32_t hist_type = Histograms::HISTOGRAM_ENUM_MAX, + HistogramImpl* file_read_hist = nullptr, + RateLimiter* rate_limiter = nullptr, + const std::vector>& listeners = {}, + Temperature file_temperature = Temperature::kUnknown, + bool is_last_level = false) + : file_(std::move(raf), io_tracer, _file_name), + file_name_(std::move(_file_name)), + clock_(clock), + stats_(stats), + hist_type_(hist_type), + file_read_hist_(file_read_hist), + rate_limiter_(rate_limiter), + listeners_(), + file_temperature_(file_temperature), + is_last_level_(is_last_level) { + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); + } + + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); + RandomAccessFileReader(const RandomAccessFileReader&) = delete; + RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; + + // In non-direct IO mode, + // 1. if using mmap, result is stored in a buffer other than scratch; + // 2. if not using mmap, result is stored in the buffer starting from scratch. + // + // In direct IO mode, an aligned buffer is allocated internally. + // 1. If aligned_buf is null, then results are copied to the buffer + // starting from scratch; + // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns + // the internally allocated buffer on return, and the result refers to a + // region in aligned_buf. + // + // `rate_limiter_priority` is used to charge the internal rate limiter when + // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the + // rate limiter. + IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, + char* scratch, AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority) const; + + // REQUIRES: + // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing. + // In non-direct IO mode, aligned_buf should be null; + // In direct IO mode, aligned_buf stores the aligned buffer allocated inside + // MultiRead, the result Slices in reqs refer to aligned_buf. + // + // `rate_limiter_priority` will be used to charge the internal rate limiter. + // It is not yet supported so the client must provide the special value + // `Env::IO_TOTAL` to bypass the rate limiter. + IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs, + size_t num_reqs, AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority) const; + + IOStatus Prefetch(uint64_t offset, size_t n, + const Env::IOPriority rate_limiter_priority) const { + IOOptions opts; + opts.rate_limiter_priority = rate_limiter_priority; + return file_->Prefetch(offset, n, opts, nullptr); + } + + FSRandomAccessFile* file() { return file_.get(); } + + const std::string& file_name() const { return file_name_; } + + bool use_direct_io() const { return file_->use_direct_io(); } + + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const; + + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + AlignedBuf* aligned_buf); + + void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.cc new file mode 100644 index 0000000000..3617a35e33 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/read_write_util.h" + +#include + +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, + std::unique_ptr* result, + const FileOptions& options) { + TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature", + const_cast(&options.temperature)); + IOStatus s = fs->NewWritableFile(fname, options, result, nullptr); + TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2); + return s; +} + +#ifndef NDEBUG +bool IsFileSectorAligned(const size_t off, size_t sector_size) { + return off % sector_size == 0; +} +#endif // NDEBUG +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.h new file mode 100644 index 0000000000..9f034b705f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/read_write_util.h @@ -0,0 +1,31 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "file/sequence_file_reader.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { +// Returns a WritableFile. +// +// env : the Env. +// fname : the file name. +// result : output arg. A WritableFile based on `fname` returned. +// options : the Env Options. +extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, + std::unique_ptr* result, + const FileOptions& options); + +#ifndef NDEBUG +bool IsFileSectorAligned(const size_t off, size_t sector_size); +#endif // NDEBUG +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_file_info.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_file_info.h new file mode 100644 index 0000000000..f0208bf2dd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_file_info.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// struct ReadaheadFileInfo contains readahead information that is passed from +// one file to another file per level during iterations. This information helps +// iterators to carry forward the internal automatic prefetching readahead value +// to next file during sequential reads instead of starting from the scratch. + +struct ReadaheadFileInfo { + struct ReadaheadInfo { + size_t readahead_size = 0; + int64_t num_file_reads = 0; + }; + + // Used by Data block iterators to update readahead info. + ReadaheadInfo data_block_readahead_info; + + // Used by Index block iterators to update readahead info. + ReadaheadInfo index_block_readahead_info; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.cc new file mode 100644 index 0000000000..dd09822e3e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/readahead_raf.h" + +#include +#include + +#include "file/read_write_util.h" +#include "rocksdb/file_system.h" +#include "util/aligned_buffer.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +class ReadaheadRandomAccessFile : public FSRandomAccessFile { + public: + ReadaheadRandomAccessFile(std::unique_ptr&& file, + size_t readahead_size) + : file_(std::move(file)), + alignment_(file_->GetRequiredBufferAlignment()), + readahead_size_(Roundup(readahead_size, alignment_)), + buffer_(), + buffer_offset_(0) { + buffer_.Alignment(alignment_); + buffer_.AllocateNewBuffer(readahead_size_); + } + + ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete; + + ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = + delete; + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + // Read-ahead only make sense if we have some slack left after reading + if (n + alignment_ >= readahead_size_) { + return file_->Read(offset, n, options, result, scratch, dbg); + } + + std::unique_lock lk(lock_); + + size_t cached_len = 0; + // Check if there is a cache hit, meaning that [offset, offset + n) is + // either completely or partially in the buffer. If it's completely cached, + // including end of file case when offset + n is greater than EOF, then + // return. + if (TryReadFromCache(offset, n, &cached_len, scratch) && + (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { + // We read exactly what we needed, or we hit end of file - return. + *result = Slice(scratch, cached_len); + return IOStatus::OK(); + } + size_t advanced_offset = static_cast(offset + cached_len); + // In the case of cache hit advanced_offset is already aligned, means that + // chunk_offset equals to advanced_offset + size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); + + IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg); + if (s.ok()) { + // The data we need is now in cache, so we can safely read it + size_t remaining_len; + TryReadFromCache(advanced_offset, n - cached_len, &remaining_len, + scratch + cached_len); + *result = Slice(scratch, cached_len + remaining_len); + } + return s; + } + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + if (n < readahead_size_) { + // Don't allow smaller prefetches than the configured `readahead_size_`. + // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. + return IOStatus::OK(); + } + + std::unique_lock lk(lock_); + + size_t offset_ = static_cast(offset); + size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); + if (prefetch_offset == buffer_offset_) { + return IOStatus::OK(); + } + return ReadIntoBuffer(prefetch_offset, + Roundup(offset_ + n, alignment_) - prefetch_offset, + options, dbg); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return file_->GetUniqueId(id, max_size); + } + + void Hint(AccessPattern pattern) override { file_->Hint(pattern); } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + std::unique_lock lk(lock_); + buffer_.Clear(); + return file_->InvalidateCache(offset, length); + } + + bool use_direct_io() const override { return file_->use_direct_io(); } + + private: + // Tries to read from buffer_ n bytes starting at offset. If anything was read + // from the cache, it sets cached_len to the number of bytes actually read, + // copies these number of bytes to scratch and returns true. + // If nothing was read sets cached_len to 0 and returns false. + bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len, + char* scratch) const { + if (offset < buffer_offset_ || + offset >= buffer_offset_ + buffer_.CurrentSize()) { + *cached_len = 0; + return false; + } + uint64_t offset_in_buffer = offset - buffer_offset_; + *cached_len = std::min( + buffer_.CurrentSize() - static_cast(offset_in_buffer), n); + memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len); + return true; + } + + // Reads into buffer_ the next n bytes from file_ starting at offset. + // Can actually read less if EOF was reached. + // Returns the status of the read operastion on the file. + IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) const { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + assert(IsFileSectorAligned(offset, alignment_)); + assert(IsFileSectorAligned(n, alignment_)); + Slice result; + IOStatus s = + file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg); + if (s.ok()) { + buffer_offset_ = offset; + buffer_.Size(result.size()); + assert(result.size() == 0 || buffer_.BufferStart() == result.data()); + } + return s; + } + + const std::unique_ptr file_; + const size_t alignment_; + const size_t readahead_size_; + + mutable std::mutex lock_; + // The buffer storing the prefetched data + mutable AlignedBuffer buffer_; + // The offset in file_, corresponding to data stored in buffer_ + mutable uint64_t buffer_offset_; +}; +} // namespace + +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size) { + std::unique_ptr result( + new ReadaheadRandomAccessFile(std::move(file), readahead_size)); + return result; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.h new file mode 100644 index 0000000000..dfaf2b4fa9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/readahead_raf.h @@ -0,0 +1,29 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class FSRandomAccessFile; +// This file provides the following main abstractions: +// SequentialFileReader : wrapper over Env::SequentialFile +// RandomAccessFileReader : wrapper over Env::RandomAccessFile +// WritableFileWriter : wrapper over Env::WritableFile +// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile, +// and ReadOneLine primitives. + +// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to +// always prefetch additional data with every read. This is mainly used in +// Compaction Table Readers. +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.cc new file mode 100644 index 0000000000..a753c1d098 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.cc @@ -0,0 +1,320 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/sequence_file_reader.h" + +#include +#include + +#include "file/read_write_util.h" +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/random.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { +IOStatus SequentialFileReader::Create( + const std::shared_ptr& fs, const std::string& fname, + const FileOptions& file_opts, std::unique_ptr* reader, + IODebugContext* dbg, RateLimiter* rate_limiter) { + std::unique_ptr file; + IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new SequentialFileReader(std::move(file), fname, nullptr, {}, + rate_limiter)); + } + return io_s; +} + +IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, + Env::IOPriority rate_limiter_priority) { + IOStatus io_s; + if (use_direct_io()) { + // + // |-offset_advance-|---bytes returned--| + // |----------------------buf size-------------------------| + // | | | | + // aligned offset offset + n Roundup(offset + n, + // offset alignment) + // + size_t offset = offset_.fetch_add(n); + size_t alignment = file_->GetRequiredBufferAlignment(); + size_t aligned_offset = TruncateToPageBoundary(alignment, offset); + size_t offset_advance = offset - aligned_offset; + size_t size = Roundup(offset + n, alignment) - aligned_offset; + size_t r = 0; + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(size); + + while (buf.CurrentSize() < size) { + size_t allowed; + if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + buf.Capacity() - buf.CurrentSize(), buf.Alignment(), + rate_limiter_priority, nullptr /* stats */, + RateLimiter::OpType::kRead); + } else { + assert(buf.CurrentSize() == 0); + allowed = size; + } + + Slice tmp; + uint64_t orig_offset = 0; + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + orig_offset = aligned_offset + buf.CurrentSize(); + start_ts = FileOperationInfo::StartNow(); + } + io_s = file_->PositionedRead(aligned_offset + buf.CurrentSize(), allowed, + IOOptions(), &tmp, buf.Destination(), + nullptr /* dbg */); + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, + io_s); + } + buf.Size(buf.CurrentSize() + tmp.size()); + if (!io_s.ok() || tmp.size() < allowed) { + break; + } + } + + if (io_s.ok() && offset_advance < buf.CurrentSize()) { + r = buf.Read(scratch, offset_advance, + std::min(buf.CurrentSize() - offset_advance, n)); + } + *result = Slice(scratch, r); + } else { + // To be paranoid, modify scratch a little bit, so in case underlying + // FileSystem doesn't fill the buffer but return success and `scratch` + // returns contains a previous block, returned value will not pass + // checksum. + // It's hard to find useful byte for direct I/O case, so we skip it. + if (n > 0 && scratch != nullptr) { + scratch[0]++; + } + + size_t read = 0; + while (read < n) { + size_t allowed; + if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + n - read, 0 /* alignment */, rate_limiter_priority, + nullptr /* stats */, RateLimiter::OpType::kRead); + } else { + allowed = n; + } + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + Slice tmp; + io_s = file_->Read(allowed, IOOptions(), &tmp, scratch + read, + nullptr /* dbg */); + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + size_t offset = offset_.fetch_add(tmp.size()); + NotifyOnFileReadFinish(offset, tmp.size(), start_ts, finish_ts, io_s); + } + read += tmp.size(); + if (!io_s.ok() || tmp.size() < allowed) { + break; + } + } + *result = Slice(scratch, read); + } + IOSTATS_ADD(bytes_read, result->size()); + return io_s; +} + +IOStatus SequentialFileReader::Skip(uint64_t n) { + if (use_direct_io()) { + offset_ += static_cast(n); + return IOStatus::OK(); + } + return file_->Skip(n); +} + +namespace { +// This class wraps a SequentialFile, exposing same API, with the differenece +// of being able to prefetch up to readahead_size bytes and then serve them +// from memory, avoiding the entire round-trip if, for example, the data for the +// file is actually remote. +class ReadaheadSequentialFile : public FSSequentialFile { + public: + ReadaheadSequentialFile(std::unique_ptr&& file, + size_t readahead_size) + : file_(std::move(file)), + alignment_(file_->GetRequiredBufferAlignment()), + readahead_size_(Roundup(readahead_size, alignment_)), + buffer_(), + buffer_offset_(0), + read_offset_(0) { + buffer_.Alignment(alignment_); + buffer_.AllocateNewBuffer(readahead_size_); + } + + ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete; + + ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete; + + IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch, + IODebugContext* dbg) override { + std::unique_lock lk(lock_); + + size_t cached_len = 0; + // Check if there is a cache hit, meaning that [offset, offset + n) is + // either completely or partially in the buffer. If it's completely cached, + // including end of file case when offset + n is greater than EOF, then + // return. + if (TryReadFromCache(n, &cached_len, scratch) && + (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { + // We read exactly what we needed, or we hit end of file - return. + *result = Slice(scratch, cached_len); + return IOStatus::OK(); + } + n -= cached_len; + + IOStatus s; + // Read-ahead only make sense if we have some slack left after reading + if (n + alignment_ >= readahead_size_) { + s = file_->Read(n, opts, result, scratch + cached_len, dbg); + if (s.ok()) { + read_offset_ += result->size(); + *result = Slice(scratch, cached_len + result->size()); + } + buffer_.Clear(); + return s; + } + + s = ReadIntoBuffer(readahead_size_, opts, dbg); + if (s.ok()) { + // The data we need is now in cache, so we can safely read it + size_t remaining_len; + TryReadFromCache(n, &remaining_len, scratch + cached_len); + *result = Slice(scratch, cached_len + remaining_len); + } + return s; + } + + IOStatus Skip(uint64_t n) override { + std::unique_lock lk(lock_); + IOStatus s = IOStatus::OK(); + // First check if we need to skip already cached data + if (buffer_.CurrentSize() > 0) { + // Do we need to skip beyond cached data? + if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) { + // Yes. Skip whaterver is in memory and adjust offset accordingly + n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_; + read_offset_ = buffer_offset_ + buffer_.CurrentSize(); + } else { + // No. The entire section to be skipped is entirely i cache. + read_offset_ += n; + n = 0; + } + } + if (n > 0) { + // We still need to skip more, so call the file API for skipping + s = file_->Skip(n); + if (s.ok()) { + read_offset_ += n; + } + buffer_.Clear(); + } + return s; + } + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) override { + return file_->PositionedRead(offset, n, opts, result, scratch, dbg); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + std::unique_lock lk(lock_); + buffer_.Clear(); + return file_->InvalidateCache(offset, length); + } + + bool use_direct_io() const override { return file_->use_direct_io(); } + + private: + // Tries to read from buffer_ n bytes. If anything was read from the cache, it + // sets cached_len to the number of bytes actually read, copies these number + // of bytes to scratch and returns true. + // If nothing was read sets cached_len to 0 and returns false. + bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) { + if (read_offset_ < buffer_offset_ || + read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) { + *cached_len = 0; + return false; + } + uint64_t offset_in_buffer = read_offset_ - buffer_offset_; + *cached_len = std::min( + buffer_.CurrentSize() - static_cast(offset_in_buffer), n); + memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len); + read_offset_ += *cached_len; + return true; + } + + // Reads into buffer_ the next n bytes from file_. + // Can actually read less if EOF was reached. + // Returns the status of the read operastion on the file. + IOStatus ReadIntoBuffer(size_t n, const IOOptions& opts, + IODebugContext* dbg) { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + assert(IsFileSectorAligned(n, alignment_)); + Slice result; + IOStatus s = file_->Read(n, opts, &result, buffer_.BufferStart(), dbg); + if (s.ok()) { + buffer_offset_ = read_offset_; + buffer_.Size(result.size()); + assert(result.size() == 0 || buffer_.BufferStart() == result.data()); + } + return s; + } + + const std::unique_ptr file_; + const size_t alignment_; + const size_t readahead_size_; + + std::mutex lock_; + // The buffer storing the prefetched data + AlignedBuffer buffer_; + // The offset in file_, corresponding to data stored in buffer_ + uint64_t buffer_offset_; + // The offset up to which data was read from file_. In fact, it can be larger + // than the actual file size, since the file_->Skip(n) call doesn't return the + // actual number of bytes that were skipped, which can be less than n. + // This is not a problemm since read_offset_ is monotonically increasing and + // its only use is to figure out if next piece of data should be read from + // buffer_ or file_ directly. + uint64_t read_offset_; +}; +} // namespace + +std::unique_ptr +SequentialFileReader::NewReadaheadSequentialFile( + std::unique_ptr&& file, size_t readahead_size) { + if (file->GetRequiredBufferAlignment() >= readahead_size) { + // Short-circuit and return the original file if readahead_size is + // too small and hence doesn't make sense to be used for prefetching. + return std::move(file); + } + std::unique_ptr result( + new ReadaheadSequentialFile(std::move(file), readahead_size)); + return result; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.h new file mode 100644 index 0000000000..14350e8de4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sequence_file_reader.h @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "env/file_system_tracer.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles +// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page +// cache disabled) reads appropriately, and also updates the IO stats. +class SequentialFileReader { + private: + void NotifyOnFileReadFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const Status& status) const { + FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts, + finish_ts, status); + info.offset = offset; + info.length = length; + + for (auto& listener : listeners_) { + listener->OnFileReadFinish(info); + } + info.status.PermitUncheckedError(); + } + + void AddFileIOListeners( + const std::vector>& listeners) { + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); + } + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + + std::string file_name_; + FSSequentialFilePtr file_; + std::atomic offset_{0}; // read offset + std::vector> listeners_{}; + RateLimiter* rate_limiter_; + + public: + explicit SequentialFileReader( + std::unique_ptr&& _file, const std::string& _file_name, + const std::shared_ptr& io_tracer = nullptr, + const std::vector>& listeners = {}, + RateLimiter* rate_limiter = + nullptr) // TODO: migrate call sites to provide rate limiter + : file_name_(_file_name), + file_(std::move(_file), io_tracer, _file_name), + listeners_(), + rate_limiter_(rate_limiter) { + AddFileIOListeners(listeners); + } + + explicit SequentialFileReader( + std::unique_ptr&& _file, const std::string& _file_name, + size_t _readahead_size, + const std::shared_ptr& io_tracer = nullptr, + const std::vector>& listeners = {}, + RateLimiter* rate_limiter = + nullptr) // TODO: migrate call sites to provide rate limiter + : file_name_(_file_name), + file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size), + io_tracer, _file_name), + listeners_(), + rate_limiter_(rate_limiter) { + AddFileIOListeners(listeners); + } + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg, RateLimiter* rate_limiter); + + SequentialFileReader(const SequentialFileReader&) = delete; + SequentialFileReader& operator=(const SequentialFileReader&) = delete; + + // `rate_limiter_priority` is used to charge the internal rate limiter when + // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the + // rate limiter. The amount charged to the internal rate limiter is n, even + // when less than n bytes are actually read (e.g. at end of file). To avoid + // overcharging the rate limiter, the caller can use file size to cap n to + // read until end of file. + IOStatus Read(size_t n, Slice* result, char* scratch, + Env::IOPriority rate_limiter_priority); + + IOStatus Skip(uint64_t n); + + FSSequentialFile* file() { return file_.get(); } + + std::string file_name() { return file_name_; } + + bool use_direct_io() const { return file_->use_direct_io(); } + + private: + // NewReadaheadSequentialFile provides a wrapper over SequentialFile to + // always prefetch additional data with every read. + static std::unique_ptr NewReadaheadSequentialFile( + std::unique_ptr&& file, size_t readahead_size); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.cc new file mode 100644 index 0000000000..459ea36cdb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.cc @@ -0,0 +1,507 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/sst_file_manager_impl.h" + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/sst_file_manager.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +SstFileManagerImpl::SstFileManagerImpl( + const std::shared_ptr& clock, + const std::shared_ptr& fs, + const std::shared_ptr& logger, int64_t rate_bytes_per_sec, + double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) + : clock_(clock), + fs_(fs), + logger_(logger), + total_files_size_(0), + compaction_buffer_size_(0), + cur_compactions_reserved_size_(0), + max_allowed_space_(0), + delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec, + logger.get(), this, max_trash_db_ratio, + bytes_max_delete_chunk), + cv_(&mu_), + closing_(false), + bg_thread_(nullptr), + reserved_disk_buffer_(0), + free_space_trigger_(0), + cur_instance_(nullptr) {} + +SstFileManagerImpl::~SstFileManagerImpl() { + Close(); + bg_err_.PermitUncheckedError(); +} + +void SstFileManagerImpl::Close() { + { + MutexLock l(&mu_); + if (closing_) { + return; + } + closing_ = true; + cv_.SignalAll(); + } + if (bg_thread_) { + bg_thread_->join(); + } +} + +Status SstFileManagerImpl::OnAddFile(const std::string& file_path) { + uint64_t file_size; + Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr); + if (s.ok()) { + MutexLock l(&mu_); + OnAddFileImpl(file_path, file_size); + } + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile", + const_cast(&file_path)); + return s; +} + +Status SstFileManagerImpl::OnAddFile(const std::string& file_path, + uint64_t file_size) { + MutexLock l(&mu_); + OnAddFileImpl(file_path, file_size); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile", + const_cast(&file_path)); + return Status::OK(); +} + +Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) { + { + MutexLock l(&mu_); + OnDeleteFileImpl(file_path); + } + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile", + const_cast(&file_path)); + return Status::OK(); +} + +void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) { + MutexLock l(&mu_); + uint64_t size_added_by_compaction = 0; + for (size_t i = 0; i < c->num_input_levels(); i++) { + for (size_t j = 0; j < c->num_input_files(i); j++) { + FileMetaData* filemeta = c->input(i, j); + size_added_by_compaction += filemeta->fd.GetFileSize(); + } + } + cur_compactions_reserved_size_ -= size_added_by_compaction; +} + +Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, + const std::string& new_path, + uint64_t* file_size) { + { + MutexLock l(&mu_); + if (file_size != nullptr) { + *file_size = tracked_files_[old_path]; + } + OnAddFileImpl(new_path, tracked_files_[old_path]); + OnDeleteFileImpl(old_path); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile"); + return Status::OK(); +} + +void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) { + MutexLock l(&mu_); + max_allowed_space_ = max_allowed_space; +} + +void SstFileManagerImpl::SetCompactionBufferSize( + uint64_t compaction_buffer_size) { + MutexLock l(&mu_); + compaction_buffer_size_ = compaction_buffer_size; +} + +bool SstFileManagerImpl::IsMaxAllowedSpaceReached() { + MutexLock l(&mu_); + if (max_allowed_space_ <= 0) { + return false; + } + return total_files_size_ >= max_allowed_space_; +} + +bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() { + MutexLock l(&mu_); + if (max_allowed_space_ <= 0) { + return false; + } + return total_files_size_ + cur_compactions_reserved_size_ >= + max_allowed_space_; +} + +bool SstFileManagerImpl::EnoughRoomForCompaction( + ColumnFamilyData* cfd, const std::vector& inputs, + const Status& bg_error) { + MutexLock l(&mu_); + uint64_t size_added_by_compaction = 0; + // First check if we even have the space to do the compaction + for (size_t i = 0; i < inputs.size(); i++) { + for (size_t j = 0; j < inputs[i].size(); j++) { + FileMetaData* filemeta = inputs[i][j]; + size_added_by_compaction += filemeta->fd.GetFileSize(); + } + } + + // Update cur_compactions_reserved_size_ so concurrent compaction + // don't max out space + size_t needed_headroom = cur_compactions_reserved_size_ + + size_added_by_compaction + compaction_buffer_size_; + if (max_allowed_space_ != 0 && + (needed_headroom + total_files_size_ > max_allowed_space_)) { + return false; + } + + // Implement more aggressive checks only if this DB instance has already + // seen a NoSpace() error. This is tin order to contain a single potentially + // misbehaving DB instance and prevent it from slowing down compactions of + // other DB instances + if (bg_error.IsNoSpace() && CheckFreeSpace()) { + auto fn = + TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(), + inputs[0][0]->fd.GetPathId()); + uint64_t free_space = 0; + Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr); + s.PermitUncheckedError(); // TODO: Check the status + // needed_headroom is based on current size reserved by compactions, + // minus any files created by running compactions as they would count + // against the reserved size. If user didn't specify any compaction + // buffer, add reserved_disk_buffer_ that's calculated by default so the + // compaction doesn't end up leaving nothing for logs and flush SSTs + if (compaction_buffer_size_ == 0) { + needed_headroom += reserved_disk_buffer_; + } + if (free_space < needed_headroom + size_added_by_compaction) { + // We hit the condition of not enough disk space + ROCKS_LOG_ERROR(logger_, + "free space [%" PRIu64 + " bytes] is less than " + "needed headroom [%" ROCKSDB_PRIszt " bytes]\n", + free_space, needed_headroom); + return false; + } + } + + cur_compactions_reserved_size_ += size_added_by_compaction; + // Take a snapshot of cur_compactions_reserved_size_ for when we encounter + // a NoSpace error. + free_space_trigger_ = cur_compactions_reserved_size_; + return true; +} + +uint64_t SstFileManagerImpl::GetCompactionsReservedSize() { + MutexLock l(&mu_); + return cur_compactions_reserved_size_; +} + +uint64_t SstFileManagerImpl::GetTotalSize() { + MutexLock l(&mu_); + return total_files_size_; +} + +std::unordered_map +SstFileManagerImpl::GetTrackedFiles() { + MutexLock l(&mu_); + return tracked_files_; +} + +int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() { + return delete_scheduler_.GetRateBytesPerSecond(); +} + +void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) { + return delete_scheduler_.SetRateBytesPerSecond(delete_rate); +} + +double SstFileManagerImpl::GetMaxTrashDBRatio() { + return delete_scheduler_.GetMaxTrashDBRatio(); +} + +void SstFileManagerImpl::SetMaxTrashDBRatio(double r) { + return delete_scheduler_.SetMaxTrashDBRatio(r); +} + +uint64_t SstFileManagerImpl::GetTotalTrashSize() { + return delete_scheduler_.GetTotalTrashSize(); +} + +void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size, + const std::string& path) { + MutexLock l(&mu_); + + reserved_disk_buffer_ += size; + if (path_.empty()) { + path_ = path; + } +} + +void SstFileManagerImpl::ClearError() { + while (true) { + MutexLock l(&mu_); + + if (error_handler_list_.empty() || closing_) { + return; + } + + uint64_t free_space = 0; + Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr); + free_space = max_allowed_space_ > 0 + ? std::min(max_allowed_space_, free_space) + : free_space; + if (s.ok()) { + // In case of multi-DB instances, some of them may have experienced a + // soft error and some a hard error. In the SstFileManagerImpl, a hard + // error will basically override previously reported soft errors. Once + // we clear the hard error, we don't keep track of previous errors for + // now + if (bg_err_.severity() == Status::Severity::kHardError) { + if (free_space < reserved_disk_buffer_) { + ROCKS_LOG_ERROR(logger_, + "free space [%" PRIu64 + " bytes] is less than " + "required disk buffer [%" PRIu64 " bytes]\n", + free_space, reserved_disk_buffer_); + ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n"); + s = Status::NoSpace(); + } + } else if (bg_err_.severity() == Status::Severity::kSoftError) { + if (free_space < free_space_trigger_) { + ROCKS_LOG_WARN(logger_, + "free space [%" PRIu64 + " bytes] is less than " + "free space for compaction trigger [%" PRIu64 + " bytes]\n", + free_space, free_space_trigger_); + ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n"); + s = Status::NoSpace(); + } + } + } + + // Someone could have called CancelErrorRecovery() and the list could have + // become empty, so check again here + if (s.ok()) { + assert(!error_handler_list_.empty()); + auto error_handler = error_handler_list_.front(); + // Since we will release the mutex, set cur_instance_ to signal to the + // shutdown thread, if it calls // CancelErrorRecovery() the meantime, + // to indicate that this DB instance is busy. The DB instance is + // guaranteed to not be deleted before RecoverFromBGError() returns, + // since the ErrorHandler::recovery_in_prog_ flag would be true + cur_instance_ = error_handler; + mu_.Unlock(); + s = error_handler->RecoverFromBGError(); + TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared"); + mu_.Lock(); + // The DB instance might have been deleted while we were + // waiting for the mutex, so check cur_instance_ to make sure its + // still non-null + if (cur_instance_) { + // Check for error again, since the instance may have recovered but + // immediately got another error. If that's the case, and the new + // error is also a NoSpace() non-fatal error, leave the instance in + // the list + Status err = cur_instance_->GetBGError(); + if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace && + err.severity() < Status::Severity::kFatalError) { + s = err; + } + cur_instance_ = nullptr; + } + + if (s.ok() || s.IsShutdownInProgress() || + (!s.ok() && s.severity() >= Status::Severity::kFatalError)) { + // If shutdown is in progress, abandon this handler instance + // and continue with the others + error_handler_list_.pop_front(); + } + } + + if (!error_handler_list_.empty()) { + // If there are more instances to be recovered, reschedule after 5 + // seconds + int64_t wait_until = clock_->NowMicros() + 5000000; + cv_.TimedWait(wait_until); + } + + // Check again for error_handler_list_ empty, as a DB instance shutdown + // could have removed it from the queue while we were in timed wait + if (error_handler_list_.empty()) { + ROCKS_LOG_INFO(logger_, "Clearing error\n"); + bg_err_ = Status::OK(); + return; + } + } +} + +void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler, + Status bg_error) { + MutexLock l(&mu_); + if (bg_error.severity() == Status::Severity::kSoftError) { + if (bg_err_.ok()) { + // Setting bg_err_ basically means we're in degraded mode + // Assume that all pending compactions will fail similarly. The trigger + // for clearing this condition is set to current compaction reserved + // size, so we stop checking disk space available in + // EnoughRoomForCompaction once this much free space is available + bg_err_ = bg_error; + } + } else if (bg_error.severity() == Status::Severity::kHardError) { + bg_err_ = bg_error; + } else { + assert(false); + } + + // If this is the first instance of this error, kick of a thread to poll + // and recover from this condition + if (error_handler_list_.empty()) { + error_handler_list_.push_back(handler); + // Release lock before calling join. Its ok to do so because + // error_handler_list_ is now non-empty, so no other invocation of this + // function will execute this piece of code + mu_.Unlock(); + if (bg_thread_) { + bg_thread_->join(); + } + // Start a new thread. The previous one would have exited. + bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this)); + mu_.Lock(); + } else { + // Check if this DB instance is already in the list + for (auto iter = error_handler_list_.begin(); + iter != error_handler_list_.end(); ++iter) { + if ((*iter) == handler) { + return; + } + } + error_handler_list_.push_back(handler); + } +} + +bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) { + MutexLock l(&mu_); + + if (cur_instance_ == handler) { + // This instance is currently busy attempting to recover + // Nullify it so the recovery thread doesn't attempt to access it again + cur_instance_ = nullptr; + return false; + } + + for (auto iter = error_handler_list_.begin(); + iter != error_handler_list_.end(); ++iter) { + if ((*iter) == handler) { + error_handler_list_.erase(iter); + return true; + } + } + return false; +} + +Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path, + const std::string& path_to_sync, + const bool force_bg) { + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion", + const_cast(&file_path)); + return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg); +} + +void SstFileManagerImpl::WaitForEmptyTrash() { + delete_scheduler_.WaitForEmptyTrash(); +} + +void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, + uint64_t file_size) { + auto tracked_file = tracked_files_.find(file_path); + if (tracked_file != tracked_files_.end()) { + // File was added before, we will just update the size + total_files_size_ -= tracked_file->second; + total_files_size_ += file_size; + cur_compactions_reserved_size_ -= file_size; + } else { + total_files_size_ += file_size; + } + tracked_files_[file_path] = file_size; +} + +void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) { + auto tracked_file = tracked_files_.find(file_path); + if (tracked_file == tracked_files_.end()) { + // File is not tracked + return; + } + + total_files_size_ -= tracked_file->second; + tracked_files_.erase(tracked_file); +} + +SstFileManager* NewSstFileManager(Env* env, std::shared_ptr info_log, + std::string trash_dir, + int64_t rate_bytes_per_sec, + bool delete_existing_trash, Status* status, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) { + const auto& fs = env->GetFileSystem(); + return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec, + delete_existing_trash, status, max_trash_db_ratio, + bytes_max_delete_chunk); +} + +SstFileManager* NewSstFileManager(Env* env, std::shared_ptr fs, + std::shared_ptr info_log, + const std::string& trash_dir, + int64_t rate_bytes_per_sec, + bool delete_existing_trash, Status* status, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) { + const auto& clock = env->GetSystemClock(); + SstFileManagerImpl* res = + new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec, + max_trash_db_ratio, bytes_max_delete_chunk); + + // trash_dir is deprecated and not needed anymore, but if user passed it + // we will still remove files in it. + Status s = Status::OK(); + if (delete_existing_trash && trash_dir != "") { + std::vector files_in_trash; + s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr); + if (s.ok()) { + for (const std::string& trash_file : files_in_trash) { + std::string path_in_trash = trash_dir + "/" + trash_file; + res->OnAddFile(path_in_trash); + Status file_delete = + res->ScheduleFileDeletion(path_in_trash, trash_dir); + if (s.ok() && !file_delete.ok()) { + s = file_delete; + } + } + } + } + + if (status) { + *status = s; + } else { + // No one passed us a Status, so they must not care about the error... + s.PermitUncheckedError(); + } + + return res; +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.h new file mode 100644 index 0000000000..e85376e5ce --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/sst_file_manager_impl.h @@ -0,0 +1,193 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "db/compaction/compaction.h" +#include "file/delete_scheduler.h" +#include "port/port.h" +#include "rocksdb/sst_file_manager.h" + +namespace ROCKSDB_NAMESPACE { +class ErrorHandler; +class FileSystem; +class SystemClock; +class Logger; + +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. +class SstFileManagerImpl : public SstFileManager { + public: + explicit SstFileManagerImpl(const std::shared_ptr& clock, + const std::shared_ptr& fs, + const std::shared_ptr& logger, + int64_t rate_bytes_per_sec, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk); + + ~SstFileManagerImpl(); + + // DB will call OnAddFile whenever a new sst/blob file is added. + Status OnAddFile(const std::string& file_path); + + // Overload where size of the file is provided by the caller rather than + // queried from the filesystem. This is an optimization. + Status OnAddFile(const std::string& file_path, uint64_t file_size); + + // DB will call OnDeleteFile whenever a sst/blob file is deleted. + Status OnDeleteFile(const std::string& file_path); + + // DB will call OnMoveFile whenever a sst/blob file is move to a new path. + Status OnMoveFile(const std::string& old_path, const std::string& new_path, + uint64_t* file_size = nullptr); + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature, maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override; + + void SetCompactionBufferSize(uint64_t compaction_buffer_size) override; + + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. + // + // thread-safe. + bool IsMaxAllowedSpaceReached() override; + + bool IsMaxAllowedSpaceReachedIncludingCompactions() override; + + // Returns true is there is enough (approximate) space for the specified + // compaction. Space is approximate because this function conservatively + // estimates how much space is currently being used by compactions (i.e. + // if a compaction has started, this function bumps the used space by + // the full compaction size). + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector& inputs, + const Status& bg_error); + + // Bookkeeping so total_file_sizes_ goes back to normal after compaction + // finishes + void OnCompactionCompletion(Compaction* c); + + uint64_t GetCompactionsReservedSize(); + + // Return the total size of all tracked files. + uint64_t GetTotalSize() override; + + // Return a map containing all tracked files and there corresponding sizes. + std::unordered_map GetTrackedFiles() override; + + // Return delete rate limit in bytes per second. + virtual int64_t GetDeleteRateBytesPerSecond() override; + + // Update the delete rate limit in bytes per second. + virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override; + + // Return trash/DB size ratio where new files will be deleted immediately + virtual double GetMaxTrashDBRatio() override; + + // Update trash/DB size ratio where new files will be deleted immediately + virtual void SetMaxTrashDBRatio(double ratio) override; + + // Return the total size of trash files + uint64_t GetTotalTrashSize() override; + + // Called by each DB instance using this sst file manager to reserve + // disk buffer space for recovery from out of space errors + void ReserveDiskBuffer(uint64_t buffer, const std::string& path); + + // Set a flag upon encountering disk full. May enqueue the ErrorHandler + // instance for background polling and recovery + void StartErrorRecovery(ErrorHandler* db, Status bg_error); + + // Remove the given Errorhandler instance from the recovery queue. Its + // not guaranteed + bool CancelErrorRecovery(ErrorHandler* db); + + // Mark file as trash and schedule it's deletion. If force_bg is set, it + // forces the file to be deleting in the background regardless of DB size, + // except when rate limited delete is disabled + virtual Status ScheduleFileDeletion(const std::string& file_path, + const std::string& dir_to_sync, + const bool force_bg = false); + + // Wait for all files being deleteing in the background to finish or for + // destructor to be called. + virtual void WaitForEmptyTrash(); + + DeleteScheduler* delete_scheduler() { return &delete_scheduler_; } + + // Stop the error recovery background thread. This should be called only + // once in the object's lifetime, and before the destructor + void Close(); + + void SetStatisticsPtr(const std::shared_ptr& stats) override { + stats_ = stats; + delete_scheduler_.SetStatisticsPtr(stats); + } + + private: + // REQUIRES: mutex locked + void OnAddFileImpl(const std::string& file_path, uint64_t file_size); + // REQUIRES: mutex locked + void OnDeleteFileImpl(const std::string& file_path); + + void ClearError(); + bool CheckFreeSpace() { + return bg_err_.severity() == Status::Severity::kSoftError; + } + + std::shared_ptr clock_; + std::shared_ptr fs_; + std::shared_ptr logger_; + // Mutex to protect tracked_files_, total_files_size_ + port::Mutex mu_; + // The summation of the sizes of all files in tracked_files_ map + uint64_t total_files_size_; + // Compactions should only execute if they can leave at least + // this amount of buffer space for logs and flushes + uint64_t compaction_buffer_size_; + // Estimated size of the current ongoing compactions + uint64_t cur_compactions_reserved_size_; + // A map containing all tracked files and there sizes + // file_path => file_size + std::unordered_map tracked_files_; + // The maximum allowed space (in bytes) for sst and blob files. + uint64_t max_allowed_space_; + // DeleteScheduler used to throttle file deletion. + DeleteScheduler delete_scheduler_; + port::CondVar cv_; + // Flag to force error recovery thread to exit + bool closing_; + // Background error recovery thread + std::unique_ptr bg_thread_; + // A path in the filesystem corresponding to this SFM. This is used for + // calling Env::GetFreeSpace. Posix requires a path in the filesystem + std::string path_; + // Save the current background error + Status bg_err_; + // Amount of free disk headroom before allowing recovery from hard errors + uint64_t reserved_disk_buffer_; + // For soft errors, amount of free disk space before we can allow + // compactions to run full throttle. If disk space is below this trigger, + // compactions will be gated by free disk space > input size + uint64_t free_space_trigger_; + // List of database error handler instances tracked by this SstFileManager. + std::list error_handler_list_; + // Pointer to ErrorHandler instance that is currently processing recovery + ErrorHandler* cur_instance_; + std::shared_ptr stats_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.cc new file mode 100644 index 0000000000..908878a5fa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.cc @@ -0,0 +1,989 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/writable_file_writer.h" + +#include +#include + +#include "db/version_edit.h" +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "rocksdb/io_status.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { +IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, + const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* writer, + IODebugContext* dbg) { + if (file_opts.use_direct_writes && + 0 == file_opts.writable_file_max_buffer_size) { + return IOStatus::InvalidArgument( + "Direct write requires writable_file_max_buffer_size > 0"); + } + std::unique_ptr file; + IOStatus io_s = fs->NewWritableFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + writer->reset(new WritableFileWriter(std::move(file), fname, file_opts)); + } + return io_s; +} + +IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, + Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + const char* src = data.data(); + size_t left = data.size(); + IOStatus s; + pending_sync_ = true; + + TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2); + + // Calculate the checksum of appended data + UpdateFileChecksum(data); + + { + IOOptions io_options; + io_options.rate_limiter_priority = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + IOSTATS_TIMER_GUARD(prepare_write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite"); + writable_file_->PrepareWrite(static_cast(GetFileSize()), left, + io_options, nullptr); + } + + // See whether we need to enlarge the buffer to avoid the flush + if (buf_.Capacity() - buf_.CurrentSize() < left) { + for (size_t cap = buf_.Capacity(); + cap < max_buffer_size_; // There is still room to increase + cap *= 2) { + // See whether the next available size is large enough. + // Buffer will never be increased to more than max_buffer_size_. + size_t desired_capacity = std::min(cap * 2, max_buffer_size_); + if (desired_capacity - buf_.CurrentSize() >= left || + (use_direct_io() && desired_capacity == max_buffer_size_)) { + buf_.AllocateNewBuffer(desired_capacity, true); + break; + } + } + } + + // Flush only when buffered I/O + if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) { + if (buf_.CurrentSize() > 0) { + s = Flush(op_rate_limiter_priority); + if (!s.ok()) { + set_seen_error(); + return s; + } + } + assert(buf_.CurrentSize() == 0); + } + + if (perform_data_verification_ && buffered_data_with_checksum_ && + crc32c_checksum != 0) { + // Since we want to use the checksum of the input data, we cannot break it + // into several pieces. We will only write them in the buffer when buffer + // size is enough. Otherwise, we will directly write it down. + if (use_direct_io() || (buf_.Capacity() - buf_.CurrentSize()) >= left) { + if ((buf_.Capacity() - buf_.CurrentSize()) >= left) { + size_t appended = buf_.Append(src, left); + if (appended != left) { + s = IOStatus::Corruption("Write buffer append failure"); + } + buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine( + buffered_data_crc32c_checksum_, crc32c_checksum, appended); + } else { + while (left > 0) { + size_t appended = buf_.Append(src, left); + buffered_data_crc32c_checksum_ = + crc32c::Extend(buffered_data_crc32c_checksum_, src, appended); + left -= appended; + src += appended; + + if (left > 0) { + s = Flush(op_rate_limiter_priority); + if (!s.ok()) { + break; + } + } + } + } + } else { + assert(buf_.CurrentSize() == 0); + buffered_data_crc32c_checksum_ = crc32c_checksum; + s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority); + } + } else { + // In this case, either we do not need to do the data verification or + // caller does not provide the checksum of the data (crc32c_checksum = 0). + // + // We never write directly to disk with direct I/O on. + // or we simply use it for its original purpose to accumulate many small + // chunks + if (use_direct_io() || (buf_.Capacity() >= left)) { + while (left > 0) { + size_t appended = buf_.Append(src, left); + if (perform_data_verification_ && buffered_data_with_checksum_) { + buffered_data_crc32c_checksum_ = + crc32c::Extend(buffered_data_crc32c_checksum_, src, appended); + } + left -= appended; + src += appended; + + if (left > 0) { + s = Flush(op_rate_limiter_priority); + if (!s.ok()) { + break; + } + } + } + } else { + // Writing directly to file bypassing the buffer + assert(buf_.CurrentSize() == 0); + if (perform_data_verification_ && buffered_data_with_checksum_) { + buffered_data_crc32c_checksum_ = crc32c::Value(src, left); + s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority); + } else { + s = WriteBuffered(src, left, op_rate_limiter_priority); + } + } + } + + TEST_KILL_RANDOM("WritableFileWriter::Append:1"); + if (s.ok()) { + uint64_t cur_size = filesize_.load(std::memory_order_acquire); + filesize_.store(cur_size + data.size(), std::memory_order_release); + } else { + set_seen_error(); + } + return s; +} + +IOStatus WritableFileWriter::Pad(const size_t pad_bytes, + Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + assert(pad_bytes < kDefaultPageSize); + size_t left = pad_bytes; + size_t cap = buf_.Capacity() - buf_.CurrentSize(); + size_t pad_start = buf_.CurrentSize(); + + // Assume pad_bytes is small compared to buf_ capacity. So we always + // use buf_ rather than write directly to file in certain cases like + // Append() does. + while (left) { + size_t append_bytes = std::min(cap, left); + buf_.PadWith(append_bytes, 0); + left -= append_bytes; + if (left > 0) { + IOStatus s = Flush(op_rate_limiter_priority); + if (!s.ok()) { + set_seen_error(); + return s; + } + } + cap = buf_.Capacity() - buf_.CurrentSize(); + } + pending_sync_ = true; + uint64_t cur_size = filesize_.load(std::memory_order_acquire); + filesize_.store(cur_size + pad_bytes, std::memory_order_release); + if (perform_data_verification_) { + buffered_data_crc32c_checksum_ = + crc32c::Extend(buffered_data_crc32c_checksum_, + buf_.BufferStart() + pad_start, pad_bytes); + } + return IOStatus::OK(); +} + +IOStatus WritableFileWriter::Close() { + if (seen_error()) { + IOStatus interim; + if (writable_file_.get() != nullptr) { + interim = writable_file_->Close(IOOptions(), nullptr); + writable_file_.reset(); + } + if (interim.ok()) { + return IOStatus::IOError( + "File is closed but data not flushed as writer has previous error."); + } else { + return interim; + } + } + + // Do not quit immediately on failure the file MUST be closed + + // Possible to close it twice now as we MUST close + // in __dtor, simply flushing is not enough + // Windows when pre-allocating does not fill with zeros + // also with unbuffered access we also set the end of data. + if (writable_file_.get() == nullptr) { + return IOStatus::OK(); + } + + IOStatus s; + s = Flush(); // flush cache to OS + + IOStatus interim; + IOOptions io_options; + io_options.rate_limiter_priority = writable_file_->GetIOPriority(); + // In direct I/O mode we write whole pages so + // we need to let the file know where data ends. + if (use_direct_io()) { + { + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + uint64_t filesz = filesize_.load(std::memory_order_acquire); + interim = writable_file_->Truncate(filesz, io_options, nullptr); + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileTruncateFinish(start_ts, finish_ts, s); + if (!interim.ok()) { + NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(), + filesz); + } + } + } + if (interim.ok()) { + { + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + interim = writable_file_->Fsync(io_options, nullptr); + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileSyncFinish(start_ts, finish_ts, s, + FileOperationType::kFsync); + if (!interim.ok()) { + NotifyOnIOError(interim, FileOperationType::kFsync, file_name()); + } + } + } + } + if (!interim.ok() && s.ok()) { + s = interim; + } + } + + TEST_KILL_RANDOM("WritableFileWriter::Close:0"); + { + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + interim = writable_file_->Close(io_options, nullptr); + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileCloseFinish(start_ts, finish_ts, s); + if (!interim.ok()) { + NotifyOnIOError(interim, FileOperationType::kClose, file_name()); + } + } + } + if (!interim.ok() && s.ok()) { + s = interim; + } + + writable_file_.reset(); + TEST_KILL_RANDOM("WritableFileWriter::Close:1"); + + if (s.ok()) { + if (checksum_generator_ != nullptr && !checksum_finalized_) { + checksum_generator_->Finalize(); + checksum_finalized_ = true; + } + } else { + set_seen_error(); + } + + return s; +} + +// write out the cached data to the OS cache or storage if direct I/O +// enabled +IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + IOStatus s; + TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2); + + if (buf_.CurrentSize() > 0) { + if (use_direct_io()) { + if (pending_sync_) { + if (perform_data_verification_ && buffered_data_with_checksum_) { + s = WriteDirectWithChecksum(op_rate_limiter_priority); + } else { + s = WriteDirect(op_rate_limiter_priority); + } + } + } else { + if (perform_data_verification_ && buffered_data_with_checksum_) { + s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(), + op_rate_limiter_priority); + } else { + s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(), + op_rate_limiter_priority); + } + } + if (!s.ok()) { + set_seen_error(); + return s; + } + } + + { + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + IOOptions io_options; + io_options.rate_limiter_priority = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + s = writable_file_->Flush(io_options, nullptr); + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileFlushFinish(start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kFlush, file_name()); + } + } + } + + if (!s.ok()) { + set_seen_error(); + return s; + } + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + + // We try to avoid sync to the last 1MB of data. For two reasons: + // (1) avoid rewrite the same page that is modified later. + // (2) for older version of OS, write can block while writing out + // the page. + // Xfs does neighbor page flushing outside of the specified ranges. We + // need to make sure sync range is far from the write offset. + if (!use_direct_io() && bytes_per_sync_) { + const uint64_t kBytesNotSyncRange = + 1024 * 1024; // recent 1MB is not synced. + const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB. + uint64_t cur_size = filesize_.load(std::memory_order_acquire); + if (cur_size > kBytesNotSyncRange) { + uint64_t offset_sync_to = cur_size - kBytesNotSyncRange; + offset_sync_to -= offset_sync_to % kBytesAlignWhenSync; + assert(offset_sync_to >= last_sync_size_); + if (offset_sync_to > 0 && + offset_sync_to - last_sync_size_ >= bytes_per_sync_) { + s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_); + if (!s.ok()) { + set_seen_error(); + } + last_sync_size_ = offset_sync_to; + } + } + } + + return s; +} + +std::string WritableFileWriter::GetFileChecksum() { + if (checksum_generator_ != nullptr) { + assert(checksum_finalized_); + return checksum_generator_->GetChecksum(); + } else { + return kUnknownFileChecksum; + } +} + +const char* WritableFileWriter::GetFileChecksumFuncName() const { + if (checksum_generator_ != nullptr) { + return checksum_generator_->Name(); + } else { + return kUnknownFileChecksumFuncName; + } +} + +IOStatus WritableFileWriter::Sync(bool use_fsync) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + IOStatus s = Flush(); + if (!s.ok()) { + set_seen_error(); + return s; + } + TEST_KILL_RANDOM("WritableFileWriter::Sync:0"); + if (!use_direct_io() && pending_sync_) { + s = SyncInternal(use_fsync); + if (!s.ok()) { + set_seen_error(); + return s; + } + } + TEST_KILL_RANDOM("WritableFileWriter::Sync:1"); + pending_sync_ = false; + return IOStatus::OK(); +} + +IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + if (!writable_file_->IsSyncThreadSafe()) { + return IOStatus::NotSupported( + "Can't WritableFileWriter::SyncWithoutFlush() because " + "WritableFile::IsSyncThreadSafe() is false"); + } + TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1"); + IOStatus s = SyncInternal(use_fsync); + TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2"); + if (!s.ok()) { +#ifndef NDEBUG + sync_without_flush_called_ = true; +#endif // NDEBUG + set_seen_error(); + } + return s; +} + +IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { + // Caller is supposed to check seen_error_ + IOStatus s; + IOSTATS_TIMER_GUARD(fsync_nanos); + TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0"); + auto prev_perf_level = GetPerfLevel(); + + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); + + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + + IOOptions io_options; + io_options.rate_limiter_priority = writable_file_->GetIOPriority(); + if (use_fsync) { + s = writable_file_->Fsync(io_options, nullptr); + } else { + s = writable_file_->Sync(io_options, nullptr); + } + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileSyncFinish( + start_ts, finish_ts, s, + use_fsync ? FileOperationType::kFsync : FileOperationType::kSync); + if (!s.ok()) { + NotifyOnIOError( + s, (use_fsync ? FileOperationType::kFsync : FileOperationType::kSync), + file_name()); + } + } + SetPerfLevel(prev_perf_level); + + // The caller will be responsible to call set_seen_error() if s is not OK. + return s; +} + +IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + IOSTATS_TIMER_GUARD(range_sync_nanos); + TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + IOOptions io_options; + io_options.rate_limiter_priority = writable_file_->GetIOPriority(); + IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr); + if (!s.ok()) { + set_seen_error(); + } + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kRangeSync, file_name(), nbytes, + offset); + } + } + return s; +} + +// This method writes to disk the specified data and makes use of the rate +// limiter if available +IOStatus WritableFileWriter::WriteBuffered( + const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + IOStatus s; + assert(!use_direct_io()); + const char* src = data; + size_t left = size; + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; + Env::IOPriority rate_limiter_priority_used = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + IOOptions io_options; + io_options.rate_limiter_priority = rate_limiter_priority_used; + + while (left > 0) { + size_t allowed = left; + if (rate_limiter_ != nullptr && + rate_limiter_priority_used != Env::IO_TOTAL) { + allowed = rate_limiter_->RequestToken(left, 0 /* alignment */, + rate_limiter_priority_used, stats_, + RateLimiter::OpType::kWrite); + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + + FileOperationInfo::StartTimePoint start_ts; + uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + old_size = next_write_offset_; + } + { + auto prev_perf_level = GetPerfLevel(); + + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); + if (perform_data_verification_) { + Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->Append(Slice(src, allowed), io_options, v_info, + nullptr); + } else { + s = writable_file_->Append(Slice(src, allowed), io_options, nullptr); + } + if (!s.ok()) { + // If writable_file_->Append() failed, then the data may or may not + // exist in the underlying memory buffer, OS page cache, remote file + // system's buffer, etc. If WritableFileWriter keeps the data in + // buf_, then a future Close() or write retry may send the data to + // the underlying file again. If the data does exist in the + // underlying buffer and gets written to the file eventually despite + // returning error, the file may end up with two duplicate pieces of + // data. Therefore, clear the buf_ at the WritableFileWriter layer + // and let caller determine error handling. + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + } + SetPerfLevel(prev_perf_level); + } + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kAppend, file_name(), allowed, + old_size); + } + } + if (!s.ok()) { + set_seen_error(); + return s; + } + } + + IOSTATS_ADD(bytes_written, allowed); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0"); + + left -= allowed; + src += allowed; + uint64_t cur_size = flushed_size_.load(std::memory_order_acquire); + flushed_size_.store(cur_size + allowed, std::memory_order_release); + } + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + if (!s.ok()) { + set_seen_error(); + } + return s; +} + +IOStatus WritableFileWriter::WriteBufferedWithChecksum( + const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + IOStatus s; + assert(!use_direct_io()); + assert(perform_data_verification_ && buffered_data_with_checksum_); + const char* src = data; + size_t left = size; + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; + Env::IOPriority rate_limiter_priority_used = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + IOOptions io_options; + io_options.rate_limiter_priority = rate_limiter_priority_used; + // Check how much is allowed. Here, we loop until the rate limiter allows to + // write the entire buffer. + // TODO: need to be improved since it sort of defeats the purpose of the rate + // limiter + size_t data_size = left; + if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) { + while (data_size > 0) { + size_t tmp_size; + tmp_size = rate_limiter_->RequestToken(data_size, buf_.Alignment(), + rate_limiter_priority_used, stats_, + RateLimiter::OpType::kWrite); + data_size -= tmp_size; + } + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + + FileOperationInfo::StartTimePoint start_ts; + uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + old_size = next_write_offset_; + } + { + auto prev_perf_level = GetPerfLevel(); + + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); + + EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr); + SetPerfLevel(prev_perf_level); + } + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kAppend, file_name(), left, + old_size); + } + } + if (!s.ok()) { + // If writable_file_->Append() failed, then the data may or may not + // exist in the underlying memory buffer, OS page cache, remote file + // system's buffer, etc. If WritableFileWriter keeps the data in + // buf_, then a future Close() or write retry may send the data to + // the underlying file again. If the data does exist in the + // underlying buffer and gets written to the file eventually despite + // returning error, the file may end up with two duplicate pieces of + // data. Therefore, clear the buf_ at the WritableFileWriter layer + // and let caller determine error handling. + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + set_seen_error(); + return s; + } + } + + IOSTATS_ADD(bytes_written, left); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0"); + + // Buffer write is successful, reset the buffer current size to 0 and reset + // the corresponding checksum value + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + uint64_t cur_size = flushed_size_.load(std::memory_order_acquire); + flushed_size_.store(cur_size + left, std::memory_order_release); + if (!s.ok()) { + set_seen_error(); + } + return s; +} + +void WritableFileWriter::UpdateFileChecksum(const Slice& data) { + if (checksum_generator_ != nullptr) { + checksum_generator_->Update(data.data(), data.size()); + } +} + +// Currently, crc32c checksum is used to calculate the checksum value of the +// content in the input buffer for handoff. In the future, the checksum might be +// calculated from the existing crc32c checksums of the in WAl and Manifest +// records, or even SST file blocks. +// TODO: effectively use the existing checksum of the data being writing to +// generate the crc32c checksum instead of a raw calculation. +void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data, + size_t size, + char* buf) { + uint32_t v_crc32c = crc32c::Extend(0, data, size); + EncodeFixed32(buf, v_crc32c); +} + +// This flushes the accumulated data in the buffer. We pad data with zeros if +// necessary to the whole page. +// However, during automatic flushes padding would not be necessary. +// We always use RateLimiter if available. We move (Refit) any buffer bytes +// that are left over the +// whole number of pages to be written again on the next flush because we can +// only write on aligned +// offsets. +IOStatus WritableFileWriter::WriteDirect( + Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + assert(false); + + return IOStatus::IOError("Writer has previous error."); + } + + assert(use_direct_io()); + IOStatus s; + const size_t alignment = buf_.Alignment(); + assert((next_write_offset_ % alignment) == 0); + + // Calculate whole page final file advance if all writes succeed + const size_t file_advance = + TruncateToPageBoundary(alignment, buf_.CurrentSize()); + + // Calculate the leftover tail, we write it here padded with zeros BUT we + // will write it again in the future either on Close() OR when the current + // whole page fills out. + const size_t leftover_tail = buf_.CurrentSize() - file_advance; + + // Round up and pad + buf_.PadToAlignmentWith(0); + + const char* src = buf_.BufferStart(); + uint64_t write_offset = next_write_offset_; + size_t left = buf_.CurrentSize(); + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; + Env::IOPriority rate_limiter_priority_used = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + IOOptions io_options; + io_options.rate_limiter_priority = rate_limiter_priority_used; + + while (left > 0) { + // Check how much is allowed + size_t size = left; + if (rate_limiter_ != nullptr && + rate_limiter_priority_used != Env::IO_TOTAL) { + size = rate_limiter_->RequestToken(left, buf_.Alignment(), + rate_limiter_priority_used, stats_, + RateLimiter::OpType::kWrite); + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + // direct writes must be positional + if (perform_data_verification_) { + Crc32cHandoffChecksumCalculation(src, size, checksum_buf); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->PositionedAppend(Slice(src, size), write_offset, + io_options, v_info, nullptr); + } else { + s = writable_file_->PositionedAppend(Slice(src, size), write_offset, + io_options, nullptr); + } + + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(), + size, write_offset); + } + } + if (!s.ok()) { + buf_.Size(file_advance + leftover_tail); + set_seen_error(); + return s; + } + } + + IOSTATS_ADD(bytes_written, size); + left -= size; + src += size; + write_offset += size; + uint64_t cur_size = flushed_size_.load(std::memory_order_acquire); + flushed_size_.store(cur_size + size, std::memory_order_release); + assert((next_write_offset_ % alignment) == 0); + } + + if (s.ok()) { + // Move the tail to the beginning of the buffer + // This never happens during normal Append but rather during + // explicit call to Flush()/Sync() or Close() + buf_.RefitTail(file_advance, leftover_tail); + // This is where we start writing next time which may or not be + // the actual file size on disk. They match if the buffer size + // is a multiple of whole pages otherwise filesize_ is leftover_tail + // behind + next_write_offset_ += file_advance; + } else { + set_seen_error(); + } + return s; +} + +IOStatus WritableFileWriter::WriteDirectWithChecksum( + Env::IOPriority op_rate_limiter_priority) { + if (seen_error()) { + return AssertFalseAndGetStatusForPrevError(); + } + + assert(use_direct_io()); + assert(perform_data_verification_ && buffered_data_with_checksum_); + IOStatus s; + const size_t alignment = buf_.Alignment(); + assert((next_write_offset_ % alignment) == 0); + + // Calculate whole page final file advance if all writes succeed + const size_t file_advance = + TruncateToPageBoundary(alignment, buf_.CurrentSize()); + + // Calculate the leftover tail, we write it here padded with zeros BUT we + // will write it again in the future either on Close() OR when the current + // whole page fills out. + const size_t leftover_tail = buf_.CurrentSize() - file_advance; + + // Round up, pad, and combine the checksum. + size_t last_cur_size = buf_.CurrentSize(); + buf_.PadToAlignmentWith(0); + size_t padded_size = buf_.CurrentSize() - last_cur_size; + const char* padded_start = buf_.BufferStart() + last_cur_size; + uint32_t padded_checksum = crc32c::Value(padded_start, padded_size); + buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine( + buffered_data_crc32c_checksum_, padded_checksum, padded_size); + + const char* src = buf_.BufferStart(); + uint64_t write_offset = next_write_offset_; + size_t left = buf_.CurrentSize(); + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; + + Env::IOPriority rate_limiter_priority_used = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + IOOptions io_options; + io_options.rate_limiter_priority = rate_limiter_priority_used; + // Check how much is allowed. Here, we loop until the rate limiter allows to + // write the entire buffer. + // TODO: need to be improved since it sort of defeats the purpose of the rate + // limiter + size_t data_size = left; + if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) { + while (data_size > 0) { + size_t size; + size = rate_limiter_->RequestToken(data_size, buf_.Alignment(), + rate_limiter_priority_used, stats_, + RateLimiter::OpType::kWrite); + data_size -= size; + } + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + // direct writes must be positional + EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->PositionedAppend(Slice(src, left), write_offset, + io_options, v_info, nullptr); + + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileWriteFinish(write_offset, left, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(), + left, write_offset); + } + } + if (!s.ok()) { + // In this case, we do not change buffered_data_crc32c_checksum_ because + // it still aligns with the data in the buffer. + buf_.Size(file_advance + leftover_tail); + buffered_data_crc32c_checksum_ = + crc32c::Value(buf_.BufferStart(), buf_.CurrentSize()); + set_seen_error(); + return s; + } + } + + IOSTATS_ADD(bytes_written, left); + assert((next_write_offset_ % alignment) == 0); + uint64_t cur_size = flushed_size_.load(std::memory_order_acquire); + flushed_size_.store(cur_size + left, std::memory_order_release); + + if (s.ok()) { + // Move the tail to the beginning of the buffer + // This never happens during normal Append but rather during + // explicit call to Flush()/Sync() or Close(). Also the buffer checksum will + // recalculated accordingly. + buf_.RefitTail(file_advance, leftover_tail); + // Adjust the checksum value to align with the data in the buffer + buffered_data_crc32c_checksum_ = + crc32c::Value(buf_.BufferStart(), buf_.CurrentSize()); + // This is where we start writing next time which may or not be + // the actual file size on disk. They match if the buffer size + // is a multiple of whole pages otherwise filesize_ is leftover_tail + // behind + next_write_offset_ += file_advance; + } else { + set_seen_error(); + } + return s; +} +Env::IOPriority WritableFileWriter::DecideRateLimiterPriority( + Env::IOPriority writable_file_io_priority, + Env::IOPriority op_rate_limiter_priority) { + if (writable_file_io_priority == Env::IO_TOTAL && + op_rate_limiter_priority == Env::IO_TOTAL) { + return Env::IO_TOTAL; + } else if (writable_file_io_priority == Env::IO_TOTAL) { + return op_rate_limiter_priority; + } else if (op_rate_limiter_priority == Env::IO_TOTAL) { + return writable_file_io_priority; + } else { + return op_rate_limiter_priority; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.h new file mode 100644 index 0000000000..aac0f59491 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/file/writable_file_writer.h @@ -0,0 +1,320 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/version_edit.h" +#include "env/file_system_tracer.h" +#include "port/port.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/listener.h" +#include "rocksdb/rate_limiter.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" + +namespace ROCKSDB_NAMESPACE { +class Statistics; +class SystemClock; + +// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides +// facilities to: +// - Handle Buffered and Direct writes. +// - Rate limit writes. +// - Flush and Sync the data to the underlying filesystem. +// - Notify any interested listeners on the completion of a write. +// - Update IO stats. +class WritableFileWriter { + private: + void NotifyOnFileWriteFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kWrite, file_name_, start_ts, + finish_ts, io_status, temperature_); + info.offset = offset; + info.length = length; + + for (auto& listener : listeners_) { + listener->OnFileWriteFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileFlushFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kFlush, file_name_, start_ts, + finish_ts, io_status, temperature_); + + for (auto& listener : listeners_) { + listener->OnFileFlushFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileSyncFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status, + FileOperationType type = FileOperationType::kSync) { + FileOperationInfo info(type, file_name_, start_ts, finish_ts, io_status, + temperature_); + + for (auto& listener : listeners_) { + listener->OnFileSyncFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileRangeSyncFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kRangeSync, file_name_, start_ts, + finish_ts, io_status, temperature_); + info.offset = offset; + info.length = length; + + for (auto& listener : listeners_) { + listener->OnFileRangeSyncFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileTruncateFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kTruncate, file_name_, start_ts, + finish_ts, io_status, temperature_); + + for (auto& listener : listeners_) { + listener->OnFileTruncateFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileCloseFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kClose, file_name_, start_ts, + finish_ts, io_status, temperature_); + + for (auto& listener : listeners_) { + listener->OnFileCloseFinish(info); + } + info.status.PermitUncheckedError(); + } + + void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation, + const std::string& file_path, size_t length = 0, + uint64_t offset = 0) { + if (listeners_.empty()) { + return; + } + IOErrorInfo io_error_info(io_status, operation, file_path, length, offset); + for (auto& listener : listeners_) { + listener->OnIOError(io_error_info); + } + io_error_info.io_status.PermitUncheckedError(); + } + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + void UpdateFileChecksum(const Slice& data); + void Crc32cHandoffChecksumCalculation(const char* data, size_t size, + char* buf); + + std::string file_name_; + FSWritableFilePtr writable_file_; + SystemClock* clock_; + AlignedBuffer buf_; + size_t max_buffer_size_; + // Actually written data size can be used for truncate + // not counting padding data + std::atomic filesize_; + std::atomic flushed_size_; + // This is necessary when we use unbuffered access + // and writes must happen on aligned offsets + // so we need to go back and write that page again + uint64_t next_write_offset_; + bool pending_sync_; + std::atomic seen_error_; +#ifndef NDEBUG + // SyncWithoutFlush() is the function that is allowed to be called + // concurrently with other function. One of the concurrent call + // could set seen_error_, and the other one would hit assertion + // in debug mode. + std::atomic sync_without_flush_called_ = false; +#endif // NDEBUG + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; + RateLimiter* rate_limiter_; + Statistics* stats_; + std::vector> listeners_; + std::unique_ptr checksum_generator_; + bool checksum_finalized_; + bool perform_data_verification_; + uint32_t buffered_data_crc32c_checksum_; + bool buffered_data_with_checksum_; + Temperature temperature_; + + public: + WritableFileWriter( + std::unique_ptr&& file, const std::string& _file_name, + const FileOptions& options, SystemClock* clock = nullptr, + const std::shared_ptr& io_tracer = nullptr, + Statistics* stats = nullptr, + const std::vector>& listeners = {}, + FileChecksumGenFactory* file_checksum_gen_factory = nullptr, + bool perform_data_verification = false, + bool buffered_data_with_checksum = false) + : file_name_(_file_name), + writable_file_(std::move(file), io_tracer, _file_name), + clock_(clock), + buf_(), + max_buffer_size_(options.writable_file_max_buffer_size), + filesize_(0), + flushed_size_(0), + next_write_offset_(0), + pending_sync_(false), + seen_error_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + rate_limiter_(options.rate_limiter), + stats_(stats), + listeners_(), + checksum_generator_(nullptr), + checksum_finalized_(false), + perform_data_verification_(perform_data_verification), + buffered_data_crc32c_checksum_(0), + buffered_data_with_checksum_(buffered_data_with_checksum) { + temperature_ = options.temperature; + assert(!use_direct_io() || max_buffer_size_ > 0); + TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", + reinterpret_cast(max_buffer_size_)); + buf_.Alignment(writable_file_->GetRequiredBufferAlignment()); + buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_)); + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); + if (file_checksum_gen_factory != nullptr) { + FileChecksumGenContext checksum_gen_context; + checksum_gen_context.file_name = _file_name; + checksum_generator_ = + file_checksum_gen_factory->CreateFileChecksumGenerator( + checksum_gen_context); + } + } + + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* writer, + IODebugContext* dbg); + WritableFileWriter(const WritableFileWriter&) = delete; + + WritableFileWriter& operator=(const WritableFileWriter&) = delete; + + ~WritableFileWriter() { + auto s = Close(); + s.PermitUncheckedError(); + } + + std::string file_name() const { return file_name_; } + + // When this Append API is called, if the crc32c_checksum is not provided, we + // will calculate the checksum internally. + IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0, + Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + + IOStatus Pad(const size_t pad_bytes, + Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + + IOStatus Flush(Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + + IOStatus Close(); + + IOStatus Sync(bool use_fsync); + + // Sync only the data that was already Flush()ed. Safe to call concurrently + // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(), + // returns NotSupported status. + IOStatus SyncWithoutFlush(bool use_fsync); + + uint64_t GetFileSize() const { + return filesize_.load(std::memory_order_acquire); + } + + // Returns the size of data flushed to the underlying `FSWritableFile`. + // Expected to match `writable_file()->GetFileSize()`. + // The return value can serve as a lower-bound for the amount of data synced + // by a future call to `SyncWithoutFlush()`. + uint64_t GetFlushedSize() const { + return flushed_size_.load(std::memory_order_acquire); + } + + IOStatus InvalidateCache(size_t offset, size_t length) { + return writable_file_->InvalidateCache(offset, length); + } + + FSWritableFile* writable_file() const { return writable_file_.get(); } + + bool use_direct_io() { return writable_file_->use_direct_io(); } + + bool BufferIsEmpty() { return buf_.CurrentSize() == 0; } + + void TEST_SetFileChecksumGenerator( + FileChecksumGenerator* checksum_generator) { + checksum_generator_.reset(checksum_generator); + } + + std::string GetFileChecksum(); + + const char* GetFileChecksumFuncName() const; + + bool seen_error() const { + return seen_error_.load(std::memory_order_relaxed); + } + // For options of relaxed consistency, users might hope to continue + // operating on the file after an error happens. + void reset_seen_error() { + seen_error_.store(false, std::memory_order_relaxed); + } + void set_seen_error() { seen_error_.store(true, std::memory_order_relaxed); } + + IOStatus AssertFalseAndGetStatusForPrevError() { + // This should only happen if SyncWithoutFlush() was called. + assert(sync_without_flush_called_); + return IOStatus::IOError("Writer has previous error."); + } + + private: + // Decide the Rate Limiter priority. + static Env::IOPriority DecideRateLimiterPriority( + Env::IOPriority writable_file_io_priority, + Env::IOPriority op_rate_limiter_priority); + + // Used when os buffering is OFF and we are writing + // DMA such as in Direct I/O mode + IOStatus WriteDirect(Env::IOPriority op_rate_limiter_priority); + IOStatus WriteDirectWithChecksum(Env::IOPriority op_rate_limiter_priority); + // Normal write. + IOStatus WriteBuffered(const char* data, size_t size, + Env::IOPriority op_rate_limiter_priority); + IOStatus WriteBufferedWithChecksum(const char* data, size_t size, + Env::IOPriority op_rate_limiter_priority); + IOStatus RangeSync(uint64_t offset, uint64_t nbytes); + IOStatus SyncInternal(bool use_fsync); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/fuzz/util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/fuzz/util.h new file mode 100644 index 0000000000..97011823a3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/fuzz/util.h @@ -0,0 +1,29 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#define CHECK_OK(expression) \ + do { \ + auto status = (expression); \ + if (!status.ok()) { \ + std::cerr << status.ToString() << std::endl; \ + abort(); \ + } \ + } while (0) + +#define CHECK_EQ(a, b) \ + if (a != b) { \ + std::cerr << "(" << #a << "=" << a << ") != (" << #b << "=" << b << ")" \ + << std::endl; \ + abort(); \ + } + +#define CHECK_TRUE(cond) \ + if (!(cond)) { \ + std::cerr << "\"" << #cond << "\" is false" << std::endl; \ + abort(); \ + } diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_cache.h new file mode 100644 index 0000000000..997a41499d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_cache.h @@ -0,0 +1,623 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// APIs for customizing read caches in RocksDB. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/memory_allocator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Logger; +class SecondaryCacheResultHandle; +class Statistics; + +// A Cache maps keys to objects resident in memory, tracks reference counts +// on those key-object entries, and is able to remove unreferenced entries +// whenever it wants. All operations are fully thread safe except as noted. +// Inserted entries have a specified "charge" which is some quantity in +// unspecified units, typically bytes of memory used. A Cache will typically +// have a finite capacity in units of charge, and evict entries as needed +// to stay at or below that capacity. +// +// NOTE: This API is for expert use only and is intended more for customizing +// cache behavior than for calling into outside of RocksDB. It is subject to +// change as RocksDB evolves, especially the RocksDB block cache. Overriding +// CacheWrapper is the preferred way of customizing some operations on an +// existing implementation. +// +// INTERNAL: See typed_cache.h for convenient wrappers on top of this API. +// New virtual functions must also be added to CacheWrapper below. +class Cache { + public: // types hidden from API client + // Opaque handle to an entry stored in the cache. + struct Handle {}; + + public: // types hidden from Cache implementation + // Pointer to cached object of unspecified type. (This type alias is + // provided for clarity, not really for type checking.) + using ObjectPtr = void*; + + // Opaque object providing context (settings, etc.) to create objects + // for primary cache from saved (serialized) secondary cache entries. + struct CreateContext {}; + + public: // type defs + // Depending on implementation, cache entries with higher priority levels + // could be less likely to get evicted than entries with lower priority + // levels. The "high" priority level applies to certain SST metablocks (e.g. + // index and filter blocks) if the option + // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority + // level is used for other kinds of SST blocks (most importantly, data + // blocks), as well as the above metablocks in case + // cache_index_and_filter_blocks_with_high_priority is + // not set. The "bottom" priority level is for BlobDB's blob values. + enum class Priority { HIGH, LOW, BOTTOM }; + + // A set of callbacks to allow objects in the primary block cache to be + // be persisted in a secondary cache. The purpose of the secondary cache + // is to support other ways of caching the object, such as persistent or + // compressed data, that may require the object to be parsed and transformed + // in some way. Since the primary cache holds C++ objects and the secondary + // cache may only hold flat data that doesn't need relocation, these + // callbacks need to be provided by the user of the block + // cache to do the conversion. + // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers + // to callback functions for size, saving and deletion of the + // object. The callbacks are defined in C-style in order to make them + // stateless and not add to the cache metadata size. + // Saving multiple std::function objects will take up 32 bytes per + // function, even if its not bound to an object and does no capture. + // + // All the callbacks are C-style function pointers in order to simplify + // lifecycle management. Objects in the cache can outlive the parent DB, + // so anything required for these operations should be contained in the + // object itself. + // + // The SizeCallback takes a pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + using SizeCallback = size_t (*)(ObjectPtr obj); + + // The SaveToCallback takes an object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(ObjectPtr from_obj, size_t from_offset, + size_t length, char* out_buf); + + // A function pointer type for destruction of a cache object. This will + // typically call the destructor for the appropriate type of the object. + // The Cache is responsible for copying and reclaiming space for the key, + // but objects are managed in part using this callback. Generally a DeleterFn + // can be nullptr if the ObjectPtr does not need destruction (e.g. nullptr or + // pointer into static data). + using DeleterFn = void (*)(ObjectPtr obj, MemoryAllocator* allocator); + + // The CreateCallback is takes in a buffer from the NVM cache and constructs + // an object using it. The callback doesn't have ownership of the buffer and + // should copy the contents into its own buffer. The CreateContext* is + // provided by Lookup and may be used to follow DB- or CF-specific settings. + // In case of some error, non-OK is returned and the caller should ignore + // any result in out_obj. (The implementation must clean up after itself.) + using CreateCallback = Status (*)(const Slice& data, CreateContext* context, + MemoryAllocator* allocator, + ObjectPtr* out_obj, size_t* out_charge); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + // Function for deleting an object on its removal from the Cache. + // nullptr is only for entries that require no destruction, such as + // "placeholder" cache entries with nullptr object. + DeleterFn del_cb; // (<- Most performance critical) + // Next three are used for persisting values as described above. + // If any is nullptr, then all three should be nullptr and persisting the + // entry to/from secondary cache is not supported. + SizeCallback size_cb; + SaveToCallback saveto_cb; + CreateCallback create_cb; + // Classification of the entry for monitoring purposes in block cache. + CacheEntryRole role; + // Another CacheItemHelper (or this one) without secondary cache support. + // This is provided so that items promoted from secondary cache into + // primary cache without removal from the secondary cache can be prevented + // from attempting re-insertion into secondary cache (for efficiency). + const CacheItemHelper* without_secondary_compat; + + CacheItemHelper() : CacheItemHelper(CacheEntryRole::kMisc) {} + + // For helpers without SecondaryCache support + explicit CacheItemHelper(CacheEntryRole _role, DeleterFn _del_cb = nullptr) + : CacheItemHelper(_role, _del_cb, nullptr, nullptr, nullptr, this) {} + + // For helpers with SecondaryCache support + explicit CacheItemHelper(CacheEntryRole _role, DeleterFn _del_cb, + SizeCallback _size_cb, SaveToCallback _saveto_cb, + CreateCallback _create_cb, + const CacheItemHelper* _without_secondary_compat) + : del_cb(_del_cb), + size_cb(_size_cb), + saveto_cb(_saveto_cb), + create_cb(_create_cb), + role(_role), + without_secondary_compat(_without_secondary_compat) { + // Either all three secondary cache callbacks are non-nullptr or + // all three are nullptr + assert((size_cb != nullptr) == (saveto_cb != nullptr)); + assert((size_cb != nullptr) == (create_cb != nullptr)); + // without_secondary_compat points to equivalent but without + // secondary support + assert(role == without_secondary_compat->role); + assert(del_cb == without_secondary_compat->del_cb); + assert(!without_secondary_compat->IsSecondaryCacheCompatible()); + } + inline bool IsSecondaryCacheCompatible() const { + return size_cb != nullptr; + } + }; + + public: // ctor/dtor/create + Cache(std::shared_ptr allocator = nullptr) + : memory_allocator_(std::move(allocator)) {} + // No copying allowed + Cache(const Cache&) = delete; + Cache& operator=(const Cache&) = delete; + + // Destroys all remaining entries by calling the associated "deleter" + virtual ~Cache() {} + + // Creates a new Cache based on the input value string and returns the result. + // Currently, this method can be used to create LRUCaches only + // @param config_options + // @param value The value might be: + // - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102( + // - Name-value option pairs -- "capacity=1M; num_shard_bits=4; + // For the LRUCache, the values are defined in LRUCacheOptions. + // @param result The new Cache object + // @return OK if the cache was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + public: // functions + // The type of the Cache + virtual const char* Name() const = 0; + + // The Insert and Lookup APIs below are intended to allow cached objects + // to be demoted/promoted between the primary block cache and a secondary + // cache. The secondary cache could be a non-volatile cache, and will + // likely store the object in a different representation. They rely on a + // per object CacheItemHelper to do the conversions. + // The secondary cache may persist across process and system restarts, + // and may even be moved between hosts. Therefore, the cache key must + // be repeatable across restarts/reboots, and globally unique if + // multiple DBs share the same cache and the set of DBs can change + // over time. + + // Insert a mapping from key->object into the cache and assign it + // the specified charge against the total cache capacity. If + // strict_capacity_limit is true and cache reaches its full capacity, + // return Status::MemoryLimit. `obj` must be non-nullptr if compatible + // with secondary cache (helper->size_cb != nullptr), because Value() == + // nullptr is reserved for indicating some secondary cache failure cases. + // On success, returns OK and takes ownership of `obj`, eventually deleting + // it with helper->del_cb. On non-OK return, the caller maintains ownership + // of `obj` so will often need to delete it in such cases. + // + // The helper argument is saved by the cache and will be used when the + // inserted object is evicted or considered for promotion to the secondary + // cache. Promotion to secondary cache is only enabled if helper->size_cb + // != nullptr. The helper must outlive the cache. Callers may use + // &kNoopCacheItemHelper as a trivial helper (no deleter for the object, + // no secondary cache). `helper` must not be nullptr (efficiency). + // + // If `handle` is not nullptr and return status is OK, `handle` is set + // to a Handle* for the entry. The caller must call this->Release(handle) + // when the returned entry is no longer needed. If `handle` is nullptr, it is + // as if Release is called immediately after Insert. + // + // Regardless of whether the item was inserted into the cache, + // it will attempt to insert it into the secondary cache if one is + // configured, and the helper supports it. + // The cache implementation must support a secondary cache, otherwise + // the item is only inserted into the primary cache. It may + // defer the insertion to the secondary cache as it sees fit. + // + // When the inserted entry is no longer needed, it will be destroyed using + // helper->del_cb (if non-nullptr). + virtual Status Insert(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) = 0; + + // Similar to Insert, but used for creating cache entries that cannot + // be found with Lookup, such as for memory charging purposes. The + // key is needed for cache sharding purposes. + // * If allow_uncharged==true or strict_capacity_limit=false, the operation + // always succeeds and returns a valid Handle. + // * If strict_capacity_limit=true and the requested charge cannot be freed + // up in the cache, then + // * If allow_uncharged==true, it's created anyway (GetCharge() == 0). + // * If allow_uncharged==false, returns nullptr to indicate failure. + virtual Handle* CreateStandalone(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + bool allow_uncharged) = 0; + + // Lookup the key, returning nullptr if not found. If found, returns + // a handle to the mapping that must eventually be passed to Release(). + // + // If a non-nullptr helper argument is provided with a non-nullptr + // create_cb, and a secondary cache is configured, then the secondary + // cache is also queried if lookup in the primary cache fails. If found + // in secondary cache, the provided create_db and create_context are + // used to promote the entry to an object in the primary cache. + // In that case, the helper may be saved and used later when the object + // is evicted, so as usual, the pointed-to helper must outlive the cache. + virtual Handle* Lookup(const Slice& key, + const CacheItemHelper* helper = nullptr, + CreateContext* create_context = nullptr, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) = 0; + + // Convenience wrapper when secondary cache not supported + inline Handle* BasicLookup(const Slice& key, Statistics* stats) { + return Lookup(key, nullptr, nullptr, Priority::LOW, stats); + } + + // Increments the reference count for the handle if it refers to an entry in + // the cache. Returns true if refcount was incremented; otherwise, returns + // false. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Ref(Handle* handle) = 0; + + /** + * Release a mapping returned by a previous Lookup(). A released entry might + * still remain in cache in case it is later looked up by others. If + * erase_if_last_ref is set then it also erases it from the cache if there is + * no other reference to it. Erasing it should call the deleter function that + * was provided when the entry was inserted. + * + * Returns true if the entry was also erased. + */ + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0; + + // Return the object assiciated with a handle returned by a successful + // Lookup(). For historical reasons, this is also known at the "value" + // associated with the key. + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual ObjectPtr Value(Handle* handle) = 0; + + // If the cache contains the entry for the key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + // Return a new numeric id. May be used by multiple clients who are + // sharding the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // sets the maximum configured capacity of the cache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will do its best job to + // purge the released entries from the cache in order to lower the usage + virtual void SetCapacity(size_t capacity) = 0; + + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + + // Get the flag whether to return error on insertion when cache reaches its + // full capacity. + virtual bool HasStrictCapacityLimit() const = 0; + + // Returns the maximum configured capacity of the cache + virtual size_t GetCapacity() const = 0; + + // Returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; + + // Returns the number of entries currently tracked in the table. SIZE_MAX + // means "not supported." This is used for inspecting the load factor, along + // with GetTableAddressCount(). + virtual size_t GetOccupancyCount() const { return SIZE_MAX; } + + // Returns the number of ways the hash function is divided for addressing + // entries. Zero means "not supported." This is used for inspecting the load + // factor, along with GetOccupancyCount(). + virtual size_t GetTableAddressCount() const { return 0; } + + // Returns the memory size for a specific entry in the cache. + virtual size_t GetUsage(Handle* handle) const = 0; + + // Returns the memory size for the entries in use by the system + virtual size_t GetPinnedUsage() const = 0; + + // Returns the charge for the specific entry in the cache. + virtual size_t GetCharge(Handle* handle) const = 0; + + // Returns the helper for the specified entry. + virtual const CacheItemHelper* GetCacheItemHelper(Handle* handle) const = 0; + + // Call this on shutdown if you want to speed it up. Cache will disown + // any underlying data and will not free it on delete. This call will leak + // memory - call this only if you're shutting down the process. + // Any attempts of using cache after this call will fail terribly. + // Always delete the DB object before calling this method! + virtual void DisownData() { + // default implementation is noop + } + + struct ApplyToAllEntriesOptions { + // If the Cache uses locks, setting `average_entries_per_lock` to + // a higher value suggests iterating over more entries each time a lock + // is acquired, likely reducing the time for ApplyToAllEntries but + // increasing latency for concurrent users of the Cache. Setting + // `average_entries_per_lock` to a smaller value could be helpful if + // callback is relatively expensive, such as using large data structures. + size_t average_entries_per_lock = 256; + }; + + // Apply a callback to all entries in the cache. The Cache must ensure + // thread safety but does not guarantee that a consistent snapshot of all + // entries is iterated over if other threads are operating on the Cache + // also. + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) = 0; + + // Remove all entries. + // Prerequisite: no entry is referenced. + virtual void EraseUnRefEntries() = 0; + + virtual std::string GetPrintableOptions() const { return ""; } + + // Check for any warnings or errors in the operation of the cache and + // report them to the logger. This is intended only to be called + // periodically so does not need to be very efficient. (Obscure calling + // conventions for Logger inherited from env.h) + virtual void ReportProblems( + const std::shared_ptr& /*info_log*/) const {} + + MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + + // See ShardedCacheOptions::hash_seed + virtual uint32_t GetHashSeed() const { return 0; } + + // EXPERIMENTAL + // The following APIs are experimental and might change in the future. + + // Release a mapping returned by a previous Lookup(). The "useful" + // parameter specifies whether the data was actually used or not, + // which may be used by the cache implementation to decide whether + // to consider it as a hit for retention purposes. As noted elsewhere, + // "pending" handles require Wait()/WaitAll() before Release(). + virtual bool Release(Handle* handle, bool /*useful*/, + bool erase_if_last_ref) { + return Release(handle, erase_if_last_ref); + } + + // A temporary handle structure for managing async lookups, which callers + // of AsyncLookup() can allocate on the call stack for efficiency. + // An AsyncLookupHandle should not be used concurrently across threads. + struct AsyncLookupHandle { + // Inputs, populated by caller: + // NOTE: at least in case of stacked secondary caches, the underlying + // key buffer must last until handle is completely waited on. + Slice key; + const CacheItemHelper* helper = nullptr; + CreateContext* create_context = nullptr; + Priority priority = Priority::LOW; + Statistics* stats = nullptr; + + AsyncLookupHandle() {} + AsyncLookupHandle(const Slice& _key, const CacheItemHelper* _helper, + CreateContext* _create_context, + Priority _priority = Priority::LOW, + Statistics* _stats = nullptr) + : key(_key), + helper(_helper), + create_context(_create_context), + priority(_priority), + stats(_stats) {} + + // AsyncLookupHandle should only be destroyed when no longer pending + ~AsyncLookupHandle() { assert(!IsPending()); } + + // No copies or moves (StartAsyncLookup may save a pointer to this) + AsyncLookupHandle(const AsyncLookupHandle&) = delete; + AsyncLookupHandle operator=(const AsyncLookupHandle&) = delete; + AsyncLookupHandle(AsyncLookupHandle&&) = delete; + AsyncLookupHandle operator=(AsyncLookupHandle&&) = delete; + + // Determines if the handle returned by Lookup() can give a value without + // blocking, though Wait()/WaitAll() might be required to publish it to + // Value(). See secondary cache compatible Lookup() above for details. + // This call is not thread safe on "pending" handles. + // WART/TODO with stacked secondaries: might indicate ready when one + // result is ready (a miss) but the next lookup will block. + bool IsReady(); + + // Returns true if Wait/WaitAll is required before calling Result(). + bool IsPending(); + + // Returns a Lookup()-like result if this AsyncHandle is not pending. + // (Undefined behavior on a pending AsyncHandle.) Like Lookup(), the + // caller is responsible for eventually Release()ing a non-nullptr + // Handle* result. + Handle* Result(); + + // Implementation details, for RocksDB internal use only + Handle* result_handle = nullptr; + SecondaryCacheResultHandle* pending_handle = nullptr; + SecondaryCache* pending_cache = nullptr; + bool found_dummy_entry = false; + bool kept_in_sec_cache = false; + }; + + // Starts a potentially asynchronous Lookup(), based on the populated + // "input" fields of the async_handle. The caller is responsible for + // keeping the AsyncLookupHandle and the key it references alive through + // WaitAll(), and the AsyncLookupHandle alive through + // AsyncLookupHandle::Result(). WaitAll() can only be skipped if + // AsyncLookupHandle::IsPending() is already false after StartAsyncLookup. + // Calling AsyncLookupHandle::Result() is essentially required so that + // Release() can be called on non-nullptr Handle result. Wait() is a + // concise version of WaitAll()+Result() on a single handle. After an + // AsyncLookupHandle has completed this cycle, its input fields can be + // updated and re-used for another StartAsyncLookup. + // + // Handle is thread-safe while AsyncLookupHandle is not thread-safe. + // + // Default implementation is appropriate for Caches without + // true asynchronous support: defers to synchronous Lookup(). + // (AsyncLookupHandles will only get into the "pending" state with + // SecondaryCache configured.) + virtual void StartAsyncLookup(AsyncLookupHandle& async_handle); + + // A convenient wrapper around WaitAll() and AsyncLookupHandle::Result() + // for a single async handle. See StartAsyncLookup(). + Handle* Wait(AsyncLookupHandle& async_handle); + + // Wait for an array of async handles to get results, so that none are left + // in the "pending" state. Not thread safe. See StartAsyncLookup(). + // Default implementation is appropriate for Caches without true + // asynchronous support: asserts that all handles are not pending (or not + // expected to be handled by this cache, in case of wrapped/stacked + // WaitAlls()). + virtual void WaitAll(AsyncLookupHandle* /*async_handles*/, size_t /*count*/); + + // For a function called on cache entries about to be evicted. The function + // returns `true` if it has taken ownership of the Value (object), or + // `false` if the cache should destroy it as usual. Regardless, Ref() and + // Release() cannot be called on this Handle that is poised for eviction. + using EvictionCallback = std::function; + // Sets an eviction callback for this Cache. Not thread safe and only + // supports being set once, so should only be used during initialization + // or destruction, guaranteed before or after any thread-shared operations. + void SetEvictionCallback(EvictionCallback&& fn); + + protected: + std::shared_ptr memory_allocator_; + EvictionCallback eviction_callback_; +}; + +// A wrapper around Cache that can easily be extended with instrumentation, +// etc. +class CacheWrapper : public Cache { + public: + explicit CacheWrapper(std::shared_ptr target) + : target_(std::move(target)) {} + + // Only function that derived class must provide + // const char* Name() const override { ... } + + Status Insert(const Slice& key, ObjectPtr value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + return target_->Insert(key, value, helper, charge, handle, priority); + } + + Handle* CreateStandalone(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + bool allow_uncharged) override { + return target_->CreateStandalone(key, obj, helper, charge, allow_uncharged); + } + + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) override { + return target_->Lookup(key, helper, create_context, priority, stats); + } + + bool Ref(Handle* handle) override { return target_->Ref(handle); } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + return target_->Release(handle, erase_if_last_ref); + } + + ObjectPtr Value(Handle* handle) override { return target_->Value(handle); } + + void Erase(const Slice& key) override { target_->Erase(key); } + uint64_t NewId() override { return target_->NewId(); } + + void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + size_t GetCapacity() const override { return target_->GetCapacity(); } + + size_t GetUsage() const override { return target_->GetUsage(); } + + size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } + + size_t GetCharge(Handle* handle) const override { + return target_->GetCharge(handle); + } + + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { + return target_->GetCacheItemHelper(handle); + } + + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + target_->ApplyToAllEntries(callback, opts); + } + + void EraseUnRefEntries() override { target_->EraseUnRefEntries(); } + + void StartAsyncLookup(AsyncLookupHandle& async_handle) override { + target_->StartAsyncLookup(async_handle); + } + + void WaitAll(AsyncLookupHandle* async_handles, size_t count) override { + target_->WaitAll(async_handles, count); + } + + protected: + std::shared_ptr target_; +}; + +// Useful for cache entries requiring no clean-up, such as for cache +// reservations +extern const Cache::CacheItemHelper kNoopCacheItemHelper; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_options.h new file mode 100644 index 0000000000..817a720282 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/advanced_options.h @@ -0,0 +1,1177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class SliceTransform; +class TablePropertiesCollectorFactory; +class TableFactory; +struct Options; + +enum CompactionStyle : char { + // level based compaction style + kCompactionStyleLevel = 0x0, + // Universal compaction style + kCompactionStyleUniversal = 0x1, + // FIFO compaction style + kCompactionStyleFIFO = 0x2, + // Disable background compaction. Compaction jobs are submitted + // via CompactFiles(). + kCompactionStyleNone = 0x3, +}; + +// In Level-based compaction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. +enum CompactionPri : char { + // Slightly prioritize larger files by size compensated by #deletes + kByCompensatedSize = 0x0, + // First compact files whose data's latest update time is oldest. + // Try this if you only update some hot keys in small ranges. + kOldestLargestSeqFirst = 0x1, + // First compact files whose range hasn't been compacted to the next level + // for the longest. If your updates are random across the key space, + // write amplification is slightly better with this option. + kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, + // Keeps a cursor(s) of the successor of the file (key range) was/were + // compacted before, and always picks the next files (key range) in that + // level. The file picking process will cycle through all the files in a + // round-robin manner. + kRoundRobin = 0x4, +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + // ==> BEGIN options that can be set by deprecated configuration syntax, <== + // ==> e.g. compression_opts=5:6:7:8:9:10:true:11:false <== + // ==> Please use compression_opts={level=6;strategy=7;} form instead. <== + + // RocksDB's generic default compression level. Internally it'll be translated + // to the default compression level specific to the library being used (see + // comment above `ColumnFamilyOptions::compression`). + // + // The default value is the max 16-bit int as it'll be written out in OPTIONS + // file, which should be portable. + static constexpr int kDefaultCompressionLevel = 32767; + + // zlib only: windowBits parameter. See https://www.zlib.net/manual.html + int window_bits = -14; + + // Compression "level" applicable to zstd, zlib, LZ4. Except for + // kDefaultCompressionLevel (see above), the meaning of each value depends + // on the compression algorithm. + int level = kDefaultCompressionLevel; + + // zlib only: strategy parameter. See https://www.zlib.net/manual.html + int strategy = 0; + + // Maximum size of dictionaries used to prime the compression library. + // Enabling dictionary can improve compression ratios when there are + // repetitions across data blocks. + // + // The dictionary is created by sampling the SST file data. If + // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's + // dictionary generator (see comments for option `use_zstd_dict_trainer` for + // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the + // random samples are used directly as the dictionary. + // + // When compression dictionary is disabled, we compress and write each block + // before buffering data for the next one. When compression dictionary is + // enabled, we buffer SST file data in-memory so we can sample it, as data + // can only be compressed and written after the dictionary has been finalized. + // + // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This + // buffered memory is charged to the block cache when there is a block cache. + // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is + // full), we finalize the dictionary with whatever data we have and then stop + // buffering. + uint32_t max_dict_bytes = 0; + + // Maximum size of training data passed to zstd's dictionary trainer. Using + // zstd's dictionary trainer can achieve even better compression ratio + // improvements than using `max_dict_bytes` alone. + // + // The training data will be used to generate a dictionary of max_dict_bytes. + uint32_t zstd_max_train_bytes = 0; + + // Number of threads for parallel compression. + // Parallel compression is enabled only if threads > 1. + // THE FEATURE IS STILL EXPERIMENTAL + // + // This option is valid only when BlockBasedTable is used. + // + // When parallel compression is enabled, SST size file sizes might be + // more inflated compared to the target size, because more data of unknown + // compressed size is in flight when compression is parallelized. To be + // reasonably accurate, this inflation is also estimated by using historical + // compression ratio and current bytes inflight. + uint32_t parallel_threads = 1; + + // When the compression options are set by the user, it will be set to "true". + // For bottommost_compression_opts, to enable it, user must set enabled=true. + // Otherwise, bottommost compression will use compression_opts as default + // compression options. + // + // For compression_opts, if compression_opts.enabled=false, it is still + // used as compression options for compression process. + bool enabled = false; + + // Limit on data buffering when gathering samples to build a dictionary. Zero + // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), + // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect. + // + // In compaction, the buffering is limited to the target file size (see + // `target_file_size_base` and `target_file_size_multiplier`) even if this + // setting permits more buffering. Since we cannot determine where the file + // should be cut until data blocks are compressed with dictionary, buffering + // more than the target file size could lead to selecting samples that belong + // to a later output SST. + // + // Limiting too strictly may harm dictionary effectiveness since it forces + // RocksDB to pick samples from the initial portion of the output SST, which + // may not be representative of the whole file. Configuring this limit below + // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can + // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can + // restrict the size of the final dictionary. + uint64_t max_dict_buffer_bytes = 0; + + // Use zstd trainer to generate dictionaries. When this option is set to true, + // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes + // buffered data will be passed to zstd dictionary trainer to generate a + // dictionary of size max_dict_bytes. + // + // When this option is false, zstd's API ZDICT_finalizeDictionary() will be + // called to generate dictionaries. zstd_max_train_bytes of training sampled + // data will be passed to this API. Using this API should save CPU time on + // dictionary training, but the compression ratio may not be as good as using + // a dictionary trainer. + bool use_zstd_dict_trainer = true; + + // ===> END options that can be set by deprecated configuration syntax <=== + // ===> Use compression_opts={level=6;strategy=7;} form for below opts <=== + + // Essentially specifies a minimum acceptable compression ratio. A block is + // stored uncompressed if the compressed block does not achieve this ratio, + // because the downstream cost of decompression is not considered worth such + // a small savings (if any). + // However, the ratio is specified in a way that is efficient for checking. + // An integer from 1 to 1024 indicates the maximum allowable compressed bytes + // per 1KB of input, so the minimum acceptable ratio is 1024.0 / this value. + // For example, for a minimum ratio of 1.5:1, set to 683. See SetMinRatio(). + // Default: abandon use of compression for a specific block or entry if + // compressed by less than 12.5% (minimum ratio of 1.143:1). + int max_compressed_bytes_per_kb = 1024 * 7 / 8; + + // A convenience function for setting max_compressed_bytes_per_kb based on a + // minimum acceptable compression ratio (uncompressed size over compressed + // size). + void SetMinRatio(double min_ratio) { + max_compressed_bytes_per_kb = static_cast(1024.0 / min_ratio + 0.5); + } +}; + +// Temperature of a file. Used to pass to FileSystem for a different +// placement and/or coding. +// Reserve some numbers in the middle, in case we need to insert new tier +// there. +enum class Temperature : uint8_t { + kUnknown = 0, + kHot = 0x04, + kWarm = 0x08, + kCold = 0x0C, + kLastTemperature, +}; + +struct FileTemperatureAge { + Temperature temperature = Temperature::kUnknown; + uint64_t age = 0; +}; + +struct CompactionOptionsFIFO { + // once the total sum of table files reaches this, we will delete the oldest + // table file + // Default: 1GB + uint64_t max_table_files_size; + + // If true, try to do compaction to compact smaller files into larger ones. + // Minimum files to compact follows options.level0_file_num_compaction_trigger + // and compaction won't trigger if average compact bytes per del file is + // larger than options.write_buffer_size. This is to protect large files + // from being compacted again. + // Default: false; + bool allow_compaction = false; + + // DEPRECATED + // When not 0, if the data in the file is older than this threshold, RocksDB + // will soon move the file to warm temperature. + uint64_t age_for_warm = 0; + + // EXPERIMENTAL + // Age (in seconds) threshold for different file temperatures. + // When not empty, each element specifies an age threshold `age` and a + // temperature such that if all the data in a file is older than `age`, + // RocksDB will compact the file to the specified `temperature`. + // + // Note: + // - Flushed files will always have temperature kUnknown. + // - Compaction output files will have temperature kUnknown by default, so + // only temperatures other than kUnknown needs to be specified. + // - The elements should be in increasing order with respect to `age` field. + // + // Dynamically changeable through SetOptions() API, e.g., + // SetOptions("compaction_options_fifo", + // "{file_temperature_age_thresholds={ + // {age=10;temperature=kWarm}:{age=20;temperature=kCold}}}") + // In this example, all files that are at least 20 seconds old will be + // compacted and output files will have temperature kCold. All files that are + // at least 10 seconds old but younger than 20 seconds will be compacted to + // files with temperature kWarm. + // + // Default: empty + std::vector file_temperature_age_thresholds{}; + + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} + CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) + : max_table_files_size(_max_table_files_size), + allow_compaction(_allow_compaction) {} +}; + +// The control option of how the cache tiers will be used. Currently rocksdb +// support block cache (volatile tier), secondary cache (non-volatile tier). +// In the future, we may add more caching layers. +enum class CacheTier : uint8_t { + kVolatileTier = 0, + kNonVolatileBlockTier = 0x01, +}; + +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + +enum class PrepopulateBlobCache : uint8_t { + kDisable = 0x0, // Disable prepopulate blob cache + kFlushOnly = 0x1, // Prepopulate blobs during flush only +}; + +struct AdvancedColumnFamilyOptions { + // The maximum number of write buffers that are built up in memory. + // The default and the minimum number is 2, so that when 1 write buffer + // is being flushed to storage, new writes can continue to the other + // write buffer. + // If max_write_buffer_number > 3, writing will be slowed down to + // options.delayed_write_rate if we are writing to the last write buffer + // allowed. + // + // Default: 2 + // + // Dynamically changeable through SetOptions() API + int max_write_buffer_number = 2; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are flushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. + // If atomic flush is enabled (options.atomic_flush == true), then this + // option will be sanitized to 1. + // Default: 1 + int min_write_buffer_number_to_merge = 1; + + // DEPRECATED + // The total maximum number of write buffers to maintain in memory including + // copies of buffers that have already been flushed. Unlike + // max_write_buffer_number, this parameter does not affect flushing. + // This parameter is being replaced by max_write_buffer_size_to_maintain. + // If both parameters are set to non-zero values, this parameter will be + // ignored. + int max_write_buffer_number_to_maintain = 0; + + // The target number of write history bytes to hold in memory. Write history + // comprises the latest write buffers (memtables). To reach the target, write + // buffers that were most recently flushed to SST files may be retained in + // memory. + // + // This controls the target amount of write history that will be available + // in memory for conflict checking when Transactions are used. + // + // This target may be undershot when the CF first opens and has not recovered + // or received enough writes to reach the target. After reaching the target + // once, it is guaranteed to never undershoot again. That guarantee is + // implemented by retaining flushed write buffers in-memory until the oldest + // one can be trimmed without dropping below the target. + // + // Examples with `max_write_buffer_size_to_maintain` set to 32MB: + // + // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB, + // and zero flushed immutable memtables. Nothing trimmable exists. + // - One mutable memtable of 16MB, zero unflushed immutable memtables, and + // one flushed immutable memtable of 64MB. Trimming is disallowed because + // dropping the earliest (only) flushed immutable memtable would result in + // write history of 16MB < 32MB. + // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB, + // and one flushed immutable memtable of 16MB. The earliest (only) flushed + // immutable memtable is trimmed because without it we still have + // 16MB + 24MB = 40MB > 32MB of write history. + // + // When using an OptimisticTransactionDB: + // If this value is too low, some transactions may fail at commit time due + // to not being able to determine whether there were any write conflicts. + // + // When using a TransactionDB: + // If Transaction::SetSnapshot is used, TransactionDB will read either + // in-memory write buffers or SST files to do write-conflict checking. + // Increasing this value can reduce the number of reads to SST files + // done for conflict detection. + // + // Setting this value to 0 will cause write buffers to be freed immediately + // after they are flushed. If this value is set to -1, + // 'max_write_buffer_number * write_buffer_size' will be used. + // + // Default: + // If using a TransactionDB/OptimisticTransactionDB, the default value will + // be set to the value of 'max_write_buffer_number * write_buffer_size' + // if it is not explicitly set by the user. Otherwise, the default is 0. + int64_t max_write_buffer_size_to_maintain = 0; + + // Allows thread-safe inplace updates. If this is true, there is no way to + // achieve point-in-time consistency using snapshot or iterator (assuming + // concurrent updates). Hence iterator and multi-get will return results + // which are not consistent as of any point-in-time. + // Backward iteration on memtables will not work either. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. + // Default: false. + bool inplace_update_support = false; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + // + // Dynamically changeable through SetOptions() API + size_t inplace_update_num_locks = 10000; + + // [experimental] + // Used to activate or deactive the Mempurge feature (memtable garbage + // collection). (deactivated by default). At every flush, the total useful + // payload (total entries minus garbage entries) is estimated as a ratio + // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then + // compared to this `threshold` value: + // - if ratio1.0 : aggressive mempurge. + // 0 < threshold < 1.0: mempurge triggered only for very low useful payload + // ratios. + // [experimental] + double experimental_mempurge_threshold = 0.0; + + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + // + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + // + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + // + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + // + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + // + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + // + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value) = nullptr; + + // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic + // Bloom filter in memtable to optimize many queries that must go beyond + // the memtable. The size in bytes of the filter is + // write_buffer_size * memtable_prefix_bloom_size_ratio. + // * If prefix_extractor is set, the filter includes prefixes. + // * If memtable_whole_key_filtering, the filter includes whole keys. + // * If both, the filter includes both. + // * If neither, the feature is disabled. + // + // If this value is larger than 0.25, it is sanitized to 0.25. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + double memtable_prefix_bloom_size_ratio = 0.0; + + // Enable whole key bloom filter in memtable. Note this will only take effect + // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering + // can potentially reduce CPU usage for point-look-ups. + // + // Default: false (disabled) + // + // Dynamically changeable through SetOptions() API + bool memtable_whole_key_filtering = false; + + // Page size for huge page for the arena used by the memtable. If <=0, it + // won't allocate from huge page but from malloc. + // Users are responsible to reserve huge pages for it to be allocated. For + // example: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + // If there isn't enough free huge page available, it will fall back to + // malloc. + // + // Dynamically changeable through SetOptions() API + size_t memtable_huge_page_size = 0; + + // If non-nullptr, memtable will use the specified function to extract + // prefixes for keys, and for each prefix maintain a hint of insert location + // to reduce CPU usage for inserting keys with the prefix. Keys out of + // domain of the prefix extractor will be insert without using hints. + // + // Currently only the default skiplist based memtable implements the feature. + // All other memtable implementation will ignore the option. It incurs ~250 + // additional bytes of memory overhead to store a hint for each prefix. + // Also concurrent writes (when allow_concurrent_memtable_write is true) will + // ignore the option. + // + // The option is best suited for workloads where keys will likely to insert + // to a location close the last inserted key with the same prefix. + // One example could be inserting keys of the form (prefix + timestamp), + // and keys of the same prefix always comes in with time order. Another + // example would be updating the same key over and over again, in which case + // the prefix can be the key itself. + // + // Default: nullptr (disabled) + std::shared_ptr + memtable_insert_with_hint_prefix_extractor = nullptr; + + // Control locality of bloom filter probes to improve CPU cache hit rate. + // This option now only applies to plaintable prefix bloom. This + // optimization is turned off when set to 0, and positive number to turn + // it on. + // Default: 0 + uint32_t bloom_locality = 0; + + // size of one block in arena memory allocation. + // If <= 0, a proper value is automatically calculated (usually 1/8 of + // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is + // smaller). + // + // There are two additional restriction of the specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + // + // Dynamically changeable through SetOptions() API + size_t arena_block_size = 0; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to use quick compression + // algorithms while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non-empty, should have an entry for + // each level of the database; these override the value specified in + // the previous field 'compression'. + // + // NOTICE if level_compaction_dynamic_level_bytes=true, + // compression_per_level[0] still determines L0, but other elements + // of the array are based on base level (the level L0 files are merged + // to), and may not match the level users see from info log for metadata. + // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] + // determines compaction type for level n+i-1. + // For example, if we have three 5 levels, and we determine to merge L0 + // data to L4 (which means L1..L3 will be empty), then the new files go to + // L4 uses compression type compression_per_level[1]. + // If now L0 is merged to L2. Data goes to L2 will be compressed + // according to compression_per_level[1], L3 using compression_per_level[2] + // and L4 using compression_per_level[3]. Compaction for each level can + // change when data grows. + // + // NOTE: if the vector size is smaller than the level number, the undefined + // lower level uses the last option in the vector, for example, for 3 level + // LSM tree the following settings are the same: + // {kNoCompression, kSnappyCompression} + // {kNoCompression, kSnappyCompression, kSnappyCompression} + // + // Dynamically changeable through SetOptions() API + std::vector compression_per_level; + + // Number of levels for this database + int num_levels = 7; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + // + // Default: 20 + // + // Dynamically changeable through SetOptions() API + int level0_slowdown_writes_trigger = 20; + + // Maximum number of level-0 files. We stop writes at this point. + // + // Default: 36 + // + // Dynamically changeable through SetOptions() API + int level0_stop_writes_trigger = 36; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + // + // Default: 64MB. + // + // Dynamically changeable through SetOptions() API + uint64_t target_file_size_base = 64 * 1048576; + + // By default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + // + // Dynamically changeable through SetOptions() API + int target_file_size_multiplier = 1; + + // If true, RocksDB will pick target size of each level dynamically. + // We will pick a base level b >= 1. L0 will be directly merged into level b, + // instead of always into level 1. Level 1 to b-1 need to be empty. + // We try to pick b and its target size so that + // 1. target size is in the range of + // (max_bytes_for_level_base / max_bytes_for_level_multiplier, + // max_bytes_for_level_base] + // 2. target size of the last level (level num_levels-1) equals to extra size + // of the level. + // At the same time max_bytes_for_level_multiplier and + // max_bytes_for_level_multiplier_additional are still satisfied. + // (When L0 is too large, we make some adjustment. See below.) + // + // With this option on, from an empty DB, we make last level the base level, + // which means merging L0 data into the last level, until it exceeds + // max_bytes_for_level_base. And then we make the second last level to be + // base level, to start to merge L0 data to second last level, with its + // target size to be 1/max_bytes_for_level_multiplier of the last level's + // extra size. After the data accumulates more so that we need to move the + // base level to the third last one, and so on. + // + // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, + // and max_bytes_for_level_base=10MB. + // Target sizes of level 1 to 5 starts with: + // [- - - - 10MB] + // with base level is level 5. Target sizes of level 1 to 4 are not applicable + // because they will not be used. + // Until the size of Level 5 grows to more than 10MB, say 11MB, we make + // base target to level 4 and now the targets looks like: + // [- - - 1.1MB 11MB] + // While data are accumulated, size targets are tuned based on actual data + // of level 5. When level 5 has 50MB of data, the target is like: + // [- - - 5MB 50MB] + // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep + // level 4 to be the base level, its target size needs to be 10.1MB, which + // doesn't satisfy the target size range. So now we make level 3 the target + // size and the target sizes of the levels look like: + // [- - 1.01MB 10.1MB 101MB] + // In the same way, while level 5 further grows, all levels' targets grow, + // like + // [- - 5MB 50MB 500MB] + // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the + // base level and make levels' target sizes like this: + // [- 1.001MB 10.01MB 100.1MB 1001MB] + // and go on... + // + // By doing it, we give max_bytes_for_level_multiplier a priority against + // max_bytes_for_level_base, for a more predictable LSM tree shape. It is + // useful to limit worse case space amplification. + // + // + // If the compaction from L0 is lagged behind, a special mode will be turned + // on to prioritize write amplification against max_bytes_for_level_multiplier + // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking + // at number of L0 files and total L0 size. If number of L0 files is at least + // the double of level0_file_num_compaction_trigger, or the total size is + // at least max_bytes_for_level_base, this mode is on. The target of L1 grows + // to the actual data size in L0, and then determine the target for each level + // so that each level will have the same level multiplier. + // + // For example, when L0 size is 100MB, the size of last level is 1600MB, + // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. + // Since L0 size is larger than max_bytes_for_level_base, this is a L0 + // compaction backlogged mode. So that the L1 size is determined to be 100MB. + // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will + // be needed. The level multiplier will be calculated to be 4 and the three + // levels' target to be [100MB, 400MB, 1600MB]. + // + // In this mode, The number of levels will be no more than the normal mode, + // and the level multiplier will be lower. The write amplification will + // likely to be reduced. + // + // + // max_bytes_for_level_multiplier_additional is ignored with this flag on. + // + // To make the migration easier, when turning this feature on, files in the + // LSM will be trivially moved down to fill the LSM starting from the + // bottommost level during DB open. For example, if the LSM looks like: + // L0: f0, f1 + // L1: f2, f3 + // L2: f4 + // L3: + // L4: f5 + // and the DB is opened with num_levels = 7 with this feature turned on, + // new LSM after DB open looks like the following: + // L0: f0, f1, (and possibly data flushed from WAL) + // L4: f2, f3 + // L5: f4 + // L6: f5 + // + // If `allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`, + // then the last level is reserved, and we will start filling LSM from the + // second last level (L5 in the above example). + // + // Note that there may be excessive levels (where target level size is 0 when + // computed based on this feature) in the LSM after a user migrates to turn + // this feature on. This is especially likely when a user migrates from + // leveled compaction with a smaller multiplier or from universal compaction. + // RocksDB will gradually drain these unnecessary levels by compacting files + // down the LSM. + // + // Default: false + bool level_compaction_dynamic_level_bytes = false; + + // Allows RocksDB to generate files that are not exactly the target_file_size + // only for the non-bottommost files. Which can reduce the write-amplification + // from compaction. The file size could be from 0 to 2x target_file_size. + // Once enabled, non-bottommost compaction will try to cut the files align + // with the next level file boundaries (grandparent level). + // + // Default: true + bool level_compaction_dynamic_file_size = true; + + // Default: 10. + // + // Dynamically changeable through SetOptions() API + double max_bytes_for_level_multiplier = 10; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // + // Default: 1 + // + // Dynamically changeable through SetOptions() API + std::vector max_bytes_for_level_multiplier_additional = + std::vector(num_levels, 1); + + // We try to limit number of bytes in one compaction to be lower than this + // threshold. But it's not guaranteed. + // Value 0 will be sanitized. + // + // Default: target_file_size_base * 25 + // + // Dynamically changeable through SetOptions() API + uint64_t max_compaction_bytes = 0; + + // When setting up compaction input files, we ignore the + // `max_compaction_bytes` limit when pulling in input files that are entirely + // within output key range. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + // We could remove this knob and always ignore the limit once it is proven + // safe. + bool ignore_max_compaction_bytes_for_input = true; + + // All writes will be slowed down to at least delayed_write_rate if estimated + // bytes needed to be compaction exceed this threshold. + // + // Default: 64GB + // + // Dynamically changeable through SetOptions() API + uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; + + // All writes are stopped if estimated bytes needed to be compaction exceed + // this threshold. + // + // Default: 256GB + // + // Dynamically changeable through SetOptions() API + uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style = kCompactionStyleLevel; + + // If level compaction_style = kCompactionStyleLevel, for each level, + // which files are prioritized to be picked to compact. + // Default: kMinOverlappingRatio + CompactionPri compaction_pri = kMinOverlappingRatio; + + // The options needed to support Universal Style compactions + // + // Dynamically changeable through SetOptions() API + // Dynamic change example: + // SetOptions("compaction_options_universal", "{size_ratio=2;}") + CompactionOptionsUniversal compaction_options_universal; + + // The options for FIFO compaction style + // + // Dynamically changeable through SetOptions() API + // Dynamic change example: + // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}") + CompactionOptionsFIFO compaction_options_fifo; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // + // Default: 8 + // + // Dynamically changeable through SetOptions() API + uint64_t max_sequential_skip_in_iterations = 8; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr memtable_factory = + std::shared_ptr(new SkipListFactory); + + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed (removed) + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + + // This option allows user to collect their own interested statistics of + // the tables. + // Default: empty vector -- no user-defined statistics collection will be + // performed. + using TablePropertiesCollectorFactories = + std::vector>; + TablePropertiesCollectorFactories table_properties_collector_factories; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + size_t max_successive_merges = 0; + + // This flag specifies that the implementation should optimize the filters + // mainly for cases where keys are found rather than also optimize for keys + // missed. This would be used in cases where the application knows that + // there are very few misses or the performance in the case of misses is not + // important. + // + // For now, this flag allows us to not store filters for the last level i.e + // the largest level which contains data of the LSM store. For keys which + // are hits, the filters in this level are not useful because we will search + // for the data anyway. NOTE: the filters in other levels are still useful + // even for key hit because they tell us whether to look in that level or go + // to the higher level. + // + // Default: false + bool optimize_filters_for_hits = false; + + // During flush or compaction, check whether keys inserted to output files + // are in order. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + bool check_flush_compaction_key_order = true; + + // After writing every SST file, reopen it and read all the keys. + // Checks the hash of all of the keys and values written versus the + // keys in the file and signals a corruption if they do not match + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool paranoid_file_checks = false; + + // In debug mode, RocksDB runs consistency checks on the LSM every time the + // LSM changes (Flush, Compaction, AddFile). When this option is true, these + // checks are also enabled in release mode. These checks were historically + // disabled in release mode, but are now enabled by default for proactive + // corruption detection. The CPU overhead is negligible for normal mixed + // operations but can slow down saturated writing. See + // Options::DisableExtraChecks(). + // Default: true + bool force_consistency_checks = true; + + // Measure IO stats in compactions and flushes, if true. + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool report_bg_io_stats = false; + + // Files containing updates older than TTL will go through the compaction + // process. This usually happens in a cascading way so that those entries + // will be compacted to bottommost level/file. + // The feature is used to remove stale entries that have been deleted or + // updated from the file system. + // Pre-req: This needs max_open_files to be set to -1. + // In Level: Non-bottom-level files older than TTL will go through the + // compaction process. + // In FIFO: Files older than TTL will be deleted. + // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 + // In FIFO, this option will have the same meaning as + // periodic_compaction_seconds. Whichever stricter will be used. + // 0 means disabling. + // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to + // pick default. + // + // Default: 30 days for leveled compaction + block based table. disable + // otherwise. + // + // Dynamically changeable through SetOptions() API + uint64_t ttl = 0xfffffffffffffffe; + + // Files older than this value will be picked up for compaction, and + // re-written to the same level as they were before. + // One main use of the feature is to make sure a file goes through compaction + // filters periodically. Users can also use the feature to clear up SST + // files using old format. + // + // A file's age is computed by looking at file_creation_time or creation_time + // table properties in order, if they have valid non-zero values; if not, the + // age is based on the file's last modified time (given by the underlying + // Env). + // + // Supported in all compaction styles. + // In Universal compaction, rocksdb will try to do a full compaction when + // possible, see more in UniversalCompactionBuilder::PickPeriodicCompaction(). + // In FIFO compaction, this option has the same meaning as TTL and whichever + // stricter will be used. + // Pre-req: max_open_file == -1. + // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 + // + // Values: + // 0: Turn off Periodic compactions. + // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature + // as needed. For now, RocksDB will change this value to 30 days + // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction + // process at least once every 30 days if not compacted sooner. + // In FIFO compaction, since the option has the same meaning as ttl, + // when this value is left default, and ttl is left to 0, 30 days will be + // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. + // + // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) + // + // Dynamically changeable through SetOptions() API + uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; + + // If this option is set then 1 in N blocks are compressed + // using a fast (lz4) and slow (zstd) compression algorithm. + // The compressibility is reported as stats and the stored + // data is left uncompressed (unless compression is also requested). + uint64_t sample_for_compression = 0; + + // EXPERIMENTAL + // The feature is still in development and is incomplete. + // If this option is set, when creating the last level files, pass this + // temperature to FileSystem used. Should be no-op for default FileSystem + // and users need to plug in their own FileSystem to take advantage of it. + // + // Note: the feature is changed from `bottommost_temperature` to + // `last_level_temperature` which now only apply for the last level files. + // The option name `bottommost_temperature` is kept only for migration, the + // behavior is the same as `last_level_temperature`. Please stop using + // `bottommost_temperature` and will be removed in next release. + // + // Dynamically changeable through the SetOptions() API + Temperature bottommost_temperature = Temperature::kUnknown; + Temperature last_level_temperature = Temperature::kUnknown; + + // EXPERIMENTAL + // The feature is still in development and is incomplete. + // If this option is set, when data insert time is within this time range, it + // will be precluded from the last level. + // 0 means no key will be precluded from the last level. + // + // Note: when enabled, universal size amplification (controlled by option + // `compaction_options_universal.max_size_amplification_percent`) calculation + // will exclude the last level. As the feature is designed for tiered storage + // and a typical setting is the last level is cold tier which is likely not + // size constrained, the size amp is going to be only for non-last levels. + // + // Default: 0 (disable the feature) + // + // Not dynamically changeable, change it requires db restart. + uint64_t preclude_last_level_data_seconds = 0; + + // EXPERIMENTAL + // If this option is set, it will preserve the internal time information about + // the data until it's older than the specified time here. + // Internally the time information is a map between sequence number and time, + // which is the same as `preclude_last_level_data_seconds`. But it won't + // preclude the data from the last level and the data in the last level won't + // have the sequence number zeroed out. + // Internally, rocksdb would sample the sequence number to time pair and store + // that in SST property "rocksdb.seqno.time.map". The information is currently + // only used for tiered storage compaction (option + // `preclude_last_level_data_seconds`). + // + // Note: if both `preclude_last_level_data_seconds` and this option is set, it + // will preserve the max time of the 2 options and compaction still preclude + // the data based on `preclude_last_level_data_seconds`. + // The higher the preserve_time is, the less the sampling frequency will be ( + // which means less accuracy of the time estimation). + // + // Default: 0 (disable the feature) + // + // Not dynamically changeable, change it requires db restart. + uint64_t preserve_internal_time_seconds = 0; + + // When set, large values (blobs) are written to separate blob files, and + // only pointers to them are stored in SST files. This can reduce write + // amplification for large-value use cases at the cost of introducing a level + // of indirection for reads. See also the options min_blob_size, + // blob_file_size, blob_compression_type, enable_blob_garbage_collection, + // blob_garbage_collection_age_cutoff, + // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size + // below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_files = false; + + // The size of the smallest value to be stored separately in a blob file. + // Values which have an uncompressed size smaller than this threshold are + // stored alongside the keys in SST files in the usual fashion. A value of + // zero for this option means that all values are stored in blob files. Note + // that enable_blob_files has to be set in order for this option to have any + // effect. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + uint64_t min_blob_size = 0; + + // The size limit for blob files. When writing blob files, a new file is + // opened once this limit is reached. Note that enable_blob_files has to be + // set in order for this option to have any effect. + // + // Default: 256 MB + // + // Dynamically changeable through the SetOptions() API + uint64_t blob_file_size = 1ULL << 28; + + // The compression algorithm to use for large values stored in blob files. + // Note that enable_blob_files has to be set in order for this option to have + // any effect. + // + // Default: no compression + // + // Dynamically changeable through the SetOptions() API + CompressionType blob_compression_type = kNoCompression; + + // Enables garbage collection of blobs. Blob GC is performed as part of + // compaction. Valid blobs residing in blob files older than a cutoff get + // relocated to new files as they are encountered during compaction, which + // makes it possible to clean up blob files once they contain nothing but + // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and + // blob_garbage_collection_force_threshold below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_garbage_collection = false; + + // The cutoff in terms of blob file age for garbage collection. Blobs in + // the oldest N blob files will be relocated when encountered during + // compaction, where N = garbage_collection_cutoff * number_of_blob_files. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 0.25 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_age_cutoff = 0.25; + + // If the ratio of garbage in the oldest blob files exceeds this threshold, + // targeted compactions are scheduled in order to force garbage collecting + // the blob files in question, assuming they are all eligible based on the + // value of blob_garbage_collection_age_cutoff above. This option is + // currently only supported with leveled compactions. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 1.0 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_force_threshold = 1.0; + + // Compaction readahead for blob files. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + uint64_t blob_compaction_readahead_size = 0; + + // Enable blob files starting from a certain LSM tree level. + // + // For certain use cases that have a mix of short-lived and long-lived values, + // it might make sense to support extracting large values only during + // compactions whose output level is greater than or equal to a specified LSM + // tree level (e.g. compactions into L1/L2/... or above). This could reduce + // the space amplification caused by large values that are turned into garbage + // shortly after being written at the price of some write amplification + // incurred by long-lived values whose extraction to blob files is delayed. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + int blob_file_starting_level = 0; + + // The Cache object to use for blobs. Using a dedicated object for blobs and + // using the same object for the block and blob caches are both supported. In + // the latter case, note that blobs are less valuable from a caching + // perspective than SST blocks, and some cache implementations have + // configuration options that can be used to prioritize items accordingly (see + // Cache::Priority and LRUCacheOptions::{high,low}_pri_pool_ratio). + // + // Default: nullptr (disabled) + std::shared_ptr blob_cache = nullptr; + + // Enable/disable prepopulating the blob cache. When set to kFlushOnly, BlobDB + // will insert newly written blobs into the blob cache during flush. This can + // improve performance when reading back these blobs would otherwise be + // expensive (e.g. when using direct I/O or remote storage), or when the + // workload has a high temporal locality. + // + // Default: disabled + // + // Dynamically changeable through the SetOptions() API + PrepopulateBlobCache prepopulate_blob_cache = PrepopulateBlobCache::kDisable; + + // Enable memtable per key-value checksum protection. + // + // Each entry in memtable will be suffixed by a per key-value checksum. + // This options determines the size of such checksums. + // + // It is suggested to turn on write batch per key-value + // checksum protection together with this option, so that the checksum + // computation is done outside of writer threads (memtable kv checksum can be + // computed from write batch checksum) See + // WriteOptions::protection_bytes_per_key for more detail. + // + // Default: 0 (no protection) + // Supported values: 0, 1, 2, 4, 8. + uint32_t memtable_protection_bytes_per_key = 0; + + // UNDER CONSTRUCTION -- DO NOT USE + // When the user-defined timestamp feature is enabled, this flag controls + // whether the user-defined timestamps will be persisted. + // + // When it's false, the user-defined timestamps will be removed from the user + // keys when data is flushed from memtables to SST files. Other places that + // user keys can be persisted like WAL and blob files go through a similar + // process. Users should call `DB::IncreaseFullHistoryTsLow` to set a cutoff + // timestamp. RocksDB refrains from flushing a memtable with data still above + // the cutoff timestamp with best effort. When users try to read below the + // cutoff timestamp, an error will be returned. + // + // Default: true (user-defined timestamps are persisted) + // Not dynamically changeable, change it requires db restart and + // only compatible changes are allowed. + bool persist_user_defined_timestamps = true; + + // Enable/disable per key-value checksum protection for in memory blocks. + // + // Checksum is constructed when a block is loaded into memory and verification + // is done for each key read from the block. This is useful for detecting + // in-memory data corruption. Note that this feature has a non-trivial + // negative impact on read performance. Different values of the + // option have similar performance impact, but different memory cost and + // corruption detection probability (e.g. 1 byte gives 255/256 chance for + // detecting a corruption). + // + // Default: 0 (no protection) + // Supported values: 0, 1, 2, 4, 8. + uint8_t block_protection_bytes_per_key = 0; + + // Create ColumnFamilyOptions with default values for all fields + AdvancedColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit AdvancedColumnFamilyOptions(const Options& options); + + // ---------------- OPTIONS NOT SUPPORTED ANYMORE ---------------- +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/block_cache_trace_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/block_cache_trace_writer.h new file mode 100644 index 0000000000..18d28685b1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/block_cache_trace_writer.h @@ -0,0 +1,149 @@ +// Copyright (c) 2022, Meta Platforms, Inc. and affiliates. All rights +// reserved. This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table_reader_caller.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { +// A record for block cache lookups/inserts. This is passed by the table +// reader to the BlockCacheTraceWriter for every block cache op. +struct BlockCacheTraceRecord { + // Required fields for all accesses. + uint64_t access_timestamp = 0; + + // Info related to the block being looked up or inserted + // + // 1. The cache key for the block + std::string block_key; + + // 2. The type of block + TraceType block_type = TraceType::kTraceMax; + + // 3. Size of the block + uint64_t block_size = 0; + + // Info about the SST file the block is in + // + // 1. Column family ID + uint64_t cf_id = 0; + + // 2. Column family name + std::string cf_name; + + // 3. LSM level of the file + uint32_t level = 0; + + // 4. SST file number + uint64_t sst_fd_number = 0; + + // Info about the calling context + // + // 1. The higher level request triggering the block cache request + TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller; + + // 2. Cache lookup hit/miss. Not relevant for inserts + bool is_cache_hit = false; + + // 3. Whether this request is a lookup + bool no_insert = false; + + // Get/MultiGet specific info + // + // 1. A unique ID for Get/MultiGet + uint64_t get_id = kReservedGetId; + + // 2. Whether the Get/MultiGet is from a user-specified snapshot + bool get_from_user_specified_snapshot = false; + + // 3. The target user key in the block + std::string referenced_key; + + // Required fields for data block and user Get/Multi-Get only. + // + // 1. Size of te useful data in the block + uint64_t referenced_data_size = 0; + + // 2. Only for MultiGet, number of keys from the batch found in the block + uint64_t num_keys_in_block = 0; + + // 3. Whether the key was found in the block or not (false positive) + bool referenced_key_exist_in_block = false; + + static const uint64_t kReservedGetId; + + BlockCacheTraceRecord() {} + + BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key, + TraceType _block_type, uint64_t _block_size, + uint64_t _cf_id, std::string _cf_name, uint32_t _level, + uint64_t _sst_fd_number, TableReaderCaller _caller, + bool _is_cache_hit, bool _no_insert, uint64_t _get_id, + bool _get_from_user_specified_snapshot = false, + std::string _referenced_key = "", + uint64_t _referenced_data_size = 0, + uint64_t _num_keys_in_block = 0, + bool _referenced_key_exist_in_block = false) + : access_timestamp(_access_timestamp), + block_key(_block_key), + block_type(_block_type), + block_size(_block_size), + cf_id(_cf_id), + cf_name(_cf_name), + level(_level), + sst_fd_number(_sst_fd_number), + caller(_caller), + is_cache_hit(_is_cache_hit), + no_insert(_no_insert), + get_id(_get_id), + get_from_user_specified_snapshot(_get_from_user_specified_snapshot), + referenced_key(_referenced_key), + referenced_data_size(_referenced_data_size), + num_keys_in_block(_num_keys_in_block), + referenced_key_exist_in_block(_referenced_key_exist_in_block) {} +}; + +// Options for tracing block cache accesses +struct BlockCacheTraceOptions { + // Specify trace sampling option, i.e. capture one per how many requests. + // Default to 1 (capture every request). + uint64_t sampling_frequency = 1; +}; + +// Options for the built-in implementation of BlockCacheTraceWriter +struct BlockCacheTraceWriterOptions { + uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024; +}; + +// BlockCacheTraceWriter is an abstract class that captures all RocksDB block +// cache accesses. Every RocksDB operation is passed to WriteBlockAccess() +// with a BlockCacheTraceRecord. +class BlockCacheTraceWriter { + public: + virtual ~BlockCacheTraceWriter() {} + + // Pass Slice references to avoid copy. + virtual Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key) = 0; + + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number and RocksDB version. + virtual Status WriteHeader() = 0; +}; + +// Allocate an instance of the built-in BlockCacheTraceWriter implementation, +// that traces all block cache accesses to a user-provided TraceWriter. Each +// access is traced to a file with a timestamp and type, followed by the +// payload. +std::unique_ptr NewBlockCacheTraceWriter( + SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options, + std::unique_ptr&& trace_writer); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/c.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/c.h new file mode 100644 index 0000000000..6e0ad08c81 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/c.h @@ -0,0 +1,2864 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for rocksdb. May be useful as a stable ABI that can be + used by programs that keep rocksdb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#pragma once + +#ifdef _WIN32 +#ifdef ROCKSDB_DLL +#ifdef ROCKSDB_LIBRARY_EXPORTS +#define ROCKSDB_LIBRARY_API __declspec(dllexport) +#else +#define ROCKSDB_LIBRARY_API __declspec(dllimport) +#endif +#else +#define ROCKSDB_LIBRARY_API +#endif +#else +#define ROCKSDB_LIBRARY_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* Exported types */ + +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; +typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t; +typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; +typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t; +typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t; +typedef struct rocksdb_hyper_clock_cache_options_t + rocksdb_hyper_clock_cache_options_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; +typedef struct rocksdb_compactionfiltercontext_t + rocksdb_compactionfiltercontext_t; +typedef struct rocksdb_compactionfilterfactory_t + rocksdb_compactionfilterfactory_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_dbpath_t rocksdb_dbpath_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_fifo_compaction_options_t + rocksdb_fifo_compaction_options_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t; +typedef struct rocksdb_block_based_table_options_t + rocksdb_block_based_table_options_t; +typedef struct rocksdb_cuckoo_table_options_t rocksdb_cuckoo_table_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t + rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; +typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t; +typedef struct rocksdb_column_family_metadata_t + rocksdb_column_family_metadata_t; +typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t; +typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t; +typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; +typedef struct rocksdb_ingestexternalfileoptions_t + rocksdb_ingestexternalfileoptions_t; +typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t; +typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t; +typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t; +typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t; +typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t; +typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t; +typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t; +typedef struct rocksdb_optimistictransactiondb_t + rocksdb_optimistictransactiondb_t; +typedef struct rocksdb_optimistictransaction_options_t + rocksdb_optimistictransaction_options_t; +typedef struct rocksdb_transaction_t rocksdb_transaction_t; +typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t; +typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t; +typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t; +typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t; +typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t; + +/* DB operations */ + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl( + const rocksdb_options_t* options, const char* name, int ttl, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( + const rocksdb_options_t* options, const char* name, + unsigned char error_if_wal_file_exists, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* +rocksdb_backup_engine_open_opts(const rocksdb_backup_engine_options_t* options, + rocksdb_env_t* env, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( + rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush( + rocksdb_backup_engine_t* be, rocksdb_t* db, + unsigned char flush_before_backup, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups( + rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t* +rocksdb_restore_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( + rocksdb_restore_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files( + rocksdb_restore_options_t* opt, int v); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_verify_backup( + rocksdb_backup_engine_t* be, uint32_t backup_id, char** errptr); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, const uint32_t backup_id, + char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t* +rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be); + +extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count( + const rocksdb_backup_engine_info_t* info); + +extern ROCKSDB_LIBRARY_API int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close( + rocksdb_backup_engine_t* be); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_increase_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* ts_low, size_t ts_lowlen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + size_t* ts_lowlen, char** errptr); + +/* BackupEngineOptions */ + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_options_t* +rocksdb_backup_engine_options_create(const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_backup_dir( + rocksdb_backup_engine_options_t* options, const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_env( + rocksdb_backup_engine_options_t* options, rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_share_table_files( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backup_engine_options_get_share_table_files( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_sync( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backup_engine_options_get_sync( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_destroy_old_data( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backup_engine_options_get_destroy_old_data( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_backup_log_files( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backup_engine_options_get_backup_log_files( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_backup_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_options_get_backup_rate_limit( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_restore_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_options_get_restore_rate_limit( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_max_background_operations( + rocksdb_backup_engine_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backup_engine_options_get_max_background_operations( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options, uint64_t size); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_options_get_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backup_engine_options_get_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backup_engine_options_get_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_destroy( + rocksdb_backup_engine_options_t*); + +/* Checkpoint */ + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create( + rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir, + uint64_t log_size_for_flush, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy( + rocksdb_checkpoint_t* checkpoint); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_and_trim_history( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char* trim_ts, + size_t trim_tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, const int* ttls, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* +rocksdb_open_for_read_only_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, + unsigned char error_if_wal_file_exists, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families( + const rocksdb_options_t* options, const char* name, size_t* lencf, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy( + char** list, size_t len); + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family(rocksdb_t* db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family_with_ttl( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, int ttl, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy( + rocksdb_column_family_handle_t*); + +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_column_family_handle_get_id(rocksdb_column_family_handle_t* handle); + +extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_handle_get_name( + rocksdb_column_family_handle_t* handle, size_t* name_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_put( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* start_key, + size_t start_key_len, const char* end_key, size_t end_key_len, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_merge( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_write( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern ROCKSDB_LIBRARY_API char* rocksdb_get( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr); + +// if values_list[i] == NULL and errs[i] == NULL, +// then we got status.IsNotFound(), which we will not return. +// all errors except status status.ok() and status.IsNotFound() are returned. +// +// errs, values_list and values_list_sizes must be num_keys in length, +// allocated by the caller. +// errs is a list of strings as opposed to the conventional one error, +// where errs[i] is the status for retrieval of keys_list[i]. +// each non-NULL errs entry is a malloc()ed, null terminated string. +// each non-NULL values_list entry is a malloc()ed array, with +// the length for each stored in values_list_sizes[i]. +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get( + rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, char** timestamp_list, + size_t* timestamp_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** timestamps_list, + size_t* timestamps_list_sizes, char** errs); + +// The MultiGet API that improves performance by batching operations +// in the read path for greater efficiency. Currently, only the block based +// table format with full filters are supported. Other table formats such +// as plain table, block based table with block based filters and +// partitioned indexes will still work, but will not get any performance +// benefits. +// +// Note that all the keys passed to this API are restricted to a single +// column family. +// +// Parameters - +// db - the RocksDB instance. +// options - ReadOptions +// column_family - ColumnFamilyHandle* that the keys belong to. All the keys +// passed to the API are restricted to a single column family +// num_keys - Number of keys to lookup +// keys_list - Pointer to C style array of keys with num_keys elements +// keys_list_sizes - Pointer to C style array of the size of corresponding key +// in key_list with num_keys elements. +// values - Pointer to C style array of PinnableSlices with num_keys elements +// statuses - Pointer to C style array of Status with num_keys elements +// sorted_input - If true, it means the input keys are already sorted by key +// order, so the MultiGet() API doesn't have to sort them +// again. If false, the keys will be copied and sorted +// internally by the API - the input array will not be +// modified +extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input); + +// The value is only allocated (using malloc) and returned if it is found and +// value_found isn't NULL. In that case the user is responsible for freeing it. +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found); + +// The value is only allocated (using malloc) and returned if it is found and +// value_found isn't NULL. In that case the user is responsible for freeing it. +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since( + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators( + rocksdb_t* db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot( + rocksdb_t* db, const rocksdb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db, + const char* propname); +/* returns 0 on success, -1 otherwise */ +extern ROCKSDB_LIBRARY_API int rocksdb_property_int(rocksdb_t* db, + const char* propname, + uint64_t* out_val); + +/* returns 0 on success, -1 otherwise */ +extern ROCKSDB_LIBRARY_API int rocksdb_property_int_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val); + +extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname); + +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db, + const char* start_key, + size_t start_key_len, + const char* limit_key, + size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt( + rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key, + size_t start_key_len, const char* limit_key, size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db, + const char* name); + +extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush( + rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf( + rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush_cfs( + rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_family, int num_column_families, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db, + unsigned char sync, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions( + rocksdb_t* db, unsigned char force, char** errptr); + +/* Management operations */ + +extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr); + +/* Iterator */ + +extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid( + const rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*, + const char* k, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*, + const char* k, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key( + const rocksdb_iterator_t*, size_t* klen); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value( + const rocksdb_iterator_t*, size_t* vlen); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp( + const rocksdb_iterator_t*, size_t* tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error( + const rocksdb_iterator_t*, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next( + rocksdb_wal_iterator_t* iter); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid( + const rocksdb_wal_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status( + const rocksdb_wal_iterator_t* iter, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch( + const rocksdb_wal_iterator_t* iter, uint64_t* seq); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_get_latest_sequence_number(rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy( + const rocksdb_wal_iterator_t* iter); + +/* Write batch */ + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create( + void); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from( + const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy( + rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf_with_ts( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen, const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*, + const char* key, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete( + rocksdb_writebatch_t* b, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf_with_ts( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range( + rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev( + rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data( + rocksdb_writebatch_t*, const char* blob, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data( + rocksdb_writebatch_t*, size_t* size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point( + rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point( + rocksdb_writebatch_t*, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point( + rocksdb_writebatch_t*, char** errptr); + +/* Write batch with index */ + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_writebatch_wi_create(size_t reserved_bytes, + unsigned char overwrite_keys); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_writebatch_wi_create_from(const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count( + rocksdb_writebatch_wi_t* b); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put( + rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge( + rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete( + rocksdb_writebatch_wi_t*, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete( + rocksdb_writebatch_wi_t*, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); +// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range( + rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len); +// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len); +// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev( + rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* start_keys_list, const size_t* start_keys_list_sizes, + const char* const* end_keys_list, const size_t* end_keys_list_sizes); +// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data( + rocksdb_writebatch_wi_t*, const char* blob, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate( + rocksdb_writebatch_wi_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data( + rocksdb_writebatch_wi_t* b, size_t* size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point( + rocksdb_writebatch_wi_t*, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch( + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + const char* key, size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf( + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* cf); + +/* Options utils */ + +// Load the latest rocksdb options from the specified db_path. +// +// On success, num_column_families will be updated with a non-zero +// number indicating the number of column families. +// The returned db_options, column_family_names, and column_family_options +// should be released via rocksdb_load_latest_options_destroy(). +// +// On error, a non-null errptr that includes the error message will be +// returned. db_options, column_family_names, and column_family_options +// will be set to NULL. +extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options( + const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options, + rocksdb_cache_t* cache, rocksdb_options_t** db_options, + size_t* num_column_families, char*** column_family_names, + rocksdb_options_t*** column_family_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options_destroy( + rocksdb_options_t* db_options, char** list_column_family_names, + rocksdb_options_t** list_column_family_options, size_t len); + +/* Block based table options */ + +extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t* +rocksdb_block_based_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_checksum( + rocksdb_block_based_table_options_t*, char); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_index_block_restart_interval( + rocksdb_block_based_table_options_t* options, + int index_block_restart_interval); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_metadata_block_size( + rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_partition_filters( + rocksdb_block_based_table_options_t* options, + unsigned char partition_filters); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_optimize_filters_for_memory( + rocksdb_block_based_table_options_t* options, + unsigned char optimize_filters_for_memory); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_use_delta_encoding( + rocksdb_block_based_table_options_t* options, + unsigned char use_delta_encoding); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, unsigned char no_block_cache); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version( + rocksdb_block_based_table_options_t*, int); +enum { + rocksdb_block_based_table_index_type_binary_search = 0, + rocksdb_block_based_table_index_type_hash_search = 1, + rocksdb_block_based_table_index_type_two_level_index_search = 2, +}; +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums +enum { + rocksdb_block_based_table_data_block_index_type_binary_search = 0, + rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1, +}; +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_data_block_index_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_data_block_hash_ratio( + rocksdb_block_based_table_options_t* options, double v); +// rocksdb_block_based_options_set_hash_index_allow_collision() +// is removed since BlockBasedTableOptions.hash_index_allow_collision() +// is removed +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_cache_index_and_filter_blocks( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); + +/* Cuckoo table options */ + +extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t* +rocksdb_cuckoo_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy( + rocksdb_cuckoo_table_options_t* options); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern ROCKSDB_LIBRARY_API void +rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options); + +/* Options */ +extern ROCKSDB_LIBRARY_API void rocksdb_set_options(rocksdb_t* db, int count, + const char* const keys[], + const char* const values[], + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, + const char* const keys[], const char* const values[], char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism( + rocksdb_options_t* opt, int total_threads); +extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup( + rocksdb_options_t* opt, uint64_t block_cache_size_mb); +extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_optimize_universal_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter( + rocksdb_options_t*, rocksdb_compactionfilter_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory( + rocksdb_options_t*, rocksdb_compactionfilterfactory_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator( + rocksdb_options_t*, rocksdb_comparator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator( + rocksdb_options_t*, rocksdb_mergeoperator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, const int* level_values, size_t num_levels); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_create_missing_column_families(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_create_missing_column_families(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths( + rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*, + rocksdb_env_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*, + rocksdb_logger_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_write_buffer_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size( + rocksdb_options_t* opt, uint64_t n); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_use_zstd_dict_trainer( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int, + int, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( + rocksdb_options_t*, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t*, unsigned char, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor( + rocksdb_options_t*, rocksdb_slicetransform_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_target_file_size_base(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t*, int* level_values, size_t num_levels); +extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, + unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt); + +/* Blob Options Settings */ +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_min_blob_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_file_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type( + rocksdb_options_t* opt, int val); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt, + uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_starting_level( + rocksdb_options_t* opt, int val); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_file_starting_level( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_cache( + rocksdb_options_t* opt, rocksdb_cache_t* blob_cache); + +enum { + rocksdb_prepopulate_blob_disable = 0, + rocksdb_prepopulate_blob_flush_only = 1 +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prepopulate_blob_cache( + rocksdb_options_t* opt, int val); + +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_prepopulate_blob_cache( + rocksdb_options_t* opt); + +/* returns a pointer to a malloc()-ed, null terminated string */ +extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*, + int64_t); +extern ROCKSDB_LIBRARY_API int64_t +rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_max_subcompactions(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_log_file_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_keep_log_file_num(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, + size_t v); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, + size_t v); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_arena_block_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir( + rocksdb_options_t*, const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*, + const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec( + rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_advise_random_on_open(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_bytes_per_sync(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_prefix_bloom_size_ratio(rocksdb_options_t*, + double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep( + rocksdb_options_t*, size_t, int32_t, int32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory( + rocksdb_options_t*, uint32_t, int, double, size_t); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress( + rocksdb_options_t* opt, int level); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_successive_merges(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_bloom_locality(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_inplace_update_support(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_avoid_unnecessary_blocking_io(rocksdb_options_t*); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_experimental_mempurge_threshold(rocksdb_options_t*); + +enum { + rocksdb_tolerate_corrupted_tail_records_recovery = 0, + rocksdb_absolute_consistency_recovery = 1, + rocksdb_point_in_time_recovery = 2, + rocksdb_skip_any_corrupted_records_recovery = 3 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode( + rocksdb_options_t*); + +enum { + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 2, + rocksdb_bz2_compression = 3, + rocksdb_lz4_compression = 4, + rocksdb_lz4hc_compression = 5, + rocksdb_xpress_compression = 6, + rocksdb_zstd_compression = 7 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression( + rocksdb_options_t*); + +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1, + rocksdb_fifo_compaction = 2 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_universal_compaction_options( + rocksdb_options_t*, rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options( + rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter( + rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush( + rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( + rocksdb_options_t* opt, rocksdb_cache_t* cache); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( + rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_compression( + rocksdb_options_t* opt, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression( + rocksdb_options_t* opt); + +/* RateLimiter */ +extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); +extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy( + rocksdb_ratelimiter_t*); + +/* PerfContext */ +enum { + rocksdb_uninitialized = 0, + rocksdb_disable = 1, + rocksdb_enable_count = 2, + rocksdb_enable_time_except_for_mutex = 3, + rocksdb_enable_time = 4, + rocksdb_out_of_bounds = 5 +}; + +enum { + rocksdb_user_key_comparison_count = 0, + rocksdb_block_cache_hit_count, + rocksdb_block_read_count, + rocksdb_block_read_byte, + rocksdb_block_read_time, + rocksdb_block_checksum_time, + rocksdb_block_decompress_time, + rocksdb_get_read_bytes, + rocksdb_multiget_read_bytes, + rocksdb_iter_read_bytes, + rocksdb_internal_key_skipped_count, + rocksdb_internal_delete_skipped_count, + rocksdb_internal_recent_skipped_count, + rocksdb_internal_merge_count, + rocksdb_get_snapshot_time, + rocksdb_get_from_memtable_time, + rocksdb_get_from_memtable_count, + rocksdb_get_post_process_time, + rocksdb_get_from_output_files_time, + rocksdb_seek_on_memtable_time, + rocksdb_seek_on_memtable_count, + rocksdb_next_on_memtable_count, + rocksdb_prev_on_memtable_count, + rocksdb_seek_child_seek_time, + rocksdb_seek_child_seek_count, + rocksdb_seek_min_heap_time, + rocksdb_seek_max_heap_time, + rocksdb_seek_internal_seek_time, + rocksdb_find_next_user_entry_time, + rocksdb_write_wal_time, + rocksdb_write_memtable_time, + rocksdb_write_delay_time, + rocksdb_write_pre_and_post_process_time, + rocksdb_db_mutex_lock_nanos, + rocksdb_db_condition_wait_nanos, + rocksdb_merge_operator_time_nanos, + rocksdb_read_index_block_nanos, + rocksdb_read_filter_block_nanos, + rocksdb_new_table_block_iter_nanos, + rocksdb_new_table_iterator_nanos, + rocksdb_block_seek_nanos, + rocksdb_find_table_nanos, + rocksdb_bloom_memtable_hit_count, + rocksdb_bloom_memtable_miss_count, + rocksdb_bloom_sst_hit_count, + rocksdb_bloom_sst_miss_count, + rocksdb_key_lock_wait_time, + rocksdb_key_lock_wait_count, + rocksdb_env_new_sequential_file_nanos, + rocksdb_env_new_random_access_file_nanos, + rocksdb_env_new_writable_file_nanos, + rocksdb_env_reuse_writable_file_nanos, + rocksdb_env_new_random_rw_file_nanos, + rocksdb_env_new_directory_nanos, + rocksdb_env_file_exists_nanos, + rocksdb_env_get_children_nanos, + rocksdb_env_get_children_file_attributes_nanos, + rocksdb_env_delete_file_nanos, + rocksdb_env_create_dir_nanos, + rocksdb_env_create_dir_if_missing_nanos, + rocksdb_env_delete_dir_nanos, + rocksdb_env_get_file_size_nanos, + rocksdb_env_get_file_modification_time_nanos, + rocksdb_env_rename_file_nanos, + rocksdb_env_link_file_nanos, + rocksdb_env_lock_file_nanos, + rocksdb_env_unlock_file_nanos, + rocksdb_env_new_logger_nanos, + rocksdb_number_async_seek, + rocksdb_blob_cache_hit_count, + rocksdb_blob_read_count, + rocksdb_blob_read_byte, + rocksdb_blob_read_time, + rocksdb_blob_checksum_time, + rocksdb_blob_decompress_time, + rocksdb_internal_range_del_reseek_count, + rocksdb_block_read_cpu_time, + rocksdb_total_metric_count = 79 +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); +extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( + rocksdb_perfcontext_t* context); +extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( + rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, int metric); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy( + rocksdb_perfcontext_t* context); + +/* Compaction Filter */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t* +rocksdb_compactionfilter_create( + void* state, void (*destructor)(void*), + unsigned char (*filter)(void*, int level, const char* key, + size_t key_length, const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots( + rocksdb_compactionfilter_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy( + rocksdb_compactionfilter_t*); + +/* Compaction Filter Context */ + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionfiltercontext_is_full_compaction( + rocksdb_compactionfiltercontext_t* context); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionfiltercontext_is_manual_compaction( + rocksdb_compactionfiltercontext_t* context); + +/* Compaction Filter Factory */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t* +rocksdb_compactionfilterfactory_create( + void* state, void (*destructor)(void*), + rocksdb_compactionfilter_t* (*create_compaction_filter)( + void*, rocksdb_compactionfiltercontext_t* context), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy( + rocksdb_compactionfilterfactory_t*); + +/* Comparator */ + +extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy( + rocksdb_comparator_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* +rocksdb_comparator_with_ts_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts, + size_t b_tslen), + int (*compare_without_ts)(void*, const char* a, size_t alen, + unsigned char a_has_ts, const char* b, + size_t blen, unsigned char b_has_ts), + const char* (*name)(void*), size_t timestamp_size); + +/* Filter policy */ + +extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy( + rocksdb_filterpolicy_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom(double bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom_full(double bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key, + int bloom_before_level); + +/* Merge Operator */ + +extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t* +rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy( + rocksdb_mergeoperator_t*); + +/* Read options */ + +extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, const rocksdb_snapshot_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing( + rocksdb_readoptions_t*); +// The functionality that this option controlled has been removed. +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size( + rocksdb_readoptions_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_readoptions_set_max_skippable_internal_keys(rocksdb_readoptions_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_readoptions_set_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_async_io( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_async_io( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp( + rocksdb_readoptions_t*, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts( + rocksdb_readoptions_t*, const char* ts, size_t tslen); + +/* Write options */ + +extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL( + rocksdb_writeoptions_t* opt, int disable); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL( + rocksdb_writeoptions_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_ignore_missing_column_families(rocksdb_writeoptions_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_writeoptions_get_ignore_missing_column_families( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_writeoptions_get_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t*); + +/* Compact range options */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t* +rocksdb_compactoptions_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_compactoptions_set_exclusive_manual_compaction( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_exclusive_manual_compaction( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_compactoptions_set_bottommost_level_compaction( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_bottommost_level_compaction( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level( + rocksdb_compactoptions_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_full_history_ts_low( + rocksdb_compactoptions_t*, char* ts, size_t tslen); + +/* Flush options */ + +extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy( + rocksdb_flushoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait( + rocksdb_flushoptions_t*); + +/* Memory allocator */ + +extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t* +rocksdb_jemalloc_nodump_allocator_create(char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy( + rocksdb_memory_allocator_t*); + +/* Cache */ + +extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t* +rocksdb_lru_cache_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy( + rocksdb_lru_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity( + rocksdb_lru_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_num_shard_bits( + rocksdb_lru_cache_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( + size_t capacity); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* +rocksdb_cache_create_lru_with_strict_capacity_limit(size_t capacity); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts( + const rocksdb_lru_cache_options_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data( + rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( + rocksdb_cache_t* cache, size_t capacity); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_capacity(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_usage(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache); + +/* HyperClockCache */ + +extern ROCKSDB_LIBRARY_API rocksdb_hyper_clock_cache_options_t* +rocksdb_hyper_clock_cache_options_create(size_t capacity, + size_t estimated_entry_charge); +extern ROCKSDB_LIBRARY_API void rocksdb_hyper_clock_cache_options_destroy( + rocksdb_hyper_clock_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_hyper_clock_cache_options_set_capacity( + rocksdb_hyper_clock_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_hyper_clock_cache_options_set_estimated_entry_charge( + rocksdb_hyper_clock_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_hyper_clock_cache_options_set_num_shard_bits( + rocksdb_hyper_clock_cache_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_hyper_clock_cache_options_set_memory_allocator( + rocksdb_hyper_clock_cache_options_t*, rocksdb_memory_allocator_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_hyper_clock( + size_t capacity, size_t estimated_entry_charge); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* +rocksdb_cache_create_hyper_clock_opts( + const rocksdb_hyper_clock_cache_options_t*); + +/* DBPath */ + +extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create( + const char* path, uint64_t target_size); +extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*); + +/* Env */ + +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void); +extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads( + rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads( + rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int +rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy( + rocksdb_envoptions_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_create_dir_if_missing( + rocksdb_env_t* env, const char* path, char** errptr); + +/* SstFile */ + +extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t* +rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env, + const rocksdb_options_t* io_options); +extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t* +rocksdb_sstfilewriter_create_with_comparator( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, + const rocksdb_comparator_t* comparator); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open( + rocksdb_sstfilewriter_t* writer, const char* name, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put_with_ts( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* ts, size_t tslen, const char* val, size_t vallen, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_with_ts( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* ts, size_t tslen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_range( + rocksdb_sstfilewriter_t* writer, const char* begin_key, size_t begin_keylen, + const char* end_key, size_t end_keylen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish( + rocksdb_sstfilewriter_t* writer, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size( + rocksdb_sstfilewriter_t* writer, uint64_t* file_size); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy( + rocksdb_sstfilewriter_t* writer); +extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t* +rocksdb_ingestexternalfileoptions_create(void); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_move_files( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_snapshot_consistency( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char snapshot_consistency); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_allow_global_seqno( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_blocking_flush); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_ingest_behind( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char fail_if_not_bottommost_level); + +extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy( + rocksdb_ingestexternalfileoptions_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file( + rocksdb_t* db, const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, + const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary( + rocksdb_t* db, char** errptr); + +/* SliceTransform */ + +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* +rocksdb_slicetransform_create( + void* state, void (*destructor)(void*), + char* (*transform)(void*, const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)(void*, const char* key, size_t length), + unsigned char (*in_range)(void*, const char* key, size_t length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* + rocksdb_slicetransform_create_fixed_prefix(size_t); +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* +rocksdb_slicetransform_create_noop(void); +extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy( + rocksdb_slicetransform_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t* +rocksdb_universal_compaction_options_create(void); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t* +rocksdb_fifo_compaction_options_create(void); +extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts, unsigned char allow_compaction); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_fifo_compaction_options_get_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy( + rocksdb_fifo_compaction_options_t* fifo_opts); + +extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count( + const rocksdb_livefiles_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t*, int index, size_t* size); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t*, int index, size_t* size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t*); + +/* Utility Helpers */ + +extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string( + const rocksdb_options_t* base_options, const char* opts_str, + rocksdb_options_t* new_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr); + +/* MetaData */ + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t* +rocksdb_get_column_family_metadata(rocksdb_t* db); + +/** + * Returns the rocksdb_column_family_metadata_t of the specified + * column family. + * + * Note that the caller is responsible to release the returned memory + * using rocksdb_column_family_metadata_destroy. + */ +extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t* +rocksdb_get_column_family_metadata_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_column_family_metadata_destroy( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_column_family_metadata_get_size( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API size_t rocksdb_column_family_metadata_get_file_count( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_metadata_get_name( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API size_t +rocksdb_column_family_metadata_get_level_count( + rocksdb_column_family_metadata_t* cf_meta); + +/** + * Returns the rocksdb_level_metadata_t of the ith level from the specified + * column family metadata. + * + * If the specified i is greater than or equal to the number of levels + * in the specified column family, then NULL will be returned. + * + * Note that the caller is responsible to release the returned memory + * using rocksdb_level_metadata_destroy before releasing its parent + * rocksdb_column_family_metadata_t. + */ +extern ROCKSDB_LIBRARY_API rocksdb_level_metadata_t* +rocksdb_column_family_metadata_get_level_metadata( + rocksdb_column_family_metadata_t* cf_meta, size_t i); + +/** + * Releases the specified rocksdb_level_metadata_t. + * + * Note that the specified rocksdb_level_metadata_t must be released + * before the release of its parent rocksdb_column_family_metadata_t. + */ +extern ROCKSDB_LIBRARY_API void rocksdb_level_metadata_destroy( + rocksdb_level_metadata_t* level_meta); + +extern ROCKSDB_LIBRARY_API int rocksdb_level_metadata_get_level( + rocksdb_level_metadata_t* level_meta); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta); + +extern ROCKSDB_LIBRARY_API size_t +rocksdb_level_metadata_get_file_count(rocksdb_level_metadata_t* level_meta); + +/** + * Returns the sst_file_metadata_t of the ith file from the specified level + * metadata. + * + * If the specified i is greater than or equal to the number of files + * in the specified level, then NULL will be returned. + * + * Note that the caller is responsible to release the returned memory + * using rocksdb_sst_file_metadata_destroy before releasing its + * parent rocksdb_level_metadata_t. + */ +extern ROCKSDB_LIBRARY_API rocksdb_sst_file_metadata_t* +rocksdb_level_metadata_get_sst_file_metadata( + rocksdb_level_metadata_t* level_meta, size_t i); + +/** + * Releases the specified rocksdb_sst_file_metadata_t. + * + * Note that the specified rocksdb_sst_file_metadata_t must be released + * before the release of its parent rocksdb_level_metadata_t. + */ +extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_metadata_destroy( + rocksdb_sst_file_metadata_t* file_meta); + +extern ROCKSDB_LIBRARY_API char* +rocksdb_sst_file_metadata_get_relative_filename( + rocksdb_sst_file_metadata_t* file_meta); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_sst_file_metadata_get_size(rocksdb_sst_file_metadata_t* file_meta); + +/** + * Returns the smallest key of the specified sst file. + * The caller is responsible for releasing the returned memory. + * + * @param file_meta the metadata of an SST file to obtain its smallest key. + * @param len the out value which will contain the length of the returned key + * after the function call. + */ +extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_smallestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* len); + +/** + * Returns the smallest key of the specified sst file. + * The caller is responsible for releasing the returned memory. + * + * @param file_meta the metadata of an SST file to obtain its smallest key. + * @param len the out value which will contain the length of the returned key + * after the function call. + */ +extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_largestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* len); + +/* Transactions */ + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_transactiondb_create_column_family( + rocksdb_transactiondb_t* txn_db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* +rocksdb_transactiondb_open_column_families( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* +rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot( + rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value( + rocksdb_transactiondb_t* db, const char* propname); + +extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int( + rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin( + rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_transaction_options_t* txn_options, + rocksdb_transaction_t* old_txn); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t** +rocksdb_transactiondb_get_prepared_transactions(rocksdb_transactiondb_t* txn_db, + size_t* cnt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_name( + rocksdb_transaction_t* txn, const char* name, size_t name_len, + char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_name( + rocksdb_transaction_t* txn, size_t* name_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_prepare( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint( + rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy( + rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_transaction_get_writebatch_wi(rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch( + rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch, + char** errptr); + +// This rocksdb_writebatch_wi_t should be freed with rocksdb_free +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch_wi( + rocksdb_transaction_t* txn, rocksdb_writebatch_wi_t* wi, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_commit_timestamp( + rocksdb_transaction_t* txn, uint64_t commit_timestamp); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_set_read_timestamp_for_validation( + rocksdb_transaction_t* txn, uint64_t read_timestamp); + +// This snapshot should be freed using rocksdb_free +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* +rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned_cf(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, unsigned char exclusive, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned_for_update(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, + unsigned char exclusive, + char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + unsigned char exclusive, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transactiondb_get_pinned(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transactiondb_get_pinned_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put( + rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, + size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge( + rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, + size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete( + rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transaction_create_iterator_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transactiondb_create_iterator_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close( + rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cfs( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_families, int num_column_families, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal( + rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options, + const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* +rocksdb_optimistictransactiondb_get_base_db( + rocksdb_optimistictransactiondb_t* otxn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db( + rocksdb_t* base_db); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* +rocksdb_optimistictransaction_begin( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_optimistictransaction_options_t* otxn_options, + rocksdb_transaction_t* old_txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close( + rocksdb_optimistictransactiondb_t* otxn_db); + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_optimistictransactiondb_checkpoint_object_create( + rocksdb_optimistictransactiondb_t* otxn_db, char** errptr); + +/* Transaction Options */ + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t* +rocksdb_transactiondb_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy( + rocksdb_transactiondb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks( + rocksdb_transactiondb_options_t* opt, int64_t max_num_locks); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes( + rocksdb_transactiondb_options_t* opt, size_t num_stripes); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transactiondb_options_set_transaction_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transactiondb_options_set_default_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t* +rocksdb_transaction_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy( + rocksdb_transaction_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout( + rocksdb_transaction_options_t* opt, int64_t lock_timeout); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration( + rocksdb_transaction_options_t* opt, int64_t expiration); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_options_set_deadlock_detect_depth( + rocksdb_transaction_options_t* opt, int64_t depth); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_options_set_max_write_batch_size( + rocksdb_transaction_options_t* opt, size_t size); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_skip_prepare( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t* +rocksdb_optimistictransaction_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy( + rocksdb_optimistictransaction_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_optimistictransaction_options_set_set_snapshot( + rocksdb_optimistictransaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value( + rocksdb_optimistictransactiondb_t* db, const char* propname); + +extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int( + rocksdb_optimistictransactiondb_t* db, const char* propname, + uint64_t* out_val); + +// referring to convention (3), this should be used by client +// to free memory that was malloc()ed +extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy( + rocksdb_pinnableslice_t* v); +extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value( + const rocksdb_pinnableslice_t* t, size_t* vlen); + +extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* +rocksdb_memory_consumers_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( + rocksdb_memory_consumers_t* consumers, rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache( + rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy( + rocksdb_memory_consumers_t* consumers); +extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t* +rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy( + rocksdb_memory_usage_t* usage); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats( + rocksdb_options_t*, unsigned char); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*, + unsigned char); + +extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work( + rocksdb_t* db, unsigned char wait); + +extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache.h new file mode 100644 index 0000000000..532e6e4afd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache.h @@ -0,0 +1,442 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Various APIs for configuring, creating, and monitoring read caches. + +#pragma once + +#include +#include +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/data_structure.h" +#include "rocksdb/memory_allocator.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; // defined in advanced_cache.h +struct ConfigOptions; +class SecondaryCache; + +// These definitions begin source compatibility for a future change in which +// a specific class for block cache is split away from general caches, so that +// the block cache API can continue to become more specialized and +// customizeable, including in ways incompatible with a general cache. For +// example, HyperClockCache is not usable as a general cache because it expects +// only fixed-size block cache keys, but this limitation is not yet reflected +// in the API function signatures. +// * Phase 1 (done) - Make both BlockCache and GeneralCache aliases for Cache, +// and make a factory function for general caches. Encourage users of row_cache +// (not common) to switch to the factory function for general caches. +// * Phase 2 - Split off GenericCache as its own class, removing secondary +// cache support features and more from the API to simplify it. Between Phase 1 +// and Phase 2 users of row_cache will need to update their code. Any time +// after Phase 2, the block cache API can become more specialized in ways +// incompatible with general caches. +// * Phase 3 - Move existing RocksDB uses of Cache to BlockCache, and deprecate +// (but not yet remove) Cache as an alias for BlockCache. +using BlockCache = Cache; +using GeneralCache = Cache; + +// Classifications of block cache entries. +// +// Developer notes: Adding a new enum to this class requires corresponding +// updates to `kCacheEntryRoleToCamelString` and +// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since +// `kNumCacheEntryRoles` assumes `kMisc` comes last. +enum class CacheEntryRole { + // Block-based table data block + kDataBlock, + // Block-based table filter block (full or partitioned) + kFilterBlock, + // Block-based table metadata block for partitioned filter + kFilterMetaBlock, + // OBSOLETE / DEPRECATED: old/removed block-based filter + kDeprecatedFilterBlock, + // Block-based table index block + kIndexBlock, + // Other kinds of block-based table block + kOtherBlock, + // WriteBufferManager's charge to account for its memtable usage + kWriteBuffer, + // Compression dictionary building buffer's charge to account for + // its memory usage + kCompressionDictionaryBuildingBuffer, + // Filter's charge to account for + // (new) bloom and ribbon filter construction's memory usage + kFilterConstruction, + // BlockBasedTableReader's charge to account for its memory usage + kBlockBasedTableReader, + // FileMetadata's charge to account for its memory usage + kFileMetadata, + // Blob value (when using the same cache as block cache and blob cache) + kBlobValue, + // Blob cache's charge to account for its memory usage (when using a + // separate block cache and blob cache) + kBlobCache, + // Default bucket, for miscellaneous cache entries. Do not use for + // entries that could potentially add up to large usage. + kMisc, +}; +constexpr uint32_t kNumCacheEntryRoles = + static_cast(CacheEntryRole::kMisc) + 1; + +// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`. +const std::string& GetCacheEntryRoleName(CacheEntryRole); + +// A fast bit set for CacheEntryRoles +using CacheEntryRoleSet = SmallEnumSet; + +// For use with `GetMapProperty()` for property +// `DB::Properties::kBlockCacheEntryStats`. On success, the map will +// be populated with all keys that can be obtained from these functions. +struct BlockCacheEntryStatsMapKeys { + static const std::string& CacheId(); + static const std::string& CacheCapacityBytes(); + static const std::string& LastCollectionDurationSeconds(); + static const std::string& LastCollectionAgeSeconds(); + + static std::string EntryCount(CacheEntryRole); + static std::string UsedBytes(CacheEntryRole); + static std::string UsedPercent(CacheEntryRole); +}; + +extern const bool kDefaultToAdaptiveMutex; + +enum CacheMetadataChargePolicy { + // Only the `charge` of each entry inserted into a Cache counts against + // the `capacity` + kDontChargeCacheMetadata, + // In addition to the `charge`, the approximate space overheads in the + // Cache (in bytes) also count against `capacity`. These space overheads + // are for supporting fast Lookup and managing the lifetime of entries. + kFullChargeCacheMetadata +}; +const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy = + kFullChargeCacheMetadata; + +// Options shared betweeen various cache implementations that +// divide the key space into shards using hashing. +struct ShardedCacheOptions { + // Capacity of the cache, in the same units as the `charge` of each entry. + // This is typically measured in bytes, but can be a different unit if using + // kDontChargeCacheMetadata. + size_t capacity = 0; + + // Cache is sharded into 2^num_shard_bits shards, by hash of key. + // If < 0, a good default is chosen based on the capacity and the + // implementation. (Mutex-based implementations are much more reliant + // on many shards for parallel scalability.) + int num_shard_bits = -1; + + // If strict_capacity_limit is set, Insert() will fail if there is not + // enough capacity for the new entry along with all the existing referenced + // (pinned) cache entries. (Unreferenced cache entries are evicted as + // needed, sometimes immediately.) If strict_capacity_limit == false + // (default), Insert() never fails. + bool strict_capacity_limit = false; + + // If non-nullptr, RocksDB will use this allocator instead of system + // allocator when allocating memory for cache blocks. + // + // Caveat: when the cache is used as block cache, the memory allocator is + // ignored when dealing with compression libraries that allocate memory + // internally (currently only XPRESS). + std::shared_ptr memory_allocator; + + // See CacheMetadataChargePolicy + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + + // A SecondaryCache instance to use the non-volatile tier. For a GeneralCache + // this option must be kept as default empty. + std::shared_ptr secondary_cache; + + // See hash_seed comments below + static constexpr int32_t kQuasiRandomHashSeed = -1; + static constexpr int32_t kHostHashSeed = -2; + + // EXPERT OPTION: Specifies how a hash seed should be determined for the + // cache, or specifies a specific seed (only recommended for diagnostics or + // testing). + // + // Background: it could be dangerous to have different cache instances + // access the same SST files with the same hash seed, as correlated unlucky + // hashing across hosts or restarts could cause a widespread issue, rather + // than an isolated one. For example, with smaller block caches, it is + // possible for large full Bloom filters in a set of SST files to be randomly + // clustered into one cache shard, causing mutex contention or a thrashing + // condition as there's little or no space left for other entries assigned to + // the shard. If a set of SST files is broadcast and used on many hosts, we + // should ensure all have an independent chance of balanced shards. + // + // Values >= 0 will be treated as fixed hash seeds. Values < 0 are reserved + // for methods of dynamically choosing a seed, currently: + // * kQuasiRandomHashSeed - Each cache created chooses a seed mostly randomly, + // except that within a process, no seed is repeated until all have been + // issued. + // * kHostHashSeed - The seed is determined based on hashing the host name. + // Although this is arguably slightly worse for production reliability, it + // solves the essential problem of cross-host correlation while ensuring + // repeatable behavior on a host, for diagnostic purposes. + int32_t hash_seed = kHostHashSeed; + + ShardedCacheOptions() {} + ShardedCacheOptions( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + std::shared_ptr _memory_allocator = nullptr, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : capacity(_capacity), + num_shard_bits(_num_shard_bits), + strict_capacity_limit(_strict_capacity_limit), + memory_allocator(std::move(_memory_allocator)), + metadata_charge_policy(_metadata_charge_policy) {} +}; + +// LRUCache - A cache using LRU eviction to stay at or below a set capacity. +// The cache is sharded to 2^num_shard_bits shards, by hash of the key. +// The total capacity is divided and evenly assigned to each shard, and each +// shard has its own LRU list for evictions. Each shard also has a mutex for +// exclusive access during operations; even read operations need exclusive +// access in order to update the LRU list. Mutex contention is usually low +// with enough shards. +struct LRUCacheOptions : public ShardedCacheOptions { + // Ratio of cache reserved for high-priority and low-priority entries, + // respectively. (See Cache::Priority below more information on the levels.) + // Valid values are between 0 and 1 (inclusive), and the sum of the two + // values cannot exceed 1. + // + // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU + // list is maintained by the cache. A ratio of 0.5 means non-high-priority + // entries will use midpoint insertion. Similarly, if low_pri_pool_ratio is + // greater than zero, a dedicated low-priority LRU list is maintained. + // There is also a bottom-priority LRU list, which is always enabled and not + // explicitly configurable. Entries are spilled over to the next available + // lower-priority pool if a certain pool's capacity is exceeded. + // + // Entries with cache hits are inserted into the highest priority LRU list + // available regardless of the entry's priority. Entries without hits + // are inserted into highest priority LRU list available whose priority + // does not exceed the entry's priority. (For example, high-priority items + // with no hits are placed in the high-priority pool if available; + // otherwise, they are placed in the low-priority pool if available; + // otherwise, they are placed in the bottom-priority pool.) This results + // in lower-priority entries without hits getting evicted from the cache + // sooner. + double high_pri_pool_ratio = 0.5; + double low_pri_pool_ratio = 0.0; + + // Whether to use adaptive mutexes for cache shards. Note that adaptive + // mutexes need to be supported by the platform in order for this to have any + // effect. The default value is true if RocksDB is compiled with + // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. + bool use_adaptive_mutex = kDefaultToAdaptiveMutex; + + LRUCacheOptions() {} + LRUCacheOptions(size_t _capacity, int _num_shard_bits, + bool _strict_capacity_limit, double _high_pri_pool_ratio, + std::shared_ptr _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + double _low_pri_pool_ratio = 0.0) + : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + std::move(_memory_allocator), + _metadata_charge_policy), + high_pri_pool_ratio(_high_pri_pool_ratio), + low_pri_pool_ratio(_low_pri_pool_ratio), + use_adaptive_mutex(_use_adaptive_mutex) {} + + // Construct an instance of LRUCache using these options + std::shared_ptr MakeSharedCache() const; + + // Construct an instance of LRUCache for use as a general cache (e.g. for + // row_cache). Some options are not relevant to general caches. + std::shared_ptr MakeSharedGeneralCache() const; +}; + +// DEPRECATED wrapper function +inline std::shared_ptr NewLRUCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, + std::shared_ptr memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + double low_pri_pool_ratio = 0.0) { + return LRUCacheOptions(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, + low_pri_pool_ratio) + .MakeSharedCache(); +} + +// DEPRECATED wrapper function +inline std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { + return cache_opts.MakeSharedCache(); +} + +// EXPERIMENTAL +// Options structure for configuring a SecondaryCache instance with in-memory +// compression. The implementation uses LRUCache so inherits its options, +// except LRUCacheOptions.secondary_cache is not used and should not be set. +struct CompressedSecondaryCacheOptions : LRUCacheOptions { + // The compression method (if any) that is used to compress data. + CompressionType compression_type = CompressionType::kLZ4Compression; + + // compress_format_version can have two values: + // compress_format_version == 1 -- decompressed size is not included in the + // block header. + // compress_format_version == 2 -- decompressed size is included in the block + // header in varint32 format. + uint32_t compress_format_version = 2; + + // Enable the custom split and merge feature, which split the compressed value + // into chunks so that they may better fit jemalloc bins. + bool enable_custom_split_merge = false; + + // Kinds of entries that should not be compressed, but can be stored. + // (Filter blocks are essentially non-compressible but others usually are.) + CacheEntryRoleSet do_not_compress_roles = {CacheEntryRole::kFilterBlock}; + + CompressedSecondaryCacheOptions() {} + CompressedSecondaryCacheOptions( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + double _high_pri_pool_ratio, double _low_pri_pool_ratio = 0.0, + std::shared_ptr _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + CompressionType _compression_type = CompressionType::kLZ4Compression, + uint32_t _compress_format_version = 2, + bool _enable_custom_split_merge = false, + const CacheEntryRoleSet& _do_not_compress_roles = + {CacheEntryRole::kFilterBlock}) + : LRUCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + _high_pri_pool_ratio, std::move(_memory_allocator), + _use_adaptive_mutex, _metadata_charge_policy, + _low_pri_pool_ratio), + compression_type(_compression_type), + compress_format_version(_compress_format_version), + enable_custom_split_merge(_enable_custom_split_merge), + do_not_compress_roles(_do_not_compress_roles) {} + + // Construct an instance of CompressedSecondaryCache using these options + std::shared_ptr MakeSharedSecondaryCache() const; + + // Avoid confusion with LRUCache + std::shared_ptr MakeSharedCache() const = delete; +}; + +// DEPRECATED wrapper function +inline std::shared_ptr NewCompressedSecondaryCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, + double low_pri_pool_ratio = 0.0, + std::shared_ptr memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + CompressionType compression_type = CompressionType::kLZ4Compression, + uint32_t compress_format_version = 2, + bool enable_custom_split_merge = false, + const CacheEntryRoleSet& _do_not_compress_roles = { + CacheEntryRole::kFilterBlock}) { + return CompressedSecondaryCacheOptions( + capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, compression_type, + compress_format_version, enable_custom_split_merge, + _do_not_compress_roles) + .MakeSharedSecondaryCache(); +} + +// DEPRECATED wrapper function +inline std::shared_ptr NewCompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts) { + return opts.MakeSharedSecondaryCache(); +} + +// HyperClockCache - A lock-free Cache alternative for RocksDB block cache +// that offers much improved CPU efficiency vs. LRUCache under high parallel +// load or high contention, with some caveats: +// * Not a general Cache implementation: can only be used for +// BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is +// compatible with HyperClockCache. +// * Requires an extra tuning parameter: see estimated_entry_charge below. +// Similarly, substantially changing the capacity with SetCapacity could +// harm efficiency. +// * Cache priorities are less aggressively enforced, which could cause +// cache dilution from long range scans (unless they use fill_cache=false). +// * Can be worse for small caches, because if almost all of a cache shard is +// pinned (more likely with non-partitioned filters), then CLOCK eviction +// becomes very CPU intensive. +// +// See internal cache/clock_cache.h for full description. +struct HyperClockCacheOptions : public ShardedCacheOptions { + // The estimated average `charge` associated with cache entries. This is a + // critical configuration parameter for good performance from the hyper + // cache, because having a table size that is fixed at creation time greatly + // reduces the required synchronization between threads. + // * If the estimate is substantially too low (e.g. less than half the true + // average) then metadata space overhead with be substantially higher (e.g. + // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this + // can slightly reduce cache hit rates, and slightly reduce access times due + // to the larger working memory size. + // * If the estimate is substantially too high (e.g. 25% higher than the true + // average) then there might not be sufficient slots in the hash table for + // both efficient operation and capacity utilization (hit rate). The hyper + // cache will evict entries to prevent load factors that could dramatically + // affect lookup times, instead letting the hit rate suffer by not utilizing + // the full capacity. + // + // A reasonable choice is the larger of block_size and metadata_block_size. + // When WriteBufferManager (and similar) charge memory usage to the block + // cache, this can lead to the same effect as estimate being too low, which + // is better than the opposite. Therefore, the general recommendation is to + // assume that other memory charged to block cache could be negligible, and + // ignore it in making the estimate. + // + // The best parameter choice based on a cache in use is given by + // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as + // with kDontChargeCacheMetadata. More precisely with + // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) / + // GetOccupancyCount(). However, when the average value size might vary + // (e.g. balance between metadata and data blocks in cache), it is better + // to estimate toward the lower side than the higher side. + size_t estimated_entry_charge; + + HyperClockCacheOptions( + size_t _capacity, size_t _estimated_entry_charge, + int _num_shard_bits = -1, bool _strict_capacity_limit = false, + std::shared_ptr _memory_allocator = nullptr, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + std::move(_memory_allocator), + _metadata_charge_policy), + estimated_entry_charge(_estimated_entry_charge) {} + + // Construct an instance of HyperClockCache using these options + std::shared_ptr MakeSharedCache() const; +}; + +// DEPRECATED - The old Clock Cache implementation had an unresolved bug and +// has been removed. The new HyperClockCache requires an additional +// configuration parameter that is not provided by this API. This function +// simply returns a new LRUCache for functional compatibility. +extern std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache_bench_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache_bench_tool.h new file mode 100644 index 0000000000..413ce15937 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cache_bench_tool.h @@ -0,0 +1,14 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +int cache_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cleanable.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cleanable.h new file mode 100644 index 0000000000..afc7366732 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/cleanable.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Cleanable { + public: + Cleanable(); + // No copy constructor and copy assignment allowed. + Cleanable(Cleanable&) = delete; + Cleanable& operator=(Cleanable&) = delete; + + // Executes all the registered cleanups + ~Cleanable(); + + // Move constructor and move assignment is allowed. + Cleanable(Cleanable&&) noexcept; + Cleanable& operator=(Cleanable&&) noexcept; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + using CleanupFunction = void (*)(void* arg1, void* arg2); + + // Add another Cleanup to the list + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + // Move the cleanups owned by this Cleanable to another Cleanable, adding to + // any existing cleanups it has + void DelegateCleanupsTo(Cleanable* other); + + // DoCleanup and also resets the pointers for reuse + inline void Reset() { + DoCleanup(); + cleanup_.function = nullptr; + cleanup_.next = nullptr; + } + + inline bool HasCleanups() { return cleanup_.function != nullptr; } + + protected: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + // It also becomes the owner of c + void RegisterCleanup(Cleanup* c); + + private: + // Performs all the cleanups. It does not reset the pointers. Making it + // private + // to prevent misuse + inline void DoCleanup() { + if (cleanup_.function != nullptr) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != nullptr;) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } + } +}; + +// A copyable, reference-counted pointer to a simple Cleanable that only +// performs registered cleanups after all copies are destroy. This is like +// shared_ptr but works more efficiently with wrapping the pointer +// in an outer Cleanable (see RegisterCopyWith() and MoveAsCleanupTo()). +// WARNING: if you create a reference cycle, for example: +// SharedCleanablePtr scp; +// scp.Allocate(); +// scp.RegisterCopyWith(&*scp); +// It will prevent cleanups from ever happening! +class SharedCleanablePtr { + public: + // Empy/null pointer + SharedCleanablePtr() {} + // Copy and move constructors and assignment + SharedCleanablePtr(const SharedCleanablePtr& from); + SharedCleanablePtr(SharedCleanablePtr&& from) noexcept; + SharedCleanablePtr& operator=(const SharedCleanablePtr& from); + SharedCleanablePtr& operator=(SharedCleanablePtr&& from) noexcept; + // Destructor (decrement refcount if non-null) + ~SharedCleanablePtr(); + // Create a new simple Cleanable and make this assign this pointer to it. + // (Reset()s first if necessary.) + void Allocate(); + // Reset to empty/null (decrement refcount if previously non-null) + void Reset(); + // Dereference to pointed-to Cleanable + Cleanable& operator*(); + Cleanable* operator->(); + // Get as raw pointer to Cleanable + Cleanable* get(); + + // Creates a (virtual) copy of this SharedCleanablePtr and registers its + // destruction with target, so that the cleanups registered with the + // Cleanable pointed to by this can only happen after the cleanups in the + // target Cleanable are run. + // No-op if this is empty (nullptr). + void RegisterCopyWith(Cleanable* target); + + // Moves (virtually) this shared pointer to a new cleanup in the target. + // This is essentilly a move semantics version of RegisterCopyWith(), for + // performance optimization. No-op if this is empty (nullptr). + void MoveAsCleanupTo(Cleanable* target); + + private: + struct Impl; + Impl* ptr_ = nullptr; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_filter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_filter.h new file mode 100644 index 0000000000..b1b511613b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_filter.h @@ -0,0 +1,363 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class SliceTransform; + +// CompactionFilter allows an application to modify/delete a key-value during +// table file creation. +// +// Some general notes: +// +// * RocksDB snapshots do not guarantee to preserve the state of the DB in the +// presence of CompactionFilter. Data seen from a snapshot might disappear after +// a table file created with a `CompactionFilter` is installed. If you use +// snapshots, think twice about whether you want to use `CompactionFilter` and +// whether you are using it in a safe way. +// +// * If multithreaded compaction is being used *and* a single CompactionFilter +// instance was supplied via Options::compaction_filter, CompactionFilter +// methods may be called from different threads concurrently. The application +// must ensure that such calls are thread-safe. If the CompactionFilter was +// created by a factory, then it will only ever be used by a single thread that +// is doing the table file creation, and this call does not need to be +// thread-safe. However, multiple filters may be in existence and operating +// concurrently. +// +// * The key passed to the filtering methods includes the timestamp if +// user-defined timestamps are enabled. +// +// * Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionFilter : public Customizable { + public: + // Value type of the key-value passed to the compaction filter's FilterV2/V3 + // methods. + enum ValueType { + // Plain key-value + kValue, + // Merge operand + kMergeOperand, + // Used internally by the old stacked BlobDB implementation; this value type + // is never passed to application code. Note that when using the new + // integrated BlobDB, values stored separately as blobs are retrieved and + // presented to FilterV2/V3 with the type kValue above. + kBlobIndex, + // Wide-column entity + kWideColumnEntity, + }; + + // Potential decisions that can be returned by the compaction filter's + // FilterV2/V3 and FilterBlobByKey methods. See decision-specific caveats and + // constraints below. + enum class Decision { + // Keep the current key-value as-is. + kKeep, + + // Remove the current key-value. Note that the semantics of removal are + // dependent on the value type. If the current key-value is a plain + // key-value or a wide-column entity, it is converted to a tombstone + // (Delete), resulting in the deletion of any earlier versions of the key. + // If it is a merge operand, it is simply dropped. Note: if you are using + // a TransactionDB, it is not recommended to filter out merge operands. + // If a Merge operation is filtered out, TransactionDB may not realize there + // is a write conflict and may allow a Transaction that should have failed + // to Commit. Instead, it is better to implement any Merge filtering inside + // the MergeOperator. + kRemove, + + // Change the value of the current key-value. If the current key-value is a + // plain key-value or a merge operand, its value is updated but its value + // type remains the same. If the current key-value is a wide-column entity, + // it is converted to a plain key-value with the new value specified. + kChangeValue, + + // Remove all key-values with key in [key, *skip_until). This range of keys + // will be skipped in a way that potentially avoids some IO operations + // compared to removing the keys one by one. Note that removal in this case + // means dropping the key-value regardless of value type; in other words, in + // contrast with kRemove, plain values and entities are not converted to + // tombstones. + // + // *skip_until <= key is treated the same as Decision::kKeep (since the + // range [key, *skip_until) is empty). + // + // Caveats: + // * The keys are skipped even if there are snapshots containing them, + // i.e. values removed by kRemoveAndSkipUntil can disappear from a + // snapshot - beware if you're using TransactionDB or DB::GetSnapshot(). + // * If value for a key was overwritten or merged into (multiple Put()s + // or Merge()s), and `CompactionFilter` skips this key with + // kRemoveAndSkipUntil, it's possible that it will remove only + // the new value, exposing the old value that was supposed to be + // overwritten. + // * Doesn't work with PlainTableFactory in prefix mode. + // * If you use kRemoveAndSkipUntil for table files created by compaction, + // consider also reducing compaction_readahead_size option. + kRemoveAndSkipUntil, + + // Used internally by the old stacked BlobDB implementation. Returning this + // decision from application code is not supported. + kChangeBlobIndex, + + // Used internally by the old stacked BlobDB implementation. Returning this + // decision from application code is not supported. + kIOError, + + // Remove the current key-value by converting it to a SingleDelete-type + // tombstone. Only supported for plain-key values and wide-column entities; + // not supported for merge operands. All the caveats related to + // SingleDeletes apply. + kPurge, + + // Change the current key-value to the wide-column entity specified. If the + // current key-value is already a wide-column entity, only its columns are + // updated; if it is a plain key-value, it is converted to a wide-column + // entity with the specified columns. Not supported for merge operands. + // Only applicable to FilterV3. + kChangeWideColumnEntity, + + // When using the integrated BlobDB implementation, it may be possible for + // applications to make a filtering decision for a given blob based on + // the key only without actually reading the blob value, which saves some + // I/O; see the FilterBlobByKey method below. Returning kUndetermined from + // FilterBlobByKey signals that making a decision solely based on the + // key is not possible; in this case, RocksDB reads the blob value and + // passes the key-value to the regular filtering method. Only applicable to + // FilterBlobByKey; returning this value from FilterV2/V3 is not supported. + kUndetermined, + }; + + // Used internally by the old stacked BlobDB implementation. + enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; + + // Context information for a table file creation. + struct Context { + // Whether this table file is created as part of a compaction including all + // table files. + bool is_full_compaction; + // Whether this table file is created as part of a compaction requested by + // the client. + bool is_manual_compaction; + // The column family that will contain the created table file. + uint32_t column_family_id; + // Reason this table file is being created. + TableFileCreationReason reason; + }; + + virtual ~CompactionFilter() {} + static const char* Type() { return "CompactionFilter"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& name, + const CompactionFilter** result); + + // The table file creation process invokes this method before adding a kv to + // the table file. A return value of false indicates that the kv should be + // preserved in the new table file and a return value of true indicates + // that this key-value should be removed (that is, converted to a tombstone). + // The application can inspect the existing value of the key and make decision + // based on it. + // + // Key-Values that are results of merge operation during table file creation + // are not passed into this function. Currently, when you have a mix of Put()s + // and Merge()s on a same key, we only guarantee to process the merge operands + // through the `CompactionFilter`s. Put()s might be processed, or might not. + // + // When the value is to be preserved, the application has the option + // to modify the existing_value and pass it back through new_value. + // value_changed needs to be set to true in this case. + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const { + return false; + } + + // The table file creation process invokes this method on every merge operand. + // If this method returns true, the merge operand will be ignored and not + // written out in the new table file. + // + // Note: If you are using a TransactionDB, it is not recommended to implement + // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB + // may not realize there is a write conflict and may allow a Transaction to + // Commit that should have failed. Instead, it is better to implement any + // Merge filtering inside the MergeOperator. + virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const { + return false; + } + + // A unified API for plain values and merge operands that may + // return a variety of decisions (see Decision above). The `value_type` + // parameter indicates the type of the key-value and the `existing_value` + // contains the current value or merge operand. The `new_value` output + // parameter can be used to set the updated value or merge operand when the + // kChangeValue decision is made by the filter. See the description of + // kRemoveAndSkipUntil above for the semantics of the `skip_until` output + // parameter, and see Decision above for more information on the semantics of + // the potential return values. + // + // The default implementation uses Filter() and FilterMergeOperand(). + // If you're overriding this method, no need to override the other two. + virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + switch (value_type) { + case ValueType::kValue: { + bool value_changed = false; + bool rv = Filter(level, key, existing_value, new_value, &value_changed); + if (rv) { + return Decision::kRemove; + } + return value_changed ? Decision::kChangeValue : Decision::kKeep; + } + + case ValueType::kMergeOperand: { + bool rv = FilterMergeOperand(level, key, existing_value); + return rv ? Decision::kRemove : Decision::kKeep; + } + + case ValueType::kBlobIndex: + return Decision::kKeep; + + default: + assert(false); + return Decision::kKeep; + } + } + + // Wide column aware unified API. Called for plain values, merge operands, and + // wide-column entities; the `value_type` parameter indicates the type of the + // key-value. When the key-value is a plain value or a merge operand, the + // `existing_value` parameter contains the existing value and the + // `existing_columns` parameter is invalid (nullptr). When the key-value is a + // wide-column entity, the `existing_columns` parameter contains the wide + // columns of the existing entity and the `existing_value` parameter is + // invalid (nullptr). The `new_value` output parameter can be used to set the + // updated value or merge operand when the kChangeValue decision is made by + // the filter. The `new_columns` output parameter can be used to specify + // the pairs of column names and column values when the + // kChangeWideColumnEntity decision is returned. See the description of + // kRemoveAndSkipUntil above for the semantics of the `skip_until` output + // parameter, and see Decision above for more information on the semantics of + // the potential return values. + // + // For compatibility, the default implementation keeps all wide-column + // entities, and falls back to FilterV2 for plain values and merge operands. + // If you override this method, there is no need to override FilterV2 (or + // Filter/FilterMergeOperand). + virtual Decision FilterV3( + int level, const Slice& key, ValueType value_type, + const Slice* existing_value, const WideColumns* existing_columns, + std::string* new_value, + std::vector>* /* new_columns */, + std::string* skip_until) const { +#ifdef NDEBUG + (void)existing_columns; +#endif + + assert(!existing_value || !existing_columns); + assert(value_type == ValueType::kWideColumnEntity || existing_value); + assert(value_type != ValueType::kWideColumnEntity || existing_columns); + + if (value_type == ValueType::kWideColumnEntity) { + return Decision::kKeep; + } + + return FilterV2(level, key, value_type, *existing_value, new_value, + skip_until); + } + + // Internal (BlobDB) use only. Do not override in application code. + virtual BlobDecision PrepareBlobOutput(const Slice& /* key */, + const Slice& /* existing_value */, + std::string* /* new_value */) const { + return BlobDecision::kKeep; + } + + // This function is deprecated. Snapshots will always be ignored for + // `CompactionFilter`s, because we realized that not ignoring snapshots + // doesn't provide the guarantee we initially thought it would provide. + // Repeatable reads will not be guaranteed anyway. If you override the + // function and returns false, we will fail the table file creation. + virtual bool IgnoreSnapshots() const { return true; } + + // Returns a name that identifies this `CompactionFilter`. + // The name will be printed to LOG file on start up for diagnosis. + const char* Name() const override = 0; + + // Internal (BlobDB) use only. Do not override in application code. + virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; } + + // In the case of BlobDB, it may be possible to reach a decision with only + // the key without reading the actual value, saving some I/O operations. + // Keys where the value is stored separately in a blob file will be + // passed to this method. If the method returns a supported decision other + // than kUndetermined, it will be considered final and performed without + // reading the existing value. Returning kUndetermined will cause FilterV3() + // to be called to make a decision as usual. The output parameters + // `new_value` and `skip_until` are applicable to the decisions kChangeValue + // and kRemoveAndSkipUntil respectively, and have the same semantics as + // the corresponding parameters of FilterV2/V3. + virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/, + std::string* /*new_value*/, + std::string* /*skip_until*/) const { + return Decision::kUndetermined; + } +}; + +// Each thread of work involving creating table files will create a new +// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This +// allows the application to know about the different ongoing threads of work +// and makes it unnecessary for `CompactionFilter` to provide thread-safety. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionFilterFactory : public Customizable { + public: + virtual ~CompactionFilterFactory() {} + static const char* Type() { return "CompactionFilterFactory"; } + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& name, + std::shared_ptr* result); + + // Returns whether a thread creating table files for the specified `reason` + // should invoke `CreateCompactionFilter()` and pass KVs through the returned + // filter. + virtual bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const { + // For backward compatibility, default implementation only applies + // `CompactionFilter` to files generated by compaction. + return reason == TableFileCreationReason::kCompaction; + } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) = 0; + + // Returns a name that identifies this `CompactionFilter` factory. + virtual const char* Name() const override = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_job_stats.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_job_stats.h new file mode 100644 index 0000000000..5ff8eccc8b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compaction_job_stats.h @@ -0,0 +1,109 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +struct CompactionJobStats { + CompactionJobStats() { Reset(); } + void Reset(); + // Aggregate the CompactionJobStats from another instance with this one + void Add(const CompactionJobStats& stats); + + // the elapsed time of this compaction in microseconds. + uint64_t elapsed_micros; + + // the elapsed CPU time of this compaction in microseconds. + uint64_t cpu_micros; + + // the number of compaction input records. + uint64_t num_input_records; + // the number of blobs read from blob files + uint64_t num_blobs_read; + // the number of compaction input files (table files) + size_t num_input_files; + // the number of compaction input files at the output level (table files) + size_t num_input_files_at_output_level; + + // the number of compaction output records. + uint64_t num_output_records; + // the number of compaction output files (table files) + size_t num_output_files; + // the number of compaction output files (blob files) + size_t num_output_files_blob; + + // true if the compaction is a full compaction (all live SST files input) + bool is_full_compaction; + // true if the compaction is a manual compaction + bool is_manual_compaction; + + // the total size of table files in the compaction input + uint64_t total_input_bytes; + // the total size of blobs read from blob files + uint64_t total_blob_bytes_read; + // the total size of table files in the compaction output + uint64_t total_output_bytes; + // the total size of blob files in the compaction output + uint64_t total_output_bytes_blob; + + // number of records being replaced by newer record associated with same key. + // this could be a new value or a deletion entry for that key so this field + // sums up all updated and deleted keys + uint64_t num_records_replaced; + + // the sum of the uncompressed input keys in bytes. + uint64_t total_input_raw_key_bytes; + // the sum of the uncompressed input values in bytes. + uint64_t total_input_raw_value_bytes; + + // the number of deletion entries before compaction. Deletion entries + // can disappear after compaction because they expired + uint64_t num_input_deletion_records; + // number of deletion records that were found obsolete and discarded + // because it is not possible to delete any more keys with this entry + // (i.e. all possible deletions resulting from it have been completed) + uint64_t num_expired_deletion_records; + + // number of corrupt keys (ParseInternalKey returned false when applied to + // the key) encountered and written out. + uint64_t num_corrupt_keys; + + // Following counters are only populated if + // options.report_bg_io_stats = true; + + // Time spent on file's Append() call. + uint64_t file_write_nanos; + + // Time spent on sync file range. + uint64_t file_range_sync_nanos; + + // Time spent on file fsync. + uint64_t file_fsync_nanos; + + // Time spent on preparing file write (fallocate, etc) + uint64_t file_prepare_write_nanos; + + // 0-terminated strings storing the first 8 bytes of the smallest and + // largest key in the output. + static const size_t kMaxPrefixLength = 8; + + std::string smallest_output_key_prefix; + std::string largest_output_key_prefix; + + // number of single-deletes which do not meet a put + uint64_t num_single_del_fallthru; + + // number of single-deletes which meet something other than a put + uint64_t num_single_del_mismatch; + + // TODO: Add output_to_penultimate_level output information +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/comparator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/comparator.h new file mode 100644 index 0000000000..ad1e71a119 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/comparator.h @@ -0,0 +1,164 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// The general interface for comparing two Slices are defined for both of +// Comparator and some internal data structures. +class CompareInterface { + public: + virtual ~CompareInterface() {} + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + // Note that Compare(a, b) also compares timestamp if timestamp size is + // non-zero. For the same user key with different timestamps, larger (newer) + // timestamp comes first. + virtual int Compare(const Slice& a, const Slice& b) const = 0; +}; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since rocksdb may invoke its methods concurrently +// from multiple threads. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Comparator : public Customizable, public CompareInterface { + public: + Comparator() : timestamp_size_(0) {} + + Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {} + + Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {} + + Comparator& operator=(const Comparator& rhs) { + if (this != &rhs) { + timestamp_size_ = rhs.timestamp_size_; + } + return *this; + } + + ~Comparator() override {} + + static Status CreateFromString(const ConfigOptions& opts, + const std::string& id, + const Comparator** comp); + static const char* Type() { return "Comparator"; } + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + const char* Name() const override = 0; + + // Compares two slices for equality. The following invariant should always + // hold (and is the default implementation): + // Equal(a, b) iff Compare(a, b) == 0 + // Overwrite only if equality comparisons can be done more efficiently than + // three-way comparisons. + virtual bool Equal(const Slice& a, const Slice& b) const { + return Compare(a, b) == 0; + } + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; + + // given two keys, determine if t is the successor of s + // BUG: only return true if no other keys starting with `t` are ordered + // before `t`. Otherwise, the auto_prefix_mode can omit entries within + // iterator bounds that have same prefix as upper bound but different + // prefix from seek key. + virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, + const Slice& /*t*/) const { + return false; + } + + // return true if two keys with different byte sequences can be regarded + // as equal by this comparator. + // The major use case is to determine if DataBlockHashIndex is compatible + // with the customized comparator. + virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } + + // if it is a wrapped comparator, may return the root one. + // return itself it is not wrapped. + virtual const Comparator* GetRootComparator() const { return this; } + + inline size_t timestamp_size() const { return timestamp_size_; } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); + } + + // For two events e1 and e2 whose timestamps are t1 and t2 respectively, + // Returns value: + // < 0 iff t1 < t2 + // == 0 iff t1 == t2 + // > 0 iff t1 > t2 + // Note that an all-zero byte array will be the smallest (oldest) timestamp + // of the same length, and a byte array with all bits 1 will be the largest. + // In the future, we can extend Comparator so that subclasses can specify + // both largest and smallest timestamps. + virtual int CompareTimestamp(const Slice& /*ts1*/, + const Slice& /*ts2*/) const { + return 0; + } + + virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, + const Slice& b, bool /*b_has_ts*/) const { + return Compare(a, b); + } + + virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const { + return 0 == + CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); + } + + private: + size_t timestamp_size_; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +// Return a builtin comparator that uses reverse lexicographic byte-wise +// ordering. +extern const Comparator* ReverseBytewiseComparator(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compression_type.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compression_type.h new file mode 100644 index 0000000000..bfeb00bdef --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/compression_type.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. + +enum CompressionType : unsigned char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, + kZlibCompression = 0x2, + kBZip2Compression = 0x3, + kLZ4Compression = 0x4, + kLZ4HCCompression = 0x5, + kXpressCompression = 0x6, + kZSTD = 0x7, + + // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than + // 0.8.0 or consider a possibility of downgrading the service or copying + // the database files to another service running with an older version of + // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will + // eventually remove the option from the public API. + kZSTDNotFinalCompression = 0x40, + + // kDisableCompressionOption is used to disable some compression options. + kDisableCompressionOption = 0xff, +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/concurrent_task_limiter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/concurrent_task_limiter.h new file mode 100644 index 0000000000..9ad741f98d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/concurrent_task_limiter.h @@ -0,0 +1,51 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This is NOT an extensible interface but a public interface for result of +// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal. +class ConcurrentTaskLimiter { + public: + virtual ~ConcurrentTaskLimiter() {} + + // Returns a name that identifies this concurrent task limiter. + virtual const std::string& GetName() const = 0; + + // Set max concurrent tasks. + // limit = 0 means no new task allowed. + // limit < 0 means no limitation. + virtual void SetMaxOutstandingTask(int32_t limit) = 0; + + // Reset to unlimited max concurrent task. + virtual void ResetMaxOutstandingTask() = 0; + + // Returns current outstanding task count. + virtual int32_t GetOutstandingTask() const = 0; +}; + +// Create a ConcurrentTaskLimiter that can be shared with multiple CFs +// across RocksDB instances to control concurrent tasks. +// +// @param name: Name of the limiter. +// @param limit: max concurrent tasks. +// limit = 0 means no new task allowed. +// limit < 0 means no limitation. +extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/configurable.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/configurable.h new file mode 100644 index 0000000000..a200d7e86c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/configurable.h @@ -0,0 +1,390 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class ObjectRegistry; +class OptionTypeInfo; +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; + +// Configurable is a base class used by the rocksdb that describes a +// standard way of configuring objects. A Configurable object can: +// -> Populate itself given: +// - One or more "name/value" pair strings +// - A string representing the set of name=value properties +// - A map of name/value properties. +// -> Convert itself into its string representation +// -> Dump itself to a Logger +// -> Compare itself to another Configurable object to see if the two objects +// have equivalent options settings +// +// If a derived class calls RegisterOptions to register (by name) how its +// options objects are to be processed, this functionality can typically be +// handled by this class without additional overrides. Otherwise, the derived +// class will need to implement the methods for handling the corresponding +// functionality. +class Configurable { + protected: + friend class ConfigurableHelper; + struct RegisteredOptions { + // The name of the options being registered + std::string name; + // Pointer to the object being registered + void* opt_ptr; + // The map of options being registered + const std::unordered_map* type_map; + }; + + public: + virtual ~Configurable() {} + + // Returns the raw pointer of the named options that is used by this + // object, or nullptr if this function is not supported. + // Since the return value is a raw pointer, the object owns the + // pointer and the caller should not delete the pointer. + // + // Note that changing the underlying options while the object + // is currently used by any open DB is undefined behavior. + // Developers should use DB::SetOption() instead to dynamically change + // options while the DB is open. + template + const T* GetOptions() const { + return GetOptions(T::kName()); + } + template + T* GetOptions() { + return GetOptions(T::kName()); + } + template + const T* GetOptions(const std::string& name) const { + return reinterpret_cast(GetOptionsPtr(name)); + } + template + T* GetOptions(const std::string& name) { + return reinterpret_cast(const_cast(GetOptionsPtr(name))); + } + + // Configures the options for this class based on the input parameters. + // On successful completion, the object is updated with the settings from + // the opt_map. + // If this method fails, an attempt is made to revert the object to original + // state. Note that the revert may not be the original state but may be an + // equivalent. For example, if the object contains an option that is a + // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not + // the Cache object that was passed in, but a Cache object of the same size). + // + // The acceptable values of the name/value pairs are documented with the + // specific class/instance. + // + // @param config_options Controls how the arguments are processed. + // @param opt_map Name/value pairs of the options to update + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all values in the map were successfully updated + // If invoke_prepare_options is true, OK also implies + // PrepareOptions ran successfully. + // @return NotFound If any of the names in the opt_map were not valid + // for this object. If unused is specified, it will contain the + // collection of NotFound names. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + // @see ConfigOptions for a description of the controls. + Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map); + Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map, + std::unordered_map* unused); + + // Updates the named option to the input value, returning OK if successful. + // Note that ConfigureOption does not cause PrepareOptions to be invoked. + // @param config_options Controls how the name/value is processed. + // @param name The name of the option to update + // @param value The value to set for the named option + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @return NotSupported If the name is valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If the value cannot be successfully parsed. + Status ConfigureOption(const ConfigOptions& config_options, + const std::string& name, const std::string& value); + + // Configures the options for this class based on the input parameters. + // On successful completion, the object is updated with the settings from + // the opt_map. If this method fails, an attempt is made to revert the + // object to original state. Note that the revert may not be the original + // state but may be an equivalent. + // @see ConfigureFromMap for more details + // @param config_options Controls how the arguments are processed. + // @param opt_str string containing the values to update. + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all specified values were successfully updated + // If invoke_prepare_options is true, OK also implies + // PrepareOptions ran successfully. + // @return NotFound If any of the names were not valid for this object. + // If unused is specified, it will contain the collection of NotFound + // names. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + Status ConfigureFromString(const ConfigOptions& config_options, + const std::string& opts); + + // Fills in result with the serialized options for this object. + // This is the inverse of ConfigureFromString. + // @param config_options Controls how serialization happens. + // @param result The string representation of this object. + // @return OK If the options for this object were successfully serialized. + // @return InvalidArgument If one or more of the options could not be + // serialized. + Status GetOptionString(const ConfigOptions& config_options, + std::string* result) const; + // Returns the serialized options for this object. + // This method is similar to GetOptionString with no errors. + // @param config_options Controls how serialization happens. + // @param prefix A string to prepend to every option. + // @return The serialized representation of the options for this object + std::string ToString(const ConfigOptions& config_options) const { + return ToString(config_options, ""); + } + std::string ToString(const ConfigOptions& config_options, + const std::string& prefix) const; + + // Returns the list of option names associated with this configurable + // @param config_options Controls how the names are returned + // @param result The set of option names for this object. Note that + // options that are deprecated or aliases are not returned. + // @return OK on success. + Status GetOptionNames(const ConfigOptions& config_options, + std::unordered_set* result) const; + + // Returns the value of the option associated with the input name + // This method is the functional inverse of ConfigureOption + // @param config_options Controls how the value is returned + // @param name The name of the option to return a value for. + // @param value The returned value associated with the named option. + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @param InvalidArgument If the name is valid for this object but + // its value cannot be serialized. + virtual Status GetOption(const ConfigOptions& config_options, + const std::string& name, std::string* value) const; + + // Checks to see if this Configurable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + virtual bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* name) const; + + // Returns a pretty-printed, human-readable version of the options. + // This method is typically used to dump the options to a log file. + // Classes should override this method + virtual std::string GetPrintableOptions() const { return ""; } + + // Validates that the settings are valid/consistent and performs any object + // initialization required by this object. This method may be called as part + // of Configure (if invoke_prepare_options is set), or may be invoked + // separately. + // + // Once an object has been prepared, non-mutable options can no longer be + // updated. + // + // Classes must override this method to provide any implementation-specific + // initialization, such as opening log files or setting up cache parameters. + // Implementations should be idempotent (e.g. don't re-open the log file or + // reconfigure the cache), as there is the potential this method can be called + // more than once. + // + // By default, this method will also prepare all nested (Inner and + // OptionType::kConfigurable) objects. + // + // @param config_options Controls how the object is prepared. Also contains + // a Logger and Env that can be used to initialize this object. + // @return OK If the object was successfully initialized. + // @return InvalidArgument If this object could not be successfully + // initialized. + virtual Status PrepareOptions(const ConfigOptions& config_options); + + // Checks to see if the settings are valid for this object. + // This method checks to see if the input DBOptions and ColumnFamilyOptions + // are valid for the settings of this object. For example, an Env might not + // support certain mmap modes or a TableFactory might require certain + // settings. + // + // By default, this method will also validate all nested (Inner and + // OptionType::kConfigurable) objects. + // + // @param db_opts The DBOptions to validate + // @param cf_opts The ColumnFamilyOptions to validate + // @return OK if the options are valid + // @return InvalidArgument If the arguments are not valid for the options + // of the current object. + virtual Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const; + + // Splits the input opt_value into the ID field and the remaining options. + // The input opt_value can be in the form of "name" or "name=value + // [;name=value]". The first form uses the "name" as an id with no options The + // latter form converts the input into a map of name=value pairs and sets "id" + // to the "id" value from the map. + // @param opt_value The value to split into id and options + // @param id The id field from the opt_value + // @param options The remaining name/value pairs from the opt_value + // @param default_id If specified and there is no id field in the map, this + // value is returned as the ID + // @return OK if the value was converted to a map successfully and an ID was + // found. + // @return InvalidArgument if the value could not be converted to a map or + // there was or there is no id property in the map. + static Status GetOptionsMap( + const std::string& opt_value, const std::string& default_id, + std::string* id, std::unordered_map* options); + + protected: + // Returns the raw pointer for the associated named option. + // The name is typically the name of an option registered via the + // Classes may override this method to provide further specialization (such as + // returning a sub-option) + // + // The default implementation looks at the registered options. If the + // input name matches that of a registered option, the pointer registered + // with that name is returned. + // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns + // "my_ptr" + virtual const void* GetOptionsPtr(const std::string& name) const; + + // Method for allowing options to be configured outside of the normal + // registered options framework. Classes may override this method if they + // wish to support non-standard options implementations (such as configuring + // themselves from constant or simple ":"-separated strings. + // + // The default implementation does nothing and returns OK + virtual Status ParseStringOptions(const ConfigOptions& config_options, + const std::string& opts_str); + + // Internal method to configure an object from a map of name-value options. + // This method uses the input config_options to drive the configuration of + // the options in opt_map. Any option name that cannot be found from the + // input set will be returned in "unused". + // + // Classes may override this method to extend the functionality if required. + // @param config_options Controls how the options are configured and errors + // handled. + // @param opts_map The set of options to configure + // @param unused Any options from opt_map that were not configured. + // @returns a Status based on the rules outlined in ConfigureFromMap + virtual Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused); + + // Method that configures a the specific opt_name from opt_value. + // By default, this method calls opt_info.ParseOption with the + // input parameters. + // Classes may override this method to extend the functionality, or + // change the returned Status. + virtual Status ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, void* opt_ptr); + + // Internal method to see if the single option name/info matches for this and + // that Classes may override this value to change its behavior. + // @param config_options Controls how the options are being matched + // @param opt_info The OptionTypeInfo registered for this option name + // that controls what field is matched (offset) and how (type). + // @param name The name associated with this opt_info. + // @param this_ptr The base pointer to compare to. This is the object + // registered for + // for this OptionTypeInfo. + // @param that_ptr The other pointer to compare to. This is the object + // registered for + // for this OptionTypeInfo. + // @param bad_name If the match fails, the name of the option that failed to + // match. + virtual bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& name, + const void* const this_ptr, + const void* const that_ptr, + std::string* bad_name) const; + // Internal method to serialize options (ToString) + // Classes may override this value to change its behavior. + virtual std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const; + + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + virtual std::string GetOptionName(const std::string& long_name) const; + + // Registers the input name with the options and associated map. + // When classes register their options in this manner, most of the + // functionality (excluding unknown options and validate/prepare) is + // implemented by the base class. + // + // This method should be called in the class constructor to register the + // option set for this object. For example, to register the options + // associated with the BlockBasedTableFactory, the constructor calls this + // method passing in: + // - the name of the options ("BlockBasedTableOptions"); + // - the options object (the BlockBasedTableOptions object for this object; + // - the options type map for the BlockBasedTableOptions. + // This registration allows the Configurable class to process the option + // values associated with the BlockBasedTableOptions without further code in + // the derived class. + // + // @param name The name of this set of options (@see GetOptionsPtr) + // @param opt_ptr Pointer to the options to associate with this name + // @param opt_map Options map that controls how this option is configured. + template + void RegisterOptions( + T* opt_ptr, + const std::unordered_map* opt_map) { + RegisterOptions(T::kName(), opt_ptr, opt_map); + } + void RegisterOptions( + const std::string& name, void* opt_ptr, + const std::unordered_map* opt_map); + + // Returns true if there are registered options for this Configurable object + inline bool HasRegisteredOptions() const { return !options_.empty(); } + + private: + // Contains the collection of options (name, opt_ptr, opt_map) associated with + // this object. This collection is typically set in the constructor of the + // Configurable option via + std::vector options_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/convenience.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/convenience.h new file mode 100644 index 0000000000..7ce676df0c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/convenience.h @@ -0,0 +1,466 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { +class Env; +class Logger; +class ObjectRegistry; + +struct ColumnFamilyOptions; +struct DBOptions; +struct Options; + +// ConfigOptions containing the parameters/controls for +// comparing objects and converting to/from strings. +// These settings control how the methods +// treat errors (e.g. ignore_unknown_objects), the format +// of the serialization (e.g. delimiter), and how to compare +// options (sanity_level). +struct ConfigOptions { + // Constructs a new ConfigOptions with a new object registry. + // This method should only be used when a DBOptions is not available, + // else registry settings may be lost + ConfigOptions(); + + // Constructs a new ConfigOptions using the settings from + // the input DBOptions. Currently constructs a new object registry. + explicit ConfigOptions(const DBOptions&); + + // This enum defines the RocksDB options sanity level. + enum SanityLevel : unsigned char { + kSanityLevelNone = 0x01, // Performs no sanity check at all. + // Performs minimum check to ensure the RocksDB instance can be + // opened without corrupting / mis-interpreting the data. + kSanityLevelLooselyCompatible = 0x02, + // Perform exact match sanity check. + kSanityLevelExactMatch = 0xFF, + }; + + enum Depth { + kDepthDefault, // Traverse nested options that are not flagged as "shallow" + kDepthShallow, // Do not traverse into any nested options + kDepthDetailed, // Traverse nested options, overriding the options shallow + // setting + }; + + // When true, any unused options will be ignored and OK will be returned + bool ignore_unknown_options = false; + + // When true, any unsupported options will be ignored and OK will be returned + bool ignore_unsupported_options = true; + + // If the strings are escaped (old-style?) + bool input_strings_escaped = true; + + // Whether or not to invoke PrepareOptions after configure is called. + bool invoke_prepare_options = true; + + // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not. + // When "mutable_options_only=false", all options are evaluated. + // When "mutable_options_only="true", any option not marked as Mutable is + // either ignored (in the case of string/equals methods) or results in an + // error (in the case of Configure). + bool mutable_options_only = false; + + // The separator between options when converting to a string + std::string delimiter = ";"; + + // Controls how to traverse options during print/match stages + Depth depth = Depth::kDepthDefault; + + // Controls how options are serialized + // Controls how pedantic the comparison must be for equivalency + SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch; + // `file_readahead_size` is used for readahead for the option file. + size_t file_readahead_size = 512 * 1024; + + // The environment to use for this option + Env* env = Env::Default(); + + // The object registry to use for this options + std::shared_ptr registry; + + bool IsShallow() const { return depth == Depth::kDepthShallow; } + bool IsDetailed() const { return depth == Depth::kDepthDetailed; } + + bool IsCheckDisabled() const { + return sanity_level == SanityLevel::kSanityLevelNone; + } + + bool IsCheckEnabled(SanityLevel level) const { + return (level > SanityLevel::kSanityLevelNone && level <= sanity_level); + } +}; + + +// The following set of functions provide a way to construct RocksDB Options +// from a string or a string-to-string map. Here is the general rule of +// setting option values from strings by type. Some RocksDB types are also +// supported in these APIs. Please refer to the comment of the function itself +// to find more information about how to config those RocksDB types. +// +// * Strings: +// Strings will be used as values directly without any truncating or +// trimming. +// +// * Booleans: +// - "true" or "1" => true +// - "false" or "0" => false. +// [Example]: +// - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or +// - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString. +// +// * Integers: +// Integers are converted directly from string, in addition to the following +// units that we support: +// - 'k' or 'K' => 2^10 +// - 'm' or 'M' => 2^20 +// - 'g' or 'G' => 2^30 +// - 't' or 'T' => 2^40 // only for unsigned int with sufficient bits. +// [Example]: +// - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or +// - "arena_block_size=19G" in GetColumnFamilyOptionsFromString. +// +// * Doubles / Floating Points: +// Doubles / Floating Points are converted directly from string. Note that +// currently we do not support units. +// [Example]: +// - {"memtable_prefix_bloom_size_ratio", "0.1"} in +// GetColumnFamilyOptionsFromMap, or +// - "memtable_prefix_bloom_size_ratio=0.1" in +// GetColumnFamilyOptionsFromString. +// * Array / Vectors: +// An array is specified by a list of values, where ':' is used as +// the delimiter to separate each value. +// [Example]: +// - {"compression_per_level", "kNoCompression:kSnappyCompression"} +// in GetColumnFamilyOptionsFromMap, or +// - "compression_per_level=kNoCompression:kSnappyCompression" in +// GetColumnFamilyOptionsFromMapString +// * Enums: +// The valid values of each enum are identical to the names of its constants. +// [Example]: +// - CompressionType: valid values are "kNoCompression", +// "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ... +// - CompactionStyle: valid values are "kCompactionStyleLevel", +// "kCompactionStyleUniversal", "kCompactionStyleFIFO", and +// "kCompactionStyleNone". +// + +// Take a ConfigOptions `config_options` and a ColumnFamilyOptions +// "base_options" as the default option in addition to a map "opts_map" of +// option name to option value to construct the new ColumnFamilyOptions +// "new_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in ColumnFamilyOptions: +// +// * table_factory: +// table_factory can be configured using our custom nested-option syntax. +// +// {option_a=value_a; option_b=value_b; option_c=value_c; ... } +// +// A nested option is enclosed by two curly braces, within which there are +// multiple option assignments. Each assignment is of the form +// "variable_name=value;". +// +// Currently we support the following types of TableFactory: +// - BlockBasedTableFactory: +// Use name "block_based_table_factory" to initialize table_factory with +// BlockBasedTableFactory. Its BlockBasedTableFactoryOptions can be +// configured using the nested-option syntax. +// [Example]: +// * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"} +// is equivalent to assigning table_factory with a BlockBasedTableFactory +// that has 1M LRU block-cache with block size equals to 4k: +// ColumnFamilyOptions cf_opt; +// BlockBasedTableOptions blk_opt; +// blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024); +// blk_opt.block_size = 4 * 1024; +// cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt)); +// - PlainTableFactory: +// Use name "plain_table_factory" to initialize table_factory with +// PlainTableFactory. Its PlainTableFactoryOptions can be configured using +// the nested-option syntax. +// [Example]: +// * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"} +// +// * memtable_factory: +// Use "memtable" to config memtable_factory. Here are the supported +// memtable factories: +// - SkipList: +// Pass "skip_list:" to config memtable to use SkipList, +// or simply "skip_list" to use the default SkipList. +// [Example]: +// * {"memtable", "skip_list:5"} is equivalent to setting +// memtable to SkipListFactory(5). +// - PrefixHash: +// Pass "prefix_hash:" to config memtable +// to use PrefixHash, or simply "prefix_hash" to use the default +// PrefixHash. +// [Example]: +// * {"memtable", "prefix_hash:1000"} is equivalent to setting +// memtable to NewHashSkipListRepFactory(hash_bucket_count). +// - HashLinkedList: +// Pass "hash_linkedlist:" to config memtable +// to use HashLinkedList, or simply "hash_linkedlist" to use the default +// HashLinkedList. +// [Example]: +// * {"memtable", "hash_linkedlist:1000"} is equivalent to +// setting memtable to NewHashLinkListRepFactory(1000). +// - VectorRepFactory: +// Pass "vector:" to config memtable to use VectorRepFactory, +// or simply "vector" to use the default Vector memtable. +// [Example]: +// * {"memtable", "vector:1024"} is equivalent to setting memtable +// to VectorRepFactory(1024). +// +// * compression_opts: +// Use "compression_opts" to config compression_opts. The value format +// is of the form ":::". +// [Example]: +// * {"compression_opts", "4:5:6:7"} is equivalent to setting: +// ColumnFamilyOptions cf_opt; +// cf_opt.compression_opts.window_bits = 4; +// cf_opt.compression_opts.level = 5; +// cf_opt.compression_opts.strategy = 6; +// cf_opt.compression_opts.max_dict_bytes = 7; +// +// @param config_options controls how the map is processed. +// @param base_options the default options of the output "new_options". +// @param opts_map an option name to value map for specifying how "new_options" +// should be set. +// @param new_options the resulting options based on "base_options" with the +// change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_options" will be set to "base_options". +// @return Status::NotFound means the one (or more) of the option name in +// the opts_map is not valid for this option +// @return Status::NotSupported means we do not know how to parse one of the +// value for this option +// @return Status::InvalidArgument means the one of the option values is not +// valid for this option. +Status GetColumnFamilyOptionsFromMap( + const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options); + +// Take a ConfigOptions `config_options` and a DBOptions "base_options" as the +// default option in addition to a map "opts_map" of option name to option value +// to construct the new DBOptions "new_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in DBOptions: +// +// * rate_limiter_bytes_per_sec: +// RateLimiter can be configured directly by specifying its bytes_per_sec. +// [Example]: +// - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to +// passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec. +// +// @param config_options controls how the map is processed. +// @param base_options the default options of the output "new_options". +// @param opts_map an option name to value map for specifying how "new_options" +// should be set. +// @param new_options the resulting options based on "base_options" with the +// change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_options" will be set to "base_options". +// @return Status::NotFound means the one (or more) of the option name in +// the opts_map is not valid for this option +// @return Status::NotSupported means we do not know how to parse one of the +// value for this option +// @return Status::InvalidArgument means the one of the option values is not +// valid for this option. +Status GetDBOptionsFromMap( + const ConfigOptions& cfg_options, const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options); + +// Take a ConfigOptions `config_options` and a BlockBasedTableOptions +// "table_options" as the default option in addition to a map "opts_map" of +// option name to option value to construct the new BlockBasedTableOptions +// "new_table_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in BlockBasedTableOptions: +// +// * filter_policy: +// We currently only support the following FilterPolicy in the convenience +// functions: +// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]" +// to specify BloomFilter. The above string is equivalent to calling +// NewBloomFilterPolicy(bits_per_key, use_block_based_builder). +// [Example]: +// - Pass {"filter_policy", "bloomfilter:4:true"} in +// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits +// per key and use_block_based_builder enabled. +// +// * block_cache / block_cache_compressed: +// We currently only support LRU cache in the GetOptions API. The LRU +// cache can be set by directly specifying its size. +// [Example]: +// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is +// equivalent to setting block_cache using NewLRUCache(1024 * 1024). +// +// @param config_options controls how the map is processed. +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status GetBlockBasedTableOptionsFromMap( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options); + +// Take a ConfigOptions `config_options` and a default PlainTableOptions +// "table_options" as the default option in addition to a map "opts_map" of +// option name to option value to construct the new PlainTableOptions +// "new_table_options". +// +// @param config_options controls how the map is processed. +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status GetPlainTableOptionsFromMap( + const ConfigOptions& config_options, const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options); + +// Take a ConfigOptions `config_options`, a string representation of option +// names and values, apply them into the base_options, and return the new +// options as a result. The string has the following format: +// "write_buffer_size=1024;max_write_buffer_number=2" +// Nested options config is also possible. For example, you can define +// BlockBasedTableOptions as part of the string for block-based table factory: +// "write_buffer_size=1024;block_based_table_factory={block_size=4k};" +// "max_write_buffer_num=2" +// +Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options); + +Status GetDBOptionsFromString(const ConfigOptions& config_options, + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options); + +Status GetStringFromDBOptions(const ConfigOptions& config_options, + const DBOptions& db_options, + std::string* opts_str); + +Status GetStringFromDBOptions(std::string* opts_str, + const DBOptions& db_options, + const std::string& delimiter = "; "); + +Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options, + const ColumnFamilyOptions& cf_options, + std::string* opts_str); +Status GetStringFromColumnFamilyOptions(std::string* opts_str, + const ColumnFamilyOptions& cf_options, + const std::string& delimiter = "; "); +Status GetStringFromCompressionType(std::string* compression_str, + CompressionType compression_type); + +std::vector GetSupportedCompressions(); + +Status GetBlockBasedTableOptionsFromString( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options); + +Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, + const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options); + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, + std::unique_ptr* new_mem_factory); + +Status GetOptionsFromString(const Options& base_options, + const std::string& opts_str, Options* new_options); +Status GetOptionsFromString(const ConfigOptions& config_options, + const Options& base_options, + const std::string& opts_str, Options* new_options); + +Status StringToMap(const std::string& opts_str, + std::unordered_map* opts_map); + +// Request stopping background work, if wait is true wait until it's done +void CancelAllBackgroundWork(DB* db, bool wait = false); + +// Delete files which are entirely in the given range +// Could leave some keys in the range which are in files which are not +// entirely in the range. Also leaves L0 files regardless of whether they're +// in the range. +// Snapshots before the delete might not see the data in the given range. +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool include_end = true); + +// Delete files in multiple ranges at once +// Delete files in a lot of ranges one at a time can be slow, use this API for +// better performance in that case. +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path, + const SequenceNumber& largest_seqno = 0); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/customizable.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/customizable.h new file mode 100644 index 0000000000..076aca6590 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/customizable.h @@ -0,0 +1,229 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/configurable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +/** + * Customizable a base class used by the rocksdb that describes a + * standard way of configuring and creating objects. Customizable objects + * are configurable objects that can be created from an ObjectRegistry. + * + * Customizable classes are used when there are multiple potential + * implementations of a class for use by RocksDB (e.g. Table, Cache, + * MergeOperator, etc). The abstract base class is expected to define a method + * declaring its type and a factory method for creating one of these, such as: + * static const char *Type() { return "Table"; } + * static Status CreateFromString(const ConfigOptions& options, + * const std::string& id, + * std::shared_ptr* result); + * The "Type" string is expected to be unique (no two base classes are the same + * type). This factory is expected, based on the options and id, create and + * return the appropriate derived type of the customizable class (e.g. + * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers, + * helper classes and methods are provided for writing this factory. + * + * Instances of a Customizable class need to define: + * - A "static const char *kClassName()" method. This method defines the name + * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the + * CheckedCast method. + * - The Name() of the object. This name is used when creating and saving + * instances of this class. Typically this name will be the same as + * kClassName(). + * + * Additionally, Customizable classes should register any options used to + * configure themselves with the Configurable subsystem. + * + * When a Customizable is being created, the "name" property specifies + * the name of the instance being created. + * For custom objects, their configuration and name can be specified by: + * [prop]={name=X;option 1 = value1[; option2=value2...]} + * + * [prop].name=X + * [prop].option1 = value1 + * + * [prop].name=X + * X.option1 =value1 + */ +class Customizable : public Configurable { + public: + ~Customizable() override {} + + // Returns the name of this class of Customizable + virtual const char* Name() const = 0; + + // Returns an identifier for this Customizable. + // This could be its name or something more complex (like its URL/pattern). + // Used for pretty printing. + virtual std::string GetId() const { + std::string id = Name(); + return id; + } + + // This is typically determined by if the input name matches the + // name of this object. + // This method is typically used in conjunction with CheckedCast to find the + // derived class instance from its base. For example, if you have an Env + // and want the "Default" env, you would IsInstanceOf("Default") to get + // the default implementation. This method should be used when you need a + // specific derivative or implementation of a class. + // + // Intermediary caches (such as SharedCache) may wish to override this method + // to check for the intermediary name (SharedCache). Classes with multiple + // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override + // this method. + // + // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a". + // Wrapped classes that have an Inner "has-a" should not be returned. + // + // @param name The name of the instance to find. + // Returns true if the class is an instance of the input name. + virtual bool IsInstanceOf(const std::string& name) const { + if (name.empty()) { + return false; + } else if (name == Name()) { + return true; + } else { + const char* nickname = NickName(); + if (nickname != nullptr && name == nickname) { + return true; + } else { + return false; + } + } + } + + const void* GetOptionsPtr(const std::string& name) const override { + const void* ptr = Configurable::GetOptionsPtr(name); + if (ptr != nullptr) { + return ptr; + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->GetOptionsPtr(name); + } else { + return nullptr; + } + } + } + + // Returns the named instance of the Customizable as a T*, or nullptr if not + // found. This method uses IsInstanceOf/Inner to find the appropriate class + // instance and then casts it to the expected return type. + template + const T* CheckedCast() const { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + template + T* CheckedCast() { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + auto inner = const_cast(Inner()); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + // Checks to see if this Customizable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + // @see Configurable::AreEquivalent for more details + bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const override; + // Gets the value of the option associated with the input name + // @see Configurable::GetOption for more details + Status GetOption(const ConfigOptions& config_options, const std::string& name, + std::string* value) const override; + // Helper method for getting for parsing the opt_value into the corresponding + // options for use in potentially creating a new Customizable object (this + // method is primarily a support method for LoadSharedObject et al for new + // Customizable objects). The opt_value may be either name-value pairs + // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new + // Customizable, the ID is determined by: + // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this + // name; + // - Otherwise, if there is a "id=value", the id is set to "value" + // - Otherwise, if the input customizable is not null, custom->GetId is used + // - Otherwise, an error is returned. + // + // If the opt_value is name-value pairs, these pairs will be returned in + // options (without the id pair). If the ID being returned matches the ID of + // the input custom object, then the options from the input object will also + // be added to the returned options. + // + // This method returns non-OK if the ID could not be found, or if the + // opt_value could not be parsed into name-value pairs. + static Status GetOptionsMap( + const ConfigOptions& config_options, const Customizable* custom, + const std::string& opt_value, std::string* id, + std::unordered_map* options); + + // Helper method to configure a new object with the supplied options. + // If the object is not null and invoke_prepare_options=true, the object + // will be configured and prepared. + // Returns success if the object is properly configured and (optionally) + // prepared Returns InvalidArgument if the object is nullptr and there are + // options in the map Returns the result of the ConfigureFromMap or + // PrepareOptions + static Status ConfigureNewObject( + const ConfigOptions& config_options, Customizable* object, + const std::unordered_map& options); + + // Returns the inner class when a Customizable implements a has-a (wrapped) + // relationship. Derived classes that implement a has-a must override this + // method in order to get CheckedCast to function properly. + virtual const Customizable* Inner() const { return nullptr; } + + protected: + // Generates a ID specific for this instance of the customizable. + // The unique ID is of the form :#pid, where: + // - name is the Name() of this object; + // - addr is the memory address of this object; + // - pid is the process ID of this process ID for this process. + // Note that if obj1 and obj2 have the same unique IDs, they must be the + // same. However, if an object is deleted and recreated, it may have the + // same unique ID as a predecessor + // + // This method is useful for objects (especially ManagedObjects) that + // wish to generate an ID that is specific for this instance and wish to + // override the GetId() method. + std::string GenerateIndividualId() const; + + // Some classes have both a class name (e.g. PutOperator) and a nickname + // (e.g. put). Classes can override this method to return a + // nickname. Nicknames can be used by InstanceOf and object creation. + virtual const char* NickName() const { return ""; } + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + std::string GetOptionName(const std::string& long_name) const override; + std::string SerializeOptions(const ConfigOptions& options, + const std::string& prefix) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/data_structure.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/data_structure.h new file mode 100644 index 0000000000..ffab82c514 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/data_structure.h @@ -0,0 +1,186 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace detail { +int CountTrailingZeroBitsForSmallEnumSet(uint64_t); +} // namespace detail + +// Represents a set of values of some enum type with a small number of +// possible enumerators. For now, it supports enums where no enumerator +// exceeds 63 when converted to int. +template +class SmallEnumSet { + private: + using StateT = uint64_t; + static constexpr int kStateBits = sizeof(StateT) * 8; + static constexpr int kMaxMax = kStateBits - 1; + static constexpr int kMaxValue = static_cast(MAX_ENUMERATOR); + static_assert(kMaxValue >= 0); + static_assert(kMaxValue <= kMaxMax); + + public: + // construct / create + SmallEnumSet() : state_(0) {} + + template + /*implicit*/ constexpr SmallEnumSet(const ENUM_TYPE e, TRest... rest) { + *this = SmallEnumSet(rest...).With(e); + } + + // Return the set that includes all valid values, assuming the enum + // is "dense" (includes all values converting to 0 through kMaxValue) + static constexpr SmallEnumSet All() { + StateT tmp = StateT{1} << kMaxValue; + return SmallEnumSet(RawStateMarker(), tmp | (tmp - 1)); + } + + // equality + bool operator==(const SmallEnumSet& that) const { + return this->state_ == that.state_; + } + bool operator!=(const SmallEnumSet& that) const { return !(*this == that); } + + // query + + // Return true if the input enum is contained in the "Set". + bool Contains(const ENUM_TYPE e) const { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + StateT tmp = 1; + return state_ & (tmp << value); + } + + bool empty() const { return state_ == 0; } + + // iterator + class const_iterator { + public: + // copy + const_iterator(const const_iterator& that) = default; + const_iterator& operator=(const const_iterator& that) = default; + + // move + const_iterator(const_iterator&& that) noexcept = default; + const_iterator& operator=(const_iterator&& that) noexcept = default; + + // equality + bool operator==(const const_iterator& that) const { + assert(set_ == that.set_); + return this->pos_ == that.pos_; + } + + bool operator!=(const const_iterator& that) const { + return !(*this == that); + } + + // ++iterator + const_iterator& operator++() { + if (pos_ < kMaxValue) { + pos_ = set_->SkipUnset(pos_ + 1); + } else { + pos_ = kStateBits; + } + return *this; + } + + // iterator++ + const_iterator operator++(int) { + auto old = *this; + ++*this; + return old; + } + + ENUM_TYPE operator*() const { + assert(pos_ <= kMaxValue); + return static_cast(pos_); + } + + private: + friend class SmallEnumSet; + const_iterator(const SmallEnumSet* set, int pos) : set_(set), pos_(pos) {} + const SmallEnumSet* set_; + int pos_; + }; + + const_iterator begin() const { return const_iterator(this, SkipUnset(0)); } + + const_iterator end() const { return const_iterator(this, kStateBits); } + + // mutable ops + + // Modifies the set (if needed) to include the given value. Returns true + // iff the set was modified. + bool Add(const ENUM_TYPE e) { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + StateT old_state = state_; + state_ |= (StateT{1} << value); + return old_state != state_; + } + + // Modifies the set (if needed) not to include the given value. Returns true + // iff the set was modified. + bool Remove(const ENUM_TYPE e) { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + StateT old_state = state_; + state_ &= ~(StateT{1} << value); + return old_state != state_; + } + + // applicative ops + + // Return a new set based on this one with the additional value(s) inserted + constexpr SmallEnumSet With(const ENUM_TYPE e) const { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + return SmallEnumSet(RawStateMarker(), state_ | (StateT{1} << value)); + } + template + constexpr SmallEnumSet With(const ENUM_TYPE e1, const ENUM_TYPE e2, + TRest... rest) const { + return With(e1).With(e2, rest...); + } + + // Return a new set based on this one excluding the given value(s) + constexpr SmallEnumSet Without(const ENUM_TYPE e) const { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + return SmallEnumSet(RawStateMarker(), state_ & ~(StateT{1} << value)); + } + template + constexpr SmallEnumSet Without(const ENUM_TYPE e1, const ENUM_TYPE e2, + TRest... rest) const { + return Without(e1).Without(e2, rest...); + } + + private: + int SkipUnset(int pos) const { + StateT tmp = state_ >> pos; + if (tmp == 0) { + return kStateBits; + } else { + return pos + detail::CountTrailingZeroBitsForSmallEnumSet(tmp); + } + } + struct RawStateMarker {}; + explicit SmallEnumSet(RawStateMarker, StateT state) : state_(state) {} + + StateT state_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db.h new file mode 100644 index 0000000000..a42f5b8b6d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db.h @@ -0,0 +1,1980 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/block_cache_trace_writer.h" +#include "rocksdb/iterator.h" +#include "rocksdb/listener.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/version.h" +#include "rocksdb/wide_columns.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) +#elif _WIN32 +#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) +#endif + +namespace ROCKSDB_NAMESPACE { + +struct ColumnFamilyOptions; +struct CompactionOptions; +struct CompactRangeOptions; +struct DBOptions; +struct ExternalSstFileInfo; +struct FlushOptions; +struct Options; +struct ReadOptions; +struct TableProperties; +struct WriteOptions; +class Env; +class EventListener; +class FileSystem; +class Replayer; +class StatsHistoryIterator; +class TraceReader; +class TraceWriter; +class WriteBatch; + +extern const std::string kDefaultColumnFamilyName; +extern const std::string kPersistentStatsColumnFamilyName; +struct ColumnFamilyDescriptor { + std::string name; + ColumnFamilyOptions options; + ColumnFamilyDescriptor() + : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {} + ColumnFamilyDescriptor(const std::string& _name, + const ColumnFamilyOptions& _options) + : name(_name), options(_options) {} +}; + +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} + // Returns the name of the column family associated with the current handle. + virtual const std::string& GetName() const = 0; + // Returns the ID of the column family associated with the current handle. + virtual uint32_t GetID() const = 0; + // Fills "*desc" with the up-to-date descriptor of the column family + // associated with this handle. Since it fills "*desc" with the up-to-date + // information, this call might internally lock and release DB mutex to + // access the up-to-date CF options. In addition, all the pointer-typed + // options cannot be referenced any longer than the original options exist. + // + // Note that this function is not supported in RocksDBLite. + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0; + // Returns the comparator of the column family associated with the + // current handle. + virtual const Comparator* GetComparator() const = 0; +}; + +static const int kMajorVersion = __ROCKSDB_MAJOR__; +static const int kMinorVersion = __ROCKSDB_MINOR__; + +// A range of keys +struct Range { + Slice start; + Slice limit; + + Range() {} + Range(const Slice& s, const Slice& l) : start(s), limit(l) {} +}; + +struct RangePtr { + const Slice* start; + const Slice* limit; + + RangePtr() : start(nullptr), limit(nullptr) {} + RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} +}; + +// It is valid that files_checksums and files_checksum_func_names are both +// empty (no checksum information is provided for ingestion). Otherwise, +// their sizes should be the same as external_files. The file order should +// be the same in three vectors and guaranteed by the caller. +// Note that, we assume the temperatures of this batch of files to be +// ingested are the same. +struct IngestExternalFileArg { + ColumnFamilyHandle* column_family = nullptr; + std::vector external_files; + IngestExternalFileOptions options; + std::vector files_checksums; + std::vector files_checksum_func_names; + Temperature file_temperature = Temperature::kUnknown; +}; + +struct GetMergeOperandsOptions { + int expected_max_number_of_operands = 0; +}; + +// A collections of table properties objects, where +// key: is the table's file name. +// value: the table properties object of the given table. +using TablePropertiesCollection = + std::unordered_map>; + +// A DB is a persistent, versioned ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +// DB is an abstract base class with one primary implementation (DBImpl) +// and a number of wrapper implementations. +class DB { + public: + // Open the database with the specified "name" for reads and writes. + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores nullptr in *dbptr and returns a non-OK status on error, including + // if the DB is already open (read-write) by another DB object. (This + // guarantee depends on options.env->LockFile(), which might not provide + // this guarantee in a custom Env implementation.) + // + // Caller must delete *dbptr when it is no longer needed. + static Status Open(const Options& options, const std::string& name, + DB** dbptr); + + // Open DB with column families. + // db_options specify database specific options + // column_families is the vector of all column families in the database, + // containing column family name and options. You need to open ALL column + // families in the database. To get the list of column families, you can use + // ListColumnFamilies(). + // + // The default column family name is 'default' and it's stored + // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName. + // If everything is OK, handles will on return be the same size + // as column_families --- handles[i] will be a handle that you + // will use to operate on column family column_family[i]. + // Before delete DB, you have to close All column families by calling + // DestroyColumnFamilyHandle() with all the handles. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + + // OpenForReadOnly() creates a Read-only instance that supports reads alone. + // + // All DB interfaces that modify data, like put/delete, will return error. + // Automatic Flush and Compactions are disabled and any manual calls + // to Flush/Compaction will return error. + // + // While a given DB can be simultaneously opened via OpenForReadOnly + // by any number of readers, if a DB is simultaneously opened by Open + // and OpenForReadOnly, the read-only instance has undefined behavior + // (though can often succeed if quickly closed) and the read-write + // instance is unaffected. See also OpenAsSecondary. + + // Open the database for read only. + // + static Status OpenForReadOnly(const Options& options, const std::string& name, + DB** dbptr, + bool error_if_wal_file_exists = false); + + // Open the database for read only with column families. + // + // When opening DB with read only, you can specify only a subset of column + // families in the database that should be opened. However, you always need + // to specify default column family. The default column family name is + // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName + // + static Status OpenForReadOnly( + const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists = false); + + // OpenAsSecondary() creates a secondary instance that supports read-only + // operations and supports dynamic catch up with the primary (through a + // call to TryCatchUpWithPrimary()). + // + // All DB interfaces that modify data, like put/delete, will return error. + // Automatic Flush and Compactions are disabled and any manual calls + // to Flush/Compaction will return error. + // + // Multiple secondary instances can co-exist at the same time. + // + + // Open DB as secondary instance + // + // The options argument specifies the options to open the secondary instance. + // Options.max_open_files should be set to -1. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the caller should + // delete it after use. + // + // Return OK on success, non-OK on failures. + static Status OpenAsSecondary(const Options& options, const std::string& name, + const std::string& secondary_path, DB** dbptr); + + // Open DB as secondary instance with specified column families + // + // When opening DB in secondary mode, you can specify only a subset of column + // families in the database that should be opened. However, you always need + // to specify default column family. The default column family name is + // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName + // + // Column families created by the primary after the secondary instance starts + // are currently ignored by the secondary instance. Column families opened + // by secondary and dropped by the primary will be dropped by secondary as + // well (on next invocation of TryCatchUpWithPrimary()). However the user + // of the secondary instance can still access the data of such dropped column + // family as long as they do not destroy the corresponding column family + // handle. + // + // The options argument specifies the options to open the secondary instance. + // Options.max_open_files should be set to -1. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The column_families argument specifies a list of column families to open. + // If default column family is not specified or if any specified column + // families does not exist, the function returns non-OK status. + // The handles is an out-arg corresponding to the opened database column + // family handles. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the caller should + // delete it after use. Before deleting the dbptr, the user should also + // delete the pointers stored in handles vector. + // + // Return OK on success, non-OK on failures. + static Status OpenAsSecondary( + const DBOptions& db_options, const std::string& name, + const std::string& secondary_path, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + + // Open DB and run the compaction. + // It's a read-only operation, the result won't be installed to the DB, it + // will be output to the `output_directory`. The API should only be used with + // `options.CompactionService` to run compaction triggered by + // `CompactionService`. + static Status OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* output, + const CompactionServiceOptionsOverride& override_options); + + static Status OpenAndCompact( + const OpenAndCompactOptions& options, const std::string& name, + const std::string& output_directory, const std::string& input, + std::string* output, + const CompactionServiceOptionsOverride& override_options); + + // Experimental and subject to change + // Open DB and trim data newer than specified timestamp. + // The trim_ts specified the user-defined timestamp trim bound. + // This API should only be used at timestamp enabled column families recovery. + // If some input column families do not support timestamp, nothing will + // be happened to them. The data with timestamp > trim_ts + // will be removed after this API returns successfully. + static Status OpenAndTrimHistory( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + std::string trim_ts); + + // Manually, synchronously attempt to resume DB writes after a write failure + // to the underlying filesystem. See + // https://github.com/facebook/rocksdb/wiki/Background-Error-Handling + // + // Returns OK if writes are successfully resumed, or there was no + // outstanding error to recover from. Returns underlying write error if + // it is not recoverable. + // + // WART: Does not mix well with auto-resume. Will return Busy if an + // auto-resume is in progress, without waiting for it to complete. + // See DBOptions::max_bgerror_resume_count and + // EventListener::OnErrorRecoveryBegin + virtual Status Resume() { return Status::NotSupported(); } + + // Close the DB by releasing resources, closing files etc. This should be + // called before calling the destructor so that the caller can get back a + // status in case there are any errors. This will not fsync the WAL files. + // If syncing is required, the caller must first call SyncWAL(), or Write() + // using an empty write batch with WriteOptions.sync=true. + // Regardless of the return status, the DB must be freed. + // If the return status is Aborted(), closing fails because there is + // unreleased snapshot in the system. In this case, users can release + // the unreleased snapshots and try again and expect it to succeed. For + // other status, re-calling Close() will be no-op and return the original + // close status. If the return status is NotSupported(), then the DB + // implementation does cleanup in the destructor + virtual Status Close() { return Status::NotSupported(); } + + // ListColumnFamilies will open the DB specified by argument name + // and return the list of all column families in that DB + // through column_families argument. The ordering of + // column families in column_families is unspecified. + static Status ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families); + + // Abstract class ctor + DB() {} + // No copying allowed + DB(const DB&) = delete; + void operator=(const DB&) = delete; + + virtual ~DB(); + + // Create a column_family and return the handle of column family + // through the argument handle. + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle); + + // Bulk create column families with the same column family options. + // Return the handles of the column families through the argument handles. + // In case of error, the request may succeed partially, and handles will + // contain column family handles that it managed to create, and have size + // equal to the number of created column families. + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles); + + // Bulk create column families. + // Return the handles of the column families through the argument handles. + // In case of error, the request may succeed partially, and handles will + // contain column family handles that it managed to create, and have size + // equal to the number of created column families. + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles); + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + + // Bulk drop column families. This call only records drop records in the + // manifest and prevents the column families from flushing and compacting. + // In case of error, the request may succeed partially. User may call + // ListColumnFamilies to check the result. + virtual Status DropColumnFamilies( + const std::vector& column_families); + + // Release and deallocate a column family handle. A column family is only + // removed once it is dropped (DropColumnFamily) and all handles have been + // destroyed (DestroyColumnFamilyHandle). Use this method to destroy + // column family handles (except for DefaultColumnFamily()!) before closing + // a DB. + virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family); + + // Set the database entry for "key" to "value". + // If "key" already exists, it will be overwritten. + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) = 0; + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Put(options, DefaultColumnFamily(), key, value); + } + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& ts, const Slice& value) { + return Put(options, DefaultColumnFamily(), key, ts, value); + } + + // Set the database entry for "key" in the column family specified by + // "column_family" to the wide-column entity defined by "columns". If the key + // already exists in the column family, it will be overwritten. + // + // Returns OK on success, and a non-OK status on error. + virtual Status PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns); + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) = 0; + virtual Status Delete(const WriteOptions& options, const Slice& key) { + return Delete(options, DefaultColumnFamily(), key); + } + virtual Status Delete(const WriteOptions& options, const Slice& key, + const Slice& ts) { + return Delete(options, DefaultColumnFamily(), key, ts); + } + + // Remove the database entry for "key". Requires that the key exists + // and was not overwritten. Returns OK on success, and a non-OK status + // on error. It is not an error if "key" did not exist in the database. + // + // If a key is overwritten (by calling Put() multiple times), then the result + // of calling SingleDelete() on this key is undefined. SingleDelete() only + // behaves correctly if there has been only one Put() for this key since the + // previous call to SingleDelete() for this key. + // + // This feature is currently an experimental performance optimization + // for a very specific workload. It is up to the caller to ensure that + // SingleDelete is only used for a key that is not deleted using Delete() or + // written using Merge(). Mixing SingleDelete operations with Deletes and + // Merges can result in undefined behavior. + // + // Note: consider setting options.sync = true. + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) = 0; + virtual Status SingleDelete(const WriteOptions& options, const Slice& key) { + return SingleDelete(options, DefaultColumnFamily(), key); + } + virtual Status SingleDelete(const WriteOptions& options, const Slice& key, + const Slice& ts) { + return SingleDelete(options, DefaultColumnFamily(), key, ts); + } + + // Removes the database entries in the range ["begin_key", "end_key"), i.e., + // including "begin_key" and excluding "end_key". Returns OK on success, and + // a non-OK status on error. It is not an error if the database does not + // contain any existing data in the range ["begin_key", "end_key"). + // + // If "end_key" comes before "start_key" according to the user's comparator, + // a `Status::InvalidArgument` is returned. + // + // This feature is now usable in production, with the following caveats: + // 1) Accumulating too many range tombstones in the memtable will degrade read + // performance; this can be avoided by manually flushing occasionally. + // 2) Limiting the maximum number of open files in the presence of range + // tombstones can degrade read performance. To avoid this problem, set + // max_open_files to -1 whenever possible. + virtual Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key); + virtual Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts); + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Merge(options, DefaultColumnFamily(), key, value); + } + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*ts*/, + const Slice& /*value*/); + + // Apply the specified updates to the database. + // If `updates` contains no update, WAL will still be synced if + // options.sync=true. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the column family specified by "column_family" contains an entry for + // "key", return the corresponding value in "*value". If the entry is a plain + // key-value, return the value as-is; if it is a wide-column entity, return + // the value of its default anonymous column (see kDefaultWideColumnName) if + // any, or an empty value otherwise. + // + // If timestamp is enabled and a non-null timestamp pointer is passed in, + // timestamp is returned. + // + // Returns OK on success. Returns NotFound and an empty value in "*value" if + // there is no entry for "key". Returns some other non-OK status on error. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) { + return Get(options, DefaultColumnFamily(), key, value); + } + + // Get() methods that return timestamp. Derived DB classes don't need to worry + // about this group of methods if they don't care about timestamp feature. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val, timestamp); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, PinnableSlice* /*value*/, + std::string* /*timestamp*/) { + return Status::NotSupported( + "Get() that returns timestamp is not implemented."); + } + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp) { + return Get(options, DefaultColumnFamily(), key, value, timestamp); + } + + // If the column family specified by "column_family" contains an entry for + // "key", return it as a wide-column entity in "*columns". If the entry is a + // wide-column entity, return it as-is; if it is a plain key-value, return it + // as an entity with a single anonymous column (see kDefaultWideColumnName) + // which contains the value. + // + // Returns OK on success. Returns NotFound and an empty wide-column entity in + // "*columns" if there is no entry for "key". Returns some other non-OK status + // on error. + virtual Status GetEntity(const ReadOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + PinnableWideColumns* /* columns */) { + return Status::NotSupported("GetEntity not supported"); + } + + // Populates the `merge_operands` array with all the merge operands in the DB + // for `key`. The `merge_operands` array will be populated in the order of + // insertion. The number of entries populated in `merge_operands` will be + // assigned to `*number_of_operands`. + // + // If the number of merge operands in DB for `key` is greater than + // `merge_operands_options.expected_max_number_of_operands`, + // `merge_operands` is not populated and the return value is + // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned + // the number of merge operands found in the DB for `key`. + // + // `merge_operands`- Points to an array of at-least + // merge_operands_options.expected_max_number_of_operands and the + // caller is responsible for allocating it. + // + // The caller should delete or `Reset()` the `merge_operands` entries when + // they are no longer needed. All `merge_operands` entries must be destroyed + // or `Reset()` before this DB is closed or destroyed. + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) = 0; + + // Consistent Get of many keys across column families without the need + // for an explicit snapshot. NOTE: the implementation of this MultiGet API + // does not have the performance benefits of the void-returning MultiGet + // functions. + // + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + return MultiGet( + options, + std::vector(keys.size(), DefaultColumnFamily()), + keys, values); + } + + virtual std::vector MultiGet( + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + const std::vector& keys, std::vector* /*values*/, + std::vector* /*timestamps*/) { + return std::vector( + keys.size(), Status::NotSupported( + "MultiGet() returning timestamps not implemented.")); + } + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values, + std::vector* timestamps) { + return MultiGet( + options, + std::vector(keys.size(), DefaultColumnFamily()), + keys, values, timestamps); + } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + // Batched MultiGet-like API that returns wide-column entities from a single + // column family. For any given "key[i]" in "keys" (where 0 <= "i" < + // "num_keys"), if the column family specified by "column_family" contains an + // entry, it is returned it as a wide-column entity in "results[i]". If the + // entry is a wide-column entity, it is returned as-is; if it is a plain + // key-value, it is returned as an entity with a single anonymous column (see + // kDefaultWideColumnName) which contains the value. + // + // "statuses[i]" is set to OK if "keys[i]" is successfully retrieved. It is + // set to NotFound and an empty wide-column entity is returned in "results[i]" + // if there is no entry for "keys[i]". Finally, "statuses[i]" is set to some + // other non-OK status on error. + // + // If "keys" are sorted according to the column family's comparator, the + // "sorted_input" flag can be set for a small performance improvement. + // + // Note that it is the caller's responsibility to ensure that "keys", + // "results", and "statuses" point to "num_keys" number of contiguous objects + // (Slices, PinnableWideColumns, and Statuses respectively). + virtual void MultiGetEntity(const ReadOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + size_t num_keys, const Slice* /* keys */, + PinnableWideColumns* /* results */, + Status* statuses, + bool /* sorted_input */ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotSupported("MultiGetEntity not supported"); + } + } + + // Batched MultiGet-like API that returns wide-column entities potentially + // from multiple column families. For any given "key[i]" in "keys" (where 0 <= + // "i" < "num_keys"), if the column family specified by "column_families[i]" + // contains an entry, it is returned it as a wide-column entity in + // "results[i]". If the entry is a wide-column entity, it is returned as-is; + // if it is a plain key-value, it is returned as an entity with a single + // anonymous column (see kDefaultWideColumnName) which contains the value. + // + // "statuses[i]" is set to OK if "keys[i]" is successfully retrieved. It is + // set to NotFound and an empty wide-column entity is returned in "results[i]" + // if there is no entry for "keys[i]". Finally, "statuses[i]" is set to some + // other non-OK status on error. + // + // If "keys" are sorted by column family id and within each column family, + // according to the column family's comparator, the "sorted_input" flag can be + // set for a small performance improvement. + // + // Note that it is the caller's responsibility to ensure that + // "column_families", "keys", "results", and "statuses" point to "num_keys" + // number of contiguous objects (ColumnFamilyHandle pointers, Slices, + // PinnableWideColumns, and Statuses respectively). + virtual void MultiGetEntity(const ReadOptions& /* options */, size_t num_keys, + ColumnFamilyHandle** /* column_families */, + const Slice* /* keys */, + PinnableWideColumns* /* results */, + Status* statuses, + bool /* sorted_input */ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotSupported("MultiGetEntity not supported"); + } + } + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, std::string* /*value*/, + std::string* /*timestamp*/, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; + } + + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, column_family, key, value, + /*timestamp=*/nullptr, value_found); + } + + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); + } + + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp, + value_found); + } + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Iterator* NewIterator(const ReadOptions& options) { + return NewIterator(options, DefaultColumnFamily()); + } + // Returns iterators from a consistent database state across multiple + // column families. Iterators are heap allocated and need to be deleted + // before the db is deleted + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + // + // nullptr will be returned if the DB fails to take a snapshot or does + // not support snapshot (eg: inplace_update_support enabled). + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // Contains all valid property arguments for GetProperty() or + // GetMapProperty(). Each is a "string" property for retrieval with + // GetProperty() unless noted as a "map" property, for GetMapProperty(). + // + // NOTE: Property names cannot end in numbers since those are interpreted as + // arguments, e.g., see kNumFilesAtLevelPrefix. + struct Properties { + // "rocksdb.num-files-at-level" - returns string containing the number + // of files at level , where is an ASCII representation of a + // level number (e.g., "0"). + static const std::string kNumFilesAtLevelPrefix; + + // "rocksdb.compression-ratio-at-level" - returns string containing the + // compression ratio of data at level , where is an ASCII + // representation of a level number (e.g., "0"). Here, compression + // ratio is defined as uncompressed data size / compressed file size. + // Returns "-1.0" if no open files at level . + static const std::string kCompressionRatioAtLevelPrefix; + + // "rocksdb.stats" - returns a multi-line string containing the data + // described by kCFStats followed by the data described by kDBStats. + static const std::string kStats; + + // "rocksdb.sstables" - returns a multi-line string summarizing current + // SST files. + static const std::string kSSTables; + + // "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram" + // and "rocksdb.cf-file-histogram" as a "map" property. + static const std::string kCFStats; + + // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with + // general column family stats per-level over db's lifetime ("L"), + // aggregated over db's lifetime ("Sum"), and aggregated over the + // interval since the last retrieval ("Int"). + static const std::string kCFStatsNoFileHistogram; + + // "rocksdb.cf-file-histogram" - print out how many file reads to every + // level, as well as the histogram of latency of single requests. + static const std::string kCFFileHistogram; + + // "rocksdb.cf-write-stall-stats" - returns a multi-line string or + // map with statistics on CF-scope write stalls for a given CF + // See`WriteStallStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kCFWriteStallStats; + + // "rocksdb.db-write-stall-stats" - returns a multi-line string or + // map with statistics on DB-scope write stalls + // See`WriteStallStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kDBWriteStallStats; + + // "rocksdb.dbstats" - As a string property, returns a multi-line string + // with general database stats, both cumulative (over the db's + // lifetime) and interval (since the last retrieval of kDBStats). + // As a map property, returns cumulative stats only and does not + // update the baseline for the interval stats. + static const std::string kDBStats; + + // "rocksdb.levelstats" - returns multi-line string containing the number + // of files per level and total size of each level (MB). + static const std::string kLevelStats; + + // "rocksdb.block-cache-entry-stats" - returns a multi-line string or + // map with statistics on block cache usage. See + // `BlockCacheEntryStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kBlockCacheEntryStats; + + // "rocksdb.fast-block-cache-entry-stats" - same as above, but returns + // stale values more frequently to reduce overhead and latency. + static const std::string kFastBlockCacheEntryStats; + + // "rocksdb.num-immutable-mem-table" - returns number of immutable + // memtables that have not yet been flushed. + static const std::string kNumImmutableMemTable; + + // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable + // memtables that have already been flushed. + static const std::string kNumImmutableMemTableFlushed; + + // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is + // pending; otherwise, returns 0. + static const std::string kMemTableFlushPending; + + // "rocksdb.num-running-flushes" - returns the number of currently running + // flushes. + static const std::string kNumRunningFlushes; + + // "rocksdb.compaction-pending" - returns 1 if at least one compaction is + // pending; otherwise, returns 0. + static const std::string kCompactionPending; + + // "rocksdb.num-running-compactions" - returns the number of currently + // running compactions. + static const std::string kNumRunningCompactions; + + // "rocksdb.background-errors" - returns accumulated number of background + // errors. + static const std::string kBackgroundErrors; + + // "rocksdb.cur-size-active-mem-table" - returns approximate size of active + // memtable (bytes). + static const std::string kCurSizeActiveMemTable; + + // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active + // and unflushed immutable memtables (bytes). + static const std::string kCurSizeAllMemTables; + + // "rocksdb.size-all-mem-tables" - returns approximate size of active, + // unflushed immutable, and pinned immutable memtables (bytes). + static const std::string kSizeAllMemTables; + + // "rocksdb.num-entries-active-mem-table" - returns total number of entries + // in the active memtable. + static const std::string kNumEntriesActiveMemTable; + + // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries + // in the unflushed immutable memtables. + static const std::string kNumEntriesImmMemTables; + + // "rocksdb.num-deletes-active-mem-table" - returns total number of delete + // entries in the active memtable. + static const std::string kNumDeletesActiveMemTable; + + // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete + // entries in the unflushed immutable memtables. + static const std::string kNumDeletesImmMemTables; + + // "rocksdb.estimate-num-keys" - returns estimated number of total keys in + // the active and unflushed immutable memtables and storage. + static const std::string kEstimateNumKeys; + + // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for + // reading SST tables, excluding memory used in block cache (e.g., + // filter and index blocks). + static const std::string kEstimateTableReadersMem; + + // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete + // files is enabled; otherwise, returns a non-zero number. + // This name may be misleading because true(non-zero) means disable, + // but we keep the name for backward compatibility. + static const std::string kIsFileDeletionsEnabled; + + // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the + // database. + static const std::string kNumSnapshots; + + // "rocksdb.oldest-snapshot-time" - returns number representing unix + // timestamp of oldest unreleased snapshot. + static const std::string kOldestSnapshotTime; + + // "rocksdb.oldest-snapshot-sequence" - returns number representing + // sequence number of oldest unreleased snapshot. + static const std::string kOldestSnapshotSequence; + + // "rocksdb.num-live-versions" - returns number of live versions. `Version` + // is an internal data structure. See version_set.h for details. More + // live versions often mean more SST files are held from being deleted, + // by iterators or unfinished compactions. + static const std::string kNumLiveVersions; + + // "rocksdb.current-super-version-number" - returns number of current LSM + // version. It is a uint64_t integer number, incremented after there is + // any change to the LSM tree. The number is not preserved after restarting + // the DB. After DB restart, it will start from 0 again. + static const std::string kCurrentSuperVersionNumber; + + // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of + // live data in bytes. For BlobDB, it also includes the exact value of + // live bytes in the blob files of the version. + static const std::string kEstimateLiveDataSize; + + // "rocksdb.min-log-number-to-keep" - return the minimum log number of the + // log files that should be kept. + static const std::string kMinLogNumberToKeep; + + // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file + // number for an obsolete SST to be kept. The max value of `uint64_t` + // will be returned if all obsolete files can be deleted. + static const std::string kMinObsoleteSstNumberToKeep; + + // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST + // files. + // WARNING: may slow down online queries if there are too many files. + static const std::string kTotalSstFilesSize; + + // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST + // files belong to the latest LSM tree. + static const std::string kLiveSstFilesSize; + + // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes) + // of SST files at all certain file temperature + static const std::string kLiveSstFilesSizeAtTemperature; + + // "rocksdb.base-level" - returns number of level to which L0 data will be + // compacted. + static const std::string kBaseLevel; + + // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total + // number of bytes compaction needs to rewrite to get all levels down + // to under target size. Not valid for other compactions than level- + // based. + static const std::string kEstimatePendingCompactionBytes; + + // "rocksdb.aggregated-table-properties" - returns a string or map + // representation of the aggregated table properties of the target + // column family. Only properties that make sense for aggregation + // are included. + static const std::string kAggregatedTableProperties; + + // "rocksdb.aggregated-table-properties-at-level", same as the previous + // one but only returns the aggregated table properties of the + // specified level "N" at the target column family. + static const std::string kAggregatedTablePropertiesAtLevel; + + // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed + // write rate. 0 means no delay. + static const std::string kActualDelayedWriteRate; + + // "rocksdb.is-write-stopped" - Return 1 if write has been stopped. + static const std::string kIsWriteStopped; + + // "rocksdb.estimate-oldest-key-time" - returns an estimation of + // oldest key timestamp in the DB. Currently only available for + // FIFO compaction with + // compaction_options_fifo.allow_compaction = false. + static const std::string kEstimateOldestKeyTime; + + // "rocksdb.block-cache-capacity" - returns block cache capacity. + static const std::string kBlockCacheCapacity; + + // "rocksdb.block-cache-usage" - returns the memory size for the entries + // residing in block cache. + static const std::string kBlockCacheUsage; + + // "rocksdb.block-cache-pinned-usage" - returns the memory size for the + // entries being pinned. + static const std::string kBlockCachePinnedUsage; + + // "rocksdb.options-statistics" - returns multi-line string + // of options.statistics + static const std::string kOptionsStatistics; + + // "rocksdb.num-blob-files" - returns number of blob files in the current + // version. + static const std::string kNumBlobFiles; + + // "rocksdb.blob-stats" - return the total number and size of all blob + // files, and total amount of garbage (bytes) in the blob files in + // the current version. + static const std::string kBlobStats; + + // "rocksdb.total-blob-file-size" - returns the total size of all blob + // files over all versions. + static const std::string kTotalBlobFileSize; + + // "rocksdb.live-blob-file-size" - returns the total size of all blob + // files in the current version. + static const std::string kLiveBlobFileSize; + + // "rocksdb.live-blob-file-garbage-size" - returns the total amount of + // garbage in the blob files in the current version. + static const std::string kLiveBlobFileGarbageSize; + + // "rocksdb.blob-cache-capacity" - returns blob cache capacity. + static const std::string kBlobCacheCapacity; + + // "rocksdb.blob-cache-usage" - returns the memory size for the entries + // residing in blob cache. + static const std::string kBlobCacheUsage; + + // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the + // entries being pinned in blob cache. + static const std::string kBlobCachePinnedUsage; + }; + + // DB implementations export properties about their state via this method. + // If "property" is a valid "string" property understood by this DB + // implementation (see Properties struct above for valid options), fills + // "*value" with its current value and returns true. Otherwise, returns + // false. + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) = 0; + virtual bool GetProperty(const Slice& property, std::string* value) { + return GetProperty(DefaultColumnFamily(), property, value); + } + + // Like GetProperty but for valid "map" properties. (Some properties can be + // accessed as either "string" properties or "map" properties.) + virtual bool GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map* value) = 0; + virtual bool GetMapProperty(const Slice& property, + std::map* value) { + return GetMapProperty(DefaultColumnFamily(), property, value); + } + + // Similar to GetProperty(), but only works for a subset of properties whose + // return value is an integer. Return the value by integer. Supported + // properties: + // "rocksdb.num-immutable-mem-table" + // "rocksdb.mem-table-flush-pending" + // "rocksdb.compaction-pending" + // "rocksdb.background-errors" + // "rocksdb.cur-size-active-mem-table" + // "rocksdb.cur-size-all-mem-tables" + // "rocksdb.size-all-mem-tables" + // "rocksdb.num-entries-active-mem-table" + // "rocksdb.num-entries-imm-mem-tables" + // "rocksdb.num-deletes-active-mem-table" + // "rocksdb.num-deletes-imm-mem-tables" + // "rocksdb.estimate-num-keys" + // "rocksdb.estimate-table-readers-mem" + // "rocksdb.is-file-deletions-enabled" + // "rocksdb.num-snapshots" + // "rocksdb.oldest-snapshot-time" + // "rocksdb.num-live-versions" + // "rocksdb.current-super-version-number" + // "rocksdb.estimate-live-data-size" + // "rocksdb.min-log-number-to-keep" + // "rocksdb.min-obsolete-sst-number-to-keep" + // "rocksdb.total-sst-files-size" + // "rocksdb.live-sst-files-size" + // "rocksdb.base-level" + // "rocksdb.estimate-pending-compaction-bytes" + // "rocksdb.num-running-compactions" + // "rocksdb.num-running-flushes" + // "rocksdb.actual-delayed-write-rate" + // "rocksdb.is-write-stopped" + // "rocksdb.estimate-oldest-key-time" + // "rocksdb.block-cache-capacity" + // "rocksdb.block-cache-usage" + // "rocksdb.block-cache-pinned-usage" + // + // Properties dedicated for BlobDB: + // "rocksdb.num-blob-files" + // "rocksdb.total-blob-file-size" + // "rocksdb.live-blob-file-size" + // "rocksdb.blob-cache-capacity" + // "rocksdb.blob-cache-usage" + // "rocksdb.blob-cache-pinned-usage" + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) = 0; + virtual bool GetIntProperty(const Slice& property, uint64_t* value) { + return GetIntProperty(DefaultColumnFamily(), property, value); + } + + // Reset internal stats for DB and all column families. + // Note this doesn't reset options.statistics as it is not owned by + // DB. + virtual Status ResetStats() { + return Status::NotSupported("Not implemented"); + } + + // Same as GetIntProperty(), but this one returns the aggregated int + // property from all column families. + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) = 0; + + // Flags for DB::GetSizeApproximation that specify whether memtable + // stats should be included, or file stats approximation or both + enum class SizeApproximationFlags : uint8_t { + NONE = 0, + INCLUDE_MEMTABLES = 1 << 0, + INCLUDE_FILES = 1 << 1 + }; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)" + // in a single column family. + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes) = 0; + + // Simpler versions of the GetApproximateSizes() method above. + // The include_flags argument must of type DB::SizeApproximationFlags + // and can not be NONE. + virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes, + SizeApproximationFlags include_flags = + SizeApproximationFlags::INCLUDE_FILES); + + virtual Status GetApproximateSizes( + const Range* ranges, int n, uint64_t* sizes, + SizeApproximationFlags include_flags = + SizeApproximationFlags::INCLUDE_FILES) { + return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, + include_flags); + } + + // The method is similar to GetApproximateSizes, except it + // returns approximate number of records in memtables. + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) = 0; + virtual void GetApproximateMemTableStats(const Range& range, + uint64_t* const count, + uint64_t* const size) { + GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size); + } + + // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // This call blocks until the operation completes successfully, fails, + // or is aborted (Status::Incomplete). See DisableManualCompaction. + // + // begin==nullptr is treated as a key before all keys in the database. + // end==nullptr is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(options, nullptr, nullptr); + // Note that after the entire database is compacted, all data are pushed + // down to the last level containing any data. If the total data size after + // compaction is reduced, that level might not be appropriate for hosting all + // the files. In this case, client could set options.change_level to true, to + // move the files back to the minimum level capable of holding the data set + // or a given level (specified by non-negative options.target_level). + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) = 0; + virtual Status CompactRange(const CompactRangeOptions& options, + const Slice* begin, const Slice* end) { + return CompactRange(options, DefaultColumnFamily(), begin, end); + } + + // Dynamically change column family options or table factory options in a + // running DB, for the specified column family. Only options internally + // marked as "mutable" can be changed. Options not listed in `opts_map` will + // keep their current values. See GetColumnFamilyOptionsFromMap() in + // convenience.h for the details of `opts_map`. Not supported in LITE mode. + // + // USABILITY NOTE: SetOptions is intended only for expert users, and does + // not apply the same sanitization to options as the standard DB::Open code + // path does. Use with caution. + // + // RELIABILITY & PERFORMANCE NOTE: SetOptions is not fully stress-tested for + // reliability, and this is a slow call because a new OPTIONS file is + // serialized and persisted for each call. Use only infrequently. + // + // EXAMPLES: + // s = db->SetOptions(cfh, {{"ttl", "36000"}}); + // s = db->SetOptions(cfh, {{"block_based_table_factory", + // "{prepopulate_block_cache=kDisable;}"}}); + virtual Status SetOptions( + ColumnFamilyHandle* /*column_family*/, + const std::unordered_map& /*opts_map*/) { + return Status::NotSupported("Not implemented"); + } + // Shortcut for SetOptions on the default column family handle. + virtual Status SetOptions( + const std::unordered_map& new_options) { + return SetOptions(DefaultColumnFamily(), new_options); + } + + // Like SetOptions but for DBOptions, including the same caveats for + // usability, reliability, and performance. See GetDBOptionsFromMap() (and + // GetColumnFamilyOptionsFromMap()) in convenience.h for details on + // `opts_map`. Note supported in LITE mode. + // + // EXAMPLES: + // s = db->SetDBOptions({{"max_subcompactions", "2"}}); + // s = db->SetDBOptions({{"stats_dump_period_sec", "0"}, + // {"stats_persist_period_sec", "0"}}); + virtual Status SetDBOptions( + const std::unordered_map& new_options) = 0; + + // CompactFiles() inputs a list of files specified by file numbers and + // compacts them to the specified level. A small difference compared to + // CompactRange() is that CompactFiles() performs the compaction job + // using the CURRENT thread, so is not considered a "background" job. + // + // @see GetDataBaseMetaData + // @see GetColumnFamilyMetaData + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) = 0; + + virtual Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) { + return CompactFiles(compact_options, DefaultColumnFamily(), + input_file_names, output_level, output_path_id, + output_file_names, compaction_job_info); + } + + // This function will wait until all currently running background processes + // finish. After it returns, no background process will be run until + // ContinueBackgroundWork is called, once for each preceding OK-returning + // call to PauseBackgroundWork. + virtual Status PauseBackgroundWork() = 0; + virtual Status ContinueBackgroundWork() = 0; + + // This function will enable automatic compactions for the given column + // families if they were previously disabled. The function will first set the + // disable_auto_compactions option for each column family to 'false', after + // which it will schedule a flush/compaction. + // + // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API + // does NOT schedule a flush/compaction afterwards, and only changes the + // parameter itself within the column family option. + // + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) = 0; + + // After this function call, CompactRange() or CompactFiles() will not + // run compactions and fail. Calling this function will tell outstanding + // manual compactions to abort and will wait for them to finish or abort + // before returning. + virtual void DisableManualCompaction() = 0; + // Re-enable CompactRange() and ComapctFiles() that are disabled by + // DisableManualCompaction(). This function must be called as many times + // as DisableManualCompaction() has been called in order to re-enable + // manual compactions, and must not be called more times than + // DisableManualCompaction() has been called. + virtual void EnableManualCompaction() = 0; + + // Number of levels used for this DB. + virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; + virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; + virtual int MaxMemCompactionLevel() { + return MaxMemCompactionLevel(DefaultColumnFamily()); + } + + // Number of files in level-0 that would stop writes. + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; + virtual int Level0StopWriteTrigger() { + return Level0StopWriteTrigger(DefaultColumnFamily()); + } + + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + + // A shortcut for GetEnv()->->GetFileSystem().get(), possibly cached for + // efficiency. + virtual FileSystem* GetFileSystem() const; + + // Get DB Options that we use. During the process of opening the + // column family, the options provided when calling DB::Open() or + // DB::CreateColumnFamily() will have been "sanitized" and transformed + // in an implementation-defined manner. + virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0; + virtual Options GetOptions() const { + return GetOptions(DefaultColumnFamily()); + } + + virtual DBOptions GetDBOptions() const = 0; + + // Flush all memtable data. + // Flush a single column family, even when atomic flush is enabled. To flush + // multiple column families, use Flush(options, column_families). + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Status Flush(const FlushOptions& options) { + return Flush(options, DefaultColumnFamily()); + } + // Flushes memtables of multiple column families. + // If atomic flush is not enabled, Flush(options, column_families) is + // equivalent to calling Flush(options, column_family) multiple times. + // If atomic flush is enabled, Flush(options, column_families) will flush all + // column families specified in 'column_families' up to the latest sequence + // number at the time when flush is requested. + // Note that RocksDB 5.15 and earlier may not be able to open later versions + // with atomic flush enabled. + virtual Status Flush( + const FlushOptions& options, + const std::vector& column_families) = 0; + + // When using the manual_wal_flush option, flushes RocksDB internal buffers + // of WAL data to the file, so that the data can survive process crash or be + // included in a Checkpoint or Backup. Without manual_wal_flush, there is no + // such internal buffer. If sync is true, it calls SyncWAL() afterwards. + virtual Status FlushWAL(bool /*sync*/) { + return Status::NotSupported("FlushWAL not implemented"); + } + + // Ensure all WAL writes have been synced to storage, so that (assuming OS + // and hardware support) data will survive power loss. This function does + // not imply FlushWAL, so `FlushWAL(true)` is recommended if using + // manual_wal_flush=true. Currently only works if allow_mmap_writes = false + // in Options. + // + // Note that Write() followed by SyncWAL() is not exactly the same as Write() + // with sync=true: in the latter case the changes won't be visible until the + // sync is done. + virtual Status SyncWAL() = 0; + + // Freezes the logical state of the DB (by stopping writes), and if WAL is + // enabled, ensures that state has been flushed to DB files (as in + // FlushWAL()). This can be used for taking a Checkpoint at a known DB + // state, though the user must use options to insure no DB flush is invoked + // in this frozen state. Other operations allowed on a "read only" DB should + // work while frozen. Each LockWAL() call that returns OK must eventually be + // followed by a corresponding call to UnlockWAL(). Where supported, non-OK + // status is generally only possible with some kind of corruption or I/O + // error. + virtual Status LockWAL() { + return Status::NotSupported("LockWAL not implemented"); + } + + // Unfreeze the DB state from a successful LockWAL(). + // The write stop on the database will be cleared when UnlockWAL() have been + // called for each successful LockWAL(). + virtual Status UnlockWAL() { + return Status::NotSupported("UnlockWAL not implemented"); + } + + // The sequence number of the most recent transaction. + virtual SequenceNumber GetLatestSequenceNumber() const = 0; + + // Prevent file deletions. Compactions will continue to occur, + // but no obsolete files will be deleted. Calling this multiple + // times have the same effect as calling it once. + virtual Status DisableFileDeletions() = 0; + + // Increase the full_history_ts of column family. The new ts_low value should + // be newer than current full_history_ts value. + // If another thread updates full_history_ts_low concurrently to a higher + // timestamp than the requested ts_low, a try again error will be returned. + virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) = 0; + + // Get current full_history_ts value. + virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) = 0; + + // Allow compactions to delete obsolete files. + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; + + // Retrieves the creation time of the oldest file in the DB. + // This API only works if max_open_files = -1, if it is not then + // Status returned is Status::NotSupported() + // The file creation time is set using the env provided to the DB. + // If the DB was created from a very old release then its possible that + // the SST files might not have file_creation_time property and even after + // moving to a newer release its possible that some files never got compacted + // and may not have file_creation_time property. In both the cases + // file_creation_time is considered 0 which means this API will return + // creation_time = 0 as there wouldn't be a timestamp lower than 0. + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0; + + // Note: this API is not yet consistent with WritePrepared transactions. + // + // Sets iter to an iterator that is positioned at a write-batch whose + // sequence number range [start_seq, end_seq] covers seq_number. If no such + // write-batch exists, then iter is positioned at the next write-batch whose + // start_seq > seq_number. + // + // Returns Status::OK if iterator is valid + // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + // use this api, else the WAL files will get + // cleared aggressively and the iterator might keep getting invalid before + // an update is read. + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) = 0; + +// Windows API macro interference +#undef DeleteFile + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. + // + // Delete the file name from the db directory and update the internal state to + // reflect that. Supports deletion of sst and log files only. 'name' must be + // path relative to the db directory. eg. 000001.sst, /archive/000003.log + virtual Status DeleteFile(std::string name) = 0; + + // Obtains a list of all live table (SST) files and how they fit into the + // LSM-trees, such as column family, level, key range, etc. + // This builds a de-normalized form of GetAllColumnFamilyMetaData(). + // For information about all files in a DB, use GetLiveFilesStorageInfo(). + virtual void GetLiveFilesMetaData( + std::vector* /*metadata*/) {} + + // Return a list of all table (SST) and blob files checksum info. + // Note: This function might be of limited use because it cannot be + // synchronized with other "live files" APIs. GetLiveFilesStorageInfo() + // is recommended instead. + virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0; + + // Get information about all live files that make up a DB, for making + // live copies (Checkpoint, backups, etc.) or other storage-related purposes. + // If creating a live copy, use DisableFileDeletions() before and + // EnableFileDeletions() after to prevent deletions. + // For LSM-tree metadata, use Get*MetaData() functions instead. + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) = 0; + + // Obtains the LSM-tree meta data of the specified column family of the DB, + // including metadata for each live table (SST) file in that column family. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) {} + + // Get the metadata of the default column family. + void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) { + GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); + } + + // Obtains the LSM-tree meta data of all column families of the DB, including + // metadata for each live table (SST) file and each blob file in the DB. + virtual void GetAllColumnFamilyMetaData( + std::vector* /*metadata*/) {} + + // Retrieve the list of all files in the database except WAL files. The files + // are relative to the dbname (or db_paths/cf_paths), not absolute paths. + // (Not recommended with db_paths/cf_paths because that information is not + // returned.) Despite being relative paths, the file names begin with "/". + // The valid size of the manifest file is returned in manifest_file_size. + // The manifest file is an ever growing file, but only the portion specified + // by manifest_file_size is valid for this snapshot. Setting flush_memtable + // to true does Flush before recording the live files (unless DB is + // read-only). Setting flush_memtable to false is useful when we don't want + // to wait for flush which may have to wait for compaction to complete + // taking an indeterminate time. + // + // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate + // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended + // instead, because it ensures a single consistent view of all files is + // captured in one call. + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) = 0; + + // Retrieve the sorted list of all wal files with earliest file first + virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + + // Retrieve information about the current wal file + // + // Note that the log might have rolled after this call in which case + // the current_log_file would not point to the current log file. + // + // Additionally, for the sake of optimization current_log_file->StartSequence + // would always be set to 0 + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) = 0; + + // IngestExternalFile() will load a list of external SST files (1) into the DB + // Two primary modes are supported: + // - Duplicate keys in the new files will overwrite exiting keys (default) + // - Duplicate keys will be skipped (set ingest_behind=true) + // In the first mode we will try to find the lowest possible level that + // the file can fit in, and ingest the file into this level (2). A file that + // have a key range that overlap with the memtable key range will require us + // to Flush the memtable first before ingesting the file. + // In the second mode we will always ingest in the bottom most level (see + // docs to IngestExternalFileOptions::ingest_behind). + // + // (1) External SST files can be created using SstFileWriter + // (2) We will try to ingest the files to the lowest possible level + // even if the file compression doesn't match the level compression + // (3) If IngestExternalFileOptions->ingest_behind is set to true, + // we always ingest at the bottommost level, which should be reserved + // for this purpose (see DBOPtions::allow_ingest_behind flag). + // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to + // true, then this method can return Status:TryAgain() indicating that + // the files cannot be ingested to the bottommost level, and it is the + // user's responsibility to clear the bottommost level in the overlapping + // range before re-attempting the ingestion. + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& options) = 0; + + virtual Status IngestExternalFile( + const std::vector& external_files, + const IngestExternalFileOptions& options) { + return IngestExternalFile(DefaultColumnFamily(), external_files, options); + } + + // IngestExternalFiles() will ingest files for multiple column families, and + // record the result atomically to the MANIFEST. + // If this function returns OK, all column families' ingestion must succeed. + // If this function returns NOK, or the process crashes, then non-of the + // files will be ingested into the database after recovery. + // Note that it is possible for application to observe a mixed state during + // the execution of this function. If the user performs range scan over the + // column families with iterators, iterator on one column family may return + // ingested data, while iterator on other column family returns old data. + // Users can use snapshot for a consistent view of data. + // If your db ingests multiple SST files using this API, i.e. args.size() + // > 1, then RocksDB 5.15 and earlier will not be able to open it. + // + // REQUIRES: each arg corresponds to a different column family: namely, for + // 0 <= i < j < len(args), args[i].column_family != args[j].column_family. + virtual Status IngestExternalFiles( + const std::vector& args) = 0; + + // CreateColumnFamilyWithImport() will create a new column family with + // column_family_name and import external SST files specified in `metadata` + // into this column family. + // (1) External SST files can be created using SstFileWriter. + // (2) External SST files can be exported from a particular column family in + // an existing DB using Checkpoint::ExportColumnFamily. `metadata` should + // be the output from Checkpoint::ExportColumnFamily. + // Option in import_options specifies whether the external files are copied or + // moved (default is copy). When option specifies copy, managing files at + // external_file_path is caller's responsibility. When option specifies a + // move, the call makes a best effort to delete the specified files at + // external_file_path on successful return, logging any failure to delete + // rather than returning in Status. Files are not modified on any error + // return, and a best effort is made to remove any newly-created files. + // On error return, column family handle returned will be nullptr. + // ColumnFamily will be present on successful return and will not be present + // on error return. ColumnFamily may be present on any crash during this call. + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) = 0; + + // EXPERIMENTAL + // ClipColumnFamily() will clip the entries in the CF according to the range + // [begin_key, + // end_key). + // Returns OK on success, and a non-OK status on error. + // Any entries outside this range will be completely deleted (including + // tombstones). + // The main difference between ClipColumnFamily(begin, end) and + // DeleteRange(begin, end) + // is that the former physically deletes all keys outside the range, but is + // more heavyweight than the latter. + // This feature is mainly used to ensure that there is no overlapping Key when + // calling + // CreateColumnFamilyWithImports() to import multiple CFs. + // Note that: concurrent updates cannot be performed during Clip. + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) = 0; + + // Verify the checksums of files in db. Currently the whole-file checksum of + // table files are checked. + virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { + return Status::NotSupported("File verification not supported"); + } + + // Verify the block checksums of files in db. The block checksums of table + // files are checked. + virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; + + virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + + + // Returns the unique ID which is read from IDENTITY file during the opening + // of database by setting in the identity variable + // Returns Status::OK if identity could be set properly + virtual Status GetDbIdentity(std::string& identity) const = 0; + + // Return a unique identifier for each DB object that is opened + // This DB session ID should be unique among all open DB instances on all + // hosts, and should be unique among re-openings of the same or other DBs. + // (Two open DBs have the same identity from other function GetDbIdentity when + // one is physically copied from the other.) + virtual Status GetDbSessionId(std::string& session_id) const = 0; + + // Returns default column family handle + virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; + + + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) = 0; + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return GetPropertiesOfAllTables(DefaultColumnFamily(), props); + } + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) = 0; + + virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) { + return Status::NotSupported("SuggestCompactRange() is not implemented."); + } + + virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/, + int /*target_level*/) { + return Status::NotSupported("PromoteL0() is not implemented."); + } + + // Trace DB operations. Use EndTrace() to stop tracing. + virtual Status StartTrace(const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartTrace() is not implemented."); + } + + virtual Status EndTrace() { + return Status::NotSupported("EndTrace() is not implemented."); + } + + // IO Tracing operations. Use EndIOTrace() to stop tracing. + virtual Status StartIOTrace(const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartIOTrace() is not implemented."); + } + + virtual Status EndIOTrace() { + return Status::NotSupported("EndIOTrace() is not implemented."); + } + + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. + virtual Status StartBlockCacheTrace( + const TraceOptions& /*trace_options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status StartBlockCacheTrace( + const BlockCacheTraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status EndBlockCacheTrace() { + return Status::NotSupported("EndBlockCacheTrace() is not implemented."); + } + + // Create a default trace replayer. + virtual Status NewDefaultReplayer( + const std::vector& /*handles*/, + std::unique_ptr&& /*reader*/, + std::unique_ptr* /*replayer*/) { + return Status::NotSupported("NewDefaultReplayer() is not implemented."); + } + + + // Needed for StackableDB + virtual DB* GetRootDB() { return this; } + + // Given a window [start_time, end_time), setup a StatsHistoryIterator + // to access stats history. Note the start_time and end_time are epoch + // time measured in seconds, and end_time is an exclusive bound. + virtual Status GetStatsHistory( + uint64_t /*start_time*/, uint64_t /*end_time*/, + std::unique_ptr* /*stats_iterator*/) { + return Status::NotSupported("GetStatsHistory() is not implemented."); + } + + // Make the secondary instance catch up with the primary by tailing and + // replaying the MANIFEST and WAL of the primary. + // Column families created by the primary after the secondary instance starts + // will be ignored unless the secondary instance closes and restarts with the + // newly created column families. + // Column families that exist before secondary instance starts and dropped by + // the primary afterwards will be marked as dropped. However, as long as the + // secondary instance does not delete the corresponding column family + // handles, the data of the column family is still accessible to the + // secondary. + virtual Status TryCatchUpWithPrimary() { + return Status::NotSupported("Supported only by secondary instance"); + } +}; + +struct WriteStallStatsMapKeys { + static const std::string& TotalStops(); + static const std::string& TotalDelays(); + + static const std::string& CFL0FileCountLimitDelaysWithOngoingCompaction(); + static const std::string& CFL0FileCountLimitStopsWithOngoingCompaction(); + + // REQUIRES: + // `cause` isn't any of these: `WriteStallCause::kNone`, + // `WriteStallCause::kCFScopeWriteStallCauseEnumMax`, + // `WriteStallCause::kDBScopeWriteStallCauseEnumMax` + // + // REQUIRES: + // `condition` isn't any of these: `WriteStallCondition::kNormal` + static std::string CauseConditionCount(WriteStallCause cause, + WriteStallCondition condition); +}; + +// Overloaded operators for enum class SizeApproximationFlags. +inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs, + DB::SizeApproximationFlags rhs) { + return static_cast(static_cast(lhs) & + static_cast(rhs)); +} +inline DB::SizeApproximationFlags operator|(DB::SizeApproximationFlags lhs, + DB::SizeApproximationFlags rhs) { + return static_cast(static_cast(lhs) | + static_cast(rhs)); +} + +inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes, + SizeApproximationFlags include_flags) { + SizeApproximationOptions options; + options.include_memtables = + ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != + SizeApproximationFlags::NONE); + options.include_files = + ((include_flags & SizeApproximationFlags::INCLUDE_FILES) != + SizeApproximationFlags::NONE); + return GetApproximateSizes(options, column_family, ranges, n, sizes); +} + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options, + const std::vector& column_families = + std::vector()); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +// +// With this API, we will warn and skip data associated with column families not +// specified in column_families. +// +// @param column_families Descriptors for known column families +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families); + +// @param unknown_cf_opts Options for column families encountered during the +// repair that were not specified in column_families. +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts); + +// @param options These options will be used for the database and for ALL column +// families encountered during the repair +Status RepairDB(const std::string& dbname, const Options& options); + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_bench_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_bench_tool.h new file mode 100644 index 0000000000..17f4e6bde7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_bench_tool.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +int db_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_dump_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_dump_tool.h new file mode 100644 index 0000000000..2c97bad755 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_dump_tool.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +struct DumpOptions { + // Database that will be dumped + std::string db_path; + // File location that will contain dump output + std::string dump_location; + // Don't include db information header in the dump + bool anonymous = false; +}; + +class DbDumpTool { + public: + bool Run(const DumpOptions& dump_options, + ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); +}; + +struct UndumpOptions { + // Database that we will load the dumped file into + std::string db_path; + // File location of the dumped file that will be loaded + std::string dump_location; + // Compact the db after loading the dumped file + bool compact_db = false; +}; + +class DbUndumpTool { + public: + bool Run(const UndumpOptions& undump_options, + ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_stress_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_stress_tool.h new file mode 100644 index 0000000000..7d3d42c9d6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/db_stress_tool.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +int db_stress_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env.h new file mode 100644 index 0000000000..e3d4146416 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env.h @@ -0,0 +1,1882 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/functor_wrapper.h" +#include "rocksdb/port_defs.h" +#include "rocksdb/status.h" +#include "rocksdb/thread_status.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#undef LoadLibrary +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \ + __attribute__((__format__(__printf__, format_param, dots_param))) +#else +#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) +#endif + +namespace ROCKSDB_NAMESPACE { + +class DynamicLibrary; +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +struct DataVerificationInfo; +class WritableFile; +class RandomRWFile; +class MemoryMappedFileBuffer; +class Directory; +struct DBOptions; +struct ImmutableDBOptions; +struct MutableDBOptions; +class RateLimiter; +class ThreadStatusUpdater; +struct ThreadStatus; +class FileSystem; +class SystemClock; +struct ConfigOptions; + +const size_t kDefaultPageSize = 4 * 1024; + +// Options while opening a file to read/write +struct EnvOptions { + // Construct with default Options + EnvOptions(); + + // Construct from Options + explicit EnvOptions(const DBOptions& options); + + // If true, then use mmap to read data. + // Not recommended for 32-bit OS. + bool use_mmap_reads = false; + + // If true, then use mmap to write data + bool use_mmap_writes = true; + + // If true, then use O_DIRECT for reading data + bool use_direct_reads = false; + + // If true, then use O_DIRECT for writing data + bool use_direct_writes = false; + + // If false, fallocate() calls are bypassed + bool allow_fallocate = true; + + // If true, set the FD_CLOEXEC on open fd. + bool set_fd_cloexec = true; + + // Allows OS to incrementally sync files to disk while they are being + // written, in the background. Issue one request for every bytes_per_sync + // written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync = 0; + + // When true, guarantees the file has at most `bytes_per_sync` bytes submitted + // for writeback at any given time. + // + // - If `sync_file_range` is supported it achieves this by waiting for any + // prior `sync_file_range`s to finish before proceeding. In this way, + // processing (compression, etc.) can proceed uninhibited in the gap + // between `sync_file_range`s, and we block only when I/O falls behind. + // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism + // always blocks, thus preventing the interleaving of I/O and processing. + // + // Note: Enabling this option does not provide any additional persistence + // guarantees, as it may use `sync_file_range`, which does not write out + // metadata. + // + // Default: false + bool strict_bytes_per_sync = false; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; + + // See DBOptions doc + size_t compaction_readahead_size = 0; + + // See DBOptions doc + size_t random_access_max_buffer_size = 0; + + // See DBOptions doc + size_t writable_file_max_buffer_size = 1024 * 1024; + + // If not nullptr, write rate limiting is enabled for flush and compaction + RateLimiter* rate_limiter = nullptr; +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Env : public Customizable { + public: + static const char* kDefaultName() { return "DefaultEnv"; } + struct FileAttributes { + // File name + std::string name; + + // Size of file in bytes + uint64_t size_bytes; + }; + + Env(); + // Construct an Env with a separate FileSystem and/or SystemClock + // implementation + explicit Env(const std::shared_ptr& fs); + Env(const std::shared_ptr& fs, + const std::shared_ptr& clock); + // No copying allowed + Env(const Env&) = delete; + void operator=(const Env&) = delete; + + ~Env() override; + + static const char* Type() { return "Environment"; } + + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return ""; } + + // Loads the environment specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // + // @param config_options Controls how the environment is loaded. + // @param value the name and associated properties for the environment. + // @param result On success, the environment that was loaded. + // @param guard If specified and the loaded environment is not static, + // this value will contain the loaded environment (guard.get() == + // result). + // @return OK If the environment was successfully loaded (and optionally + // prepared) + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result); + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard); + + // Loads the environment specified by the env and fs uri. + // If both are specified, an error is returned. + // Otherwise, the environment is created by loading (via CreateFromString) + // the appropriate env/fs from the corresponding values. + static Status CreateFromUri(const ConfigOptions& options, + const std::string& env_uri, + const std::string& fs_uri, Env** result, + std::shared_ptr* guard); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to rocksdb and must never be deleted. + static Env* Default(); + + // See FileSystem::RegisterDbPaths. + virtual Status RegisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // See FileSystem::UnregisterDbPaths. + virtual Status UnregisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + WLTH_NOT_SET = 0, // No hint information set + WLTH_NONE, // No hints about write life time + WLTH_SHORT, // Data written has a short life time + WLTH_MEDIUM, // Data written has a medium life time + WLTH_LONG, // Data written has a long life time + WLTH_EXTREME, // Data written has an extremely long life time + }; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that writes to a file with the specified name. + // `WritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status ReopenWritableFile(const std::string& /*fname*/, + std::unique_ptr* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported("Env::ReopenWritableFile() not supported."); + } + + // Reuse an existing file by renaming it and opening it as writable. + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewRandomRWFile(const std::string& /*fname*/, + std::unique_ptr* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported("RandomRWFile is not implemented in this Env"); + } + + // Opens `fname` as a memory-mapped file for read and write (in-place updates + // only, i.e., no appends). On success, stores a raw buffer covering the whole + // file in `*result`. The file must exist prior to this call. + virtual Status NewMemoryMappedFileBuffer( + const std::string& /*fname*/, + std::unique_ptr* /*result*/) { + return Status::NotSupported( + "MemoryMappedFileBuffer is not implemented in this Env"); + } + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result) = 0; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + virtual Status FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir", and shall never include the + // names `.` or `..`. + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir", and shall never include the + // names `.` or `..`. + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildrenFileAttributes(const std::string& dir, + std::vector* result); + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Truncate the named file to the specified size. + virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) { + return Status::NotSupported("Truncate is not supported for this Env"); + } + + // Create the specified directory. Returns error if directory exists. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual Status CreateDirIfMissing(const std::string& dirname) = 0; + + // Delete the specified directory. + // Many implementations of this function will only delete a directory if it is + // empty. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) = 0; + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Hard Link file src to target. + virtual Status LinkFile(const std::string& /*src*/, + const std::string& /*target*/) { + return Status::NotSupported("LinkFile is not supported for this Env"); + } + + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/) { + return Status::NotSupported( + "Getting number of file links is not supported for this Env"); + } + + virtual Status AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, bool* /*res*/) { + return Status::NotSupported("AreFilesSame is not supported for this Env"); + } + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Opens `lib_name` as a dynamic library. + // If the 'search_path' is specified, breaks the path into its components + // based on the appropriate platform separator (";" or ";") and looks for the + // library in those directories. If 'search path is not specified, uses the + // default library path search mechanism (such as LD_LIBRARY_PATH). On + // success, stores a dynamic library in `*result`. + virtual Status LoadLibrary(const std::string& /*lib_name*/, + const std::string& /*search_path */, + std::shared_ptr* /*result*/) { + return Status::NotSupported("LoadLibrary is not implemented in this Env"); + } + + // Priority for scheduling job in thread pool + enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL }; + + static std::string PriorityToString(Priority priority); + + // Priority for requesting bytes in rate limiter scheduler + enum IOPriority { + IO_LOW = 0, + IO_MID = 1, + IO_HIGH = 2, + IO_USER = 3, + IO_TOTAL = 4 + }; + + // EXPERIMENTAL + enum class IOActivity : uint8_t { + kFlush = 0, + kCompaction = 1, + kDBOpen = 2, + kUnknown, // Keep last for easy array of non-unknowns + }; + + // Arrange to run "(*function)(arg)" once in a background thread, in + // the thread pool specified by pri. By default, jobs go to the 'LOW' + // priority thread pool. + + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + // When the UnSchedule function is called, the unschedFunction + // registered at the time of Schedule is invoked with arg as a parameter. + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW, void* tag = nullptr, + void (*unschedFunction)(void* arg) = nullptr) = 0; + + // Arrange to remove jobs for given arg from the queue_ if they are not + // already scheduled. Caller is expected to have exclusive lock on arg. + virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; } + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // Start a new thread, invoking "function(args...)" within the new thread. + // When "function(args...)" returns, the thread will be destroyed. + template + void StartThreadTyped(FunctionT function, Args&&... args) { + using FWType = FunctorWrapper; + StartThread( + [](void* arg) { + auto* functor = static_cast(arg); + functor->invoke(); + delete functor; + }, + new FWType(std::function(function), + std::forward(args)...)); + } + + // Wait for all threads started by StartThread to terminate. + virtual void WaitForJoin() {} + + // Reserve available background threads in the specified thread pool. + virtual int ReserveThreads(int /*threads_to_be_reserved*/, Priority /*pri*/) { + return 0; + } + + // Release a specific number of reserved threads from the specified thread + // pool + virtual int ReleaseThreads(int /*threads_to_be_released*/, Priority /*pri*/) { + return 0; + } + + // Get thread pool queue length for specific thread pool. + virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const { + return 0; + } + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + virtual Status NewLogger(const std::string& fname, + std::shared_ptr* result); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // 0 indicates not supported. + virtual uint64_t NowCPUNanos() { return 0; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the current host name as a null terminated string iff the string + // length is < len. The hostname should otherwise be truncated to len. + virtual Status GetHostName(char* name, uint64_t len) = 0; + + // Get the current hostname from the given env as a std::string in result. + // The result may be truncated if the hostname is too + // long + virtual Status GetHostNameString(std::string* result); + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Get full directory name for this db. + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) = 0; + + // The number of background worker threads of a specific thread pool + // for this environment. 'LOW' is the default pool. + // default number: 1 + virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + virtual int GetBackgroundThreads(Priority pri = LOW) = 0; + + virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) { + return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported."); + } + + // Enlarge number of background worker threads of a specific thread pool + // for this environment if it is smaller than specified. 'LOW' is the default + // pool. + virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0; + + // Lower IO priority for threads from the specified pool. + virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {} + + // Lower CPU priority for threads from the specified pool. + virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/, + CpuPriority /*pri*/) { + return Status::NotSupported( + "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported"); + } + + // Lower CPU priority for threads from the specified pool. + virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {} + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; + + // Generates a human-readable unique ID that can be used to identify a DB. + // In built-in implementations, this is an RFC-4122 UUID string, but might + // not be in all implementations. Overriding is not recommended. + // NOTE: this has not be validated for use in cryptography + virtual std::string GenerateUniqueId(); + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for reading log files. + virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const; + + // OptimizeForManifestRead will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for reading manifest + // files. + virtual EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const; + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const; + + // OptimizeForCompactionTableWrite will create a new EnvOptions object that is + // a copy of the EnvOptions in the parameters, but is optimized for writing + // table files. + virtual EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const; + + // OptimizeForCompactionTableWrite will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // table files. + virtual EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + + // OptimizeForBlobFileRead will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // blob files. + virtual EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + + // Returns the status of all threads that belong to the current Env. + virtual Status GetThreadList(std::vector* /*thread_list*/) { + return Status::NotSupported("Env::GetThreadList() not supported."); + } + + // Returns the pointer to ThreadStatusUpdater. This function will be + // used in RocksDB internally to update thread status and supports + // GetThreadList(). + virtual ThreadStatusUpdater* GetThreadStatusUpdater() const { + return thread_status_updater_; + } + + // Returns the ID of the current thread. + virtual uint64_t GetThreadID() const; + +// This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + + // Get the amount of free disk space + virtual Status GetFreeSpace(const std::string& /*path*/, + uint64_t* /*diskfree*/) { + return Status::NotSupported("Env::GetFreeSpace() not supported."); + } + + // Check whether the specified path is a directory + virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) { + return Status::NotSupported("Env::IsDirectory() not supported."); + } + + virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {} + + // Get the FileSystem implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetFileSystem() const; + + // Get the SystemClock implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetSystemClock() const; + + // If you're adding methods here, remember to add them to EnvWrapper too. + + protected: + // The pointer to an internal structure that will update the + // status of each thread. + ThreadStatusUpdater* thread_status_updater_; + + // Pointer to the underlying FileSystem implementation + std::shared_ptr file_system_; + + // Pointer to the underlying SystemClock implementation + std::shared_ptr system_clock_; + + private: + static const size_t kMaxHostNameLen = 256; +}; + +// The factory function to construct a ThreadStatusUpdater. Any Env +// that supports GetThreadList() feature should call this function in its +// constructor to initialize thread_status_updater_. +ThreadStatusUpdater* CreateThreadStatusUpdater(); + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() {} + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported( + "SequentialFile::InvalidateCache not supported."); + } + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/, + Slice* /*result*/, char* /*scratch*/) { + return Status::NotSupported( + "SequentialFile::PositionedRead() not supported."); + } + + // If you're adding methods here, remember to add them to + // SequentialFileWrapper too. +}; + +// A read IO request structure for use in MultiRead +struct ReadRequest { + // File offset in bytes + uint64_t offset; + + // Length to read in bytes. `result` only returns fewer bytes if end of file + // is hit (or `status` is not OK). + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + Slice result; + + // Status of read + Status status; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() {} + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + // Readahead the file starting from offset by n bytes for caching. + virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) { + return Status::OK(); + } + + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping. If the function return Status is not ok, status of + // individual requests will be ignored and return status will be assumed + // for all read requests. The function return status is only meant for + // any errors that occur before even processing specific read requests + virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest& req = reqs[i]; + req.status = Read(req.offset, req.len, &req.result, req.scratch); + } + return Status::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + } + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern /*pattern*/) {} + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported( + "RandomAccessFile::InvalidateCache not supported."); + } + + // If you're adding methods here, remember to add them to + // RandomAccessFileWrapper too. +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(false) {} + + explicit WritableFile(const EnvOptions& options) + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + // No copying allowed + WritableFile(const WritableFile&) = delete; + void operator=(const WritableFile&) = delete; + + virtual ~WritableFile(); + + // Append data to the end of the file + // Note: A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + virtual Status Append(const Slice& data) = 0; + + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status Append(const Slice& data, + const DataVerificationInfo& /* verification_info */) { + return Append(data); + } + + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() + virtual Status PositionedAppend(const Slice& /* data */, + uint64_t /* offset */) { + return Status::NotSupported( + "WritableFile::PositionedAppend() not supported."); + } + + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const DataVerificationInfo& /* verification_info */) { + return Status::NotSupported("PositionedAppend"); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); } + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { return Sync(); } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + virtual bool IsSyncThreadSafe() const { return false; } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + /* + * If rate limiting is enabled, change the file-granularity priority used in + * rate-limiting writes. + * + * In the presence of finer-granularity priority such as + * `WriteOptions::rate_limiter_priority`, this file-granularity priority may + * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as + * a fallback for Env::IO_TOTAL finer-granularity priority. + * + * If rate limiting is not enabled, this call has no effect. + */ + virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; } + + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() { return 0; } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + virtual void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported("WritableFile::InvalidateCache not supported."); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) { + if (strict_bytes_per_sync_) { + return Sync(); + } + return Status::OK(); + } + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + // TODO: Don't ignore errors from allocate + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks) + .PermitUncheckedError(); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + // Pre-allocates space for a file. + virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) { + return Status::OK(); + } + + // If you're adding methods here, remember to add them to + // WritableFileWrapper too. + + protected: + size_t preallocation_block_size() { return preallocation_block_size_; } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + + protected: + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + const bool strict_bytes_per_sync_; +}; + +// A file abstraction for random reading and writing. +class RandomRWFile { + public: + RandomRWFile() {} + // No copying allowed + RandomRWFile(const RandomRWFile&) = delete; + RandomRWFile& operator=(const RandomRWFile&) = delete; + + virtual ~RandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Returns Status::OK() on success. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + virtual Status Flush() = 0; + + virtual Status Sync() = 0; + + virtual Status Fsync() { return Sync(); } + + virtual Status Close() = 0; + + // If you're adding methods here, remember to add them to + // RandomRWFileWrapper too. +}; + +// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. +// Subclasses should release the mapping upon destruction. +class MemoryMappedFileBuffer { + public: + MemoryMappedFileBuffer(void* _base, size_t _length) + : base_(_base), length_(_length) {} + + virtual ~MemoryMappedFileBuffer() = 0; + + // We do not want to unmap this twice. We can make this class + // movable if desired, however, since + MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete; + MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete; + + void* GetBase() const { return base_; } + size_t GetLen() const { return length_; } + + protected: + void* base_; + const size_t length_; +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class Directory { + public: + virtual ~Directory() {} + // Fsync directory. Can be called concurrently from multiple threads. + virtual Status Fsync() = 0; + // Close directory. + virtual Status Close() { return Status::NotSupported("Close"); } + + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; + } + + // If you're adding methods here, remember to add them to + // DirectoryWrapper too. +}; + +enum InfoLogLevel : unsigned char { + DEBUG_LEVEL = 0, + INFO_LEVEL, + WARN_LEVEL, + ERROR_LEVEL, + FATAL_LEVEL, + HEADER_LEVEL, + NUM_INFO_LOG_LEVELS, +}; + +// An interface for writing log messages. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Logger { + public: + size_t kDoNotSupportGetLogFileSize = (std::numeric_limits::max)(); + + explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : closed_(false), log_level_(log_level) {} + // No copying allowed + Logger(const Logger&) = delete; + void operator=(const Logger&) = delete; + + virtual ~Logger(); + + // Close the log file. Must be called before destructor. If the return + // status is NotSupported(), it means the implementation does cleanup in + // the destructor + virtual Status Close(); + + // Write a header to the log file with the specified format + // It is recommended that you log all header information at the start of the + // application. But it is not enforced. + virtual void LogHeader(const char* format, va_list ap) { + // Default implementation does a simple INFO level log write. + // Please override as per the logger class requirement. + Logv(InfoLogLevel::INFO_LEVEL, format, ap); + } + + // Write an entry to the log file with the specified format. + // + // Users who override the `Logv()` overload taking `InfoLogLevel` do not need + // to implement this, unless they explicitly invoke it in + // `Logv(InfoLogLevel, ...)`. + virtual void Logv(const char* /* format */, va_list /* ap */) { + assert(false); + } + + // Write an entry to the log file with the specified log level + // and format. Any log with level under the internal log level + // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be + // printed. + virtual void Logv(const InfoLogLevel log_level, const char* format, + va_list ap); + + virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; } + // Flush to the OS buffers + virtual void Flush() {} + virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; } + virtual void SetInfoLogLevel(const InfoLogLevel log_level) { + log_level_ = log_level; + } + + // If you're adding methods here, remember to add them to LoggerWrapper too. + + protected: + virtual Status CloseImpl(); + bool closed_; + + private: + InfoLogLevel log_level_; +}; + +// Identifies a locked file. Except in custom Env/Filesystem implementations, +// the lifetime of a FileLock object should be managed only by LockFile() and +// UnlockFile(). +class FileLock { + public: + FileLock() {} + virtual ~FileLock(); + + private: + // No copying allowed + FileLock(const FileLock&) = delete; + void operator=(const FileLock&) = delete; +}; + +class DynamicLibrary { + public: + virtual ~DynamicLibrary() {} + + // Returns the name of the dynamic library. + virtual const char* Name() const = 0; + + // Loads the symbol for sym_name from the library and updates the input + // function. Returns the loaded symbol. + template + Status LoadFunction(const std::string& sym_name, std::function* function) { + assert(nullptr != function); + void* ptr = nullptr; + Status s = LoadSymbol(sym_name, &ptr); + *function = reinterpret_cast(ptr); + return s; + } + // Loads and returns the symbol for sym_name from the library. + virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; +}; + +extern void LogFlush(const std::shared_ptr& info_log); + +extern void Log(const InfoLogLevel log_level, + const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); + +// a set of log functions with different log levels. +extern void Header(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Debug(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Info(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Warn(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Error(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Fatal(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// Log the specified data to *info_log if info_log is non-nullptr. +// The default info log level is InfoLogLevel::INFO_LEVEL. +extern void Log(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +extern void LogFlush(Logger* info_log); + +extern void Log(const InfoLogLevel log_level, Logger* info_log, + const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); + +// The default info log level is InfoLogLevel::INFO_LEVEL. +extern void Log(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// a set of log functions with different log levels. +extern void Header(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Debug(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Info(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Warn(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Error(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Fatal(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync = false); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// Below are helpers for wrapping most of the classes in this file. +// They forward all calls to another instance of the class. +// Useful when wrapping the default implementations. +// Typical usage is to inherit your wrapper from *Wrapper, e.g.: +// +// class MySequentialFileWrapper : public +// ROCKSDB_NAMESPACE::SequentialFileWrapper { +// public: +// MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target): +// ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {} +// Status Read(size_t n, Slice* result, char* scratch) override { +// cout << "Doing a read of size " << n << "!" << endl; +// return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result, +// scratch); +// } +// // All other methods are forwarded to target_ automatically. +// }; +// +// This is often more convenient than inheriting the class directly because +// (a) Don't have to override and forward all methods - the Wrapper will +// forward everything you're not explicitly overriding. +// (b) Don't need to update the wrapper when more methods are added to the +// rocksdb class. Unless you actually want to override the behavior. +// (And unless rocksdb people forgot to update the *Wrapper class.) + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // The Target struct allows an Env to be stored as a raw (Env*) or + // std::shared_ptr. By using this struct, the wrapping/calling + // class does not need to worry about the ownership/lifetime of the + // wrapped target env. If the guard is set, then the Env will point + // to the guard.get(). + struct Target { + Env* env; // The raw Env + std::shared_ptr guard; // The guarded Env + + // Creates a Target without assuming ownership of the target Env + explicit Target(Env* t) : env(t) {} + + // Creates a Target from the guarded env, assuming ownership + explicit Target(std::unique_ptr&& t) : guard(t.release()) { + env = guard.get(); + } + + // Creates a Target from the guarded env, assuming ownership + explicit Target(const std::shared_ptr& t) : guard(t) { + env = guard.get(); + } + + // Makes sure the raw Env is not nullptr + void Prepare() { + if (guard.get() != nullptr) { + env = guard.get(); + } else if (env == nullptr) { + env = Env::Default(); + } + } + }; + + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t); + explicit EnvWrapper(std::unique_ptr&& t); + explicit EnvWrapper(const std::shared_ptr& t); + ~EnvWrapper() override; + + // Return the target to which this Env forwards all calls + Env* target() const { return target_.env; } + + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return target_.env->Name(); } + + // The following text is boilerplate that forwards all methods to target() + Status RegisterDbPaths(const std::vector& paths) override { + return target_.env->RegisterDbPaths(paths); + } + + Status UnregisterDbPaths(const std::vector& paths) override { + return target_.env->UnregisterDbPaths(paths); + } + + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->NewSequentialFile(f, r, options); + } + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->NewRandomAccessFile(f, r, options); + } + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->NewWritableFile(f, r, options); + } + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + return target_.env->ReopenWritableFile(fname, result, options); + } + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->ReuseWritableFile(fname, old_fname, r, options); + } + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + return target_.env->NewRandomRWFile(fname, result, options); + } + Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return target_.env->NewMemoryMappedFileBuffer(fname, result); + } + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + return target_.env->NewDirectory(name, result); + } + Status FileExists(const std::string& f) override { + return target_.env->FileExists(f); + } + Status GetChildren(const std::string& dir, + std::vector* r) override { + return target_.env->GetChildren(dir, r); + } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { + return target_.env->GetChildrenFileAttributes(dir, result); + } + Status DeleteFile(const std::string& f) override { + return target_.env->DeleteFile(f); + } + Status Truncate(const std::string& fname, size_t size) override { + return target_.env->Truncate(fname, size); + } + Status CreateDir(const std::string& d) override { + return target_.env->CreateDir(d); + } + Status CreateDirIfMissing(const std::string& d) override { + return target_.env->CreateDirIfMissing(d); + } + Status DeleteDir(const std::string& d) override { + return target_.env->DeleteDir(d); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + return target_.env->GetFileSize(f, s); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + return target_.env->GetFileModificationTime(fname, file_mtime); + } + + Status RenameFile(const std::string& s, const std::string& t) override { + return target_.env->RenameFile(s, t); + } + + Status LinkFile(const std::string& s, const std::string& t) override { + return target_.env->LinkFile(s, t); + } + + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + return target_.env->NumFileLinks(fname, count); + } + + Status AreFilesSame(const std::string& first, const std::string& second, + bool* res) override { + return target_.env->AreFilesSame(first, second, res); + } + + Status LockFile(const std::string& f, FileLock** l) override { + return target_.env->LockFile(f, l); + } + + Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); } + + Status IsDirectory(const std::string& path, bool* is_dir) override { + return target_.env->IsDirectory(path, is_dir); + } + + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return target_.env->LoadLibrary(lib_name, search_path, result); + } + + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return target_.env->Schedule(f, a, pri, tag, u); + } + + int UnSchedule(void* tag, Priority pri) override { + return target_.env->UnSchedule(tag, pri); + } + + void StartThread(void (*f)(void*), void* a) override { + return target_.env->StartThread(f, a); + } + void WaitForJoin() override { return target_.env->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return target_.env->GetThreadPoolQueueLen(pri); + } + + int ReserveThreads(int threads_to_be_reserved, Priority pri) override { + return target_.env->ReserveThreads(threads_to_be_reserved, pri); + } + + int ReleaseThreads(int threads_to_be_released, Priority pri) override { + return target_.env->ReleaseThreads(threads_to_be_released, pri); + } + + Status GetTestDirectory(std::string* path) override { + return target_.env->GetTestDirectory(path); + } + Status NewLogger(const std::string& fname, + std::shared_ptr* result) override { + return target_.env->NewLogger(fname, result); + } + uint64_t NowMicros() override { return target_.env->NowMicros(); } + uint64_t NowNanos() override { return target_.env->NowNanos(); } + uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); } + + void SleepForMicroseconds(int micros) override { + target_.env->SleepForMicroseconds(micros); + } + Status GetHostName(char* name, uint64_t len) override { + return target_.env->GetHostName(name, len); + } + Status GetCurrentTime(int64_t* unix_time) override { + return target_.env->GetCurrentTime(unix_time); + } + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override { + return target_.env->GetAbsolutePath(db_path, output_path); + } + void SetBackgroundThreads(int num, Priority pri) override { + return target_.env->SetBackgroundThreads(num, pri); + } + int GetBackgroundThreads(Priority pri) override { + return target_.env->GetBackgroundThreads(pri); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access); + } + + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return target_.env->IncBackgroundThreadsIfNeeded(num, pri); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + target_.env->LowerThreadPoolIOPriority(pool); + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + target_.env->LowerThreadPoolCPUPriority(pool); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return target_.env->LowerThreadPoolCPUPriority(pool, pri); + } + + std::string TimeToString(uint64_t time) override { + return target_.env->TimeToString(time); + } + + Status GetThreadList(std::vector* thread_list) override { + return target_.env->GetThreadList(thread_list); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_.env->GetThreadStatusUpdater(); + } + + uint64_t GetThreadID() const override { return target_.env->GetThreadID(); } + + std::string GenerateUniqueId() override { + return target_.env->GenerateUniqueId(); + } + + EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { + return target_.env->OptimizeForLogRead(env_options); + } + EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const override { + return target_.env->OptimizeForManifestRead(env_options); + } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override { + return target_.env->OptimizeForLogWrite(env_options, db_options); + } + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override { + return target_.env->OptimizeForManifestWrite(env_options); + } + EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_.env->OptimizeForCompactionTableWrite(env_options, + immutable_ops); + } + EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_.env->OptimizeForCompactionTableRead(env_options, db_options); + } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_.env->OptimizeForBlobFileRead(env_options, db_options); + } + Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { + return target_.env->GetFreeSpace(path, diskfree); + } + void SanitizeEnvOptions(EnvOptions* env_opts) const override { + target_.env->SanitizeEnvOptions(env_opts); + } + Status PrepareOptions(const ConfigOptions& options) override; + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; + + private: + Target target_; +}; + +class SequentialFileWrapper : public SequentialFile { + public: + explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + return target_->Read(n, result, scratch); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + return target_->PositionedRead(offset, n, result, scratch); + } + + private: + SequentialFile* target_; +}; + +class RandomAccessFileWrapper : public RandomAccessFile { + public: + explicit RandomAccessFileWrapper(RandomAccessFile* target) + : target_(target) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + return target_->MultiRead(reqs, num_reqs); + } + Status Prefetch(uint64_t offset, size_t n) override { + return target_->Prefetch(offset, n); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { target_->Hint(pattern); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + RandomAccessFile* target_; +}; + +class WritableFileWrapper : public WritableFile { + public: + explicit WritableFileWrapper(WritableFile* t) : target_(t) {} + + Status Append(const Slice& data) override { return target_->Append(data); } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + return target_->Append(data, verification_info); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + return target_->PositionedAppend(data, offset, verification_info); + } + Status Truncate(uint64_t size) override { return target_->Truncate(size); } + Status Close() override { return target_->Close(); } + Status Flush() override { return target_->Flush(); } + Status Sync() override { return target_->Sync(); } + Status Fsync() override { return target_->Fsync(); } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetIOPriority(Env::IOPriority pri) override { + target_->SetIOPriority(pri); + } + + Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { return target_->GetFileSize(); } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + return target_->RangeSync(offset, nbytes); + } + + void PrepareWrite(size_t offset, size_t len) override { + target_->PrepareWrite(offset, len); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + return target_->Allocate(offset, len); + } + + private: + WritableFile* target_; +}; + +class RandomRWFileWrapper : public RandomRWFile { + public: + explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + return target_->Write(offset, data); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + Status Flush() override { return target_->Flush(); } + Status Sync() override { return target_->Sync(); } + Status Fsync() override { return target_->Fsync(); } + Status Close() override { return target_->Close(); } + + private: + RandomRWFile* target_; +}; + +class DirectoryWrapper : public Directory { + public: + explicit DirectoryWrapper(Directory* target) : target_(target) {} + + Status Fsync() override { return target_->Fsync(); } + Status Close() override { return target_->Close(); } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + Directory* target_; +}; + +class LoggerWrapper : public Logger { + public: + explicit LoggerWrapper(Logger* target) : target_(target) {} + + Status Close() override { return target_->Close(); } + void LogHeader(const char* format, va_list ap) override { + return target_->LogHeader(format, ap); + } + void Logv(const char* format, va_list ap) override { + return target_->Logv(format, ap); + } + void Logv(const InfoLogLevel log_level, const char* format, + va_list ap) override { + return target_->Logv(log_level, format, ap); + } + size_t GetLogFileSize() const override { return target_->GetLogFileSize(); } + void Flush() override { return target_->Flush(); } + InfoLogLevel GetInfoLogLevel() const override { + return target_->GetInfoLogLevel(); + } + void SetInfoLogLevel(const InfoLogLevel log_level) override { + return target_->SetInfoLogLevel(log_level); + } + + private: + Logger* target_; +}; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +// Returns a new environment that measures function call times for filesystem +// operations, reporting results to variables in PerfContext. +// This is a factory method for TimedEnv defined in utilities/env_timed.cc. +Env* NewTimedEnv(Env* base_env); + +// Returns an instance of logger that can be used for storing informational +// messages. +// This is a factory method for EnvLogger declared in logging/env_logging.h +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result); + +// Creates a new Env based on Env::Default() but modified to use the specified +// FileSystem. +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env_encryption.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env_encryption.h new file mode 100644 index 0000000000..3ddb1a7558 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/env_encryption.h @@ -0,0 +1,463 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class EncryptionProvider; + +struct ConfigOptions; + +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider); +std::shared_ptr NewEncryptedFS( + const std::shared_ptr& base_fs, + const std::shared_ptr& provider); + +// BlockAccessCipherStream is the base class for any cipher stream that +// supports random access at block level (without requiring data from other +// blocks). E.g. CTR (Counter operation mode) supports this requirement. +class BlockAccessCipherStream { + public: + virtual ~BlockAccessCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; + + // Encrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize); + + // Decrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize); + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) = 0; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; +}; + +// BlockCipher +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class BlockCipher : public Customizable { + public: + virtual ~BlockCipher(){}; + + // Creates a new BlockCipher from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "ROT13", a ROT13BlockCipher is created. + // + // @param config_options Options to control how this cipher is created + // and initialized. + // @param value The value might be: + // - ROT13 Create a ROT13 Cipher + // - ROT13:nn Create a ROT13 Cipher with block size of nn + // @param result The new cipher object + // @return OK if the cipher was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + static const char* Type() { return "BlockCipher"; } + // Short-cut method to create a ROT13 BlockCipher. + // This cipher is only suitable for test purposes and should not be used in + // production!!! + static std::shared_ptr NewROT13Cipher(size_t block_size); + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Encrypt(char* data) = 0; + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Decrypt(char* data) = 0; +}; + +// The encryption provider is used to create a cipher stream for a specific +// file. The returned cipher stream will be used for actual +// encryption/decryption actions. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class EncryptionProvider : public Customizable { + public: + virtual ~EncryptionProvider(){}; + + // Creates a new EncryptionProvider from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "CTR", a CTREncryptionProvider will be + // created. If the value is ends with "://test" (e.g CTR://test"), the + // provider will be initialized in "TEST" mode prior to being returned. + // + // @param config_options Options to control how this provider is created + // and initialized. + // @param value The value might be: + // - CTR Create a CTR provider + // - CTR://test Create a CTR provider and initialize it for tests. + // @param result The new provider object + // @return OK if the provider was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + static const char* Type() { return "EncryptionProvider"; } + + // Short-cut method to create a CTR-provider + static std::shared_ptr NewCTRProvider( + const std::shared_ptr& cipher); + + // GetPrefixLength returns the length of the prefix that is added to every + // file and used for storing encryption options. For optimal performance, the + // prefix length should be a multiple of the page size. + virtual size_t GetPrefixLength() const = 0; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + virtual Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) const = 0; + + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + virtual Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) = 0; + + // Returns a string representing an encryption marker prefix for this + // provider. If a marker is provided, this marker can be used to tell whether + // or not a file is encrypted by this provider. The maker will also be part + // of any encryption prefix for this provider. + virtual std::string GetMarker() const { return ""; } +}; + +class EncryptedSequentialFile : public FSSequentialFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + uint64_t offset_; + size_t prefixLength_; + + public: + // Default ctor. Given underlying sequential file is supposed to be at + // offset == prefixLength. + EncryptedSequentialFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + offset_(prefixLength), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + IOStatus Skip(uint64_t n) override; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; +}; + +// A file abstraction for randomly reading the contents of a file. +class EncryptedRandomAccessFile : public FSRandomAccessFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + EncryptedRandomAccessFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + // Readahead the file starting from offset by n bytes for caching. + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + size_t GetUniqueId(char* id, size_t max_size) const override; + + void Hint(AccessPattern pattern) override; + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + IOStatus InvalidateCache(size_t offset, size_t length) override; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class EncryptedWritableFile : public FSWritableFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + // Default ctor. Prefix is assumed to be written already. + EncryptedWritableFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + bool IsSyncThreadSafe() const override; + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + /* + * Get the size of valid data in the file. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override; + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override; + + void SetPreallocationBlockSize(size_t size) override; + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override; + + // Pre-allocates space for a file. + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; + +// A file abstraction for random reading and writing. +class EncryptedRandomRWFile : public FSRandomRWFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + EncryptedRandomRWFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; + +class EncryptedFileSystem : public FileSystemWrapper { + public: + explicit EncryptedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; + static const char* kClassName() { return "EncryptedFileSystem"; } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return FileSystemWrapper::IsInstanceOf(name); + } + } +}; +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/experimental.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/experimental.h new file mode 100644 index 0000000000..b59395255c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/experimental.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +namespace experimental { + +// Supported only for Leveled compaction +Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); +Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end); + +// Move all L0 files to target_level skipping compaction. +// This operation succeeds only if the files in L0 have disjoint ranges; this +// is guaranteed to happen, for instance, if keys are inserted in sorted +// order. Furthermore, all levels between 1 and target_level must be empty. +// If any of the above condition is violated, InvalidArgument will be +// returned. +Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, + int target_level = 1); + +struct UpdateManifestForFilesStateOptions { + // When true, read current file temperatures from FileSystem and update in + // DB manifest when a temperature other than Unknown is reported and + // inconsistent with manifest. + bool update_temperatures = true; + + // TODO: new_checksums: to update files to latest file checksum algorithm +}; + +// Utility for updating manifest of DB directory (not open) for current state +// of files on filesystem. See UpdateManifestForFilesStateOptions. +// +// To minimize interference with ongoing DB operations, only the following +// guarantee is provided, assuming no IO error encountered: +// * Only files live in DB at start AND end of call to +// UpdateManifestForFilesState() are guaranteed to be updated (as needed) in +// manifest. +// * For example, new files after start of call to +// UpdateManifestForFilesState() might not be updated, but that is not +// typically required to achieve goal of manifest consistency/completeness +// (because current DB configuration would ensure new files get the desired +// consistent metadata). +Status UpdateManifestForFilesState( + const DBOptions& db_opts, const std::string& db_name, + const std::vector& column_families, + const UpdateManifestForFilesStateOptions& opts = {}); + +} // namespace experimental +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_checksum.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_checksum.h new file mode 100644 index 0000000000..758bae4acf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_checksum.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The unknown file checksum. +constexpr char kUnknownFileChecksum[] = ""; +// The unknown sst file checksum function name. +constexpr char kUnknownFileChecksumFuncName[] = "Unknown"; +// The standard DB file checksum function name. +// This is the name of the checksum function returned by +// GetFileChecksumGenCrc32cFactory(); +constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c"; + +struct FileChecksumGenContext { + std::string file_name; + // The name of the requested checksum generator. + // Checksum factories may use or ignore requested_checksum_func_name, + // and checksum factories written before this field was available are still + // compatible. + std::string requested_checksum_func_name; +}; + +// FileChecksumGenerator is the class to generates the checksum value +// for each file when the file is written to the file system. +// Implementations may assume that +// * Finalize is called at most once during the life of the object +// * All calls to Update come before Finalize +// * All calls to GetChecksum come after Finalize +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumGenerator { + public: + virtual ~FileChecksumGenerator() {} + + // Update the current result after process the data. For different checksum + // functions, the temporal results may be stored and used in Update to + // include the new data. + virtual void Update(const char* data, size_t n) = 0; + + // Generate the final results if no further new data will be updated. + virtual void Finalize() = 0; + + // Get the checksum. The result should not be the empty string and may + // include arbitrary bytes, including non-printable characters. + virtual std::string GetChecksum() const = 0; + + // Returns a name that identifies the current file checksum function. + virtual const char* Name() const = 0; +}; + +// Create the FileChecksumGenerator object for each SST file. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumGenFactory : public Customizable { + public: + ~FileChecksumGenFactory() override {} + static const char* Type() { return "FileChecksumGenFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + // Create a new FileChecksumGenerator. + virtual std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) = 0; + + // Return the name of this FileChecksumGenFactory. + const char* Name() const override = 0; +}; + +// FileChecksumList stores the checksum information of a list of files (e.g., +// SST files). The FileChecksumList can be used to store the checksum +// information of all SST file getting from the MANIFEST, which are +// the checksum information of all valid SST file of a DB instance. It can +// also be used to store the checksum information of a list of SST files to +// be ingested. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumList { + public: + virtual ~FileChecksumList() {} + + // Clean the previously stored file checksum information. + virtual void reset() = 0; + + // Get the number of checksums in the checksum list + virtual size_t size() const = 0; + + // Return all the file checksum information being stored in a unordered_map. + // File_number is the key, the first part of the value is checksum value, + // and the second part of the value is checksum function name. + virtual Status GetAllFileChecksums( + std::vector* file_numbers, std::vector* checksums, + std::vector* checksum_func_names) = 0; + + // Given the file_number, it searches if the file checksum information is + // stored. + virtual Status SearchOneFileChecksum(uint64_t file_number, + std::string* checksum, + std::string* checksum_func_name) = 0; + + // Insert the checksum information of one file to the FileChecksumList. + virtual Status InsertOneFileChecksum( + uint64_t file_number, const std::string& checksum, + const std::string& checksum_func_name) = 0; + + // Remove the checksum information of one SST file. + virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0; +}; + +// Create a new file checksum list. +extern FileChecksumList* NewFileChecksumList(); + +// Return a shared_ptr of the builtin Crc32c based file checksum generator +// factory object, which can be shared to create the Crc32c based checksum +// generator object. +// Note: this implementation is compatible with many other crc32c checksum +// implementations and uses big-endian encoding of the result, unlike most +// other crc32c checksums in RocksDB, which alter the result with +// crc32c::Mask and use little-endian encoding. +extern std::shared_ptr +GetFileChecksumGenCrc32cFactory(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_system.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_system.h new file mode 100644 index 0000000000..ae59ef8009 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/file_system.h @@ -0,0 +1,1849 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A FileSystem is an interface used by the rocksdb implementation to access +// storage functionality like the filesystem etc. Callers +// may wish to provide a custom FileSystem object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All FileSystem implementations are safe for concurrent access from +// multiple threads without any external synchronization. +// +// WARNING: Since this is a new interface, it is expected that there will be +// some changes as storage systems are ported over. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "rocksdb/thread_status.h" + +namespace ROCKSDB_NAMESPACE { + +class FileLock; +class FSDirectory; +class FSRandomAccessFile; +class FSRandomRWFile; +class FSSequentialFile; +class FSWritableFile; +class Logger; +class Slice; +struct ImmutableDBOptions; +struct MutableDBOptions; +class RateLimiter; +struct ConfigOptions; + +using AccessPattern = RandomAccessFile::AccessPattern; +using FileAttributes = Env::FileAttributes; + +// DEPRECATED +// Priority of an IO request. This is a hint and does not guarantee any +// particular QoS. +// IO_LOW - Typically background reads/writes such as compaction/flush +// IO_HIGH - Typically user reads/synchronous WAL writes +enum class IOPriority : uint8_t { + kIOLow, + kIOHigh, + kIOTotal, +}; + +// Type of the data begin read/written. It can be passed down as a flag +// for the FileSystem implementation to optionally handle different types in +// different ways +enum class IOType : uint8_t { + kData, + kFilter, + kIndex, + kMetadata, + kWAL, + kManifest, + kLog, + kUnknown, + kInvalid, +}; + +// Per-request options that can be passed down to the FileSystem +// implementation. These are hints and are not necessarily guaranteed to be +// honored. More hints can be added here in the future to indicate things like +// storage media (HDD/SSD) to be used, replication level etc. +struct IOOptions { + // Timeout for the operation in microseconds + std::chrono::microseconds timeout; + + // DEPRECATED + // Priority - high or low + IOPriority prio; + + // Priority used to charge rate limiter configured in file system level (if + // any) + // Limitation: right now RocksDB internal does not consider this + // rate_limiter_priority + Env::IOPriority rate_limiter_priority; + + // Type of data being read/written + IOType type; + + // EXPERIMENTAL + // An option map that's opaque to RocksDB. It can be used to implement a + // custom contract between a FileSystem user and the provider. This is only + // useful in cases where a RocksDB user directly uses the FileSystem or file + // object for their own purposes, and wants to pass extra options to APIs + // such as NewRandomAccessFile and NewWritableFile. + std::unordered_map property_bag; + + // Force directory fsync, some file systems like btrfs may skip directory + // fsync, set this to force the fsync + bool force_dir_fsync; + + // Can be used by underlying file systems to skip recursing through sub + // directories and list only files in GetChildren API. + bool do_not_recurse; + + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + + IOOptions() : IOOptions(false) {} + + explicit IOOptions(bool force_dir_fsync_) + : timeout(std::chrono::microseconds::zero()), + prio(IOPriority::kIOLow), + rate_limiter_priority(Env::IO_TOTAL), + type(IOType::kUnknown), + force_dir_fsync(force_dir_fsync_), + do_not_recurse(false) {} +}; + +struct DirFsyncOptions { + enum FsyncReason : uint8_t { + kNewFileSynced, + kFileRenamed, + kDirRenamed, + kFileDeleted, + kDefault, + } reason; + + std::string renamed_new_name; // for kFileRenamed + // add other options for other FsyncReason + + DirFsyncOptions(); + + explicit DirFsyncOptions(std::string file_renamed_new_name); + + explicit DirFsyncOptions(FsyncReason fsync_reason); +}; + +// File scope options that control how a file is opened/created and accessed +// while its open. We may add more options here in the future such as +// redundancy level, media to use etc. +struct FileOptions : EnvOptions { + // Embedded IOOptions to control the parameters for any IOs that need + // to be issued for the file open/creation + IOOptions io_options; + + // EXPERIMENTAL + // The feature is in development and is subject to change. + // When creating a new file, set the temperature of the file so that + // underlying file systems can put it with appropriate storage media and/or + // coding. + Temperature temperature = Temperature::kUnknown; + + // The checksum type that is used to calculate the checksum value for + // handoff during file writes. + ChecksumType handoff_checksum_type; + + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const DBOptions& opts) + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const EnvOptions& opts) + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const FileOptions& opts) + : EnvOptions(opts), + io_options(opts.io_options), + temperature(opts.temperature), + handoff_checksum_type(opts.handoff_checksum_type) {} + + FileOptions& operator=(const FileOptions&) = default; +}; + +// A structure to pass back some debugging information from the FileSystem +// implementation to RocksDB in case of an IO error +struct IODebugContext { + // file_path to be filled in by RocksDB in case of an error + std::string file_path; + + // A map of counter names to values - set by the FileSystem implementation + std::map counters; + + // To be set by the FileSystem implementation + std::string msg; + + // To be set by the underlying FileSystem implementation. + std::string request_id; + + // In order to log required information in IO tracing for different + // operations, Each bit in trace_data stores which corresponding info from + // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it + // means bit at position 0 is set so TraceData::kRequestID (request_id) will + // be logged in the trace record. + // + enum TraceData : char { + // The value of each enum represents the bitwise position for + // that information in trace_data which will be used by IOTracer for + // tracing. Make sure to add them sequentially. + kRequestID = 0, + }; + uint64_t trace_data = 0; + + IODebugContext() {} + + void AddCounter(std::string& name, uint64_t value) { + counters.emplace(name, value); + } + + // Called by underlying file system to set request_id and log request_id in + // IOTracing. + void SetRequestId(const std::string& _request_id) { + request_id = _request_id; + trace_data |= (1 << TraceData::kRequestID); + } + + std::string ToString() { + std::ostringstream ss; + ss << file_path << ", "; + for (auto counter : counters) { + ss << counter.first << " = " << counter.second << ","; + } + ss << msg; + return ss.str(); + } +}; + +// A function pointer type for custom destruction of void pointer passed to +// ReadAsync API. RocksDB/caller is responsible for deleting the void pointer +// allocated by FS in ReadAsync API. +using IOHandleDeleter = std::function; + +// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile, +// FSRandomRWFileclass, and FSDIrectory classes define the interface between +// RocksDB and storage systems, such as Posix filesystems, +// remote filesystems etc. +// The interface allows for fine grained control of individual IO operations, +// such as setting a timeout, prioritization, hints on data placement, +// different handling based on type of IO etc. +// This is accomplished by passing an instance of IOOptions to every +// API call that can potentially perform IO. Additionally, each such API is +// passed a pointer to a IODebugContext structure that can be used by the +// storage system to include troubleshooting information. The return values +// of the APIs is of type IOStatus, which can indicate an error code/sub-code, +// as well as metadata about the error such as its scope and whether its +// retryable. +// NewCompositeEnv can be used to create an Env with a custom FileSystem for +// DBOptions::env. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileSystem : public Customizable { + public: + FileSystem(); + + // No copying allowed + FileSystem(const FileSystem&) = delete; + + virtual ~FileSystem(); + + static const char* Type() { return "FileSystem"; } + static const char* kDefaultName() { return "DefaultFileSystem"; } + + // Loads the FileSystem specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // @param config_options Controls how the FileSystem is loaded + // @param value The name and optional properties describing the file system + // to load. + // @param result On success, returns the loaded FileSystem + // @return OK if the FileSystem was successfully loaded. + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + + // Return a default FileSystem suitable for the current operating + // system. + static std::shared_ptr Default(); + + // Handles the event when a new DB or a new ColumnFamily starts using the + // specified data paths. + // + // The data paths might be shared by different DBs or ColumnFamilies, + // so RegisterDbPaths might be called with the same data paths. + // For example, when CreateColumnFamily is called multiple times with the same + // data path, RegisterDbPaths will also be called with the same data path. + // + // If the return status is ok, then the paths must be correspondingly + // called in UnregisterDbPaths; + // otherwise this method should have no side effect, and UnregisterDbPaths + // do not need to be called for the paths. + // + // Different implementations may take different actions. + // By default, it's a no-op and returns Status::OK. + virtual Status RegisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // Handles the event a DB or a ColumnFamily stops using the specified data + // paths. + // + // It should be called corresponding to each successful RegisterDbPaths. + // + // Different implementations may take different actions. + // By default, it's a no-op and returns Status::OK. + virtual Status UnregisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual IOStatus NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + kWLTHNotSet = 0, // No hint information set + kWLTHNone, // No hints about write life time + kWLTHShort, // Data written has a short life time + kWLTHMedium, // Data written has a medium life time + kWLTHLong, // Data written has a long life time + kWLTHExtreme, // Data written has an extremely long life time + }; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) = 0; + + // Create an object that writes to a file with the specified name. + // `FSWritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus ReopenWritableFile( + const std::string& /*fname*/, const FileOptions& /*options*/, + std::unique_ptr* /*result*/, IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("ReopenWritableFile"); + } + + // Reuse an existing file by renaming it and opening it as writable. + virtual IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg); + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "RandomRWFile is not implemented in this FileSystem"); + } + + // Opens `fname` as a memory-mapped file for read and write (in-place updates + // only, i.e., no appends). On success, stores a raw buffer covering the whole + // file in `*result`. The file must exist prior to this call. + virtual IOStatus NewMemoryMappedFileBuffer( + const std::string& /*fname*/, + std::unique_ptr* /*result*/) { + return IOStatus::NotSupported( + "MemoryMappedFileBuffer is not implemented in this FileSystem"); + } + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual IOStatus NewDirectory(const std::string& name, + const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) = 0; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + virtual IOStatus FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) = 0; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual IOStatus GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + assert(result != nullptr); + std::vector child_fnames; + IOStatus s = GetChildren(dir, options, &child_fnames, dbg); + if (!s.ok()) { + return s; + } + result->resize(child_fnames.size()); + size_t result_size = 0; + for (size_t i = 0; i < child_fnames.size(); ++i) { + const std::string path = dir + "/" + child_fnames[i]; + if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes, + dbg)) + .ok()) { + if (FileExists(path, options, dbg).IsNotFound()) { + // The file may have been deleted since we listed the directory + continue; + } + return s; + } + (*result)[result_size].name = std::move(child_fnames[i]); + result_size++; + } + result->resize(result_size); + return IOStatus::OK(); + } + +// This seems to clash with a macro on Windows, so #undef it here +#ifdef DeleteFile +#undef DeleteFile +#endif + // Delete the named file. + virtual IOStatus DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Truncate the named file to the specified size. + virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "Truncate is not supported for this FileSystem"); + } + + // Create the specified directory. Returns error if directory exists. + virtual IOStatus CreateDir(const std::string& dirname, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Delete the specified directory. + virtual IOStatus DeleteDir(const std::string& dirname, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Store the size of fname in *file_size. + virtual IOStatus GetFileSize(const std::string& fname, + const IOOptions& options, uint64_t* file_size, + IODebugContext* dbg) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) = 0; + // Rename file src to target. + virtual IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Hard Link file src to target. + virtual IOStatus LinkFile(const std::string& /*src*/, + const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "LinkFile is not supported for this FileSystem"); + } + + virtual IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, + uint64_t* /*count*/, IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "Getting number of file links is not supported for this FileSystem"); + } + + virtual IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "AreFilesSame is not supported for this FileSystem"); + } + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) = 0; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg); + + // Get full directory name for this db. + virtual IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) = 0; + + // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions + // copy constructor + virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {} + + // OptimizeForLogRead will create a new FileOptions object that is a copy of + // the FileOptions in the parameters, but is optimized for reading log files. + virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const; + + // OptimizeForManifestRead will create a new FileOptions object that is a copy + // of the FileOptions in the parameters, but is optimized for reading manifest + // files. + virtual FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const; + + // OptimizeForLogWrite will create a new FileOptions object that is a copy of + // the FileOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const; + + // OptimizeForManifestWrite will create a new FileOptions object that is a + // copy of the FileOptions in the parameters, but is optimized for writing + // manifest files. Default implementation returns the copy of the same + // object. + virtual FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const; + + // OptimizeForCompactionTableWrite will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // writing table files. + virtual FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const; + + // OptimizeForCompactionTableRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading table files. + virtual FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + + // OptimizeForBlobFileRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading blob files. + virtual FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + +// This seems to clash with a macro on Windows, so #undef it here +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + + // Get the amount of free disk space + virtual IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, + uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("GetFreeSpace"); + } + + virtual IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& options, bool* is_dir, + IODebugContext* /*dgb*/) = 0; + + // EXPERIMENTAL + // Poll for completion of read IO requests. The Poll() method should call the + // callback functions to indicate completion of read requests. + // Underlying FS is required to support Poll API. Poll implementation should + // ensure that the callback gets called at IO completion, and return only + // after the callback has been called. + // If Poll returns partial results for any reads, its caller reponsibility to + // call Read or ReadAsync in order to get the remaining bytes. + // + // Default implementation is to return IOStatus::OK. + + virtual IOStatus Poll(std::vector& /*io_handles*/, + size_t /*min_completions*/) { + return IOStatus::OK(); + } + + // EXPERIMENTAL + // Abort the read IO requests submitted asynchronously. Underlying FS is + // required to support AbortIO API. AbortIO implementation should ensure that + // the all the read requests related to io_handles should be aborted and + // it shouldn't call the callback for these io_handles. + // + // Default implementation is to return IOStatus::OK. + virtual IOStatus AbortIO(std::vector& /*io_handles*/) { + return IOStatus::OK(); + } + + // Indicates to upper layers whether the FileSystem supports/uses async IO + // or not + virtual bool use_async_io() { return true; } + + // If you're adding methods here, remember to add them to EnvWrapper too. + + private: + void operator=(const FileSystem&); +}; + +// A file abstraction for reading sequentially through a file +class FSSequentialFile { + public: + FSSequentialFile() {} + + virtual ~FSSequentialFile() {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // REQUIRES: External synchronization + virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual IOStatus Skip(uint64_t n) = 0; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + Slice* /*result*/, char* /*scratch*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedRead"); + } + + // EXPERIMENTAL + // When available, returns the actual temperature for the file. This is + // useful in case some outside process moves a file from one tier to another, + // though the temperature is generally expected not to change while a file is + // open. + virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + + // If you're adding methods here, remember to add them to + // SequentialFileWrapper too. +}; + +// A read IO request structure for use in MultiRead and asynchronous Read APIs. +struct FSReadRequest { + // Input parameter that represents the file offset in bytes. + uint64_t offset; + + // Input parameter that represents the length to read in bytes. `result` only + // returns fewer bytes if end of file is hit (or `status` is not OK). + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer. + // The lifecycle of scratch will be until IO is completed. + // + // In case of asynchronous reads, its an output parameter and it will be + // maintained until callback has been called. Scratch is allocated by RocksDB + // and will be passed to underlying FileSystem. + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + // + // In case of asynchronous reads, this output parameter is set by Async Read + // APIs to point to the data buffer, and + // the number of valid bytes. + // Slice result should point to scratch i.e the data should + // always be read into scratch. + Slice result; + + // Output parameter set by underlying FileSystem that represents status of + // read request. + IOStatus status; +}; + +// A file abstraction for randomly reading the contents of a file. +class FSRandomAccessFile { + public: + FSRandomAccessFile() {} + + virtual ~FSRandomAccessFile() {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const = 0; + + // Readahead the file starting from offset by n bytes for caching. + // If it's not implemented (default: `NotSupported`), RocksDB will create + // internal prefetch buffer to improve read performance. + virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("Prefetch"); + } + + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping but can be in any order. If the function return Status + // is not ok, status of individual requests will be ignored and return + // status will be assumed for all read requests. The function return status + // is only meant for errors that occur before processing individual read + // requests. + virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + FSReadRequest& req = reqs[i]; + req.status = + Read(req.offset, req.len, options, &req.result, req.scratch, dbg); + } + return IOStatus::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; + + enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed }; + + virtual void Hint(AccessPattern /*pattern*/) {} + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // EXPERIMENTAL + // This API reads the requested data in FSReadRequest asynchronously. This is + // a asynchronous call, i.e it should return after submitting the request. + // + // When the read request is completed, callback function specified in cb + // should be called with arguments cb_arg and the result populated in + // FSReadRequest with result and status fileds updated by FileSystem. + // cb_arg should be used by the callback to track the original request + // submitted. + // + // This API should also populate io_handle which should be used by + // underlying FileSystem to store the context in order to distinguish the read + // requests at their side and provide the custom deletion function in del_fn. + // RocksDB guarantees that the del_fn for io_handle will be called after + // receiving the callback. Furthermore, RocksDB guarantees that if it calls + // the Poll API for this io_handle, del_fn will be called after the Poll + // returns. RocksDB is responsible for managing the lifetime of io_handle. + // + // req contains the request offset and size passed as input parameter of read + // request and result and status fields are output parameter set by underlying + // FileSystem. The data should always be read into scratch field. + // + // Default implementation is to read the data synchronously. + virtual IOStatus ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) { + req.status = + Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg); + cb(req, cb_arg); + return IOStatus::OK(); + } + + // EXPERIMENTAL + // When available, returns the actual temperature for the file. This is + // useful in case some outside process moves a file from one tier to another, + // though the temperature is generally expected not to change while a file is + // open. + virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + + // If you're adding methods here, remember to add them to + // RandomAccessFileWrapper too. +}; + +// A data structure brings the data verification information, which is +// used together with data being written to a file. +struct DataVerificationInfo { + // checksum of the data being written. + Slice checksum; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class FSWritableFile { + public: + FSWritableFile() + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(false) {} + + explicit FSWritableFile(const FileOptions& options) + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + + virtual ~FSWritableFile() {} + + // Append data to the end of the file + // Note: A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + virtual IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) = 0; + + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) { + return Append(data, options, dbg); + } + + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() + virtual IOStatus PositionedAppend(const Slice& /* data */, + uint64_t /* offset */, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedAppend"); + } + + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual IOStatus PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const IOOptions& /*options*/, + const DataVerificationInfo& /* verification_info */, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedAppend"); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + virtual IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) = 0; + + virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0; + virtual IOStatus Sync(const IOOptions& options, + IODebugContext* dbg) = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) { + return Sync(options, dbg); + } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + virtual bool IsSyncThreadSafe() const { return false; } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + /* + * If rate limiting is enabled, change the file-granularity priority used in + * rate-limiting writes. + * + * In the presence of finer-granularity priority such as + * `WriteOptions::rate_limiter_priority`, this file-granularity priority may + * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as + * a fallback for Env::IO_TOTAL finer-granularity priority. + * + * If rate limiting is not enabled, this call has no effect. + */ + virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; } + + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return 0; + } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + virtual void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, IODebugContext* dbg) { + if (strict_bytes_per_sync_) { + return Sync(options, dbg); + } + return IOStatus::OK(); + } + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks, options, dbg) + .PermitUncheckedError(); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + // Pre-allocates space for a file. + virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + + // If you're adding methods here, remember to add them to + // WritableFileWrapper too. + + protected: + size_t preallocation_block_size() { return preallocation_block_size_; } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + // No copying allowed + FSWritableFile(const FSWritableFile&); + void operator=(const FSWritableFile&); + + protected: + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + const bool strict_bytes_per_sync_; +}; + +// A file abstraction for random reading and writing. +class FSRandomRWFile { + public: + FSRandomRWFile() {} + + virtual ~FSRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Returns Status::OK() on success. + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const = 0; + + virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) { + return Sync(options, dbg); + } + + virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0; + + // EXPERIMENTAL + // When available, returns the actual temperature for the file. This is + // useful in case some outside process moves a file from one tier to another, + // though the temperature is generally expected not to change while a file is + // open. + virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + + // If you're adding methods here, remember to add them to + // RandomRWFileWrapper too. + + // No copying allowed + FSRandomRWFile(const RandomRWFile&) = delete; + FSRandomRWFile& operator=(const RandomRWFile&) = delete; +}; + +// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. +// Subclasses should release the mapping upon destruction. +class FSMemoryMappedFileBuffer { + public: + FSMemoryMappedFileBuffer(void* _base, size_t _length) + : base_(_base), length_(_length) {} + + virtual ~FSMemoryMappedFileBuffer() = 0; + + // We do not want to unmap this twice. We can make this class + // movable if desired, however, since + FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete; + FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete; + + void* GetBase() const { return base_; } + size_t GetLen() const { return length_; } + + protected: + void* base_; + const size_t length_; +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class FSDirectory { + public: + virtual ~FSDirectory() {} + // Fsync directory. Can be called concurrently from multiple threads. + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0; + + // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it + // may fsync directory or just the renaming file (e.g. btrfs). By default, it + // just calls directory fsync. + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& /*dir_fsync_options*/) { + return Fsync(options, dbg); + } + + // Close directory + virtual IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("Close"); + } + + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; + } + + // If you're adding methods here, remember to add them to + // DirectoryWrapper too. +}; + +// Below are helpers for wrapping most of the classes in this file. +// They forward all calls to another instance of the class. +// Useful when wrapping the default implementations. +// Typical usage is to inherit your wrapper from *Wrapper, e.g.: +// +// class MySequentialFileWrapper : public +// ROCKSDB_NAMESPACE::FSSequentialFileWrapper { +// public: +// MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target): +// ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {} +// Status Read(size_t n, FileSystem::IOOptions& options, Slice* result, +// char* scratch, FileSystem::IODebugContext* dbg) override { +// cout << "Doing a read of size " << n << "!" << endl; +// return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options, +// result, +// scratch, dbg); +// } +// // All other methods are forwarded to target_ automatically. +// }; +// +// This is often more convenient than inheriting the class directly because +// (a) Don't have to override and forward all methods - the Wrapper will +// forward everything you're not explicitly overriding. +// (b) Don't need to update the wrapper when more methods are added to the +// rocksdb class. Unless you actually want to override the behavior. +// (And unless rocksdb people forgot to update the *Wrapper class.) + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class FileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit FileSystemWrapper(const std::shared_ptr& t); + ~FileSystemWrapper() override {} + + // Return the target to which this Env forwards all calls + FileSystem* target() const { return target_.get(); } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->NewSequentialFile(f, file_opts, r, dbg); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->NewRandomAccessFile(f, file_opts, r, dbg); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->NewWritableFile(f, file_opts, r, dbg); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + return target_->ReopenWritableFile(fname, file_opts, result, dbg); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + return target_->NewRandomRWFile(fname, file_opts, result, dbg); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return target_->NewMemoryMappedFileBuffer(fname, result); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + return target_->NewDirectory(name, io_opts, result, dbg); + } + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override { + return target_->FileExists(f, io_opts, dbg); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override { + return target_->GetChildren(dir, io_opts, r, dbg); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + return target_->GetChildrenFileAttributes(dir, options, result, dbg); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override { + return target_->DeleteFile(f, options, dbg); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override { + return target_->Truncate(fname, size, options, dbg); + } + IOStatus CreateDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->CreateDir(d, options, dbg); + } + IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->CreateDirIfMissing(d, options, dbg); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->DeleteDir(d, options, dbg); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& options, + uint64_t* s, IODebugContext* dbg) override { + return target_->GetFileSize(f, options, s, dbg); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override { + return target_->GetFileModificationTime(fname, options, file_mtime, dbg); + } + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override { + return target_->GetAbsolutePath(db_path, options, output_path, dbg); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + return target_->RenameFile(s, t, options, dbg); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + return target_->LinkFile(s, t, options, dbg); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& options, + uint64_t* count, IODebugContext* dbg) override { + return target_->NumFileLinks(fname, options, count, dbg); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& options, bool* res, + IODebugContext* dbg) override { + return target_->AreFilesSame(first, second, options, res, dbg); + } + + IOStatus LockFile(const std::string& f, const IOOptions& options, + FileLock** l, IODebugContext* dbg) override { + return target_->LockFile(f, options, l, dbg); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& options, + IODebugContext* dbg) override { + return target_->UnlockFile(l, options, dbg); + } + + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override { + return target_->GetTestDirectory(options, path, dbg); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override { + return target_->NewLogger(fname, options, result, dbg); + } + + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeFileOptions(opts); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, + uint64_t* diskfree, IODebugContext* dbg) override { + return target_->GetFreeSpace(path, options, diskfree, dbg); + } + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override { + return target_->IsDirectory(path, options, is_dir, dbg); + } + + const Customizable* Inner() const override { return target_.get(); } + Status PrepareOptions(const ConfigOptions& options) override; + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; + + virtual IOStatus Poll(std::vector& io_handles, + size_t min_completions) override { + return target_->Poll(io_handles, min_completions); + } + + virtual IOStatus AbortIO(std::vector& io_handles) override { + return target_->AbortIO(io_handles); + } + + virtual bool use_async_io() override { return target_->use_async_io(); } + + protected: + std::shared_ptr target_; +}; + +class FSSequentialFileWrapper : public FSSequentialFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {} + + FSSequentialFile* target() const { return target_; } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + return target_->Read(n, options, result, scratch, dbg); + } + IOStatus Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override { + return target_->PositionedRead(offset, n, options, result, scratch, dbg); + } + Temperature GetTemperature() const override { + return target_->GetTemperature(); + } + + private: + FSSequentialFile* target_; +}; + +class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSSequentialFileOwnerWrapper(std::unique_ptr&& t) + : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSRandomAccessFileWrapper : public FSRandomAccessFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {} + + FSRandomAccessFile* target() const { return target_; } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return target_->Read(offset, n, options, result, scratch, dbg); + } + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override { + return target_->MultiRead(reqs, num_reqs, options, dbg); + } + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Prefetch(offset, n, options, dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + }; + void Hint(AccessPattern pattern) override { target_->Hint(pattern); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override { + return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg); + } + Temperature GetTemperature() const override { + return target_->GetTemperature(); + } + + private: + std::unique_ptr guard_; + FSRandomAccessFile* target_; +}; + +class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSRandomAccessFileOwnerWrapper( + std::unique_ptr&& t) + : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSWritableFileWrapper : public FSWritableFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {} + + FSWritableFile* target() const { return target_; } + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Append(data, options, dbg); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + return target_->Append(data, options, verification_info, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, verification_info, + dbg); + } + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Truncate(size, options, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + return target_->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override { + return target_->GetFileSize(options, dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + return target_->RangeSync(offset, nbytes, options, dbg); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override { + target_->PrepareWrite(offset, len, options, dbg); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Allocate(offset, len, options, dbg); + } + + private: + FSWritableFile* target_; +}; + +class FSWritableFileOwnerWrapper : public FSWritableFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSWritableFileOwnerWrapper(std::unique_ptr&& t) + : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSRandomRWFileWrapper : public FSRandomRWFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {} + + FSRandomRWFile* target() const { return target_; } + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Write(offset, data, options, dbg); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return target_->Read(offset, n, options, result, scratch, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + return target_->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + Temperature GetTemperature() const override { + return target_->GetTemperature(); + } + + private: + FSRandomRWFile* target_; +}; + +class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSRandomRWFileOwnerWrapper(std::unique_ptr&& t) + : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSDirectoryWrapper : public FSDirectory { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSDirectoryWrapper(std::unique_ptr&& t) + : guard_(std::move(t)) { + target_ = guard_.get(); + } + + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {} + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + + IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override { + return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr guard_; + FSDirectory* target_; +}; + +// A utility routine: write "data" to the named file. +extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, + bool should_sync = false); + +// A utility routine: read contents of named file into *data +extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/filter_policy.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/filter_policy.h new file mode 100644 index 0000000000..954d15b4a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/filter_policy.h @@ -0,0 +1,206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in rocksdb and are consulted +// automatically by rocksdb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +struct BlockBasedTableOptions; +struct ConfigOptions; + +// As of RocksDB 7.0, the details of these classes are internal +class FilterBitsBuilder; +class FilterBitsReader; + +// Contextual information passed to BloomFilterPolicy at filter building time. +// Used in overriding FilterPolicy::GetBuilderWithContext(). References other +// structs because this is expected to be a temporary, stack-allocated object. +struct FilterBuildingContext { + // This constructor is for internal use only and subject to change. + FilterBuildingContext(const BlockBasedTableOptions& table_options); + + // Options for the table being built + const BlockBasedTableOptions& table_options; + + // BEGIN from (DB|ColumnFamily)Options in effect at table creation time + CompactionStyle compaction_style = kCompactionStyleLevel; + + // Number of LSM levels, or -1 if unknown + int num_levels = -1; + + // An optional logger for reporting errors, warnings, etc. + Logger* info_log = nullptr; + // END from (DB|ColumnFamily)Options + + // Name of the column family for the table (or empty string if unknown) + // TODO: consider changing to Slice + std::string column_family_name; + + // The table level at time of constructing the SST file, or -1 if unknown + // or N/A as in SstFileWriter. (The table file could later be used at a + // different level.) + int level_at_creation = -1; + + // True if known to be going into bottommost sorted run for applicable + // key range (which might not even be last level with data). False + // otherwise. + bool is_bottommost = false; + + // Reason for creating the file with the filter + TableFileCreationReason reason = TableFileCreationReason::kMisc; +}; + +// Determines what kind of filter (if any) to generate in SST files, and under +// which conditions. API users can create custom filter policies that +// defer to other built-in policies (see NewBloomFilterPolicy and +// NewRibbonFilterPolicy) based on the context provided to +// GetBuilderWithContext. +class FilterPolicy : public Customizable { + public: + virtual ~FilterPolicy(); + static const char* Type() { return "FilterPolicy"; } + + // The name used for identifying whether a filter on disk is readable + // by this FilterPolicy. If this FilterPolicy is part of a family that + // can read each others filters, such as built-in BloomFilterPolcy and + // RibbonFilterPolicy, the CompatibilityName is a shared family name, + // while kinds of filters in the family can have distinct Customizable + // Names. This function is pure virtual so that wrappers around built-in + // policies are prompted to defer to CompatibilityName() of the wrapped + // policy, which is important for compatibility. + // + // For custom filter policies that are not part of a read-compatible + // family (rare), implementations may return Name(). + virtual const char* CompatibilityName() const = 0; + + // Creates a new FilterPolicy based on the input value string and returns the + // result The value might be an ID, and ID with properties, or an old-style + // policy string. + // The value describes the FilterPolicy being created. + // For BloomFilters, value may be a ":"-delimited value of the form: + // "bloomfilter:[bits_per_key]", + // e.g. ""bloomfilter:4" + // The above string is equivalent to calling NewBloomFilterPolicy(4). + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + // Return a new FilterBitsBuilder for constructing full or partitioned + // filter blocks, or return nullptr to indicate "no filter". Custom + // implementations should defer to a built-in FilterPolicy to get a + // new FilterBitsBuilder, but the FilterBuildingContext can be used + // to decide which built-in FilterPolicy to defer to. + virtual FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const = 0; + + // Return a new FilterBitsReader for full or partitioned filter blocks. + // Caller retains ownership of any buffer pointed to by the input Slice. + // Custom implementation should defer to GetFilterBitsReader on any + // built-in FilterPolicy, which can read filters generated by any other + // built-in FilterPolicy. + virtual FilterBitsReader* GetFilterBitsReader( + const Slice& /*contents*/) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. See +// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter +// +// bits_per_key: average bits allocated per key in bloom filter. A good +// choice is 9.9, which yields a filter with ~ 1% false positive rate. +// When format_version < 5, the value will be rounded to the nearest +// integer. Recommend using no more than three decimal digits after the +// decimal point, as in 6.667. +// +// To avoid configurations that are unlikely to produce good filtering +// value for the CPU overhead, bits_per_key < 0.5 is rounded down to 0.0 +// which means "generate no filter", and 0.5 <= bits_per_key < 1.0 is +// rounded up to 1.0, for a 62% FP rate. +// +// The caller is responsible for eventually deleting the result, though +// this is typically handled automatically with BlockBasedTableOptions: +// table_options.filter_policy.reset(NewBloomFilterPolicy(...)); +// +// As of RocksDB 7.0, the use_block_based_builder parameter is ignored. +// (The old, inefficient block-based filter is no longer accessible in +// the public API.) +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy( + double bits_per_key, bool IGNORED_use_block_based_builder = false); + +// A new Bloom alternative that saves about 30% space compared to +// Bloom filters, with similar query times but roughly 3-4x CPU time +// and 3x temporary space usage during construction. For example, if +// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same +// 0.95% FP rate as Bloom filter but only using about 7 bits per key. +// +// The space savings of Ribbon filters makes sense for lower (higher +// numbered; larger; longer-lived) levels of LSM, whereas the speed of +// Bloom filters make sense for highest levels of LSM. Setting +// bloom_before_level allows for this design with Level and Universal +// compaction styles. For example, bloom_before_level=1 means that Bloom +// filters will be used in level 0, including flushes, and Ribbon +// filters elsewhere, including FIFO compaction and external SST files. +// For this option, memtable flushes are considered level -1 (so that +// flushes can be distinguished from intra-L0 compaction). +// bloom_before_level=0 (default) -> Generate Bloom filters only for +// flushes under Level and Universal compaction styles. +// bloom_before_level=-1 -> Always generate Ribbon filters (except in +// some extreme or exceptional cases). +// +// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier +// versions reading the data will behave as if no filter was used +// (degraded performance until compaction rebuilds filters). All +// built-in FilterPolicies (Bloom or Ribbon) are able to read other +// kinds of built-in filters. +// +// Note: the current Ribbon filter schema uses some extra resources +// when constructing very large filters. For example, for 100 million +// keys in a single filter (one SST file without partitioned filters), +// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom. +// However, the savings in filter space from just ~60 open SST files +// makes up for the additional temporary memory use. +// +// Also consider using optimize_filters_for_memory to save filter +// memory. +extern const FilterPolicy* NewRibbonFilterPolicy( + double bloom_equivalent_bits_per_key, int bloom_before_level = 0); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/flush_block_policy.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/flush_block_policy.h new file mode 100644 index 0000000000..7a5dd957e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/flush_block_policy.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class BlockBuilder; +struct ConfigOptions; +struct Options; + +// FlushBlockPolicy provides a configurable way to determine when to flush a +// block in the block based tables. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FlushBlockPolicy { + public: + // Keep track of the key/value sequences and return the boolean value to + // determine if table builder should flush current data block. + virtual bool Update(const Slice& key, const Slice& value) = 0; + + virtual ~FlushBlockPolicy() {} +}; + +class FlushBlockPolicyFactory : public Customizable { + public: + static const char* Type() { return "FlushBlockPolicyFactory"; } + + // Creates a FlushBlockPolicyFactory based on the input value. + // By default, this method can create EveryKey or BySize PolicyFactory, + // which take now config_options. + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result); + + // Return a new block flush policy that flushes data blocks by data size. + // FlushBlockPolicy may need to access the metadata of the data block + // builder to determine when to flush the blocks. + // + // Callers must delete the result after any database that is using the + // result has been closed. + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const = 0; + + virtual ~FlushBlockPolicyFactory() {} +}; + +class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { + public: + FlushBlockBySizePolicyFactory(); + + static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; } + const char* Name() const override { return kClassName(); } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const override; + + static FlushBlockPolicy* NewFlushBlockPolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/functor_wrapper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/functor_wrapper.h new file mode 100644 index 0000000000..17b021bf73 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/functor_wrapper.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace detail { +template +struct IndexSequence {}; + +template +struct IndexSequenceHelper + : public IndexSequenceHelper {}; + +template +struct IndexSequenceHelper<0U, Next...> { + using type = IndexSequence; +}; + +template +using make_index_sequence = typename IndexSequenceHelper::type; + +template +void call(Function f, Tuple t, IndexSequence) { + f(std::get(t)...); +} + +template +void call(Function f, Tuple t) { + static constexpr auto size = std::tuple_size::value; + call(f, t, make_index_sequence{}); +} +} // namespace detail + +template +class FunctorWrapper { + public: + explicit FunctorWrapper(std::function functor, Args &&...args) + : functor_(std::move(functor)), args_(std::forward(args)...) {} + + void invoke() { detail::call(functor_, args_); } + + private: + std::function functor_; + std::tuple args_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/io_status.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/io_status.h new file mode 100644 index 0000000000..0bf5e939a6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/io_status.h @@ -0,0 +1,244 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// An IOStatus encapsulates the result of an operation. It may indicate +// success, or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on an IOStatus without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same IOStatus must use +// external synchronization. + +#pragma once + +#include + +#include "rocksdb/slice.h" +#ifdef OS_WIN +#include +#endif +#include + +#include "status.h" + +namespace ROCKSDB_NAMESPACE { + +class IOStatus : public Status { + public: + using Code = Status::Code; + using SubCode = Status::SubCode; + + enum IOErrorScope : unsigned char { + kIOErrorScopeFileSystem, + kIOErrorScopeFile, + kIOErrorScopeRange, + kIOErrorScopeMax, + }; + + // Create a success status. + IOStatus() : IOStatus(kOk, kNone) {} + ~IOStatus() {} + + // Copy the specified status. + IOStatus(const IOStatus& s); + IOStatus& operator=(const IOStatus& s); + IOStatus(IOStatus&& s) noexcept; + IOStatus& operator=(IOStatus&& s) noexcept; + bool operator==(const IOStatus& rhs) const; + bool operator!=(const IOStatus& rhs) const; + + void SetRetryable(bool retryable) { retryable_ = retryable; } + void SetDataLoss(bool data_loss) { data_loss_ = data_loss; } + void SetScope(IOErrorScope scope) { + scope_ = static_cast(scope); + } + + bool GetRetryable() const { return retryable_; } + bool GetDataLoss() const { return data_loss_; } + IOErrorScope GetScope() const { return static_cast(scope_); } + + // Return a success status. + static IOStatus OK() { return IOStatus(); } + + static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kNotSupported, msg, msg2); + } + static IOStatus NotSupported(SubCode msg = kNone) { + return IOStatus(kNotSupported, msg); + } + + // Return error status of an appropriate type. + static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static IOStatus NotFound(SubCode msg = kNone) { + return IOStatus(kNotFound, msg); + } + + static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kCorruption, msg, msg2); + } + static IOStatus Corruption(SubCode msg = kNone) { + return IOStatus(kCorruption, msg); + } + + static IOStatus InvalidArgument(const Slice& msg, + const Slice& msg2 = Slice()) { + return IOStatus(kInvalidArgument, msg, msg2); + } + static IOStatus InvalidArgument(SubCode msg = kNone) { + return IOStatus(kInvalidArgument, msg); + } + + static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, msg, msg2); + } + static IOStatus IOError(SubCode msg = kNone) { + return IOStatus(kIOError, msg); + } + + static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); } + static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kBusy, msg, msg2); + } + + static IOStatus TimedOut(SubCode msg = kNone) { + return IOStatus(kTimedOut, msg); + } + static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kTimedOut, msg, msg2); + } + + static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); } + static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kNoSpace, msg, msg2); + } + + static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); } + static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kPathNotFound, msg, msg2); + } + + static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); } + static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kIOFenced, msg, msg2); + } + + static IOStatus Aborted(SubCode msg = kNone) { + return IOStatus(kAborted, msg); + } + static IOStatus Aborted(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kAborted, msg, msg2); + } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + // std::string ToString() const; + + private: + friend IOStatus status_to_io_status(Status&&); + + explicit IOStatus(Code _code, SubCode _subcode = kNone) + : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {} + + IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2); + IOStatus(Code _code, const Slice& msg, const Slice& msg2) + : IOStatus(_code, kNone, msg, msg2) {} +}; + +inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg, + const Slice& msg2) + : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) { + assert(code_ != kOk); + assert(subcode_ != kMaxSubCode); + const size_t len1 = msg.size(); + const size_t len2 = msg2.size(); + const size_t size = len1 + (len2 ? (2 + len2) : 0); + char* const result = new char[size + 1]; // +1 for null terminator + memcpy(result, msg.data(), len1); + if (len2) { + result[len1] = ':'; + result[len1 + 1] = ' '; + memcpy(result + len1 + 2, msg2.data(), len2); + } + result[size] = '\0'; // null terminator for C style string + state_.reset(result); +} + +inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); +} +inline IOStatus& IOStatus::operator=(const IOStatus& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (this != &s) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + code_ = s.code_; + subcode_ = s.subcode_; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); + } + return *this; +} + +inline IOStatus::IOStatus(IOStatus&& s) noexcept : IOStatus() { + *this = std::move(s); +} + +inline IOStatus& IOStatus::operator=(IOStatus&& s) noexcept { + if (this != &s) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + s.scope_ = kIOErrorScopeFileSystem; + state_ = std::move(s.state_); + } + return *this; +} + +inline bool IOStatus::operator==(const IOStatus& rhs) const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; + rhs.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return (code_ == rhs.code_); +} + +inline bool IOStatus::operator!=(const IOStatus& rhs) const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; + rhs.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return !(*this == rhs); +} + +inline IOStatus status_to_io_status(Status&& status) { + IOStatus io_s; + Status& s = io_s; + s = std::move(status); + return io_s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iostats_context.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iostats_context.h new file mode 100644 index 0000000000..559d44c57a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iostats_context.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include + +#include "rocksdb/perf_level.h" + +// A thread local context for gathering io-stats efficiently and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + +namespace ROCKSDB_NAMESPACE { + +// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each +// item in Temperature class. +struct FileIOByTemperature { + // the number of bytes read to Temperature::kHot file + uint64_t hot_file_bytes_read; + // the number of bytes read to Temperature::kWarm file + uint64_t warm_file_bytes_read; + // the number of bytes read to Temperature::kCold file + uint64_t cold_file_bytes_read; + // total number of reads to Temperature::kHot file + uint64_t hot_file_read_count; + // total number of reads to Temperature::kWarm file + uint64_t warm_file_read_count; + // total number of reads to Temperature::kCold file + uint64_t cold_file_read_count; + // reset all the statistics to 0. + void Reset() { + hot_file_bytes_read = 0; + warm_file_bytes_read = 0; + cold_file_bytes_read = 0; + hot_file_read_count = 0; + warm_file_read_count = 0; + cold_file_read_count = 0; + } +}; + +struct IOStatsContext { + // reset all io-stats counter to zero + void Reset(); + + std::string ToString(bool exclude_zero_counters = false) const; + + // the thread pool id + uint64_t thread_pool_id; + + // number of bytes that has been written. + uint64_t bytes_written; + // number of bytes that has been read. + uint64_t bytes_read; + + // time spent in open() and fopen(). + uint64_t open_nanos; + // time spent in fallocate(). + uint64_t allocate_nanos; + // time spent in write() and pwrite(). + uint64_t write_nanos; + // time spent in read() and pread() + uint64_t read_nanos; + // time spent in sync_file_range(). + uint64_t range_sync_nanos; + // time spent in fsync + uint64_t fsync_nanos; + // time spent in preparing write (fallocate etc). + uint64_t prepare_write_nanos; + // time spent in Logger::Logv(). + uint64_t logger_nanos; + // CPU time spent in write() and pwrite() + uint64_t cpu_write_nanos; + // CPU time spent in read() and pread() + uint64_t cpu_read_nanos; + + FileIOByTemperature file_io_stats_by_temperature; + + // It is not consistent that whether iostats follows PerfLevel.Timer counters + // follows it but BackupEngine relies on counter metrics to always be there. + // Here we create a backdoor option to disable some counters, so that some + // existing stats are not polluted by file operations, such as logging, by + // turning this off. + bool disable_iostats = false; +}; + +// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global, +// non-thread-local IOStatsContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, a pointer to a thread-local IOStatsContext object will be +// returned. +// +// This function never returns nullptr. +IOStatsContext* get_iostats_context(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iterator.h new file mode 100644 index 0000000000..9d4c9f73a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/iterator.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#pragma once + +#include + +#include "rocksdb/cleanable.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator : public Cleanable { + public: + Iterator() {} + // No copying allowed + Iterator(const Iterator&) = delete; + void operator=(const Iterator&) = delete; + + virtual ~Iterator() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + // Always returns false if !status().ok(). + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + // All Seek*() methods clear any error status() that the iterator had prior to + // the call; after the seek, status() indicates only the error (if any) that + // happened during the seek, not any past errors. + // Target does not contain timestamp. + virtual void Seek(const Slice& target) = 0; + + // Position at the last key in the source that at or before target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + // Target does not contain timestamp. + virtual void SeekForPrev(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of the + // iterator (i.e. the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev + // operation). + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. If the entry is a plain key-value, + // return the value as-is; if it is a wide-column entity, return the value of + // the default anonymous column (see kDefaultWideColumnName) if any, or an + // empty value otherwise. The underlying storage for the returned slice is + // valid only until the next modification of the iterator (i.e. the next + // SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation). + // REQUIRES: Valid() + virtual Slice value() const = 0; + + // Return the wide columns for the current entry. If the entry is a + // wide-column entity, return it as-is; if it is a plain key-value, return it + // as an entity with a single anonymous column (see kDefaultWideColumnName) + // which contains the value. The underlying storage for the returned + // structure is valid only until the next modification of the iterator (i.e. + // the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation). + // REQUIRES: Valid() + virtual const WideColumns& columns() const { + assert(false); + return kNoWideColumns; + } + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // If supported, renew the iterator to represent the latest state. The + // iterator will be invalidated after the call. Not supported if + // ReadOptions.snapshot is given when creating the iterator. + virtual Status Refresh() { + return Status::NotSupported("Refresh() is not supported"); + } + + // Property "rocksdb.iterator.is-key-pinned": + // If returning "1", this means that the Slice returned by key() is valid + // as long as the iterator is not deleted. + // It is guaranteed to always return "1" if + // - Iterator created with ReadOptions::pin_data = true + // - DB tables were created with + // BlockBasedTableOptions::use_delta_encoding = false. + // Property "rocksdb.iterator.super-version-number": + // LSM version used by the iterator. The same format as DB Property + // kCurrentSuperVersionNumber. See its comment for more information. + // Property "rocksdb.iterator.internal-key": + // Get the user-key portion of the internal key at which the iteration + // stopped. + virtual Status GetProperty(std::string prop_name, std::string* prop); + + virtual Slice timestamp() const { + assert(false); + return Slice(); + } +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/ldb_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/ldb_tool.h new file mode 100644 index 0000000000..b8f2e222fa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/ldb_tool.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// An interface for converting a slice to a readable string +class SliceFormatter { + public: + virtual ~SliceFormatter() {} + virtual std::string Format(const Slice& s) const = 0; +}; + +// Options for customizing ldb tool (beyond the DB Options) +struct LDBOptions { + // Create LDBOptions with default values for all fields + LDBOptions(); + + // Key formatter that converts a slice to a readable string. + // Default: Slice::ToString() + std::shared_ptr key_formatter; + + std::string print_help_header = "ldb - RocksDB Tool"; +}; + +class LDBTool { + public: + void Run( + int argc, char** argv, Options db_options = Options(), + const LDBOptions& ldb_options = LDBOptions(), + const std::vector* column_families = nullptr); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/listener.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/listener.h new file mode 100644 index 0000000000..87bc678693 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/listener.h @@ -0,0 +1,840 @@ +// Copyright (c) 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/io_status.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +using TablePropertiesCollection = + std::unordered_map>; + +class DB; +class ColumnFamilyHandle; +class Status; +struct CompactionJobStats; + +struct FileCreationBriefInfo { + FileCreationBriefInfo() = default; + FileCreationBriefInfo(const std::string& _db_name, + const std::string& _cf_name, + const std::string& _file_path, int _job_id) + : db_name(_db_name), + cf_name(_cf_name), + file_path(_file_path), + job_id(_job_id) {} + // the name of the database where the file was created. + std::string db_name; + // the name of the column family where the file was created. + std::string cf_name; + // the path to the created file. + std::string file_path; + // the id of the job (which could be flush or compaction) that + // created the file. + int job_id = 0; +}; + +struct TableFileCreationBriefInfo : public FileCreationBriefInfo { + // reason of creating the table. + TableFileCreationReason reason; +}; + +struct TableFileCreationInfo : public TableFileCreationBriefInfo { + TableFileCreationInfo() = default; + explicit TableFileCreationInfo(TableProperties&& prop) + : table_properties(prop) {} + // the size of the file. + uint64_t file_size; + // Detailed properties of the created file. + TableProperties table_properties; + // The status indicating whether the creation was successful or not. + Status status; + // The checksum of the table file being created + std::string file_checksum; + // The checksum function name of checksum generator used for this table file + std::string file_checksum_func_name; +}; + +struct BlobFileCreationBriefInfo : public FileCreationBriefInfo { + BlobFileCreationBriefInfo(const std::string& _db_name, + const std::string& _cf_name, + const std::string& _file_path, int _job_id, + BlobFileCreationReason _reason) + : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id), + reason(_reason) {} + // reason of creating the blob file. + BlobFileCreationReason reason; +}; + +struct BlobFileCreationInfo : public BlobFileCreationBriefInfo { + BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name, + const std::string& _file_path, int _job_id, + BlobFileCreationReason _reason, + uint64_t _total_blob_count, uint64_t _total_blob_bytes, + Status _status, const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id, + _reason), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes), + status(_status), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name) {} + + // the number of blob in a file. + uint64_t total_blob_count; + // the total bytes in a file. + uint64_t total_blob_bytes; + // The status indicating whether the creation was successful or not. + Status status; + // The checksum of the blob file being created. + std::string file_checksum; + // The checksum function name of checksum generator used for this blob file. + std::string file_checksum_func_name; +}; + +enum class CompactionReason : int { + kUnknown = 0, + // [Level] number of L0 files > level0_file_num_compaction_trigger + kLevelL0FilesNum, + // [Level] total size of level > MaxBytesForLevel() + kLevelMaxLevelSize, + // [Universal] Compacting for size amplification + kUniversalSizeAmplification, + // [Universal] Compacting for size ratio + kUniversalSizeRatio, + // [Universal] number of sorted runs > level0_file_num_compaction_trigger + kUniversalSortedRunNum, + // [FIFO] total size > max_table_files_size + kFIFOMaxSize, + // [FIFO] reduce number of files. + kFIFOReduceNumFiles, + // [FIFO] files with creation time < (current_time - interval) + kFIFOTtl, + // Manual compaction + kManualCompaction, + // DB::SuggestCompactRange() marked files for compaction + kFilesMarkedForCompaction, + // [Level] Automatic compaction within bottommost level to cleanup duplicate + // versions of same user key, usually due to a released snapshot. + kBottommostFiles, + // Compaction based on TTL + kTtl, + // According to the comments in flush_job.cc, RocksDB treats flush as + // a level 0 compaction in internal stats. + kFlush, + // [InternalOnly] External sst file ingestion treated as a compaction + // with placeholder input level L0 as file ingestion + // technically does not have an input level like other compactions. + // Used only for internal stats and conflict checking with other compactions + kExternalSstIngestion, + // Compaction due to SST file being too old + kPeriodicCompaction, + // Compaction in order to move files to temperature + kChangeTemperature, + // Compaction scheduled to force garbage collection of blob files + kForcedBlobGC, + // A special TTL compaction for RoundRobin policy, which basically the same as + // kLevelMaxLevelSize, but the goal is to compact TTLed files. + kRoundRobinTtl, + // [InternalOnly] DBImpl::ReFitLevel treated as a compaction, + // Used only for internal conflict checking with other compactions + kRefitLevel, + // total number of compaction reasons, new reasons must be added above this. + kNumOfReasons, +}; + +enum class FlushReason : int { + kOthers = 0x00, + kGetLiveFiles = 0x01, + kShutDown = 0x02, + kExternalFileIngestion = 0x03, + kManualCompaction = 0x04, + kWriteBufferManager = 0x05, + kWriteBufferFull = 0x06, + kTest = 0x07, + kDeleteFiles = 0x08, + kAutoCompaction = 0x09, + kManualFlush = 0x0a, + kErrorRecovery = 0xb, + // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable + // will not be called to avoid many small immutable memtables. + kErrorRecoveryRetryFlush = 0xc, + kWalFull = 0xd, +}; + +// TODO: In the future, BackgroundErrorReason will only be used to indicate +// why the BG Error is happening (e.g., flush, compaction). We may introduce +// other data structure to indicate other essential information such as +// the file type (e.g., Manifest, SST) and special context. +enum class BackgroundErrorReason { + kFlush, + kCompaction, + kWriteCallback, + kMemTable, + kManifestWrite, + kFlushNoWAL, + kManifestWriteNoWAL, +}; + +struct WriteStallInfo { + // the name of the column family + std::string cf_name; + // state of the write controller + struct { + WriteStallCondition cur; + WriteStallCondition prev; + } condition; +}; + + +struct FileDeletionInfo { + FileDeletionInfo() = default; + + FileDeletionInfo(const std::string& _db_name, const std::string& _file_path, + int _job_id, Status _status) + : db_name(_db_name), + file_path(_file_path), + job_id(_job_id), + status(_status) {} + // The name of the database where the file was deleted. + std::string db_name; + // The path to the deleted file. + std::string file_path; + // The id of the job which deleted the file. + int job_id = 0; + // The status indicating whether the deletion was successful or not. + Status status; +}; + +struct TableFileDeletionInfo : public FileDeletionInfo {}; + +struct BlobFileDeletionInfo : public FileDeletionInfo { + BlobFileDeletionInfo(const std::string& _db_name, + const std::string& _file_path, int _job_id, + Status _status) + : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {} +}; + +enum class FileOperationType { + kRead, + kWrite, + kTruncate, + kClose, + kFlush, + kSync, + kFsync, + kRangeSync, + kAppend, + kPositionedAppend, + kOpen +}; + +struct FileOperationInfo { + using Duration = std::chrono::nanoseconds; + using SteadyTimePoint = + std::chrono::time_point; + using SystemTimePoint = + std::chrono::time_point; + using StartTimePoint = std::pair; + using FinishTimePoint = SteadyTimePoint; + + FileOperationType type; + const std::string& path; + // Rocksdb try to provide file temperature information, but it's not + // guaranteed. + Temperature temperature; + uint64_t offset; + size_t length; + const Duration duration; + const SystemTimePoint& start_ts; + Status status; + + FileOperationInfo(const FileOperationType _type, const std::string& _path, + const StartTimePoint& _start_ts, + const FinishTimePoint& _finish_ts, const Status& _status, + const Temperature _temperature = Temperature::kUnknown) + : type(_type), + path(_path), + temperature(_temperature), + duration(std::chrono::duration_cast( + _finish_ts - _start_ts.second)), + start_ts(_start_ts.first), + status(_status) {} + static StartTimePoint StartNow() { + return std::make_pair( + std::chrono::system_clock::now(), std::chrono::steady_clock::now()); + } + static FinishTimePoint FinishNow() { + return std::chrono::steady_clock::now(); + } +}; + +struct BlobFileInfo { + BlobFileInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number) + : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {} + + std::string blob_file_path; + uint64_t blob_file_number; +}; + +struct BlobFileAdditionInfo : public BlobFileInfo { + BlobFileAdditionInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number, + const uint64_t _total_blob_count, + const uint64_t _total_blob_bytes) + : BlobFileInfo(_blob_file_path, _blob_file_number), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes) {} + uint64_t total_blob_count; + uint64_t total_blob_bytes; +}; + +struct BlobFileGarbageInfo : public BlobFileInfo { + BlobFileGarbageInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number, + const uint64_t _garbage_blob_count, + const uint64_t _garbage_blob_bytes) + : BlobFileInfo(_blob_file_path, _blob_file_number), + garbage_blob_count(_garbage_blob_count), + garbage_blob_bytes(_garbage_blob_bytes) {} + uint64_t garbage_blob_count; + uint64_t garbage_blob_bytes; +}; + +struct FlushJobInfo { + // the id of the column family + uint32_t cf_id; + // the name of the column family + std::string cf_name; + // the path to the newly created file + std::string file_path; + // the file number of the newly created file + uint64_t file_number; + // the oldest blob file referenced by the newly created file + uint64_t oldest_blob_file_number; + // the id of the thread that completed this flush job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + // If true, then rocksdb is currently slowing-down all writes to prevent + // creating too many Level 0 files as compaction seems not able to + // catch up the write request speed. This indicates that there are + // too many files in Level 0. + bool triggered_writes_slowdown; + // If true, then rocksdb is currently blocking any writes to prevent + // creating more L0 files. This indicates that there are too many + // files in level 0. Compactions should try to compact L0 files down + // to lower levels as soon as possible. + bool triggered_writes_stop; + // The smallest sequence number in the newly created file + SequenceNumber smallest_seqno; + // The largest sequence number in the newly created file + SequenceNumber largest_seqno; + // Table properties of the table being flushed + TableProperties table_properties; + + FlushReason flush_reason; + + // Compression algorithm used for blob output files + CompressionType blob_compression_type; + + // Information about blob files created during flush in Integrated BlobDB. + std::vector blob_file_addition_infos; +}; + +struct CompactionFileInfo { + // The level of the file. + int level; + + // The file number of the file. + uint64_t file_number; + + // The file number of the oldest blob file this SST file references. + uint64_t oldest_blob_file_number; +}; + +struct SubcompactionJobInfo { + ~SubcompactionJobInfo() { status.PermitUncheckedError(); } + // the id of the column family where the compaction happened. + uint32_t cf_id; + // the name of the column family where the compaction happened. + std::string cf_name; + // the status indicating whether the compaction was successful or not. + Status status; + // the id of the thread that completed this compaction job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + + // sub-compaction job id, which is only unique within the same compaction, so + // use both 'job_id' and 'subcompaction_job_id' to identify a subcompaction + // within an instance. + // For non subcompaction job, it's set to -1. + int subcompaction_job_id; + // the smallest input level of the compaction. + int base_input_level; + // the output level of the compaction. + int output_level; + + // Reason to run the compaction + CompactionReason compaction_reason; + + // Compression algorithm used for output files + CompressionType compression; + + // Statistics and other additional details on the compaction + CompactionJobStats stats; + + // Compression algorithm used for blob output files. + CompressionType blob_compression_type; +}; + +struct CompactionJobInfo { + ~CompactionJobInfo() { status.PermitUncheckedError(); } + // the id of the column family where the compaction happened. + uint32_t cf_id; + // the name of the column family where the compaction happened. + std::string cf_name; + // the status indicating whether the compaction was successful or not. + Status status; + // the id of the thread that completed this compaction job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + + // the smallest input level of the compaction. + int base_input_level; + // the output level of the compaction. + int output_level; + + // The following variables contain information about compaction inputs + // and outputs. A file may appear in both the input and output lists + // if it was simply moved to a different level. The order of elements + // is the same across input_files and input_file_infos; similarly, it is + // the same across output_files and output_file_infos. + + // The names of the compaction input files. + std::vector input_files; + + // Additional information about the compaction input files. + std::vector input_file_infos; + + // The names of the compaction output files. + std::vector output_files; + + // Additional information about the compaction output files. + std::vector output_file_infos; + + // Table properties for input and output tables. + // The map is keyed by values from input_files and output_files. + TablePropertiesCollection table_properties; + + // Reason to run the compaction + CompactionReason compaction_reason; + + // Compression algorithm used for output files + CompressionType compression; + + // Statistics and other additional details on the compaction + CompactionJobStats stats; + + // Compression algorithm used for blob output files. + CompressionType blob_compression_type; + + // Information about blob files created during compaction in Integrated + // BlobDB. + std::vector blob_file_addition_infos; + + // Information about blob files deleted during compaction in Integrated + // BlobDB. + std::vector blob_file_garbage_infos; +}; + +struct MemTableInfo { + // the name of the column family to which memtable belongs + std::string cf_name; + // Sequence number of the first element that was inserted + // into the memtable. + SequenceNumber first_seqno; + // Sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + SequenceNumber earliest_seqno; + // Total number of entries in memtable + uint64_t num_entries; + // Total number of deletes in memtable + uint64_t num_deletes; +}; + +struct ExternalFileIngestionInfo { + // the name of the column family + std::string cf_name; + // Path of the file outside the DB + std::string external_file_path; + // Path of the file inside the DB + std::string internal_file_path; + // The global sequence number assigned to keys in this file + SequenceNumber global_seqno; + // Table properties of the table being flushed + TableProperties table_properties; +}; + +// Result of auto background error recovery +struct BackgroundErrorRecoveryInfo { + // The original error that triggered the recovery + Status old_bg_error; + + // The final bg_error after all recovery attempts. Status::OK() means + // the recovery was successful and the database is fully operational. + Status new_bg_error; +}; + +struct IOErrorInfo { + IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation, + const std::string& _file_path, size_t _length, uint64_t _offset) + : io_status(_io_status), + operation(_operation), + file_path(_file_path), + length(_length), + offset(_offset) {} + + IOStatus io_status; + FileOperationType operation; + std::string file_path; + size_t length; + uint64_t offset; +}; + +// EventListener class contains a set of callback functions that will +// be called when specific RocksDB event happens such as flush. It can +// be used as a building block for developing custom features such as +// stats-collector or external compaction algorithm. +// +// IMPORTANT +// Because compaction is needed to resolve a "writes stopped" condition, +// calling or waiting for any blocking DB write function (no_slowdown=false) +// from a compaction-related listener callback can hang RocksDB. For DB +// writes from a callback we recommend a WriteBatch and no_slowdown=true, +// because the WriteBatch can accumulate writes for later in case DB::Write +// returns Status::Incomplete. Similarly, calling CompactRange or similar +// could hang by waiting for a background worker that is occupied until the +// callback returns. +// +// Otherwise, callback functions should not run for an extended period of +// time before the function returns, because this will slow RocksDB. +// +// [Threading] All EventListener callback will be called using the +// actual thread that involves in that specific event. For example, it +// is the RocksDB background flush thread that does the actual flush to +// call EventListener::OnFlushCompleted(). +// +// [Locking] All EventListener callbacks are designed to be called without +// the current thread holding any DB mutex. This is to prevent potential +// deadlock and performance issue when using EventListener callback +// in a complex way. +// +// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into +// RocksDB, because RocksDB is not exception-safe. This could cause undefined +// behavior including data loss, unreported corruption, deadlocks, and more. +class EventListener : public Customizable { + public: + static const char* Type() { return "EventListener"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& id, + std::shared_ptr* result); + const char* Name() const override { + // Since EventListeners did not have a name previously, we will assume + // an empty name. Instances should override this method. + return ""; + } + // A callback function to RocksDB which will be called whenever a + // registered RocksDB flushes a file. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} + + // A callback function to RocksDB which will be called before a + // RocksDB starts to flush memtables. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnFlushBegin(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} + + // A callback function for RocksDB which will be called whenever + // a SST file is deleted. Different from OnCompactionCompleted and + // OnFlushCompleted, this callback is designed for external logging + // service and thus only provide string parameters instead + // of a pointer to DB. Applications that build logic basic based + // on file creations and deletions is suggested to implement + // OnFlushCompleted and OnCompactionCompleted. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from the + // returned value. + virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {} + + // A callback function to RocksDB which will be called before a + // RocksDB starts to compact. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {} + + // A callback function for RocksDB which will be called whenever + // a registered RocksDB compacts a file. The default implementation + // is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param db a pointer to the rocksdb instance which just compacted + // a file. + // @param ci a reference to a CompactionJobInfo struct. 'ci' is released + // after this function is returned, and must be copied if it is needed + // outside of this function. + virtual void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*ci*/) {} + + // A callback function to RocksDB which will be called before a sub-compaction + // begins. If a compaction is split to 2 sub-compactions, it will trigger one + // `OnCompactionBegin()` first, then two `OnSubcompactionBegin()`. + // If compaction is not split, it will still trigger one + // `OnSubcompactionBegin()`, as internally, compaction is always handled by + // sub-compaction. The default implementation is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param ci a reference to a CompactionJobInfo struct, it contains a + // `sub_job_id` which is only unique within the specified compaction (which + // can be identified by `job_id`). 'ci' is released after this function is + // returned, and must be copied if it's needed outside this function. + // Note: `table_properties` is not set for sub-compaction, the information + // could be got from `OnCompactionBegin()`. + virtual void OnSubcompactionBegin(const SubcompactionJobInfo& /*si*/) {} + + // A callback function to RocksDB which will be called whenever a + // sub-compaction completed. The same as `OnSubcompactionBegin()`, if a + // compaction is split to 2 sub-compactions, it will be triggered twice. If + // a compaction is not split, it will still be triggered once. + // The default implementation is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param ci a reference to a CompactionJobInfo struct, it contains a + // `sub_job_id` which is only unique within the specified compaction (which + // can be identified by `job_id`). 'ci' is released after this function is + // returned, and must be copied if it's needed outside this function. + // Note: `table_properties` is not set for sub-compaction, the information + // could be got from `OnCompactionCompleted()`. + virtual void OnSubcompactionCompleted(const SubcompactionJobInfo& /*si*/) {} + + // A callback function for RocksDB which will be called whenever + // a SST file is created. Different from OnCompactionCompleted and + // OnFlushCompleted, this callback is designed for external logging + // service and thus only provide string parameters instead + // of a pointer to DB. Applications that build logic basic based + // on file creations and deletions is suggested to implement + // OnFlushCompleted and OnCompactionCompleted. + // + // Historically it will only be called if the file is successfully created. + // Now it will also be called on failure case. User can check info.status + // to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a SST file is being created. It will follow by OnTableFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a memtable is made immutable. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a column family handle is deleted. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // @param handle is a pointer to the column family handle to be deleted + // which will become a dangling pointer after the deletion. + virtual void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* /*handle*/) {} + + // A callback function for RocksDB which will be called after an external + // file is ingested using IngestExternalFile. + // + // Note that the this function will run on the same thread as + // IngestExternalFile(), if this function is blocked, IngestExternalFile() + // will be blocked from finishing. + virtual void OnExternalFileIngested( + DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before setting the + // background error status to a non-OK value. The new background error status + // is provided in `bg_error` and can be modified by the callback. E.g., a + // callback can suppress errors by resetting it to Status::OK(), thus + // preventing the database from entering read-only mode. We do not provide any + // guarantee when failed flushes/compactions will be rescheduled if the user + // suppresses an error. + // + // Note that this function can run on the same threads as flush, compaction, + // and user writes. So, it is extremely important not to perform heavy + // computations or blocking calls in this function. + virtual void OnBackgroundError(BackgroundErrorReason /* reason */, + Status* /* bg_error */) {} + + // A callback function for RocksDB which will be called whenever a change + // of superversion triggers a change of the stall conditions. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever a file read + // operation finishes. + virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file write + // operation finishes. + virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file flush + // operation finishes. + virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file sync + // operation finishes. + virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file + // rangeSync operation finishes. + virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file + // truncate operation finishes. + virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file close + // operation finishes. + virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {} + + // If true, the OnFile*Finish functions will be called. If + // false, then they won't be called. + virtual bool ShouldBeNotifiedOnFileIO() { return false; } + + // A callback function for RocksDB which will be called just before + // starting the automatic recovery process for recoverable background + // errors, such as NoSpace(). The callback can suppress the automatic + // recovery by setting *auto_recovery to false. The database will then + // have to be transitioned out of read-only mode by calling DB::Resume() + virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */, + Status /* bg_error */, + bool* /* auto_recovery */) {} + + // DEPRECATED + // A callback function for RocksDB which will be called once the database + // is recovered from read-only mode after an error. When this is called, it + // means normal writes to the database can be issued and the user can + // initiate any further recovery actions needed + virtual void OnErrorRecoveryCompleted(Status old_bg_error) { + old_bg_error.PermitUncheckedError(); + } + + // A callback function for RocksDB which will be called once the recovery + // attempt from a background retryable error is completed. The recovery + // may have been successful or not. In either case, the callback is called + // with the old and new error. If info.new_bg_error is Status::OK(), that + // means the recovery succeeded. + virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) { + } + + // A callback function for RocksDB which will be called before + // a blob file is being created. It will follow by OnBlobFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileCreationStarted( + const BlobFileCreationBriefInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever + // a blob file is created. + // It will be called whether the file is successfully created or not. User can + // check info.status to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever + // a blob file is deleted. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever an IO error + // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback. + virtual void OnIOError(const IOErrorInfo& /*info*/) {} + + ~EventListener() override {} +}; + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memory_allocator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memory_allocator.h new file mode 100644 index 0000000000..d126abfe6d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memory_allocator.h @@ -0,0 +1,87 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// MemoryAllocator is an interface that a client can implement to supply custom +// memory allocation and deallocation methods. See rocksdb/cache.h for more +// information. +// All methods should be thread-safe. +class MemoryAllocator : public Customizable { + public: + static const char* Type() { return "MemoryAllocator"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + + // Allocate a block of at least size. Has to be thread-safe. + virtual void* Allocate(size_t size) = 0; + + // Deallocate previously allocated block. Has to be thread-safe. + virtual void Deallocate(void* p) = 0; + + // Returns the memory size of the block allocated at p. The default + // implementation that just returns the original allocation_size is fine. + virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const { + // default implementation just returns the allocation size + return allocation_size; + } + + std::string GetId() const override { return GenerateIndividualId(); } +}; + +struct JemallocAllocatorOptions { + static const char* kName() { return "JemallocAllocatorOptions"; } + // Jemalloc tcache cache allocations by size class. For each size class, + // it caches between 20 (for large size classes) to 200 (for small size + // classes). To reduce tcache memory usage in case the allocator is access + // by large number of threads, we can control whether to cache an allocation + // by its size. + bool limit_tcache_size = false; + + // Lower bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommended to set it to block_size/4. + size_t tcache_size_lower_bound = 1024; + + // Upper bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommended to set it to block_size. + size_t tcache_size_upper_bound = 16 * 1024; + + // Number of arenas across which we spread allocation requests. Increasing + // this setting can mitigate arena mutex contention. The value must be + // positive. + size_t num_arenas = 1; +}; + +// Generate memory allocator which allocates through Jemalloc and utilize +// MADV_DONTDUMP through madvise to exclude cache items from core dump. +// Applications can use the allocator with block cache to exclude block cache +// usage from core dump. +// +// Implementation details: +// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all +// allocations of the JemallocNodumpAllocator are through the same arena. +// The memory allocator hooks memory allocation of the arena, and calls +// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from +// core dump. Side benefit of using single arena would be reduction of jemalloc +// metadata for some workloads. +// +// To mitigate mutex contention for using one single arena (see also +// `JemallocAllocatorOptions::num_arenas` above), jemalloc tcache +// (thread-local cache) is enabled to cache unused allocations for future use. +// The tcache normally incurs 0.5M extra memory usage per-thread. The usage +// can be reduced by limiting allocation sizes to cache. +extern Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr* memory_allocator); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memtablerep.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memtablerep.h new file mode 100644 index 0000000000..be0f6cd1f1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/memtablerep.h @@ -0,0 +1,421 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains the interface that must be implemented by any collection +// to be used as the backing store for a MemTable. Such a collection must +// satisfy the following properties: +// (1) It does not store duplicate items. +// (2) It uses MemTableRep::KeyComparator to compare items for iteration and +// equality. +// (3) It can be accessed concurrently by multiple readers and can support +// during reads. However, it needn't support multiple concurrent writes. +// (4) Items are never deleted. +// The liberal use of assertions is encouraged to enforce (1). +// +// The factory will be passed an MemTableAllocator object when a new MemTableRep +// is requested. +// +// Users can implement their own memtable representations. We include three +// types built in: +// - SkipListRep: This is the default; it is backed by a skip list. +// - HashSkipListRep: The memtable rep that is best used for keys that are +// structured like "prefix:suffix" where iteration within a prefix is +// common and iteration across different prefixes is rare. It is backed by +// a hash map where each bucket is a skip list. +// - VectorRep: This is backed by an unordered std::vector. On iteration, the +// vector is sorted. It is intelligent about sorting; once the MarkReadOnly() +// has been called, the vector will only be sorted once. It is optimized for +// random-write-heavy workloads. +// +// The last four implementations are designed for situations in which +// iteration over the entire collection is rare since doing so requires all the +// keys to be copied into a sorted data structure. + +#pragma once + +#include +#include + +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class Allocator; +class LookupKey; +class SliceTransform; +class Logger; +struct DBOptions; + +using KeyHandle = void*; + +extern Slice GetLengthPrefixedSlice(const char* data); + +class MemTableRep { + public: + // KeyComparator provides a means to compare keys, which are internal keys + // concatenated with values. + class KeyComparator { + public: + using DecodedType = ROCKSDB_NAMESPACE::Slice; + + virtual DecodedType decode_key(const char* key) const { + // The format of key is frozen and can be treated as a part of the API + // contract. Refer to MemTable::Add for details. + return GetLengthPrefixedSlice(key); + } + + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const = 0; + + virtual int operator()(const char* prefix_len_key, + const Slice& key) const = 0; + + virtual ~KeyComparator() {} + }; + + explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} + + // Allocate a buf of len size for storing key. The idea is that a + // specific memtable representation knows its underlying data structure + // better. By allowing it to allocate memory, it can possibly put + // correlated stuff in consecutive memory area to make processor + // prefetching more efficient. + virtual KeyHandle Allocate(const size_t len, char** buf); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert). + // REQUIRES: nothing that compares equal to key is currently in the + // collection, and no concurrent modifications to the table in progress + virtual void Insert(KeyHandle handle) = 0; + + // Same as ::Insert + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKey(KeyHandle handle) { + Insert(handle); + return true; + } + + // Same as Insert(), but in additional pass a hint to insert location for + // the key. If hint points to nullptr, a new hint will be populated. + // otherwise the hint will be updated to reflect the last insert location. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to Insert() by default. + virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + Insert(handle); + } + + // Same as ::InsertWithHint + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { + InsertWithHint(handle, hint); + return true; + } + + // Same as ::InsertWithHint, but allow concurrent write + // + // If hint points to nullptr, a new hint will be allocated on heap, otherwise + // the hint will be updated to reflect the last insert location. The hint is + // owned by the caller and it is the caller's responsibility to delete the + // hint later. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to InsertConcurrently() by default. + virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + InsertConcurrently(handle); + } + + // Same as ::InsertWithHintConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) { + InsertWithHintConcurrently(handle, hint); + return true; + } + + // Like Insert(handle), but may be called concurrent with other calls + // to InsertConcurrently for other handles. + // + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual void InsertConcurrently(KeyHandle handle); + + // Same as ::InsertConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyConcurrently(KeyHandle handle) { + InsertConcurrently(handle); + return true; + } + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const = 0; + + // Notify this table rep that it will no longer be added to. By default, + // does nothing. After MarkReadOnly() is called, this table rep will + // not be written to (ie No more calls to Allocate(), Insert(), + // or any writes done directly to entries accessed through the iterator.) + virtual void MarkReadOnly() {} + + // Notify this table rep that it has been flushed to stable storage. + // By default, does nothing. + // + // Invariant: MarkReadOnly() is called, before MarkFlushed(). + // Note that this method if overridden, should not run for an extended period + // of time. Otherwise, RocksDB may be blocked. + virtual void MarkFlushed() {} + + // Look up key from the mem table, since the first key in the mem table whose + // user_key matches the one given k, call the function callback_func(), with + // callback_args directly forwarded as the first parameter, and the mem table + // key as the second parameter. If the return value is false, then terminates. + // Otherwise, go through the next key. + // + // It's safe for Get() to terminate after having finished all the potential + // key for the k.user_key(), or not. + // + // Default: + // Get() function with a default value of dynamically construct an iterator, + // seek and call the call back function. + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)); + + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, + const Slice& /*end_key*/) { + return 0; + } + + // Returns a vector of unique random memtable entries of approximate + // size 'target_sample_size' (this size is not strictly enforced). + virtual void UniqueRandomSample(const uint64_t num_entries, + const uint64_t target_sample_size, + std::unordered_set* entries) { + (void)num_entries; + (void)target_sample_size; + (void)entries; + assert(false); + } + + // Report an approximation of how much memory has been used other than memory + // that was allocated through the allocator. Safe to call from any thread. + virtual size_t ApproximateMemoryUsage() = 0; + + virtual ~MemTableRep() {} + + // Iteration over the contents of a skip collection + class Iterator { + public: + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() {} + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const = 0; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const = 0; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + + // retreat to the first entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) = 0; + + virtual void RandomSeek() {} + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() = 0; + }; + + // Return an iterator over the keys in this representation. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; + + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + // arena: If not null, the arena is used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { + return GetIterator(arena); + } + + // Return true if the current MemTableRep supports merge operator. + // Default: true + virtual bool IsMergeOperatorSupported() const { return true; } + + // Return true if the current MemTableRep supports snapshot + // Default: true + virtual bool IsSnapshotSupported() const { return true; } + + protected: + // When *key is an internal key concatenated with the value, returns the + // user key. + virtual Slice UserKey(const char* key) const; + + Allocator* allocator_; +}; + +// This is the base class for all factories that are used by RocksDB to create +// new MemTableRep objects +class MemTableRepFactory : public Customizable { + public: + ~MemTableRepFactory() override {} + + static const char* Type() { return "MemTableRepFactory"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::unique_ptr* factory); + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* factory); + + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) = 0; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, + const SliceTransform* slice_transform, Logger* logger, + uint32_t /* column_family_id */) { + return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); + } + + const char* Name() const override = 0; + + // Return true if the current MemTableRep supports concurrent inserts + // Default: false + virtual bool IsInsertConcurrentlySupported() const { return false; } + + // Return true if the current MemTableRep supports detecting duplicate + // at insertion time. If true, then MemTableRep::Insert* returns + // false when if the already exists. + // Default: false + virtual bool CanHandleDuplicatedKey() const { return false; } +}; + +// This uses a skip list to store keys. It is the default. +// +// Parameters: +// lookahead: If non-zero, each iterator's seek operation will start the +// search from the previously visited record (doing at most 'lookahead' +// steps). This is an optimization for the access pattern including many +// seeks with consecutive keys. +class SkipListFactory : public MemTableRepFactory { + public: + explicit SkipListFactory(size_t lookahead = 0); + + // Methods for Configurable/Customizable class overrides + static const char* kClassName() { return "SkipListFactory"; } + static const char* kNickName() { return "skip_list"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + std::string GetId() const override; + + // Methods for MemTableRepFactory class overrides + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; + + bool IsInsertConcurrentlySupported() const override { return true; } + + bool CanHandleDuplicatedKey() const override { return true; } + + private: + size_t lookahead_; +}; + +// This creates MemTableReps that are backed by an std::vector. On iteration, +// the vector is sorted. This is useful for workloads where iteration is very +// rare and writes are generally not issued after reads begin. +// +// Parameters: +// count: Passed to the constructor of the underlying std::vector of each +// VectorRep. On initialization, the underlying array will be at least count +// bytes reserved for usage. +class VectorRepFactory : public MemTableRepFactory { + size_t count_; + + public: + explicit VectorRepFactory(size_t count = 0); + + // Methods for Configurable/Customizable class overrides + static const char* kClassName() { return "VectorRepFactory"; } + static const char* kNickName() { return "vector"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + // Methods for MemTableRepFactory class overrides + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; +}; + +// This class contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// bucket_count: number of fixed array buckets +// skiplist_height: the max height of the skiplist +// skiplist_branching_factor: probabilistic size ratio between adjacent +// link lists in the skiplist +extern MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count = 1000000, int32_t skiplist_height = 4, + int32_t skiplist_branching_factor = 4); + +// The factory is to create memtables based on a hash table: +// it contains a fixed array of buckets, each pointing to either a linked list +// or a skip list if number of entries inside the bucket exceeds +// threshold_use_skiplist. +// @bucket_count: number of fixed array buckets +// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. +// Otherwise from huge page TLB. The user needs to reserve +// huge pages for it to be allocated, like: +// sysctl -w vm.nr_hugepages=20 +// See linux doc Documentation/vm/hugetlbpage.txt +// @bucket_entries_logging_threshold: if number of entries in one bucket +// exceeds this number, log about it. +// @if_log_bucket_dist_when_flash: if true, log distribution of number of +// entries when flushing. +// @threshold_use_skiplist: a bucket switches to skip list if number of +// entries exceed this parameter. +extern MemTableRepFactory* NewHashLinkListRepFactory( + size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, + int bucket_entries_logging_threshold = 4096, + bool if_log_bucket_dist_when_flash = true, + uint32_t threshold_use_skiplist = 256); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/merge_operator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/merge_operator.h new file mode 100644 index 0000000000..077130475d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/merge_operator.h @@ -0,0 +1,286 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Logger; + +// The Merge Operator +// +// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// To use merge, the client needs to provide an object implementing one of +// the following interfaces: +// a) AssociativeMergeOperator - for most simple semantics (always take +// two values, and merge them into one value, which is then put back +// into rocksdb); numeric addition and string concatenation are examples; +// +// b) MergeOperator - the generic class for all the more abstract / complex +// operations; one method (FullMergeV2) to merge a Put/Delete value with a +// merge operand; and another method (PartialMerge) that merges multiple +// operands together. this is especially useful if your key values have +// complex structures but you would still like to support client-specific +// incremental updates. +// +// AssociativeMergeOperator is simpler to implement. MergeOperator is simply +// more powerful. +// +// Refer to rocksdb-merge wiki for more details and example implementations. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class MergeOperator : public Customizable { + public: + virtual ~MergeOperator() {} + static const char* Type() { return "MergeOperator"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& id, + std::shared_ptr* result); + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics + // existing: (IN) null indicates that the key does not exist before this op + // operand_list:(IN) the sequence of merge operations to apply, front() first. + // new_value:(OUT) Client is responsible for filling the merge result here. + // The string that new_value is pointing to will be empty. + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + // + // Also make use of the *logger for error messages. + virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + // deprecated, please use FullMergeV2() + assert(false); + return false; + } + + struct MergeOperationInput { + // If user-defined timestamp is enabled, `_key` includes timestamp. + explicit MergeOperationInput(const Slice& _key, + const Slice* _existing_value, + const std::vector& _operand_list, + Logger* _logger) + : key(_key), + existing_value(_existing_value), + operand_list(_operand_list), + logger(_logger) {} + + // The key associated with the merge operation. + const Slice& key; + // The existing value of the current key, nullptr means that the + // value doesn't exist. + const Slice* existing_value; + // A list of operands to apply. + const std::vector& operand_list; + // Logger could be used by client to log any errors that happen during + // the merge operation. + Logger* logger; + }; + + enum class OpFailureScope { + kDefault, + kTryMerge, + kMustMerge, + kOpFailureScopeMax, + }; + + struct MergeOperationOutput { + explicit MergeOperationOutput(std::string& _new_value, + Slice& _existing_operand) + : new_value(_new_value), existing_operand(_existing_operand) {} + + // Client is responsible for filling the merge result here. + std::string& new_value; + // If the merge result is one of the existing operands (or existing_value), + // client can set this field to the operand (or existing_value) instead of + // using new_value. + Slice& existing_operand; + // Indicates the blast radius of the failure. It is only meaningful to + // provide a failure scope when returning `false` from the API populating + // the `MergeOperationOutput`. Currently RocksDB operations handle these + // values as follows: + // + // - `OpFailureScope::kDefault`: fallback to default + // (`OpFailureScope::kTryMerge`) + // - `OpFailureScope::kTryMerge`: operations that try to merge that key will + // fail. This includes flush and compaction, which puts the DB in + // read-only mode. + // - `OpFailureScope::kMustMerge`: operations that must merge that key will + // fail (e.g., `Get()`, `MultiGet()`, iteration). Flushes/compactions can + // still proceed by copying the original input operands to the output. + OpFailureScope op_failure_scope = OpFailureScope::kDefault; + }; + + // This function applies a stack of merge operands in chronological order + // on top of an existing value. There are two ways in which this method is + // being used: + // a) During Get() operation, it used to calculate the final value of a key + // b) During compaction, in order to collapse some operands with the based + // value. + // + // Note: The name of the method is somewhat misleading, as both in the cases + // of Get() or compaction it may be called on a subset of operands: + // K: 0 +1 +2 +7 +4 +5 2 +1 +2 + // ^ + // | + // snapshot + // In the example above, Get(K) operation will call FullMerge with a base + // value of 2 and operands [+1, +2]. Compaction process might decide to + // collapse the beginning of the history up to the snapshot by performing + // full Merge with base value of 0 and operands [+1, +2, +7, +4]. + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const; + + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a DB::Merge() call in the same order + // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. + // *new_value should be constructed such that a call to + // DB::Merge(key, *new_value) would yield the same result as a call + // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). + // + // The string that new_value is pointing to will be empty. + // + // The default implementation of PartialMergeMulti will use this function + // as a helper, for backward compatibility. Any successor class of + // MergeOperator should either implement PartialMerge or PartialMergeMulti, + // although implementing PartialMergeMulti is suggested as it is in general + // more effective to merge multiple operands at a time instead of two + // operands at a time. + // + // If it is impossible or infeasible to combine the two operations, + // leave new_value unchanged and return false. The library will + // internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + // + // TODO: Presently there is no way to differentiate between error/corruption + // and simply "return false". For now, the client should simply return + // false in any case it cannot perform partial-merge, regardless of reason. + // If there is corruption in the data, handle it in the FullMergeV2() function + // and return false there. The default implementation of PartialMerge will + // always return false. + virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, + const Slice& /*right_operand*/, + std::string* /*new_value*/, + Logger* /*logger*/) const { + return false; + } + + // This function performs merge when all the operands are themselves merge + // operation types that you would have passed to a DB::Merge() call in the + // same order (front() first) + // (i.e. DB::Merge(key, operand_list[0]), followed by + // DB::Merge(key, operand_list[1]), ...) + // + // PartialMergeMulti should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. *new_value should + // be constructed such that a call to DB::Merge(key, *new_value) would yield + // the same result as sequential individual calls to DB::Merge(key, operand) + // for each operand in operand_list from front() to back(). + // + // The string that new_value is pointing to will be empty. + // + // The PartialMergeMulti function will be called when there are at least two + // operands. + // + // In the default implementation, PartialMergeMulti will invoke PartialMerge + // multiple times, where each time it only merges two operands. Developers + // should either implement PartialMergeMulti, or implement PartialMerge which + // is served as the helper function of the default PartialMergeMulti. + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + // The name of the MergeOperator. Used to check for MergeOperator + // mismatches (i.e., a DB created with one MergeOperator is + // accessed using a different MergeOperator) + // TODO: the name is currently not stored persistently and thus + // no checking is enforced. Client is responsible for providing + // consistent MergeOperator between DB opens. + virtual const char* Name() const override = 0; + + // Determines whether the PartialMerge can be called with just a single + // merge operand. + // Override and return true for allowing a single operand. PartialMerge + // and PartialMergeMulti should be overridden and implemented + // correctly to properly handle a single operand. + virtual bool AllowSingleOperand() const { return false; } + + // Allows to control when to invoke a full merge during Get. + // This could be used to limit the number of merge operands that are looked at + // during a point lookup, thereby helping in limiting the number of levels to + // read from. + // Doesn't help with iterators. + // + // Note: the merge operands are passed to this function in the reversed order + // relative to how they were merged (passed to FullMerge or FullMergeV2) + // for performance reasons, see also: + // https://github.com/facebook/rocksdb/issues/3865 + virtual bool ShouldMerge(const std::vector& /*operands*/) const { + return false; + } +}; + +// The simpler, associative merge operator. +class AssociativeMergeOperator : public MergeOperator { + public: + ~AssociativeMergeOperator() override {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // existing_value:(IN) null indicates the key does not exist before this op + // value: (IN) the value to update/merge the existing_value with + // new_value: (OUT) Client is responsible for filling the merge result + // here. The string that new_value is pointing to will be empty. + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. The client should assume that this will be treated + // as an error by the library. + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const = 0; + + private: + // Default implementations of the MergeOperator functions + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const Slice& key, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* logger) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/metadata.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/metadata.h new file mode 100644 index 0000000000..4ab3842dda --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/metadata.h @@ -0,0 +1,258 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Basic identifiers and metadata for a file in a DB. This only includes +// information considered relevant for taking backups, checkpoints, or other +// services relating to DB file storage. +// This is only appropriate for immutable files, such as SST files or all +// files in a backup. See also LiveFileStorageInfo. +struct FileStorageInfo { + // The name of the file within its directory (e.g. "123456.sst") + std::string relative_filename; + // The directory containing the file, without a trailing '/'. This could be + // a DB path, wal_dir, etc. + std::string directory; + + // The id of the file within a single DB. Set to 0 if the file does not have + // a number (e.g. CURRENT) + uint64_t file_number = 0; + // The type of the file as part of a DB. + FileType file_type = kTempFile; + + // File size in bytes. See also `trim_to_size`. + uint64_t size = 0; + + // This feature is experimental and subject to change. + Temperature temperature = Temperature::kUnknown; + + // The checksum of a SST file, the value is decided by the file content and + // the checksum algorithm used for this SST file. The checksum function is + // identified by the file_checksum_func_name. If the checksum function is + // not specified, file_checksum is "0" by default. + std::string file_checksum; + + // The name of the checksum function used to generate the file checksum + // value. If file checksum is not enabled (e.g., sst_file_checksum_func is + // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is + // "Unknown". + std::string file_checksum_func_name; +}; + +// Adds to FileStorageInfo the ability to capture the state of files that +// might change in a running DB. +struct LiveFileStorageInfo : public FileStorageInfo { + // If non-empty, this string represents the "saved" contents of the file + // for the current context. (This field is used for checkpointing CURRENT + // file.) In that case, size == replacement_contents.size() and file on disk + // should be ignored. If empty string, the file on disk should still have + // "saved" contents. (See trim_to_size.) + std::string replacement_contents; + + // If true, the file on disk is allowed to be larger than `size` but only + // the first `size` bytes should be used for the current context. If false, + // the file is corrupt if size on disk does not equal `size`. + bool trim_to_size = false; +}; + +// The metadata that describes an SST file. (Does not need to extend +// LiveFileStorageInfo because SST files are always immutable.) +struct SstFileMetaData : public FileStorageInfo { + SstFileMetaData() { file_type = kTableFile; } + + SstFileMetaData(const std::string& _file_name, uint64_t _file_number, + const std::string& _directory, uint64_t _size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, + const std::string& _smallestkey, + const std::string& _largestkey, uint64_t _num_reads_sampled, + bool _being_compacted, Temperature _temperature, + uint64_t _oldest_blob_file_number, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _epoch_number, std::string& _file_checksum, + std::string& _file_checksum_func_name) + : smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno), + smallestkey(_smallestkey), + largestkey(_largestkey), + num_reads_sampled(_num_reads_sampled), + being_compacted(_being_compacted), + num_entries(0), + num_deletions(0), + oldest_blob_file_number(_oldest_blob_file_number), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + epoch_number(_epoch_number) { + if (!_file_name.empty()) { + if (_file_name[0] == '/') { + relative_filename = _file_name.substr(1); + name = _file_name; // Deprecated field + } else { + relative_filename = _file_name; + name = std::string("/") + _file_name; // Deprecated field + } + assert(relative_filename.size() + 1 == name.size()); + assert(relative_filename[0] != '/'); + assert(name[0] == '/'); + } + directory = _directory; + db_path = _directory; // Deprecated field + file_number = _file_number; + file_type = kTableFile; + size = _size; + temperature = _temperature; + file_checksum = _file_checksum; + file_checksum_func_name = _file_checksum_func_name; + } + + SequenceNumber smallest_seqno = 0; // Smallest sequence number in file. + SequenceNumber largest_seqno = 0; // Largest sequence number in file. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled = 0; // How many times the file is read. + bool being_compacted = + false; // true if the file is currently being compacted. + + uint64_t num_entries = 0; + uint64_t num_deletions = 0; + + uint64_t oldest_blob_file_number = 0; // The id of the oldest blob file + // referenced by the file. + // An SST file may be generated by compactions whose input files may + // in turn be generated by earlier compactions. The creation time of the + // oldest SST file that is the compaction ancestor of this file. + // The timestamp is provided SystemClock::GetCurrentTime(). + // 0 if the information is not available. + // + // Note: for TTL blob files, it contains the start of the expiration range. + uint64_t oldest_ancester_time = 0; + // Timestamp when the SST file is created, provided by + // SystemClock::GetCurrentTime(). 0 if the information is not available. + uint64_t file_creation_time = 0; + // The order of a file being flushed or ingested/imported. + // Compaction output file will be assigned with the minimum `epoch_number` + // among input files'. + // For L0, larger `epoch_number` indicates newer L0 file. + // 0 if the information is not available. + uint64_t epoch_number = 0; + + // These bounds define the effective key range for range tombstones + // in this file. + // Currently only used by CreateColumnFamilyWithImport(). + std::string smallest{}; // Smallest internal key served by table + std::string largest{}; // Largest internal key served by table + + // DEPRECATED: The name of the file within its directory with a + // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct + // instead. + std::string name; + + // DEPRECATED: replaced by `directory` in base struct + std::string db_path; +}; + +// The full set of metadata associated with each SST file. +struct LiveFileMetaData : SstFileMetaData { + std::string column_family_name; // Name of the column family + int level; // Level at which this file resides. + LiveFileMetaData() : column_family_name(), level(0) {} +}; + +// The MetaData that describes a Blob file +struct BlobMetaData { + BlobMetaData() + : blob_file_number(0), + blob_file_size(0), + total_blob_count(0), + total_blob_bytes(0), + garbage_blob_count(0), + garbage_blob_bytes(0) {} + + BlobMetaData(uint64_t _file_number, const std::string& _file_name, + const std::string& _file_path, uint64_t _file_size, + uint64_t _total_blob_count, uint64_t _total_blob_bytes, + uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes, + const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : blob_file_number(_file_number), + blob_file_name(_file_name), + blob_file_path(_file_path), + blob_file_size(_file_size), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes), + garbage_blob_count(_garbage_blob_count), + garbage_blob_bytes(_garbage_blob_bytes), + checksum_method(_file_checksum), + checksum_value(_file_checksum_func_name) {} + uint64_t blob_file_number; + std::string blob_file_name; + std::string blob_file_path; + uint64_t blob_file_size; + uint64_t total_blob_count; + uint64_t total_blob_bytes; + uint64_t garbage_blob_count; + uint64_t garbage_blob_bytes; + std::string checksum_method; + std::string checksum_value; +}; + +// The metadata that describes a level. +struct LevelMetaData { + LevelMetaData(int _level, uint64_t _size, + const std::vector&& _files) + : level(_level), size(_size), files(_files) {} + + // The level which this meta data describes. + const int level; + // The size of this level in bytes, which is equal to the sum of + // the file size of its "files". + const uint64_t size; + // The metadata of all sst files in this level. + const std::vector files; +}; + +// The metadata that describes a column family. +struct ColumnFamilyMetaData { + ColumnFamilyMetaData() : size(0), file_count(0), name("") {} + ColumnFamilyMetaData(const std::string& _name, uint64_t _size, + const std::vector&& _levels) + : size(_size), name(_name), levels(_levels) {} + + // The size of this column family in bytes, which is equal to the sum of + // the file size of its "levels". + uint64_t size; + // The number of files in this column family. + size_t file_count; + // The name of the column family. + std::string name; + // The metadata of all levels in this column family. + std::vector levels; + + // The total size of all blob files + uint64_t blob_file_size = 0; + // The number of blob files in this column family. + size_t blob_file_count = 0; + // The metadata of the blobs in this column family. + std::vector blob_files; +}; + +// Metadata returned as output from ExportColumnFamily() and used as input to +// CreateColumnFamiliesWithImport(). +struct ExportImportFilesMetaData { + std::string db_comparator_name; // Used to safety check at import. + std::vector files; // Vector of file metadata. +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/options.h new file mode 100644 index 0000000000..e9a708aa38 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/options.h @@ -0,0 +1,2083 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/data_structure.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/listener.h" +#include "rocksdb/sst_partitioner.h" +#include "rocksdb/types.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/version.h" +#include "rocksdb/write_buffer_manager.h" + +#ifdef max +#undef max +#endif + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class CompactionFilter; +class CompactionFilterFactory; +class Comparator; +class ConcurrentTaskLimiter; +class Env; +enum InfoLogLevel : unsigned char; +class SstFileManager; +class FilterPolicy; +class Logger; +class MergeOperator; +class Snapshot; +class MemTableRepFactory; +class RateLimiter; +class Slice; +class Statistics; +class InternalKeyComparator; +class WalFilter; +class FileSystem; + +struct Options; +struct DbPath; + +using FileTypeSet = SmallEnumSet; + +struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { + // The function recovers options to a previous version. Only 4.6 or later + // versions are supported. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. + ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + // Some functions that make it easier to optimize RocksDB + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + // An optional cache object is passed in to be used as the block cache + ColumnFamilyOptions* OptimizeForSmallDb( + std::shared_ptr* cache = nullptr); + + // Use this if you don't need to keep the data sorted, i.e. you'll never use + // an iterator, only Put() and Get() API calls + // + ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb); + + // Default values for some parameters in ColumnFamilyOptions are not + // optimized for heavy workloads and big datasets, which means you might + // observe write stalls under some conditions. As a starting point for tuning + // RocksDB options, use the following two functions: + // * OptimizeLevelStyleCompaction -- optimizes level style compaction + // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction + // Universal style compaction is focused on reducing Write Amplification + // Factor for big data sets, but increases Space Amplification. You can learn + // more about the different styles here: + // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide + // Make sure to also call IncreaseParallelism(), which will provide the + // biggest performance gains. + // Note: we might use more memory than memtable_memory_budget during high + // write rate period + ColumnFamilyOptions* OptimizeLevelStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + ColumnFamilyOptions* OptimizeUniversalStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator = BytewiseComparator(); + + // REQUIRES: The client must provide a merge operator if Merge operation + // needs to be accessed. Calling Merge on a DB without a merge operator + // would result in Status::NotSupported. The client must ensure that the + // merge operator supplied here has the same name and *exactly* the same + // semantics as the merge operator provided to previous open calls on + // the same DB. The only exception is reserved for upgrade, where a DB + // previously without a merge operator is introduced to Merge operation + // for the first time. It's necessary to specify a merge operator when + // opening the DB in this case. + // Default: nullptr + std::shared_ptr merge_operator = nullptr; + + // A single CompactionFilter instance to call into during compaction. + // Allows an application to modify/delete a key-value during background + // compaction. + // + // If the client requires a new `CompactionFilter` to be used for different + // compaction runs and/or requires a `CompactionFilter` for table file + // creations outside of compaction, it can specify compaction_filter_factory + // instead of this option. The client should specify only one of the two. + // compaction_filter takes precedence over compaction_filter_factory if + // client specifies both. + // + // If multithreaded compaction is being used, the supplied CompactionFilter + // instance may be used from different threads concurrently and so should be + // thread-safe. + // + // Default: nullptr + const CompactionFilter* compaction_filter = nullptr; + + // This is a factory that provides `CompactionFilter` objects which allow + // an application to modify/delete a key-value during table file creation. + // + // Unlike the `compaction_filter` option, which is used when compaction + // creates a table file, this factory allows using a `CompactionFilter` when a + // table file is created for various reasons. The factory can decide what + // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by + // default the decision is to use a `CompactionFilter` for + // `TableFileCreationReason::kCompaction` only. + // + // Each thread of work involving creating table files will create a new + // `CompactionFilter` when it will be used according to the above + // `TableFileCreationReason`-based decision. This allows the application to + // know about the different ongoing threads of work and makes it unnecessary + // for `CompactionFilter` to provide thread-safety. + // + // Default: nullptr + std::shared_ptr compaction_filter_factory = nullptr; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to max_write_buffer_number write buffers may be held in memory + // at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Note that write_buffer_size is enforced per column family. + // See db_write_buffer_size for sharing memory across column families. + // + // Default: 64MB + // + // Dynamically changeable through SetOptions() API + size_t write_buffer_size = 64 << 20; + + // Compress blocks using the specified compression algorithm. + // + // Default: kSnappyCompression, if it's supported. If snappy is not linked + // with the library, the default is kNoCompression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + // + // If you do not set `compression_opts.level`, or set it to + // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the + // default corresponding to `compression` as follows: + // + // - kZSTD: 3 + // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1) + // - kLZ4HCCompression: 0 + // - For all others, we do not specify a compression level + // + // Dynamically changeable through SetOptions() API + CompressionType compression; + + // Compression algorithm that will be used for the bottommost level that + // contain files. The behavior for num_levels = 1 is not well defined. + // Right now, with num_levels = 1, all compaction outputs will use + // bottommost_compression and all flush outputs still use options.compression, + // but the behavior is subject to change. + // + // Default: kDisableCompressionOption (Disabled) + CompressionType bottommost_compression = kDisableCompressionOption; + + // different options for compression algorithms used by bottommost_compression + // if it is enabled. To enable it, please see the definition of + // CompressionOptions. Behavior for num_levels = 1 is the same as + // options.bottommost_compression. + CompressionOptions bottommost_compression_opts; + + // different options for compression algorithms + CompressionOptions compression_opts; + + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + // + // Default: 4 + // + // Dynamically changeable through SetOptions() API + int level0_file_num_compaction_trigger = 4; + + // If non-nullptr, use the specified function to put keys in contiguous + // groups called "prefixes". These prefixes are used to place one + // representative entry for the group into the Bloom filter + // rather than an entry for each key (see whole_key_filtering). + // Under certain conditions, this enables optimizing some range queries + // (Iterators) in addition to some point lookups (Get/MultiGet). + // + // Together `prefix_extractor` and `comparator` must satisfy one essential + // property for valid prefix filtering of range queries: + // If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and + // InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3), + // Then InDomain(k2) and prefix(k2) == prefix(k1) + // + // In other words, all keys with the same prefix must be in a contiguous + // group by comparator order, and cannot be interrupted by keys with no + // prefix ("out of domain"). (This makes it valid to conclude that no + // entries within some bounds are present if the upper and lower bounds + // have a common prefix and no entries with that same prefix are present.) + // + // Some other properties are recommended but not strictly required. Under + // most sensible comparators, the following will need to hold true to + // satisfy the essential property above: + // * "Prefix is a prefix": key.starts_with(prefix(key)) + // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then + // Compare(prefix(k1), prefix(k2)) <= 0 + // + // The next two properties ensure that seeking to a prefix allows + // enumerating all entries with that prefix: + // * "Prefix starts the group": Compare(prefix(key), key) <= 0 + // * "Prefix idempotent": prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + std::shared_ptr prefix_extractor = nullptr; + + // Control maximum total data size for a level. + // max_bytes_for_level_base is the max total for level-1. + // Maximum number of bytes for level L can be calculated as + // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) + // For example, if max_bytes_for_level_base is 200MB, and if + // max_bytes_for_level_multiplier is 10, total data size for level-1 + // will be 200MB, total file size for level-2 will be 2GB, + // and total file size for level-3 will be 20GB. + // + // Default: 256MB. + // + // Dynamically changeable through SetOptions() API + uint64_t max_bytes_for_level_base = 256 * 1048576; + + // Deprecated. + uint64_t snap_refresh_nanos = 0; + + // Disable automatic compactions. Manual compactions can still + // be issued on this column family + // + // Dynamically changeable through SetOptions() API + bool disable_auto_compactions = false; + + // This is a factory that provides TableFactory objects. + // Default: a block-based table factory that provides a default + // implementation of TableBuilder and TableReader with default + // BlockBasedTableOptions. + std::shared_ptr table_factory; + + // A list of paths where SST files for this column family + // can be put into, with its target size. Similar to db_paths, + // newer data is placed into paths specified earlier in the + // vector while older data gradually moves to paths specified + // later in the vector. + // Note that, if a path is supplied to multiple column + // families, it would have files and total size from all + // the column families combined. User should provision for the + // total size(from all the column families) in such cases. + // + // If left empty, db_paths will be used. + // Default: empty + std::vector cf_paths; + + // Compaction concurrent thread limiter for the column family. + // If non-nullptr, use given concurrent thread limiter to control + // the max outstanding compaction tasks. Limiter can be shared with + // multiple column families across db instances. + // + // Default: nullptr + std::shared_ptr compaction_thread_limiter = nullptr; + + // If non-nullptr, use the specified factory for a function to determine the + // partitioning of sst files. This helps compaction to split the files + // on interesting boundaries (key prefixes) to make propagation of sst + // files less write amplifying (covering the whole key space). + // THE FEATURE IS STILL EXPERIMENTAL + // + // Default: nullptr + std::shared_ptr sst_partitioner_factory = nullptr; + + // Create ColumnFamilyOptions with default values for all fields + ColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit ColumnFamilyOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +enum class WALRecoveryMode : char { + // Original levelDB recovery + // + // We tolerate the last record in any log to be incomplete due to a crash + // while writing it. Zeroed bytes from preallocation are also tolerated in the + // trailing data of any log. + // + // Use case: Applications for which updates, once applied, must not be rolled + // back even after a crash-recovery. In this recovery mode, RocksDB guarantees + // this as long as `WritableFile::Append()` writes are durable. In case the + // user needs the guarantee in more situations (e.g., when + // `WritableFile::Append()` writes to page cache, but the user desires this + // guarantee in face of power-loss crash-recovery), RocksDB offers various + // mechanisms to additionally invoke `WritableFile::Sync()` in order to + // strengthen the guarantee. + // + // This differs from `kPointInTimeRecovery` in that, in case a corruption is + // detected during recovery, this mode will refuse to open the DB. Whereas, + // `kPointInTimeRecovery` will stop recovery just before the corruption since + // that is a valid point-in-time to which to recover. + kTolerateCorruptedTailRecords = 0x00, + // Recover from clean shutdown + // We don't expect to find any corruption in the WAL + // Use case : This is ideal for unit tests and rare applications that + // can require high consistency guarantee + kAbsoluteConsistency = 0x01, + // Recover to point-in-time consistency (default) + // We stop the WAL playback on discovering WAL inconsistency + // Use case : Ideal for systems that have disk controller cache like + // hard disk, SSD without super capacitor that store related data + kPointInTimeRecovery = 0x02, + // Recovery after a disaster + // We ignore any corruption in the WAL and try to salvage as much data as + // possible + // Use case : Ideal for last ditch effort to recover data or systems that + // operate with low grade unrelated data + kSkipAnyCorruptedRecords = 0x03, +}; + +struct DbPath { + std::string path; + uint64_t target_size; // Target size of total files under the path, in byte. + + DbPath() : target_size(0) {} + DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} +}; + +extern const char* kHostnameForDbHostId; + +enum class CompactionServiceJobStatus : char { + kSuccess, + kFailure, + kUseLocal, +}; + +struct CompactionServiceJobInfo { + std::string db_name; + std::string db_id; + std::string db_session_id; + uint64_t job_id; // job_id is only unique within the current DB and session, + // restart DB will reset the job_id. `db_id` and + // `db_session_id` could help you build unique id across + // different DBs and sessions. + + Env::Priority priority; + + CompactionServiceJobInfo(std::string db_name_, std::string db_id_, + std::string db_session_id_, uint64_t job_id_, + Env::Priority priority_) + : db_name(std::move(db_name_)), + db_id(std::move(db_id_)), + db_session_id(std::move(db_session_id_)), + job_id(job_id_), + priority(priority_) {} +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionService : public Customizable { + public: + static const char* Type() { return "CompactionService"; } + + // Returns the name of this compaction service. + const char* Name() const override = 0; + + // Start the remote compaction with `compaction_service_input`, which can be + // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the + // information the user might want to know, which includes `job_id`. + virtual CompactionServiceJobStatus StartV2( + const CompactionServiceJobInfo& /*info*/, + const std::string& /*compaction_service_input*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + // Wait for remote compaction to finish. + virtual CompactionServiceJobStatus WaitForCompleteV2( + const CompactionServiceJobInfo& /*info*/, + std::string* /*compaction_service_result*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + ~CompactionService() override = default; +}; + +struct DBOptions { + // The function recovers options to the option as in version 4.6. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. + DBOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + // Some functions that make it easier to optimize RocksDB + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + // An optional cache object is passed in for the memory of the + // memtable to cost to + DBOptions* OptimizeForSmallDb(std::shared_ptr* cache = nullptr); + + // By default, RocksDB uses only one background thread for flush and + // compaction. Calling this function will set it up such that total of + // `total_threads` is used. Good value for `total_threads` is the number of + // cores. You almost definitely want to call this function if your system is + // bottlenecked by RocksDB. + DBOptions* IncreaseParallelism(int total_threads = 16); + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing = false; + + // If true, missing column families will be automatically created. + // Default: false + bool create_missing_column_families = false; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists = false; + + // If true, RocksDB will aggressively check consistency of the data. + // Also, if any of the writes to the database fails (Put, Delete, Merge, + // Write), the database will switch to read-only mode and fail all other + // Write operations. + // In most cases you want this to be set to true. + // Default: true + bool paranoid_checks = true; + + // If true, during memtable flush, RocksDB will validate total entries + // read in flush, and compare with counter inserted into it. + // The option is here to turn the feature off in case this new validation + // feature has a bug. + // Default: true + bool flush_verify_memtable_count = true; + + // If true, the log numbers and sizes of the synced WALs are tracked + // in MANIFEST. During DB recovery, if a synced WAL is missing + // from disk, or the WAL's size does not match the recorded size in + // MANIFEST, an error will be reported and the recovery will be aborted. + // + // This is one additional protection against WAL corruption besides the + // per-WAL-entry checksum. + // + // Note that this option does not work with secondary instance. + // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`, + // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not + // tracked for performance/efficiency reasons. + // + // Default: false + bool track_and_verify_wals_in_manifest = false; + + // If true, verifies the SST unique id between MANIFEST and actual file + // each time an SST file is opened. This check ensures an SST file is not + // overwritten or misplaced. A corruption error will be reported if mismatch + // detected, but only when MANIFEST tracks the unique id, which starts from + // RocksDB version 7.3. Although the tracked internal unique id is related + // to the one returned by GetUniqueIdFromTableProperties, that is subject to + // change. + // NOTE: verification is currently only done on SST files using block-based + // table format. + // + // Setting to false should only be needed in case of unexpected problems. + // + // Although an early version of this option opened all SST files for + // verification on DB::Open, that is no longer guaranteed. However, as + // documented in an above option, if max_open_files is -1, DB will open all + // files on DB::Open(). + // + // Default: true + bool verify_sst_unique_id_in_manifest = true; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. In the near + // future, support for doing storage operations such as read/write files + // through env will be deprecated in favor of file_system (see below) + // Default: Env::Default() + Env* env = Env::Default(); + + // Limits internal file read/write bandwidth: + // + // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH` + // - Compaction requests read and write bandwidth at + // `Env::IOPriority::IO_LOW` + // - Reads associated with a `ReadOptions` can be charged at + // `ReadOptions::rate_limiter_priority` (see that option's API doc for usage + // and limitations). + // - Writes associated with a `WriteOptions` can be charged at + // `WriteOptions::rate_limiter_priority` (see that option's API doc for + // usage and limitations). + // + // Rate limiting is disabled if nullptr. If rate limiter is enabled, + // bytes_per_sync is set to 1MB by default. + // + // Default: nullptr + std::shared_ptr rate_limiter = nullptr; + + // Use to track SST files and control their file deletion rate. + // + // Features: + // - Throttle the deletion rate of the SST files. + // - Keep track the total size of all SST files. + // - Set a maximum allowed space limit for SST files that when reached + // the DB wont do any further flushes or compactions and will set the + // background error. + // - Can be shared between multiple dbs. + // Limitations: + // - Only track and throttle deletes of SST files in + // first db_path (db_name if db_paths is empty). + // + // Default: nullptr + std::shared_ptr sst_file_manager = nullptr; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + std::shared_ptr info_log = nullptr; + +#ifdef NDEBUG + InfoLogLevel info_log_level = INFO_LEVEL; +#else + InfoLogLevel info_log_level = DEBUG_LEVEL; +#endif // NDEBUG + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. + // + // A high value or -1 for this option can cause high memory usage. + // See BlockBasedTableOptions::cache_usage_options to constrain + // memory usage in case of block based table format. + // + // Default: -1 + // + // Dynamically changeable through SetDBOptions() API. + int max_open_files = -1; + + // If max_open_files is -1, DB will open all files on DB::Open(). You can + // use this option to increase the number of threads used to open the files. + // Default: 16 + int max_file_opening_threads = 16; + + // Once write-ahead logs exceed this size, we will start forcing the flush of + // column families whose memtables are backed by the oldest live WAL file + // (i.e. the ones that are causing all the space amplification). If set to 0 + // (default), we will dynamically choose the WAL size limit to be + // [sum of all write_buffer_size * max_write_buffer_number] * 4 + // + // For example, with 15 column families, each with + // write_buffer_size = 128 MB + // max_write_buffer_number = 6 + // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB + // + // The RocksDB wiki has some discussion about how the WAL interacts + // with memtables and flushing of column families. + // https://github.com/facebook/rocksdb/wiki/Column-Families + // + // This option takes effect only when there are more than one column + // family as otherwise the wal size is dictated by the write_buffer_size. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + uint64_t max_total_wal_size = 0; + + // If non-null, then we should collect metrics about database operations + std::shared_ptr statistics = nullptr; + + // By default, writes to stable storage use fdatasync (on platforms + // where this function is available). If this option is true, + // fsync is used instead. + // + // fsync and fdatasync are equally safe for our purposes and fdatasync is + // faster, so it is rarely necessary to set this option. It is provided + // as a workaround for kernel/filesystem bugs, such as one that affected + // fdatasync with ext4 in kernel versions prior to 3.7. + bool use_fsync = false; + + // A list of paths where SST files can be put into, with its target size. + // Newer data is placed into paths specified earlier in the vector while + // older data gradually moves to paths specified later in the vector. + // + // For example, you have a flash device with 10GB allocated for the DB, + // as well as a hard drive of 2TB, you should config it to be: + // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] + // + // The system will try to guarantee data under each path is close to but + // not larger than the target size. But current and future file sizes used + // by determining where to place a file are based on best-effort estimation, + // which means there is a chance that the actual size under the directory + // is slightly more than target size under some workloads. User should give + // some buffer room for those cases. + // + // If none of the paths has sufficient room to place a file, the file will + // be placed to the last path anyway, despite to the target size. + // + // Placing newer data to earlier paths is also best-efforts. User should + // expect user files to be placed in higher levels in some extreme cases. + // + // If left empty, only one path will be used, which is db_name passed when + // opening the DB. + // Default: empty + std::vector db_paths; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir = ""; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir = ""; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + // + // Default: 6 hours + // + // Dynamically changeable through SetDBOptions() API. + uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000; + + // Maximum number of concurrent background jobs (compactions and flushes). + // + // Default: 2 + // + // Dynamically changeable through SetDBOptions() API. + int max_background_jobs = 2; + + // DEPRECATED: RocksDB automatically decides this based on the + // value of max_background_jobs. For backwards compatibility we will set + // `max_background_jobs = max_background_compactions + max_background_flushes` + // in the case where user sets at least one of `max_background_compactions` or + // `max_background_flushes` (we replace -1 by 1 in case one option is unset). + // + // Maximum number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // + // If you're increasing this, also consider increasing number of threads in + // LOW priority thread pool. For more information, see + // Env::SetBackgroundThreads + // + // Default: -1 + // + // Dynamically changeable through SetDBOptions() API. + int max_background_compactions = -1; + + // This value represents the maximum number of threads that will + // concurrently perform a compaction job by breaking it into multiple, + // smaller ones that are run simultaneously. + // Default: 1 (i.e. no subcompactions) + // + // Dynamically changeable through SetDBOptions() API. + uint32_t max_subcompactions = 1; + + // DEPRECATED: RocksDB automatically decides this based on the + // value of max_background_jobs. For backwards compatibility we will set + // `max_background_jobs = max_background_compactions + max_background_flushes` + // in the case where user sets at least one of `max_background_compactions` or + // `max_background_flushes`. + // + // Maximum number of concurrent background memtable flush jobs, submitted by + // default to the HIGH priority thread pool. If the HIGH priority thread pool + // is configured to have zero threads, flush jobs will share the LOW priority + // thread pool with compaction jobs. + // + // It is important to use both thread pools when the same Env is shared by + // multiple db instances. Without a separate pool, long running compaction + // jobs could potentially block memtable flush jobs of other db instances, + // leading to unnecessary Put stalls. + // + // If you're increasing this, also consider increasing number of threads in + // HIGH priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: -1 + int max_background_flushes = -1; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size = 0; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + size_t log_file_time_to_roll = 0; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num = 1000; + + // Recycle log files. + // If non-zero, we will reuse previously written log files for new + // logs, overwriting the old data. The value indicates how many + // such files we will keep around at any point in time for later + // use. This is more efficient because the blocks are already + // allocated and fdatasync does not need to update the inode after + // each write. + // Default: 0 + size_t recycle_log_file_num = 0; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is 1GB so that the manifest file can grow, but not + // reach the limit of storage capacity. + uint64_t max_manifest_file_size = 1024 * 1024 * 1024; + + // Number of shards used for table cache. + int table_cache_numshardbits = 6; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_seconds / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds = 0; + uint64_t WAL_size_limit_MB = 0; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size = 4 * 1024 * 1024; + + // Allow the OS to mmap file for reading sst tables. + // Not recommended for 32-bit OS. + // When the option is set to true and compression is disabled, the blocks + // will not be copied and will be read directly from the mmap-ed memory + // area, and the block will not be inserted into the block cache. However, + // checksums will still be checked if ReadOptions.verify_checksums is set + // to be true. It means a checksum check every time a block is read, more + // than the setup where the option is set to false and the block cache is + // used. The common use of the options is to run RocksDB on ramfs, where + // checksum verification is usually not needed. + // Default: false + bool allow_mmap_reads = false; + + // Allow the OS to mmap file for writing. + // DB::SyncWAL() only works if this is set to false. + // Default: false + bool allow_mmap_writes = false; + + // Enable direct I/O mode for read/write + // they may or may not improve performance depending on the use case + // + // Files will be opened in "direct I/O" mode + // which means that data r/w from the disk will not be cached or + // buffered. The hardware buffer of the devices may however still + // be used. Memory mapped files are not impacted by these parameters. + + // Use O_DIRECT for user and compaction reads. + // Default: false + bool use_direct_reads = false; + + // Use O_DIRECT for writes in background flush and compactions. + // Default: false + bool use_direct_io_for_flush_and_compaction = false; + + // If false, fallocate() calls are bypassed, which disables file + // preallocation. The file space preallocation is used to increase the file + // write/append performance. By default, RocksDB preallocates space for WAL, + // SST, Manifest files, the extra space is truncated when the file is written. + // Warning: if you're using btrfs, we would recommend setting + // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra + // allocated space cannot be freed, which could be significant if you have + // lots of files. More details about this limitation: + // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md + bool allow_fallocate = true; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec = true; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // + // Default: 600 (10 min) + // + // Dynamically changeable through SetDBOptions() API. + unsigned int stats_dump_period_sec = 600; + + // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec + // Default: 600 + unsigned int stats_persist_period_sec = 600; + + // If true, automatically persist stats to a hidden column family (column + // family name: ___rocksdb_stats_history___) every + // stats_persist_period_sec seconds; otherwise, write to an in-memory + // struct. User can query through `GetStatsHistory` API. + // If user attempts to create a column family with the same name on a DB + // which have previously set persist_stats_to_disk to true, the column family + // creation will fail, but the hidden column family will survive, as well as + // the previously persisted statistics. + // When peristing stats to disk, the stat name will be limited at 100 bytes. + // Default: false + bool persist_stats_to_disk = false; + + // if not zero, periodically take stats snapshots and store in memory, the + // memory size for stats snapshots is capped at stats_history_buffer_size + // Default: 1MB + size_t stats_history_buffer_size = 1024 * 1024; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open = true; + + // Amount of data to build up in memtables across all column + // families before writing to disk. + // + // This is distinct from write_buffer_size, which enforces a limit + // for a single memtable. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: 0 (disabled) + size_t db_write_buffer_size = 0; + + // The memory usage of memtable will report to this object. The same object + // can be passed into multiple DBs and it will track the sum of size of all + // the DBs. If the total size of all live memtables of all the DBs exceeds + // a limit, a flush will be triggered in the next DB to which the next write + // is issued, as long as there is one or more column family not already + // flushing. + // + // If the object is only passed to one DB, the behavior is the same as + // db_write_buffer_size. When write_buffer_manager is set, the value set will + // override db_write_buffer_size. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: null + std::shared_ptr write_buffer_manager = nullptr; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED }; + AccessHint access_hint_on_compaction_start = NORMAL; + + // If non-zero, we perform bigger reads when doing compaction. If you're + // running RocksDB on spinning disks, you should set this to at least 2MB. + // That way RocksDB's compaction is doing sequential instead of random reads. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + size_t compaction_readahead_size = 0; + + // This is a maximum buffer size that is used by WinMmapReadableFile in + // unbuffered disk I/O mode. We need to maintain an aligned buffer for + // reads. We allow the buffer to grow until the specified value and then + // for bigger requests allocate one shot buffers. In unbuffered mode we + // always bypass read-ahead buffer at ReadaheadRandomAccessFile + // When read-ahead is required we then make use of compaction_readahead_size + // value and always try to read ahead. With read-ahead we always + // pre-allocate buffer to the size instead of growing it up to a limit. + // + // This option is currently honored only on Windows + // + // Default: 1 Mb + // + // Special value: 0 - means do not maintain per instance buffer. Allocate + // per request buffer and avoid locking. + size_t random_access_max_buffer_size = 1024 * 1024; + + // This is the maximum buffer size that is used by WritableFileWriter. + // With direct IO, we need to maintain an aligned buffer for writes. + // We allow the buffer to grow until it's size hits the limit in buffered + // IO and fix the buffer size when using direct IO to ensure alignment of + // write requests if the logical sector size is unusual + // + // Default: 1024 * 1024 (1 MB) + // + // Dynamically changeable through SetDBOptions() API. + size_t writable_file_max_buffer_size = 1024 * 1024; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex = false; + + // Create DBOptions with default values for all fields + DBOptions(); + // Create DBOptions from Options + explicit DBOptions(const Options& options); + + void Dump(Logger* log) const; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. This operation can be used + // to smooth out write I/Os over time. Users shouldn't rely on it for + // persistence guarantee. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // + // You may consider using rate_limiter to regulate write rate to device. + // When rate limiter is enabled, it automatically enables bytes_per_sync + // to 1MB. + // + // This option applies to table files + // + // Default: 0, turned off + // + // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead + // Dynamically changeable through SetDBOptions() API. + uint64_t bytes_per_sync = 0; + + // Same as bytes_per_sync, but applies to WAL files + // + // Default: 0, turned off + // + // Dynamically changeable through SetDBOptions() API. + uint64_t wal_bytes_per_sync = 0; + + // When true, guarantees WAL files have at most `wal_bytes_per_sync` + // bytes submitted for writeback at any given time, and SST files have at most + // `bytes_per_sync` bytes pending writeback at any given time. This can be + // used to handle cases where processing speed exceeds I/O speed during file + // generation, which can lead to a huge sync when the file is finished, even + // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured. + // + // - If `sync_file_range` is supported it achieves this by waiting for any + // prior `sync_file_range`s to finish before proceeding. In this way, + // processing (compression, etc.) can proceed uninhibited in the gap + // between `sync_file_range`s, and we block only when I/O falls behind. + // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism + // always blocks, thus preventing the interleaving of I/O and processing. + // + // Note: Enabling this option does not provide any additional persistence + // guarantees, as it may use `sync_file_range`, which does not write out + // metadata. + // + // Default: false + bool strict_bytes_per_sync = false; + + // A vector of EventListeners whose callback functions will be called + // when specific RocksDB event happens. + std::vector> listeners; + + // If true, then the status of the threads involved in this DB will + // be tracked and available via GetThreadList() API. + // + // Default: false + bool enable_thread_tracking = false; + + // The limited write rate to DB if soft_pending_compaction_bytes_limit or + // level0_slowdown_writes_trigger is triggered, or we are writing to the + // last mem table allowed and we allow more than 3 mem tables. It is + // calculated using size of user write requests before compression. + // RocksDB may decide to slow down more if the compaction still + // gets behind further. + // If the value is 0, we will infer a value from `rater_limiter` value + // if it is not empty, or 16MB if `rater_limiter` is empty. Note that + // if users change the rate in `rate_limiter` after DB is opened, + // `delayed_write_rate` won't be adjusted. + // + // Unit: byte per second. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + uint64_t delayed_write_rate = 0; + + // By default, a single write thread queue is maintained. The thread gets + // to the head of the queue becomes write batch group leader and responsible + // for writing to WAL and memtable for the batch group. + // + // If enable_pipelined_write is true, separate write thread queue is + // maintained for WAL write and memtable write. A write thread first enter WAL + // writer queue and then memtable writer queue. Pending thread on the WAL + // writer queue thus only have to wait for previous writers to finish their + // WAL writing but not the memtable writing. Enabling the feature may improve + // write throughput and reduce latency of the prepare phase of two-phase + // commit. + // + // Default: false + bool enable_pipelined_write = false; + + // Setting unordered_write to true trades higher write throughput with + // relaxing the immutability guarantee of snapshots. This violates the + // repeatability one expects from ::Get from a snapshot, as well as + // ::MultiGet and Iterator's consistent-point-in-time view property. + // If the application cannot tolerate the relaxed guarantees, it can implement + // its own mechanisms to work around that and yet benefit from the higher + // throughput. Using TransactionDB with WRITE_PREPARED write policy and + // two_write_queues=true is one way to achieve immutable snapshots despite + // unordered_write. + // + // By default, i.e., when it is false, rocksdb does not advance the sequence + // number for new snapshots unless all the writes with lower sequence numbers + // are already finished. This provides the immutability that we except from + // snapshots. Moreover, since Iterator and MultiGet internally depend on + // snapshots, the snapshot immutability results into Iterator and MultiGet + // offering consistent-point-in-time view. If set to true, although + // Read-Your-Own-Write property is still provided, the snapshot immutability + // property is relaxed: the writes issued after the snapshot is obtained (with + // larger sequence numbers) will be still not visible to the reads from that + // snapshot, however, there still might be pending writes (with lower sequence + // number) that will change the state visible to the snapshot after they are + // landed to the memtable. + // + // Default: false + bool unordered_write = false; + + // If true, allow multi-writers to update mem tables in parallel. + // Only some memtable_factory-s support concurrent writes; currently it + // is implemented only for SkipListFactory. Concurrent memtable writes + // are not compatible with inplace_update_support or filter_deletes. + // It is strongly recommended to set enable_write_thread_adaptive_yield + // if you are going to use this feature. + // + // Default: true + bool allow_concurrent_memtable_write = true; + + // If true, threads synchronizing with the write batch group leader will + // wait for up to write_thread_max_yield_usec before blocking on a mutex. + // This can substantially improve throughput for concurrent workloads, + // regardless of whether allow_concurrent_memtable_write is enabled. + // + // Default: true + bool enable_write_thread_adaptive_yield = true; + + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + // + // Default: 1 MB + uint64_t max_write_batch_group_size_bytes = 1 << 20; + + // The maximum number of microseconds that a write operation will use + // a yielding spin loop to coordinate with other write threads before + // blocking on a mutex. (Assuming write_thread_slow_yield_usec is + // set properly) increasing this value is likely to increase RocksDB + // throughput at the expense of increased CPU usage. + // + // Default: 100 + uint64_t write_thread_max_yield_usec = 100; + + // The latency in microseconds after which a std::this_thread::yield + // call (sched_yield on Linux) is considered to be a signal that + // other processes or threads would like to use the current core. + // Increasing this makes writer threads more likely to take CPU + // by spinning, which will show up as an increase in the number of + // involuntary context switches. + // + // Default: 3 + uint64_t write_thread_slow_yield_usec = 3; + + // If true, then DB::Open() will not update the statistics used to optimize + // compaction decision by loading table properties from many files. + // Turning off this feature will improve DBOpen time especially in + // disk environment. + // + // Default: false + bool skip_stats_update_on_db_open = false; + + // If true, then DB::Open() will not fetch and check sizes of all sst files. + // This may significantly speed up startup if there are many sst files, + // especially when using non-default Env with expensive GetFileSize(). + // We'll still check that all required sst files exist. + // If paranoid_checks is false, this option is ignored, and sst files are + // not checked at all. + // + // Default: false + bool skip_checking_sst_file_sizes_on_db_open = false; + + // Recovery mode to control the consistency while replaying WAL + // Default: kPointInTimeRecovery + WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + // if set to false then recovery will fail when a prepared + // transaction is encountered in the WAL + bool allow_2pc = false; + + // A global cache for table-level rows. + // Default: nullptr (disabled) + std::shared_ptr row_cache = nullptr; + + // A filter object supplied to be invoked while processing write-ahead-logs + // (WALs) during recovery. The filter provides a way to inspect log + // records, ignoring a particular record or skipping replay. + // The filter is invoked at startup and is invoked from a single-thread + // currently. + WalFilter* wal_filter = nullptr; + + // If true, then DB::Open / CreateColumnFamily / DropColumnFamily + // SetOptions will fail if options file is not properly persisted. + // + // DEFAULT: false + bool fail_if_options_file_error = false; + + // If true, then print malloc stats together with rocksdb.stats + // when printing to LOG. + // DEFAULT: false + bool dump_malloc_stats = false; + + // By default RocksDB replay WAL logs and flush them on DB open, which may + // create very small SST files. If this option is enabled, RocksDB will try + // to avoid (but not guarantee not to) flush during recovery. Also, existing + // WAL logs will be kept, so that if crash happened before flush, we still + // have logs to recover from. + // + // DEFAULT: false + bool avoid_flush_during_recovery = false; + + // By default RocksDB will flush all memtables on DB close if there are + // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup + // DB close. Unpersisted data WILL BE LOST. + // + // DEFAULT: false + // + // Dynamically changeable through SetDBOptions() API. + bool avoid_flush_during_shutdown = false; + + // Set this option to true during creation of database if you want + // to be able to ingest behind (call IngestExternalFile() skipping keys + // that already exist, rather than overwriting matching keys). + // Setting this option to true will affect 2 things: + // 1) Disable some internal optimizations around SST file compression + // 2) Reserve bottom-most level for ingested files only. + // Note that only universal compaction supports reserving last level + // for file ingestion only. + // `num_levels` should be >= 3 if this option is turned on. + // + // + // DEFAULT: false + // Immutable. + bool allow_ingest_behind = false; + + // If enabled it uses two queues for writes, one for the ones with + // disable_memtable and one for the ones that also write to memtable. This + // allows the memtable writes not to lag behind other writes. It can be used + // to optimize MySQL 2PC in which only the commits, which are serial, write to + // memtable. + bool two_write_queues = false; + + // If true WAL is not flushed automatically after each write. Instead it + // relies on manual invocation of FlushWAL to write the WAL buffer to its + // file. + bool manual_wal_flush = false; + + // This feature is WORK IN PROGRESS + // If enabled WAL records will be compressed before they are written. + // Only zstd is supported. Compressed WAL records will be read in supported + // versions regardless of the wal_compression settings. + CompressionType wal_compression = kNoCompression; + + // If true, RocksDB supports flushing multiple column families and committing + // their results atomically to MANIFEST. Note that it is not + // necessary to set atomic_flush to true if WAL is always enabled since WAL + // allows the database to be restored to the last persistent state in WAL. + // This option is useful when there are column families with writes NOT + // protected by WAL. + // For manual flush, application has to specify which column families to + // flush atomically in DB::Flush. + // For auto-triggered flush, RocksDB atomically flushes ALL column families. + // + // Currently, any WAL-enabled writes after atomic flush may be replayed + // independently if the process crashes later and tries to recover. + bool atomic_flush = false; + + // If true, working thread may avoid doing unnecessary and long-latency + // operation (such as deleting obsolete files directly or deleting memtable) + // and will instead schedule a background job to do it. + // Use it if you're latency-sensitive. + // If set to true, takes precedence over + // ReadOptions::background_purge_on_iterator_cleanup. + bool avoid_unnecessary_blocking_io = false; + + // Historically DB ID has always been stored in Identity File in DB folder. + // If this flag is true, the DB ID is written to Manifest file in addition + // to the Identity file. By doing this 2 problems are solved + // 1. We don't checksum the Identity file where as Manifest file is. + // 2. Since the source of truth for DB is Manifest file DB ID will sit with + // the source of truth. Previously the Identity file could be copied + // independent of Manifest and that can result in wrong DB ID. + // We recommend setting this flag to true. + // Default: false + bool write_dbid_to_manifest = false; + + // The number of bytes to prefetch when reading the log. This is mostly useful + // for reading a remotely located log, as it can save the number of + // round-trips. If 0, then the prefetching is disabled. + // + // Default: 0 + size_t log_readahead_size = 0; + + // If user does NOT provide the checksum generator factory, the file checksum + // will NOT be used. A new file checksum generator object will be created + // when a SST file is created. Therefore, each created FileChecksumGenerator + // will only be used from a single thread and so does not need to be + // thread-safe. + // + // Default: nullptr + std::shared_ptr file_checksum_gen_factory = nullptr; + + // By default, RocksDB will attempt to detect any data losses or corruptions + // in DB files and return an error to the user, either at DB::Open time or + // later during DB operation. The exception to this policy is the WAL file, + // whose recovery is controlled by the wal_recovery_mode option. + // + // Best-efforts recovery (this option set to true) signals a preference for + // opening the DB to any point-in-time valid state for each column family, + // including the empty/new state, versus the default of returning non-WAL + // data losses to the user as errors. In terms of RocksDB user data, this + // is like applying WALRecoveryMode::kPointInTimeRecovery to each column + // family rather than just the WAL. + // + // Best-efforts recovery (BER) is specifically designed to recover a DB with + // files that are missing or truncated to some smaller size, such as the + // result of an incomplete DB "physical" (FileSystem) copy. BER can also + // detect when an SST file has been replaced with a different one of the + // same size (assuming SST unique IDs are tracked in DB manifest). + // BER is not yet designed to produce a usable DB from other corruptions to + // DB files (which should generally be detectable by DB::VerifyChecksum()), + // and BER does not yet attempt to recover any WAL files. + // + // For example, if an SST or blob file referenced by the MANIFEST is missing, + // BER might be able to find a set of files corresponding to an old "point in + // time" version of the column family, possibly from an older MANIFEST + // file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are + // either ignored or replaced with BER, or quietly fixed regardless of BER + // setting. BER does require at least one valid MANIFEST to recover to a + // non-trivial DB state, unlike `ldb repair`. + // + // Currently, best_efforts_recovery=true is not compatible with atomic flush. + // + // Default: false + bool best_efforts_recovery = false; + + // It defines how many times DB::Resume() is called by a separate thread when + // background retryable IO Error happens. When background retryable IO + // Error happens, SetBGError is called to deal with the error. If the error + // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write), + // then db resume is called in background to recover from the error. If this + // value is 0 or negative, DB::Resume() will not be called automatically. + // + // Default: INT_MAX + int max_bgerror_resume_count = INT_MAX; + + // If max_bgerror_resume_count is >= 2, db resume is called multiple times. + // This option decides how long to wait to retry the next resume if the + // previous resume fails and satisfy redo resume conditions. + // + // Default: 1000000 (microseconds). + uint64_t bgerror_resume_retry_interval = 1000000; + + // It allows user to opt-in to get error messages containing corrupted + // keys/values. Corrupt keys, values will be logged in the + // messages/logs/status that will help users with the useful information + // regarding affected data. By default value is set false to prevent users + // data to be exposed in the logs/messages etc. + // + // Default: false + bool allow_data_in_errors = false; + + // A string identifying the machine hosting the DB. This + // will be written as a property in every SST file written by the DB (or + // by offline writers such as SstFileWriter and RepairDB). It can be useful + // for troubleshooting in memory corruption caused by a failing host when + // writing a file, by tracing back to the writing host. These corruptions + // may not be caught by the checksum since they happen before checksumming. + // If left as default, the table writer will substitute it with the actual + // hostname when writing the SST file. If set to an empty string, the + // property will not be written to the SST file. + // + // Default: hostname + std::string db_host_id = kHostnameForDbHostId; + + // Use this if your DB want to enable checksum handoff for specific file + // types writes. Make sure that the File_system you use support the + // crc32c checksum verification + // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile. + // NOTE: currently RocksDB only generates crc32c based checksum for the + // handoff. If the storage layer has different checksum support, user + // should enble this set as empty. Otherwise,it may cause unexpected + // write failures. + FileTypeSet checksum_handoff_file_types; + + // EXPERIMENTAL + // CompactionService is a feature allows the user to run compactions on a + // different host or process, which offloads the background load from the + // primary host. + // It's an experimental feature, the interface will be changed without + // backward/forward compatibility support for now. Some known issues are still + // under development. + std::shared_ptr compaction_service = nullptr; + + // It indicates, which lowest cache tier we want to + // use for a certain DB. Currently we support volatile_tier and + // non_volatile_tier. They are layered. By setting it to kVolatileTier, only + // the block cache (current implemented volatile_tier) is used. So + // cache entries will not spill to secondary cache (current + // implemented non_volatile_tier), and block cache lookup misses will not + // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use + // both block cache and secondary cache. + // + // Default: kNonVolatileBlockTier + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; + + // If set to false, when compaction or flush sees a SingleDelete followed by + // a Delete for the same user key, compaction job will not fail. + // Otherwise, compaction job will fail. + // This is a temporary option to help existing use cases migrate, and + // will be removed in a future release. + // Warning: do not set to false unless you are trying to migrate existing + // data in which the contract of single delete + // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced, + // thus has Delete mixed with SingleDelete for the same user key. Violation + // of the contract leads to undefined behaviors with high possibility of data + // inconsistency, e.g. deleted old data become visible again, etc. + bool enforce_single_del_contracts = true; +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options : public DBOptions, public ColumnFamilyOptions { + // Create an Options object with default values for all fields. + Options() : DBOptions(), ColumnFamilyOptions() {} + + Options(const DBOptions& db_options, + const ColumnFamilyOptions& column_family_options) + : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} + + // Change to some default settings from an older version. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. + Options* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + void Dump(Logger* log) const; + + void DumpCFOptions(Logger* log) const; + + // Some functions that make it easier to optimize RocksDB + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + Options* OptimizeForSmallDb(); + + // Disable some checks that should not be necessary in the absence of + // software logic errors or CPU+memory hardware errors. This can improve + // write speeds but is only recommended for temporary use. Does not + // change protection against corrupt storage (e.g. verify_checksums). + Options* DisableExtraChecks(); +}; + +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1, // data in memtable or block cache + kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option + // will skip data in memtable. + // Note that this ReadTier currently only supports + // Get and MultiGet and does not support iterators. + kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. +}; + +// Options that control read operations +struct ReadOptions { + // *** BEGIN options relevant to point lookups as well as scans *** + + // If "snapshot" is non-nullptr, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is nullptr, use an implicit + // snapshot of the state at the beginning of this read operation. + const Snapshot* snapshot = nullptr; + + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order tuples. + // For iterator, iter_start_ts is the lower bound (older) and timestamp + // serves as the upper bound. Versions of the same record that fall in + // the timestamp range will be returned. If iter_start_ts is nullptr, + // only the most recent version visible to timestamp is returned. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp = nullptr; + const Slice* iter_start_ts = nullptr; + + // Deadline for completing an API call (Get/MultiGet/Seek/Next for now) + // in microseconds. + // It should be set to microseconds since epoch, i.e, gettimeofday or + // equivalent plus allowed duration in microseconds. The best way is to use + // env->NowMicros() + some timeout. + // This is best efforts. The call may exceed the deadline if there is IO + // involved and the file system doesn't support deadlines, or due to + // checking for deadline periodically rather than for every key if + // processing a batch + std::chrono::microseconds deadline = std::chrono::microseconds::zero(); + + // A timeout in microseconds to be passed to the underlying FileSystem for + // reads. As opposed to deadline, this determines the timeout for each + // individual file read request. If a MultiGet/Get/Seek/Next etc call + // results in multiple reads, each read can last up to io_timeout us. + std::chrono::microseconds io_timeout = std::chrono::microseconds::zero(); + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + ReadTier read_tier = kReadAllTier; + + // For file reads associated with this option, charge the internal rate + // limiter (see `DBOptions::rate_limiter`) at the specified priority. The + // special value `Env::IO_TOTAL` disables charging the rate limiter. + // + // The rate limiting is bypassed no matter this option's value for file reads + // on plain tables (these can exist when `ColumnFamilyOptions::table_factory` + // is a `PlainTableFactory`) and cuckoo tables (these can exist when + // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`). + // + // The bytes charged to rate limiter may not exactly match the file read bytes + // since there are some seemingly insignificant reads, like for file + // headers/footers, that we currently do not charge to rate limiter. + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; + + // It limits the maximum cumulative value size of the keys in batch while + // reading through MultiGet. Once the cumulative value size exceeds this + // soft limit then all the remaining keys are returned with status Aborted. + uint64_t value_size_soft_limit = std::numeric_limits::max(); + + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + bool verify_checksums = true; + + // Should the "data block"/"index block" read for this iteration be placed in + // block cache? + // Callers may wish to set this field to false for bulk scans. + // This would help not to the change eviction order of existing items in the + // block cache. + bool fill_cache = true; + + // If true, range tombstones handling will be skipped in key lookup paths. + // For DB instances that don't use DeleteRange() calls, this setting can + // be used to optimize the read performance. + // Note that, if this assumption (of no previous DeleteRange() calls) is + // broken, stale keys could be served in read paths. + bool ignore_range_deletions = false; + + // Experimental + // + // If async_io is enabled, RocksDB will prefetch some of data asynchronously. + // RocksDB apply it if reads are sequential and its internal automatic + // prefetching. + bool async_io = false; + + // Experimental + // + // If async_io is set, then this flag controls whether we read SST files + // in multiple levels asynchronously. Enabling this flag can help reduce + // MultiGet latency by maximizing the number of SST files read in + // parallel if the keys in the MultiGet batch are in different levels. It + // comes at the expense of slightly higher CPU overhead. + bool optimize_multiget_for_io = true; + + // *** END options relevant to point lookups (as well as scans) *** + // *** BEGIN options only relevant to iterators or scans *** + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file. The readahead starts at 8KB and doubles on every + // additional read up to 256KB. + // This option can help if most of the range scans are large, and if it is + // determined that a larger readahead than that enabled by auto-readahead is + // needed. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + size_t readahead_size = 0; + + // A threshold for the number of keys that can be skipped before failing an + // iterator seek as incomplete. The default value of 0 should be used to + // never fail a request as incomplete, even on skipping too many keys. + uint64_t max_skippable_internal_keys = 0; + + // `iterate_lower_bound` defines the smallest key at which the backward + // iterator can return an entry. Once the bound is passed, Valid() will be + // false. `iterate_lower_bound` is inclusive ie the bound value is a valid + // entry. + // + // If prefix_extractor is not null, the Seek target and `iterate_lower_bound` + // need to have the same prefix. This is because ordering is not guaranteed + // outside of prefix domain. + // + // In case of user_defined timestamp, if enabled, iterate_lower_bound should + // point to key without timestamp part. + const Slice* iterate_lower_bound = nullptr; + + // "iterate_upper_bound" defines the extent up to which the forward iterator + // can return entries. Once the bound is reached, Valid() will be false. + // "iterate_upper_bound" is exclusive ie the bound value is + // not a valid entry. If prefix_extractor is not null: + // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used + // to infer whether prefix iterating (e.g. applying prefix bloom filter) + // can be used within RocksDB. This is done by comparing + // iterate_upper_bound with the seek key. + // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes + // effect if it shares the same prefix as the seek key. If + // iterate_upper_bound is outside the prefix of the seek key, then keys + // returned outside the prefix range will be undefined, just as if + // iterate_upper_bound = null. + // If iterate_upper_bound is not null, SeekToLast() will position the iterator + // at the first key smaller than iterate_upper_bound. + // + // In case of user_defined timestamp, if enabled, iterate_upper_bound should + // point to key without timestamp part. + const Slice* iterate_upper_bound = nullptr; + + // Specify to create a tailing iterator -- a special iterator that has a + // view of the complete database (i.e. it can also be used to read newly + // added data) and is optimized for sequential reads. It will return records + // that were inserted into the database after the creation of the iterator. + bool tailing = false; + + // This options is not used anymore. It was to turn on a functionality that + // has been removed. DEPRECATED + bool managed = false; + + // Enable a total order seek regardless of index format (e.g. hash index) + // used in the table. Some table format (e.g. plain table) may not support + // this option. + // If true when calling Get(), we also skip prefix bloom when reading from + // block based table, which only affects Get() performance. + bool total_order_seek = false; + + // When true, by default use total_order_seek = true, and RocksDB can + // selectively enable prefix seek mode if won't generate a different result + // from total_order_seek, based on seek key, and iterator upper bound. + // BUG: Using Comparator::IsSameLengthImmediateSuccessor and + // SliceTransform::FullLengthEnabled to enable prefix mode in cases where + // prefix of upper bound differs from prefix of seek key has a flaw. + // If present in the DB, "short keys" (shorter than "full length" prefix) + // can be omitted from auto_prefix_mode iteration when they would be present + // in total_order_seek iteration, regardless of whether the short keys are + // "in domain" of the prefix extractor. This is not an issue if no short + // keys are added to DB or are not expected to be returned by such + // iterators. (We are also assuming the new condition on + // IsSameLengthImmediateSuccessor is satisfied; see its BUG section). + // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG". + bool auto_prefix_mode = false; + + // Enforce that the iterator only iterates over the same prefix as the seek. + // This option is effective only for prefix seeks, i.e. prefix_extractor is + // non-null for the column family and total_order_seek is false. Unlike + // iterate_upper_bound, prefix_same_as_start only works within a prefix + // but in both directions. + bool prefix_same_as_start = false; + + // Keep the blocks loaded by the iterator pinned in memory as long as the + // iterator is not deleted, If used when reading from tables created with + // BlockBasedTableOptions::use_delta_encoding = false, + // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to + // return 1. + bool pin_data = false; + + // For iterators, RocksDB does auto-readahead on noticing more than two + // sequential reads for a table file if user doesn't provide readahead_size. + // The readahead starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size only when reads are sequential. However at each + // level, if iterator moves over next file, readahead_size starts again from + // 8KB. + // + // By enabling this option, RocksDB will do some enhancements for + // prefetching the data. + bool adaptive_readahead = false; + + // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we + // schedule a background job in the flush job queue and delete obsolete files + // in background. + bool background_purge_on_iterator_cleanup = false; + + // A callback to determine whether relevant keys for this scan exist in a + // given table based on the table's properties. The callback is passed the + // properties of each table during iteration. If the callback returns false, + // the table will not be scanned. This option only affects Iterators and has + // no impact on point lookups. + // Default: empty (every table will be scanned) + std::function table_filter; + + // *** END options only relevant to iterators or scans *** + + // ** For RocksDB internal use only ** + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + + ReadOptions() {} + ReadOptions(bool _verify_checksums, bool _fill_cache); + explicit ReadOptions(Env::IOActivity _io_activity); +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fdatasync()". + // + // Default: false + bool sync; + + // If true, writes will not first go to the write ahead log, + // and the write may get lost after a crash. The backup engine + // relies on write-ahead logs to back up the memtable, so if + // you disable write-ahead logs, you must create backups with + // flush_before_backup=true to avoid losing unflushed memtable data. + // Default: false + bool disableWAL; + + // If true and if user is trying to write to column families that don't exist + // (they were dropped), ignore the write (don't return an error). If there + // are multiple writes in a WriteBatch, other writes will succeed. + // Default: false + bool ignore_missing_column_families; + + // If true and we need to wait or sleep for the write request, fails + // immediately with Status::Incomplete(). + // Default: false + bool no_slowdown; + + // If true, this write request is of lower priority if compaction is + // behind. In this case, no_slowdown = true, the request will be canceled + // immediately with Status::Incomplete() returned. Otherwise, it will be + // slowed down. The slowdown value is determined by RocksDB to guarantee + // it introduces minimum impacts to high priority writes. + // + // Default: false + bool low_pri; + + // If true, this writebatch will maintain the last insert positions of each + // memtable as hints in concurrent write. It can improve write performance + // in concurrent writes if keys in one writebatch are sequential. In + // non-concurrent writes (when concurrent_memtable_writes is false) this + // option will be ignored. + // + // Default: false + bool memtable_insert_hint_per_batch; + + // For writes associated with this option, charge the internal rate + // limiter (see `DBOptions::rate_limiter`) at the specified priority. The + // special value `Env::IO_TOTAL` disables charging the rate limiter. + // + // Currently the support covers automatic WAL flushes, which happen during + // live updates (`Put()`, `Write()`, `Delete()`, etc.) + // when `WriteOptions::disableWAL == false` + // and `DBOptions::manual_wal_flush == false`. + // + // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed + // due to implementation constraints. + // + // Default: `Env::IO_TOTAL` + Env::IOPriority rate_limiter_priority; + + // `protection_bytes_per_key` is the number of bytes used to store + // protection information for each key entry. Currently supported values are + // zero (disabled) and eight. + // + // Default: zero (disabled). + size_t protection_bytes_per_key; + + WriteOptions() + : sync(false), + disableWAL(false), + ignore_missing_column_families(false), + no_slowdown(false), + low_pri(false), + memtable_insert_hint_per_batch(false), + rate_limiter_priority(Env::IO_TOTAL), + protection_bytes_per_key(0) {} +}; + +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + // If true, the flush would proceed immediately even it means writes will + // stall for the duration of the flush; if false the operation will wait + // until it's possible to do flush w/o causing stall or until required flush + // is performed by someone else (foreground call or background thread). + // Default: false + bool allow_write_stall; + FlushOptions() : wait(true), allow_write_stall(false) {} +}; + +// Create a Logger from provided DBOptions +extern Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger); + +// CompactionOptions are used in CompactFiles() call. +struct CompactionOptions { + // Compaction output compression type + // Default: snappy + // If set to `kDisableCompressionOption`, RocksDB will choose compression type + // according to the `ColumnFamilyOptions`, taking into account the output + // level if `compression_per_level` is specified. + CompressionType compression; + // Compaction will create files of size `output_file_size_limit`. + // Default: MAX, which means that compaction will create a single file + uint64_t output_file_size_limit; + // If > 0, it will replace the option in the DBOptions for this compaction. + uint32_t max_subcompactions; + + CompactionOptions() + : compression(kSnappyCompression), + output_file_size_limit(std::numeric_limits::max()), + max_subcompactions(0) {} +}; + +// For level based compaction, we can configure if we want to skip/force +// bottommost level compaction. +enum class BottommostLevelCompaction { + // Skip bottommost level compaction + kSkip, + // Only compact bottommost level if there is a compaction filter + // This is the default option + kIfHaveCompactionFilter, + // Always compact bottommost level + kForce, + // Always compact bottommost level but in bottommost level avoid + // double-compacting files created in the same compaction + kForceOptimized, +}; + +// For manual compaction, we can configure if we want to skip/force garbage +// collection of blob files. +enum class BlobGarbageCollectionPolicy { + // Force blob file garbage collection. + kForce, + // Skip blob file garbage collection. + kDisable, + // Inherit blob file garbage collection policy from ColumnFamilyOptions. + kUseDefault, +}; + +// CompactRangeOptions is used by CompactRange() call. +struct CompactRangeOptions { + // If true, no other compaction will run at the same time as this + // manual compaction. + // + // Default: false + bool exclusive_manual_compaction = false; + + // If true, compacted files will be moved to the minimum level capable + // of holding the data or given level (specified non-negative target_level). + bool change_level = false; + // If change_level is true and target_level have non-negative value, compacted + // files will be moved to target_level. + int target_level = -1; + // Compaction outputs will be placed in options.db_paths[target_path_id]. + // Behavior is undefined if target_path_id is out of range. + uint32_t target_path_id = 0; + // By default level based compaction will only compact the bottommost level + // if there is a compaction filter + BottommostLevelCompaction bottommost_level_compaction = + BottommostLevelCompaction::kIfHaveCompactionFilter; + // If true, will execute immediately even if doing so would cause the DB to + // enter write stall mode. Otherwise, it'll sleep until load is low enough. + bool allow_write_stall = false; + // If > 0, it will replace the option in the DBOptions for this compaction. + uint32_t max_subcompactions = 0; + // Set user-defined timestamp low bound, the data with older timestamp than + // low bound maybe GCed by compaction. Default: nullptr + const Slice* full_history_ts_low = nullptr; + + // Allows cancellation of an in-progress manual compaction. + // + // Cancellation can be delayed waiting on automatic compactions when used + // together with `exclusive_manual_compaction == true`. + std::atomic* canceled = nullptr; + // NOTE: Calling DisableManualCompaction() overwrites the uer-provided + // canceled variable in CompactRangeOptions. + // Typically, when CompactRange is being called in one thread (t1) with + // canceled = false, and DisableManualCompaction is being called in the + // other thread (t2), manual compaction is disabled normally, even if the + // compaction iterator may still scan a few items before *canceled is + // set to true + + // If set to kForce, RocksDB will override enable_blob_file_garbage_collection + // to true; if set to kDisable, RocksDB will override it to false, and + // kUseDefault leaves the setting in effect. This enables customers to both + // force-enable and force-disable GC when calling CompactRange. + BlobGarbageCollectionPolicy blob_garbage_collection_policy = + BlobGarbageCollectionPolicy::kUseDefault; + + // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff + // from ColumnFamilyOptions in effect. Otherwise, it will override the + // user-provided setting. This enables customers to selectively override the + // age cutoff. + double blob_garbage_collection_age_cutoff = -1; +}; + +// IngestExternalFileOptions is used by IngestExternalFile() +struct IngestExternalFileOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; + // If set to false, an ingested file keys could appear in existing snapshots + // that where created before the file was ingested. + bool snapshot_consistency = true; + // If set to false, IngestExternalFile() will fail if the file key range + // overlaps with existing keys or tombstones or output of ongoing compaction + // during file ingestion in the DB (the conditions under which a global_seqno + // must be assigned to the ingested file). + bool allow_global_seqno = true; + // If set to false and the file key range overlaps with the memtable key range + // (memtable flush required), IngestExternalFile will fail. + bool allow_blocking_flush = true; + // Set to true if you would like duplicate keys in the file being ingested + // to be skipped rather than overwriting existing data under that key. + // Use case: back-fill of some historical data in the database without + // over-writing existing newer version of data. + // This option could only be used if the DB has been running + // with allow_ingest_behind=true since the dawn of time. + // All files will be ingested at the bottommost level with seqno=0. + bool ingest_behind = false; + // DEPRECATED - Set to true if you would like to write global_seqno to + // the external SST file on ingestion for backward compatibility before + // RocksDB 5.16.0. Such old versions of RocksDB expect any global_seqno to + // be written to the SST file rather than recorded in the DB manifest. + // This functionality was deprecated because (a) random writes might be + // costly or unsupported on some FileSystems, and (b) the file checksum + // changes with such a write. + bool write_global_seqno = false; + // Set to true if you would like to verify the checksums of each block of the + // external SST file before ingestion. + // Warning: setting this to true causes slowdown in file ingestion because + // the external SST file has to be read. + bool verify_checksums_before_ingest = false; + // When verify_checksums_before_ingest = true, RocksDB uses default + // readahead setting to scan the file while verifying checksums before + // ingestion. + // Users can override the default value using this option. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + size_t verify_checksums_readahead_size = 0; + // Set to TRUE if user wants to verify the sst file checksum of ingested + // files. The DB checksum function will generate the checksum of each + // ingested file (if file_checksum_gen_factory is set) and compare the + // checksum function name and checksum with the ingested checksum information. + // + // If this option is set to True: 1) if DB does not enable checksum + // (file_checksum_gen_factory == nullptr), the ingested checksum information + // will be ignored; 2) If DB enable the checksum function, we calculate the + // sst file checksum after the file is moved or copied and compare the + // checksum and checksum name. If checksum or checksum function name does + // not match, ingestion will be failed. If the verification is successful, + // checksum and checksum function name will be stored in Manifest. + // If this option is set to FALSE, 1) if DB does not enable checksum, + // the ingested checksum information will be ignored; 2) if DB enable the + // checksum, we only verify the ingested checksum function name and we + // trust the ingested checksum. If the checksum function name matches, we + // store the checksum in Manifest. DB does not calculate the checksum during + // ingestion. However, if no checksum information is provided with the + // ingested files, DB will generate the checksum and store in the Manifest. + bool verify_file_checksum = true; + // Set to TRUE if user wants file to be ingested to the bottommost level. An + // error of Status::TryAgain() will be returned if a file cannot fit in the + // bottommost level when calling + // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear + // the bottommost level in the overlapping range before re-attempt. + // + // ingest_behind takes precedence over fail_if_not_bottommost_level. + bool fail_if_not_bottommost_level = false; +}; + +enum TraceFilterType : uint64_t { + // Trace all the operations + kTraceFilterNone = 0x0, + // Do not trace the get operations + kTraceFilterGet = 0x1 << 0, + // Do not trace the write operations + kTraceFilterWrite = 0x1 << 1, + // Do not trace the `Iterator::Seek()` operations + kTraceFilterIteratorSeek = 0x1 << 2, + // Do not trace the `Iterator::SeekForPrev()` operations + kTraceFilterIteratorSeekForPrev = 0x1 << 3, + // Do not trace the `MultiGet()` operations + kTraceFilterMultiGet = 0x1 << 4, +}; + +// TraceOptions is used for StartTrace +struct TraceOptions { + // To avoid the trace file size grows large than the storage space, + // user can set the max trace file size in Bytes. Default is 64GB + uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024; + // Specify trace sampling option, i.e. capture one per how many requests. + // Default to 1 (capture every request). + uint64_t sampling_frequency = 1; + // Note: The filtering happens before sampling. + uint64_t filter = kTraceFilterNone; + // When true, the order of write records in the trace will match the order of + // the corresponding write records in the WAL and applied to the DB. There may + // be a performance penalty associated with preserving this ordering. + // + // Default: false. This means write records in the trace may be in an order + // different from the WAL's order. + bool preserve_write_order = false; +}; + +// ImportColumnFamilyOptions is used by ImportColumnFamily() +struct ImportColumnFamilyOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; +}; + +// Options used with DB::GetApproximateSizes() +struct SizeApproximationOptions { + // Defines whether the returned size should include the recently written + // data in the memtables. If set to false, include_files must be true. + bool include_memtables = false; + // Defines whether the returned size should include data serialized to disk. + // If set to false, include_memtables must be true. + bool include_files = true; + // When approximating the files total size that is used to store a keys range + // using DB::GetApproximateSizes, allow approximation with an error margin of + // up to total_files_size * files_size_error_margin. This allows to take some + // shortcuts in files size approximation, resulting in better performance, + // while guaranteeing the resulting error is within a reasonable margin. + // E.g., if the value is 0.1, then the error margin of the returned files size + // approximation will be within 10%. + // If the value is non-positive - a more precise yet more CPU intensive + // estimation is performed. + double files_size_error_margin = -1.0; +}; + +struct CompactionServiceOptionsOverride { + // Currently pointer configurations are not passed to compaction service + // compaction so the user needs to set it. It will be removed once pointer + // configuration passing is supported. + Env* env = Env::Default(); + std::shared_ptr file_checksum_gen_factory = nullptr; + + const Comparator* comparator = BytewiseComparator(); + std::shared_ptr merge_operator = nullptr; + const CompactionFilter* compaction_filter = nullptr; + std::shared_ptr compaction_filter_factory = nullptr; + std::shared_ptr prefix_extractor = nullptr; + std::shared_ptr table_factory; + std::shared_ptr sst_partitioner_factory = nullptr; + + // Only subsets of events are triggered in remote compaction worker, like: + // `OnTableFileCreated`, `OnTableFileCreationStarted`, + // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`, + // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and + // `OnCompactionCompleted` won't be triggered. They will be triggered on the + // primary DB side. + std::vector> listeners; + + // statistics is used to collect DB operation metrics, the metrics won't be + // returned to CompactionService primary host, to collect that, the user needs + // to set it here. + std::shared_ptr statistics = nullptr; + + // Only compaction generated SST files use this user defined table properties + // collector. + std::vector> + table_properties_collector_factories; +}; + +struct OpenAndCompactOptions { + // Allows cancellation of an in-progress compaction. + std::atomic* canceled = nullptr; +}; + +struct LiveFilesStorageInfoOptions { + // Whether to populate FileStorageInfo::file_checksum* or leave blank + bool include_checksum_info = false; + // Flushes memtables if total size in bytes of live WAL files is >= this + // number (and DB is not read-only). + // Default: always force a flush without checking sizes. + uint64_t wal_size_for_flush = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_context.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_context.h new file mode 100644 index 0000000000..46266abbaa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_context.h @@ -0,0 +1,314 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "rocksdb/perf_level.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * NOTE: + * Please do not reorder the fields in this structure. If you plan to do that or + * add/remove fields to this structure, builds would fail. The way to fix the + * builds would be to add the appropriate fields to the + * DEF_PERF_CONTEXT_LEVEL_METRICS() macro in the perf_context.cc file. + */ + +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevelBase { + // These Bloom stats apply to point reads (Get/MultiGet) for whole key and + // prefix filters. + // # of times bloom filter has avoided file reads, i.e., negatives. + uint64_t bloom_filter_useful = 0; + // # of times bloom FullFilter has not avoided the reads. + uint64_t bloom_filter_full_positive = 0; + // # of times bloom FullFilter has not avoided the reads and data actually + // exist. + uint64_t bloom_filter_full_true_positive = 0; + + // total number of user key returned (only include keys that are found, does + // not include keys that are deleted or merged without a final put + uint64_t user_key_return_count = 0; + + // total nanos spent on reading data from SST files + uint64_t get_from_table_nanos = 0; + + uint64_t block_cache_hit_count = 0; // total number of block cache hits + uint64_t block_cache_miss_count = 0; // total number of block cache misses +}; + +// A thread local context for gathering performance counter efficiently +// and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevel : public PerfContextByLevelBase { + void Reset(); // reset all performance counters to zero +}; + +/* + * NOTE: + * Please do not reorder the fields in this structure. If you plan to do that or + * add/remove fields to this structure, builds would fail. The way to fix the + * builds would be to add the appropriate fields to the + * DEF_PERF_CONTEXT_METRICS() macro in the perf_context.cc file. + */ + +struct PerfContextBase { + uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; // total number of block cache hits + uint64_t block_read_count; // total number of block reads (with IO) + uint64_t block_read_byte; // total number of bytes from block reads + uint64_t block_read_time; // total nanos spent on block reads + // total cpu time in nanos spent on block reads + uint64_t block_read_cpu_time; + uint64_t block_cache_index_hit_count; // total number of index block hits + // total number of standalone handles lookup from secondary cache + uint64_t block_cache_standalone_handle_count; + // total number of real handles lookup from secondary cache that are inserted + // into primary cache + uint64_t block_cache_real_handle_count; + uint64_t index_block_read_count; // total number of index block reads + uint64_t block_cache_filter_hit_count; // total number of filter block hits + uint64_t filter_block_read_count; // total number of filter block reads + uint64_t compression_dict_block_read_count; // total number of compression + // dictionary block reads + + uint64_t secondary_cache_hit_count; // total number of secondary cache hits + // total number of real handles inserted into secondary cache + uint64_t compressed_sec_cache_insert_real_count; + // total number of dummy handles inserted into secondary cache + uint64_t compressed_sec_cache_insert_dummy_count; + // bytes for vals before compression in secondary cache + uint64_t compressed_sec_cache_uncompressed_bytes; + // bytes for vals after compression in secondary cache + uint64_t compressed_sec_cache_compressed_bytes; + + uint64_t block_checksum_time; // total nanos spent on block checksum + uint64_t block_decompress_time; // total nanos spent on block decompression + + uint64_t get_read_bytes; // bytes for vals returned by Get + uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet + uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator + + uint64_t blob_cache_hit_count; // total number of blob cache hits + uint64_t blob_read_count; // total number of blob reads (with IO) + uint64_t blob_read_byte; // total number of bytes from blob reads + uint64_t blob_read_time; // total nanos spent on blob reads + uint64_t blob_checksum_time; // total nanos spent on blob checksum + uint64_t blob_decompress_time; // total nanos spent on blob decompression + + // total number of internal keys skipped over during iteration. + // There are several reasons for it: + // 1. when calling Next(), the iterator is in the position of the previous + // key, so that we'll need to skip it. It means this counter will always + // be incremented in Next(). + // 2. when calling Next(), we need to skip internal entries for the previous + // keys that are overwritten. + // 3. when calling Next(), Seek() or SeekToFirst(), after previous key + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next + // valid key that the operation should place the iterator to. We need + // to skip both of the tombstone and updates hidden by the tombstones. The + // tombstones are not included in this counter, while previous updates + // hidden by the tombstones will be included here. + // 4. symmetric cases for Prev() and SeekToLast() + // internal_recent_skipped_count is not included in this counter. + // + uint64_t internal_key_skipped_count; + // Total number of deletes and single deletes skipped over during iteration + // When calling Next(), Seek() or SeekToFirst(), after previous position + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next valid + // key. Every deleted key is counted once. We don't recount here if there are + // still older updates invalidated by the tombstones. + // + uint64_t internal_delete_skipped_count; + // How many times iterators skipped over internal keys that are more recent + // than the snapshot that iterator is using. + // + uint64_t internal_recent_skipped_count; + // How many merge operands were fed into the merge operator by iterators. + // Note: base values are not included in the count. + // + uint64_t internal_merge_count; + // How many merge operands were fed into the merge operator by point lookups. + // Note: base values are not included in the count. + // + uint64_t internal_merge_point_lookup_count; + // Number of times we reseeked inside a merging iterator, specifically to skip + // after or before a range of keys covered by a range deletion in a newer LSM + // component. + uint64_t internal_range_del_reseek_count; + + uint64_t get_snapshot_time; // total nanos spent on getting snapshot + uint64_t get_from_memtable_time; // total nanos spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total nanos spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total nanos reading from output files + // total nanos spent on seeking memtable + uint64_t seek_on_memtable_time; + // number of seeks issued on memtable + // (including SeekForPrev but not SeekToFirst and SeekToLast) + uint64_t seek_on_memtable_count; + // number of Next()s issued on memtable + uint64_t next_on_memtable_count; + // number of Prev()s issued on memtable + uint64_t prev_on_memtable_count; + // total nanos spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total nanos spent on the merge min heap + uint64_t seek_max_heap_time; // total nanos spent on the merge max heap + // total nanos spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total nanos spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + + // This group of stats provide a breakdown of time spent by Write(). + // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write + // are enabled. + // + // total nanos spent on writing to WAL + uint64_t write_wal_time; + // total nanos spent on writing to mem tables + uint64_t write_memtable_time; + // total nanos spent on delaying or throttling write + uint64_t write_delay_time; + // total nanos spent on switching memtable/wal and scheduling + // flushes/compactions. + uint64_t write_scheduling_flushes_compactions_time; + // total nanos spent on writing a record, excluding the above four things + uint64_t write_pre_and_post_process_time; + + // time spent waiting for other threads of the batch group + uint64_t write_thread_wait_nanos; + + // time spent on acquiring DB mutex. + uint64_t db_mutex_lock_nanos; + // Time spent on waiting with a condition variable created with DB mutex. + uint64_t db_condition_wait_nanos; + // Time spent on merge operator. + uint64_t merge_operator_time_nanos; + + // Time spent on reading index block from block cache or SST file + uint64_t read_index_block_nanos; + // Time spent on reading filter block from block cache or SST file + uint64_t read_filter_block_nanos; + // Time spent on creating data block iterator + uint64_t new_table_block_iter_nanos; + // Time spent on creating a iterator of an SST file. + uint64_t new_table_iterator_nanos; + // Time spent on seeking a key in data/index blocks + uint64_t block_seek_nanos; + // Time spent on finding or creating a table reader + uint64_t find_table_nanos; + // total number of mem table bloom hits + uint64_t bloom_memtable_hit_count; + // total number of mem table bloom misses + uint64_t bloom_memtable_miss_count; + // total number of SST bloom hits + uint64_t bloom_sst_hit_count; + // total number of SST bloom misses + uint64_t bloom_sst_miss_count; + + // Time spent waiting on key locks in transaction lock manager. + uint64_t key_lock_wait_time; + // number of times acquiring a lock was blocked by another transaction. + uint64_t key_lock_wait_count; + + // Total time spent in Env filesystem operations. These are only populated + // when TimedEnv is used. + uint64_t env_new_sequential_file_nanos; + uint64_t env_new_random_access_file_nanos; + uint64_t env_new_writable_file_nanos; + uint64_t env_reuse_writable_file_nanos; + uint64_t env_new_random_rw_file_nanos; + uint64_t env_new_directory_nanos; + uint64_t env_file_exists_nanos; + uint64_t env_get_children_nanos; + uint64_t env_get_children_file_attributes_nanos; + uint64_t env_delete_file_nanos; + uint64_t env_create_dir_nanos; + uint64_t env_create_dir_if_missing_nanos; + uint64_t env_delete_dir_nanos; + uint64_t env_get_file_size_nanos; + uint64_t env_get_file_modification_time_nanos; + uint64_t env_rename_file_nanos; + uint64_t env_link_file_nanos; + uint64_t env_lock_file_nanos; + uint64_t env_unlock_file_nanos; + uint64_t env_new_logger_nanos; + + uint64_t get_cpu_nanos; + uint64_t iter_next_cpu_nanos; + uint64_t iter_prev_cpu_nanos; + uint64_t iter_seek_cpu_nanos; + + // EXPERIMENTAL + // Total number of db iterator's Next(), Prev(), Seek-related APIs being + // called + uint64_t iter_next_count; + uint64_t iter_prev_count; + uint64_t iter_seek_count; + + // Time spent in encrypting data. Populated when EncryptedEnv is used. + uint64_t encrypt_data_nanos; + // Time spent in decrypting data. Populated when EncryptedEnv is used. + uint64_t decrypt_data_nanos; + + uint64_t number_async_seek; +}; + +struct PerfContext : public PerfContextBase { + ~PerfContext(); + + PerfContext() {} + + PerfContext(const PerfContext&); + PerfContext& operator=(const PerfContext&); + PerfContext(PerfContext&&) noexcept; + + void Reset(); // reset all performance counters to zero + + std::string ToString(bool exclude_zero_counters = false) const; + + // enable per level perf context and allocate storage for PerfContextByLevel + void EnablePerLevelPerfContext(); + + // temporarily disable per level perf context by setting the flag to false + void DisablePerLevelPerfContext(); + + // free the space for PerfContextByLevel, also disable per level perf context + void ClearPerLevelPerfContext(); + + std::map* level_to_perf_context = nullptr; + bool per_level_perf_context_enabled = false; + + void copyMetrics(const PerfContext* other) noexcept; +}; + +// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global, +// non-thread-local PerfContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, +// a) if thread-local is supported on the platform, then a pointer to +// a thread-local PerfContext object will be returned. +// b) if thread-local is NOT supported, then compilation will fail. +// +// This function never returns nullptr. +PerfContext* get_perf_context(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_level.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_level.h new file mode 100644 index 0000000000..e7dded0e32 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/perf_level.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// How much perf stats to collect. Affects perf_context and iostats_context. +enum PerfLevel : unsigned char { + kUninitialized = 0, // unknown setting + kDisable = 1, // disable perf stats + kEnableCount = 2, // enable only count stats + kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time + // stats except for mutexes + // Other than time, also measure CPU time counters. Still don't measure + // time (neither wall time nor CPU time) for mutexes. + kEnableTimeAndCPUTimeExceptForMutex = 4, + kEnableTime = 5, // enable count and time stats + kOutOfBounds = 6 // N.B. Must always be the last value! +}; + +// set the perf stats level for current thread +void SetPerfLevel(PerfLevel level); + +// get current perf stats level for current thread +PerfLevel GetPerfLevel(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/persistent_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/persistent_cache.h new file mode 100644 index 0000000000..f14f019993 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/persistent_cache.h @@ -0,0 +1,74 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// PersistentCache +// +// Persistent cache interface for caching IO pages on a persistent medium. The +// cache interface is specifically designed for persistent read cache. +class PersistentCache { + public: + using StatsType = std::vector>; + + virtual ~PersistentCache() {} + + // Insert to page cache + // + // page_key Identifier to identify a page uniquely across restarts + // data Page data to copy (caller retains ownership) + // size Size of the page + virtual Status Insert(const Slice& key, const char* data, + const size_t size) = 0; + + // Lookup page cache by page identifier + // + // page_key Page identifier + // buf Buffer where the data should be copied + // size Size of the page + virtual Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) = 0; + + // True if the cache is configured to store serialized blocks, which are + // potentially compressed and include a trailer (when SST format calls for + // one). False if the cache stores uncompressed blocks (no trailer). + virtual bool IsCompressed() = 0; + + // Return stats as map of {string, double} per-tier + // + // Persistent cache can be initialized as a tier of caches. The stats are per + // tire top-down + virtual StatsType Stats() = 0; + + virtual std::string GetPrintableOptions() const = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharding the same persistent cache to partition the key space. Typically + // the client will allocate a new id at startup and prepend the id to its + // cache keys. + virtual uint64_t NewId() = 0; +}; + +// Factor method to create a new persistent cache +Status NewPersistentCache(Env* const env, const std::string& path, + const uint64_t size, + const std::shared_ptr& log, + const bool optimized_for_nvm, + std::shared_ptr* cache); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/port_defs.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/port_defs.h new file mode 100644 index 0000000000..9771aacb92 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/port_defs.h @@ -0,0 +1,22 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file includes the common definitions used in the port/, +// the public API (this directory), and other directories + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +enum class CpuPriority { + kIdle = 0, + kLow = 1, + kNormal = 2, + kHigh = 3, +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rate_limiter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rate_limiter.h new file mode 100644 index 0000000000..9cad6edf4a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rate_limiter.h @@ -0,0 +1,159 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class RateLimiter { + public: + enum class OpType { + kRead, + kWrite, + }; + + enum class Mode { + kReadsOnly, + kWritesOnly, + kAllIo, + }; + + // For API compatibility, default to rate-limiting writes only. + explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {} + + virtual ~RateLimiter() {} + + // This API allows user to dynamically change rate limiter's bytes per second. + // REQUIRED: bytes_per_second > 0 + virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0; + + // Deprecated. New RateLimiter derived classes should override + // Request(const int64_t, const Env::IOPriority, Statistics*) or + // Request(const int64_t, const Env::IOPriority, Statistics*, OpType) + // instead. + // + // Request for token for bytes. If this request can not be satisfied, the call + // is blocked. Caller is responsible to make sure + // bytes <= GetSingleBurstBytes() + // and bytes >= 0. + virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) { + assert(false); + } + + // Request for token for bytes and potentially update statistics. If this + // request can not be satisfied, the call is blocked. Caller is responsible to + // make sure bytes <= GetSingleBurstBytes() + // and bytes >= 0. + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* /* stats */) { + // For API compatibility, default implementation calls the older API in + // which statistics are unsupported. + Request(bytes, pri); + } + + // Requests token to read or write bytes and potentially updates statistics. + // + // If this request can not be satisfied, the call is blocked. Caller is + // responsible to make sure bytes <= GetSingleBurstBytes() + // and bytes >= 0. + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats, OpType op_type) { + if (IsRateLimited(op_type)) { + Request(bytes, pri, stats); + } + } + + // Requests token to read or write bytes and potentially updates statistics. + // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of + // direct I/O) to allocate an appropriate number of bytes, which may be less + // than the number of bytes requested. + virtual size_t RequestToken(size_t bytes, size_t alignment, + Env::IOPriority io_priority, Statistics* stats, + RateLimiter::OpType op_type); + + // Max bytes can be granted in a single burst + virtual int64_t GetSingleBurstBytes() const = 0; + + // Total bytes that go through rate limiter + virtual int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + + // Total # of requests that go through rate limiter + virtual int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + + // Total # of requests that are pending for bytes in rate limiter + // For convenience, this function is supported by the RateLimiter returned + // by NewGenericRateLimiter but is not required by RocksDB. + // + // REQUIRED: total_pending_request != nullptr + virtual Status GetTotalPendingRequests( + int64_t* total_pending_requests, + const Env::IOPriority pri = Env::IO_TOTAL) const { + assert(total_pending_requests != nullptr); + (void)total_pending_requests; + (void)pri; + return Status::NotSupported(); + } + + virtual int64_t GetBytesPerSecond() const = 0; + + virtual bool IsRateLimited(OpType op_type) { + if ((mode_ == RateLimiter::Mode::kWritesOnly && + op_type == RateLimiter::OpType::kRead) || + (mode_ == RateLimiter::Mode::kReadsOnly && + op_type == RateLimiter::OpType::kWrite)) { + return false; + } + return true; + } + + protected: + Mode GetMode() { return mode_; } + + private: + const Mode mode_; +}; + +// Create a RateLimiter object, which can be shared among RocksDB instances to +// control write rate of flush and compaction. +// @rate_bytes_per_sec: this is the only parameter you want to set most of the +// time. It controls the total write rate of compaction and flush in bytes per +// second. Currently, RocksDB does not enforce rate limit for anything other +// than flush and compaction, e.g. write to WAL. +// @refill_period_us: this controls how often tokens are refilled. For example, +// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to +// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to +// burstier writes while smaller value introduces more CPU overhead. +// The default should work for most cases. +// @fairness: RateLimiter accepts high-pri requests and low-pri requests. +// A low-pri request is usually blocked in favor of hi-pri request. Currently, +// RocksDB assigns low-pri to request from compaction and high-pri to request +// from flush. Low-pri requests can get blocked if flush requests come in +// continuously. This fairness parameter grants low-pri requests permission by +// 1/fairness chance even though high-pri requests exist to avoid starvation. +// You should be good by leaving it at default 10. +// @mode: Mode indicates which types of operations count against the limit. +// @auto_tuned: Enables dynamic adjustment of rate limit within the range +// `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to +// the recent demand for background I/O. +extern RateLimiter* NewGenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000, + int32_t fairness = 10, + RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly, + bool auto_tuned = false); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rocksdb_namespace.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rocksdb_namespace.h new file mode 100644 index 0000000000..a339ec2aa5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/rocksdb_namespace.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// For testing purposes +#if ROCKSDB_NAMESPACE == 42 +#undef ROCKSDB_NAMESPACE +#endif + +// Normal logic +#ifndef ROCKSDB_NAMESPACE +#define ROCKSDB_NAMESPACE rocksdb +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/secondary_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/secondary_cache.h new file mode 100644 index 0000000000..2aad50280c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/secondary_cache.h @@ -0,0 +1,139 @@ +// Copyright (c) 2021, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include +#include + +#include "rocksdb/advanced_cache.h" +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A handle for lookup result. Immediately after SecondaryCache::Lookup() with +// wait=false (and depending on the implementation), the handle could be in any +// of the below states. It must not be destroyed while in the pending state. +// * Pending state (IsReady() == false): result is not ready. Value() and Size() +// must not be called. +// * Ready + not found state (IsReady() == true, Value() == nullptr): the lookup +// has completed, finding no match. Or an error occurred that prevented +// normal completion of the Lookup. +// * Ready + found state (IsReady() == false, Value() != nullptr): the lookup +// has completed, finding an entry that has been loaded into an object that is +// now owned by the caller. +// +// Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to +// return true, but (depending on the implementation) IsReady() might never +// return true without Wait() or SecondaryCache::WaitAll(). After the handle +// is known ready, calling Value() is required to avoid a memory leak in case +// of a cache hit. +class SecondaryCacheResultHandle { + public: + virtual ~SecondaryCacheResultHandle() = default; + + // Returns whether the handle is ready or not + virtual bool IsReady() = 0; + + // Block until handle becomes ready + virtual void Wait() = 0; + + // Return the cache entry object (also known as value). If nullptr, it means + // the lookup was unsuccessful. + virtual Cache::ObjectPtr Value() = 0; + + // Return the out_charge from the helper->create_cb used to construct the + // object. + // WART: potentially confusing name + virtual size_t Size() = 0; +}; + +// SecondaryCache +// +// Cache interface for caching blocks on a secondary tier (which can include +// non-volatile media, or alternate forms of caching such as compressed data) +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SecondaryCache : public Customizable { + public: + ~SecondaryCache() override = default; + + static const char* Type() { return "SecondaryCache"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Suggest inserting an entry into this cache. The caller retains ownership + // of `obj` (also called the "value"), so is only used directly by the + // SecondaryCache during Insert(). When the cache chooses to perform the + // suggested insertion, it uses the size_cb and saveto_cb provided by + // `helper` to extract the persistable data (typically an uncompressed block) + // and writes it to this cache tier. OK may be returned even if the insertion + // is not made. + virtual Status Insert(const Slice& key, Cache::ObjectPtr obj, + const Cache::CacheItemHelper* helper) = 0; + + // Insert a value from its saved/persistable data (typically uncompressed + // block), as if generated by SaveToCallback/SizeCallback. This can be used + // in "warming up" the cache from some auxiliary source, and like Insert() + // may or may not write it to cache depending on the admission control + // policy, even if the return status is success. + // + // The default implementation only assumes the entry helper's create_cb is + // called at Lookup() time and not Insert() time, so should work for all + // foreseeable implementations. + virtual Status InsertSaved(const Slice& key, const Slice& saved); + + // Lookup the data for the given key in this cache. The create_cb + // will be used to create the object. The handle returned may not be + // ready yet, unless wait=true, in which case Lookup() will block until + // the handle is ready. + // + // advise_erase is a hint from the primary cache indicating that the handle + // will be cached there, so the secondary cache is advised to drop it from + // the cache as an optimization. To use this feature, SupportForceErase() + // needs to return true. + // This hint can also be safely ignored. + // + // kept_in_sec_cache is to indicate whether the entry will be kept in the + // secondary cache after the Lookup (rather than erased because of Lookup) + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& kept_in_sec_cache) = 0; + + // Indicate whether a handle can be erased in this secondary cache. + [[nodiscard]] virtual bool SupportForceErase() const = 0; + + // At the discretion of the implementation, erase the data associated + // with key. + virtual void Erase(const Slice& key) = 0; + + // Wait for a collection of handles to become ready. + virtual void WaitAll(std::vector handles) = 0; + + // Set the maximum configured capacity of the cache. + // When the new capacity is less than the old capacity and the existing usage + // is greater than new capacity, the implementation will do its best job to + // purge the released entries from the cache in order to lower the usage. + // + // The derived class can make this function no-op and return NotSupported(). + virtual Status SetCapacity(size_t /* capacity */) { + return Status::NotSupported(); + } + + // The derived class can make this function no-op and return NotSupported(). + virtual Status GetCapacity(size_t& /* capacity */) { + return Status::NotSupported(); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice.h new file mode 100644 index 0000000000..0d7eb59499 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice.h @@ -0,0 +1,264 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#pragma once + +#include +#include +#include +#include +#include +#include // RocksDB now requires C++17 support + +#include "rocksdb/cleanable.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) {} + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) {} + + // Create a slice that refers to the contents of "s" + /* implicit */ + Slice(const std::string& s) : data_(s.data()), size_(s.size()) {} + + // Create a slice that refers to the same contents as "sv" + /* implicit */ + Slice(const std::string_view& sv) : data_(sv.data()), size_(sv.size()) {} + + // Create a slice that refers to s[0,strlen(s)-1] + /* implicit */ + Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); } + + // Create a single slice from SliceParts using buf as storage. + // buf must exist as long as the returned Slice exists. + Slice(const struct SliceParts& parts, std::string* buf); + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { + data_ = ""; + size_ = 0; + } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + void remove_suffix(size_t n) { + assert(n <= size()); + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + // when hex is true, returns a string of twice the length hex encoded (0-9A-F) + std::string ToString(bool hex = false) const; + + // Return a string_view that references the same data as this slice. + std::string_view ToStringView() const { + return std::string_view(data_, size_); + } + + // Decodes the current slice interpreted as an hexadecimal string into result, + // if successful returns true, if this isn't a valid hex string + // (e.g not coming from Slice::ToString(true)) DecodeHex returns false. + // This slice is expected to have an even number of 0-9A-F characters + // also accepts lowercase (a-f) + bool DecodeHex(std::string* result) const; + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0)); + } + + bool ends_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0)); + } + + // Compare two slices and returns the first byte where they differ + size_t difference_offset(const Slice& b) const; + + // private: make these public for rocksdbjni access + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +/** + * A Slice that can be pinned with some cleanup tasks, which will be run upon + * ::Reset() or object destruction, whichever is invoked first. This can be used + * to avoid memcpy by having the PinnableSlice object referring to the data + * that is locked in the memory and release them after the data is consumed. + */ +class PinnableSlice : public Slice, public Cleanable { + public: + PinnableSlice() { buf_ = &self_space_; } + explicit PinnableSlice(std::string* buf) { buf_ = buf; } + + PinnableSlice(PinnableSlice&& other); + PinnableSlice& operator=(PinnableSlice&& other); + + // No copy constructor and copy assignment allowed. + PinnableSlice(PinnableSlice&) = delete; + PinnableSlice& operator=(PinnableSlice&) = delete; + + inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1, + void* arg2) { + assert(!pinned_); + pinned_ = true; + data_ = s.data(); + size_ = s.size(); + RegisterCleanup(f, arg1, arg2); + assert(pinned_); + } + + inline void PinSlice(const Slice& s, Cleanable* cleanable) { + assert(!pinned_); + pinned_ = true; + data_ = s.data(); + size_ = s.size(); + if (cleanable != nullptr) { + cleanable->DelegateCleanupsTo(this); + } + assert(pinned_); + } + + inline void PinSelf(const Slice& slice) { + assert(!pinned_); + buf_->assign(slice.data(), slice.size()); + data_ = buf_->data(); + size_ = buf_->size(); + assert(!pinned_); + } + + inline void PinSelf() { + assert(!pinned_); + data_ = buf_->data(); + size_ = buf_->size(); + assert(!pinned_); + } + + void remove_suffix(size_t n) { + assert(n <= size()); + if (pinned_) { + size_ -= n; + } else { + buf_->erase(size() - n, n); + PinSelf(); + } + } + + void remove_prefix(size_t n) { + assert(n <= size()); + if (pinned_) { + data_ += n; + size_ -= n; + } else { + buf_->erase(0, n); + PinSelf(); + } + } + + void Reset() { + Cleanable::Reset(); + pinned_ = false; + size_ = 0; + } + + inline std::string* GetSelf() { return buf_; } + + inline bool IsPinned() const { return pinned_; } + + private: + friend class PinnableSlice4Test; + std::string self_space_; + std::string* buf_; + bool pinned_ = false; +}; + +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* _parts, int _num_parts) + : parts(_parts), num_parts(_num_parts) {} + SliceParts() : parts(nullptr), num_parts(0) {} + + const Slice* parts; + int num_parts; +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); } + +inline int Slice::compare(const Slice& b) const { + assert(data_ != nullptr && b.data_ != nullptr); + const size_t min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) + r = -1; + else if (size_ > b.size_) + r = +1; + } + return r; +} + +inline size_t Slice::difference_offset(const Slice& b) const { + size_t off = 0; + const size_t len = (size_ < b.size_) ? size_ : b.size_; + for (; off < len; off++) { + if (data_[off] != b.data_[off]) break; + } + return off; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice_transform.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice_transform.h new file mode 100644 index 0000000000..8909b9c539 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/slice_transform.h @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +struct ConfigOptions; + +// A SliceTransform is a generic pluggable way of transforming one string +// to another. Its primary use-case is in configuring RocksDB prefix Bloom +// filters, by setting prefix_extractor in ColumnFamilyOptions. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SliceTransform : public Customizable { + public: + virtual ~SliceTransform(){}; + + // Return the name of this transformation. + virtual const char* Name() const override = 0; + static const char* Type() { return "SliceTransform"; } + + // Creates and configures a new SliceTransform from the input options and id. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Returns a string representation of this SliceTransform, representing the ID + // and any additional properties. + std::string AsString() const; + + // Extract a prefix from a specified key, partial key, iterator upper bound, + // etc. This is normally used for building and checking prefix Bloom filters + // but should accept any string for which InDomain() returns true. + // See ColumnFamilyOptions::prefix_extractor for specific properties that + // must be satisfied by prefix extractors. + virtual Slice Transform(const Slice& key) const = 0; + + // Determine whether the specified key is compatible with the logic + // specified in the Transform method. Keys for which InDomain returns + // false will not be added to or queried against prefix Bloom filters. + // + // For example, if the Transform method returns a fixed length + // prefix of size 4, then an invocation to InDomain("abc") returns + // false because the specified key length(3) is shorter than the + // prefix size of 4. + // + // Wiki documentation here: + // https://github.com/facebook/rocksdb/wiki/Prefix-Seek + // + virtual bool InDomain(const Slice& key) const = 0; + + // DEPRECATED: This is currently not used and remains here for backward + // compatibility. + virtual bool InRange(const Slice& /*dst*/) const { return false; } + + // Returns information on maximum prefix length, if there is one. + // If Transform(x).size() == n for some keys and otherwise < n, + // should return true and set *len = n. Returning false is safe but + // currently disables some auto_prefix_mode filtering. + // Specifically, if the iterate_upper_bound is the immediate successor (see + // Comparator::IsSameLengthImmediateSuccessor) of the seek key's prefix, + // we require this function return true and iterate_upper_bound.size() == n + // to recognize and optimize the prefix seek. + // Otherwise (including FullLengthEnabled returns false, or prefix length is + // less than maximum), Seek with auto_prefix_mode is only optimized if the + // iterate_upper_bound and seek key have the same prefix. + // BUG: Despite all these conditions and even with the extra condition on + // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not + // sufficient to ensure auto_prefix_mode returns all entries that + // total_order_seek would return. See auto_prefix_mode "BUG" section. + virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } + + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. + // + // This function is not used by RocksDB, but for users. If users pass + // Options by string to RocksDB, they might not know what prefix extractor + // they are using. This function is to help users can determine: + // if they want to iterate all keys prefixing `prefix`, whether it is + // safe to use prefix bloom filter and seek to key `prefix`. + // If this function returns true, this means a user can Seek() to a prefix + // using the bloom filter. Otherwise, user needs to skip the bloom filter + // by setting ReadOptions.total_order_seek = true. + // + // Here is an example: Suppose we implement a slice transform that returns + // the first part of the string up to and including first ",": + // 1. SameResultWhenAppended("abc,") should return true. If applying prefix + // bloom filter using it, all slices matching "abc,.*" will be extracted + // to "abc,", so any SST file or memtable containing any of those key + // will not be filtered out. + // 2. SameResultWhenAppended("abc") should return false. A user will not be + // guaranteed to see all the keys matching "abc.*" if a user prefix + // seeks to "abc" against a DB with the same setting. If one SST file + // only contains "abcd,e", the file can be filtered out and the key will + // be invisible, because the prefix according to the configured extractor + // is "abcd,". + // + // i.e., an implementation always returning false is safe. + virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const { + return false; + } +}; + +// The prefix is the first `prefix_len` bytes of the key, and keys shorter +// then `prefix_len` are not InDomain. +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +// The prefix is the first min(length(key),`cap_len`) bytes of the key, and +// all keys are InDomain. +extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); + +// Prefix is equal to key. All keys are InDomain. +extern const SliceTransform* NewNoopTransform(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/snapshot.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/snapshot.h new file mode 100644 index 0000000000..1ea56e71e0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/snapshot.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +// +// To Create a Snapshot, call DB::GetSnapshot(). +// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot). +class Snapshot { + public: + virtual SequenceNumber GetSequenceNumber() const = 0; + + // Returns unix time i.e. the number of seconds since the Epoch, 1970-01-01 + // 00:00:00 (UTC). + virtual int64_t GetUnixTime() const = 0; + + virtual uint64_t GetTimestamp() const = 0; + + protected: + virtual ~Snapshot(); +}; + +// Simple RAII wrapper class for Snapshot. +// Constructing this object will create a snapshot. Destructing will +// release the snapshot. +class ManagedSnapshot { + public: + explicit ManagedSnapshot(DB* db); + + // Instead of creating a snapshot, take ownership of the input snapshot. + ManagedSnapshot(DB* db, const Snapshot* _snapshot); + + ~ManagedSnapshot(); + + const Snapshot* snapshot(); + + private: + DB* db_; + const Snapshot* snapshot_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_dump_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_dump_tool.h new file mode 100644 index 0000000000..0b81833f7d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_dump_tool.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +class SSTDumpTool { + public: + int Run(int argc, char const* const* argv, Options options = Options()); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_manager.h new file mode 100644 index 0000000000..6132921512 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_manager.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Logger; + +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. +// SstFileManager is NOT an extensible interface but a public interface for +// result of NewSstFileManager. Any derived classes must be RocksDB internal. +class SstFileManager { + public: + virtual ~SstFileManager() {} + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature; maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0; + + // Set the amount of buffer room each compaction should be able to leave. + // In other words, at its maximum disk space consumption, the compaction + // should still leave compaction_buffer_size available on the disk so that + // other background functions may continue, such as logging and flushing. + virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0; + + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. + // + // thread-safe. + virtual bool IsMaxAllowedSpaceReached() = 0; + + // Returns true if the total size of SST and blob files as well as estimated + // size of ongoing compactions exceeds the maximums allowed space usage. + virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0; + + // Return the total size of all tracked files. + // thread-safe + virtual uint64_t GetTotalSize() = 0; + + // Return a map containing all tracked files and their corresponding sizes. + // thread-safe + virtual std::unordered_map GetTrackedFiles() = 0; + + // Return delete rate limit in bytes per second. + // thread-safe + virtual int64_t GetDeleteRateBytesPerSecond() = 0; + + // Update the delete rate limit in bytes per second. + // zero means disable delete rate limiting and delete files immediately + // thread-safe + virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0; + + // Return trash/DB size ratio where new files will be deleted immediately + // thread-safe + virtual double GetMaxTrashDBRatio() = 0; + + // Update trash/DB size ratio where new files will be deleted immediately + // thread-safe + virtual void SetMaxTrashDBRatio(double ratio) = 0; + + // Return the total size of trash files + // thread-safe + virtual uint64_t GetTotalTrashSize() = 0; + + // Set the statistics ptr to dump the stat information + virtual void SetStatisticsPtr(const std::shared_ptr& stats) = 0; +}; + +// Create a new SstFileManager that can be shared among multiple RocksDB +// instances to track SST and blob files and control there deletion rate. +// Even though SstFileManager don't track WAL files but it still control +// there deletion rate. +// +// @param env: Pointer to Env object, please see "rocksdb/env.h". +// @param fs: Pointer to FileSystem object (rocksdb/file_system.h" +// @param info_log: If not nullptr, info_log will be used to log errors. +// +// == Deletion rate limiting specific arguments == +// @param trash_dir: Deprecated, this argument have no effect +// @param rate_bytes_per_sec: How many bytes should be deleted per second, If +// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb +// in 1 second, we will wait for another 3 seconds before we delete other +// files, Set to 0 to disable deletion rate limiting. +// This option also affect the delete rate of WAL files in the DB. +// @param delete_existing_trash: Deprecated, this argument have no effect, but +// if user provide trash_dir we will schedule deletes for files in the dir +// @param status: If not nullptr, status will contain any errors that happened +// during creating the missing trash_dir or deleting existing files in trash. +// @param max_trash_db_ratio: If the trash size constitutes for more than this +// fraction of the total DB size we will start deleting new files passed to +// DeleteScheduler immediately +// @param bytes_max_delete_chunk: if a file to delete is larger than delete +// chunk, ftruncate the file by this size each time, rather than dropping the +// whole file. 0 means to always delete the whole file. If the file has more +// than one linked names, the file will be deleted as a whole. Either way, +// `rate_bytes_per_sec` will be appreciated. NOTE that with this option, +// files already renamed as a trash may be partial, so users should not +// directly recover them without checking. +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr fs, + std::shared_ptr info_log = nullptr, + const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); + +// Same as above, but takes a pointer to a legacy Env object, instead of +// Env and FileSystem objects +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr info_log = nullptr, + std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_reader.h new file mode 100644 index 0000000000..026ae66d03 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_reader.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// SstFileReader is used to read sst files that are generated by DB or +// SstFileWriter. +class SstFileReader { + public: + SstFileReader(const Options& options); + + ~SstFileReader(); + + // Prepares to read from the file located at "file_path". + Status Open(const std::string& file_path); + + // Returns a new iterator over the table contents. + // Most read options provide the same control as we read from DB. + // If "snapshot" is nullptr, the iterator returns only the latest keys. + Iterator* NewIterator(const ReadOptions& options); + + std::shared_ptr GetTableProperties() const; + + // Verifies whether there is corruption in this table. + Status VerifyChecksum(const ReadOptions& /*read_options*/); + + Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + + private: + struct Rep; + std::unique_ptr rep_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_writer.h new file mode 100644 index 0000000000..fb2806865f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_file_writer.h @@ -0,0 +1,189 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) +#elif _WIN32 +#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) +#endif + +namespace ROCKSDB_NAMESPACE { + +class Comparator; + +// ExternalSstFileInfo include information about sst files created +// using SstFileWriter. +struct ExternalSstFileInfo { + ExternalSstFileInfo() + : file_path(""), + smallest_key(""), + largest_key(""), + smallest_range_del_key(""), + largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), + sequence_number(0), + file_size(0), + num_entries(0), + num_range_del_entries(0), + version(0) {} + + ExternalSstFileInfo(const std::string& _file_path, + const std::string& _smallest_key, + const std::string& _largest_key, + SequenceNumber _sequence_number, uint64_t _file_size, + int32_t _num_entries, int32_t _version) + : file_path(_file_path), + smallest_key(_smallest_key), + largest_key(_largest_key), + smallest_range_del_key(""), + largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), + sequence_number(_sequence_number), + file_size(_file_size), + num_entries(_num_entries), + num_range_del_entries(0), + version(_version) {} + + std::string file_path; // external sst file path + std::string smallest_key; // smallest user key in file + std::string largest_key; // largest user key in file + std::string + smallest_range_del_key; // smallest range deletion user key in file + std::string largest_range_del_key; // largest range deletion user key in file + std::string file_checksum; // sst file checksum; + std::string file_checksum_func_name; // The name of file checksum function + SequenceNumber sequence_number; // sequence number of all keys in file + uint64_t file_size; // file size in bytes + uint64_t num_entries; // number of entries in file + uint64_t num_range_del_entries; // number of range deletion entries in file + int32_t version; // file version +}; + +// SstFileWriter is used to create sst files that can be added to database later +// All keys in files generated by SstFileWriter will have sequence number = 0. +class SstFileWriter { + public: + // User can pass `column_family` to specify that the generated file will + // be ingested into this column_family, note that passing nullptr means that + // the column_family is unknown. + // If invalidate_page_cache is set to true, SstFileWriter will give the OS a + // hint that this file pages is not needed every time we write 1MB to the + // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be + // passed. + // The `skip_filters` option is DEPRECATED and could be removed in the + // future. Use `BlockBasedTableOptions::filter_policy` to control filter + // generation. + SstFileWriter(const EnvOptions& env_options, const Options& options, + ColumnFamilyHandle* column_family = nullptr, + bool invalidate_page_cache = true, + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, + bool skip_filters = false) + : SstFileWriter(env_options, options, options.comparator, column_family, + invalidate_page_cache, io_priority, skip_filters) {} + + // Deprecated API + SstFileWriter(const EnvOptions& env_options, const Options& options, + const Comparator* user_comparator, + ColumnFamilyHandle* column_family = nullptr, + bool invalidate_page_cache = true, + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, + bool skip_filters = false); + + ~SstFileWriter(); + + // Prepare SstFileWriter to write into file located at "file_path". + Status Open(const std::string& file_path); + + // Add a Put key with value to currently opened file (deprecated) + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: comparator is *not* timestamp-aware. + ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value); + + // Add a Put key with value to currently opened file + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: comparator is *not* timestamp-aware. + Status Put(const Slice& user_key, const Slice& value); + + // Add a Put (key with timestamp, value) to the currently opened file + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: timestamp's size is equal to what is expected by the comparator. + Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value); + + // Add a Merge key with value to currently opened file + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: comparator is *not* timestamp-aware. + Status Merge(const Slice& user_key, const Slice& value); + + // Add a deletion key to currently opened file + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: comparator is *not* timestamp-aware. + Status Delete(const Slice& user_key); + + // Add a deletion key with timestamp to the currently opened file + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: timestamp's size is equal to what is expected by the comparator. + Status Delete(const Slice& user_key, const Slice& timestamp); + + // Add a range deletion tombstone to currently opened file. Such a range + // deletion tombstone does NOT delete point (Put/Merge/Delete) keys in the + // same file. + // + // Range deletion tombstones may be added in any order, both with respect to + // each other and with respect to the point (Put/Merge/Delete) keys in the + // same file. + // + // REQUIRES: The comparator orders `begin_key` at or before `end_key` + // REQUIRES: comparator is *not* timestamp-aware. + Status DeleteRange(const Slice& begin_key, const Slice& end_key); + + // Add a range deletion tombstone to currently opened file. Such a range + // deletion tombstone does NOT delete point (Put/Merge/Delete) keys in the + // same file. + // + // Range deletion tombstones may be added in any order, both with respect to + // each other and with respect to the point (Put/Merge/Delete) keys in the + // same file. + // + // REQUIRES: begin_key and end_key are user keys without timestamp. + // REQUIRES: The comparator orders `begin_key` at or before `end_key` + // REQUIRES: timestamp's size is equal to what is expected by the comparator. + Status DeleteRange(const Slice& begin_key, const Slice& end_key, + const Slice& timestamp); + + // Finalize writing to sst file and close file. + // + // An optional ExternalSstFileInfo pointer can be passed to the function + // which will be populated with information about the created sst file. + Status Finish(ExternalSstFileInfo* file_info = nullptr); + + // Return the current file size. + uint64_t FileSize(); + + private: + void InvalidatePageCache(bool closing); + struct Rep; + std::unique_ptr rep_; +}; +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_partitioner.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_partitioner.h new file mode 100644 index 0000000000..3af8e94929 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/sst_partitioner.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +enum PartitionerResult : char { + // Partitioner does not require to create new file + kNotRequired = 0x0, + // Partitioner is requesting forcefully to create new file + kRequired = 0x1 + // Additional constants can be added +}; + +struct PartitionerRequest { + PartitionerRequest(const Slice& prev_user_key_, + const Slice& current_user_key_, + uint64_t current_output_file_size_) + : prev_user_key(&prev_user_key_), + current_user_key(¤t_user_key_), + current_output_file_size(current_output_file_size_) {} + const Slice* prev_user_key; + const Slice* current_user_key; + uint64_t current_output_file_size; +}; + +/* + * A SstPartitioner is a generic pluggable way of defining the partition + * of SST files. Compaction job will split the SST files on partition boundary + * to lower the write amplification during SST file promote to higher level. + */ +class SstPartitioner { + public: + virtual ~SstPartitioner() {} + + // Return the name of this partitioner. + virtual const char* Name() const = 0; + + // It is called for all keys in compaction. When partitioner want to create + // new SST file it needs to return true. It means compaction job will finish + // current SST file where last key is "prev_user_key" parameter and start new + // SST file where first key is "current_user_key". Returns decision if + // partition boundary was detected and compaction should create new file. + virtual PartitionerResult ShouldPartition( + const PartitionerRequest& request) = 0; + + // Called with smallest and largest keys in SST file when compaction try to do + // trivial move. Returns true is partitioner allows to do trivial move. + virtual bool CanDoTrivialMove(const Slice& smallest_user_key, + const Slice& largest_user_key) = 0; + + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + // Output level for this compaction + int output_level; + // Smallest key for compaction + Slice smallest_user_key; + // Largest key for compaction + Slice largest_user_key; + }; +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SstPartitionerFactory : public Customizable { + public: + ~SstPartitionerFactory() override {} + static const char* Type() { return "SstPartitionerFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + virtual std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& context) const = 0; + + // Returns a name that identifies this partitioner factory. + const char* Name() const override = 0; +}; + +/* + * Fixed key prefix partitioner. It splits the output SST files when prefix + * defined by size changes. + */ +class SstPartitionerFixedPrefix : public SstPartitioner { + public: + explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {} + + virtual ~SstPartitionerFixedPrefix() override {} + + const char* Name() const override { return "SstPartitionerFixedPrefix"; } + + PartitionerResult ShouldPartition(const PartitionerRequest& request) override; + + bool CanDoTrivialMove(const Slice& smallest_user_key, + const Slice& largest_user_key) override; + + private: + size_t len_; +}; + +/* + * Factory for fixed prefix partitioner. + */ +class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory { + public: + explicit SstPartitionerFixedPrefixFactory(size_t len); + + ~SstPartitionerFixedPrefixFactory() override {} + + static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override; + + private: + size_t len_; +}; + +extern std::shared_ptr +NewSstPartitionerFixedPrefixFactory(size_t prefix_len); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/statistics.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/statistics.h new file mode 100644 index 0000000000..cfeaa5a53e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/statistics.h @@ -0,0 +1,757 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +/** + * Keep adding tickers here. Note that the C++ enum values, unlike the values in + * the Java bindings, are not guaranteed to be stable; also, the C++ and Java + * values for any given ticker are not guaranteed to match. + * 1. Add the new ticker before TICKER_ENUM_MAX. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. + * 3. Add a corresponding enum value to TickerType.java in the Java API. + * 4. Add the enum conversions from/to Java/C++ to portal.h's toJavaTickerType + * and toCppTickers. + */ +enum Tickers : uint32_t { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS = 0, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of failures when adding blocks to block cache. + BLOCK_CACHE_ADD_FAILURES, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of index blocks added to block cache. + BLOCK_CACHE_INDEX_ADD, + // # of bytes of index blocks inserted into cache + BLOCK_CACHE_INDEX_BYTES_INSERT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of filter blocks added to block cache. + BLOCK_CACHE_FILTER_ADD, + // # of bytes of bloom filter blocks inserted into cache + BLOCK_CACHE_FILTER_BYTES_INSERT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of data blocks added to block cache. + BLOCK_CACHE_DATA_ADD, + // # of bytes of data blocks inserted into cache + BLOCK_CACHE_DATA_BYTES_INSERT, + // # of bytes read from cache. + BLOCK_CACHE_BYTES_READ, + // # of bytes written into cache. + BLOCK_CACHE_BYTES_WRITE, + + // # of times bloom filter has avoided file reads, i.e., negatives. + BLOOM_FILTER_USEFUL, + // # of times bloom FullFilter has not avoided the reads. + BLOOM_FILTER_FULL_POSITIVE, + // # of times bloom FullFilter has not avoided the reads and data actually + // exist. + BLOOM_FILTER_FULL_TRUE_POSITIVE, + + // # persistent cache hit + PERSISTENT_CACHE_HIT, + // # persistent cache miss + PERSISTENT_CACHE_MISS, + + // # total simulation block cache hits + SIM_BLOCK_CACHE_HIT, + // # total simulation block cache misses + SIM_BLOCK_CACHE_MISS, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + // # of Get() queries served by L0 + GET_HIT_L0, + // # of Get() queries served by L1 + GET_HIT_L1, + // # of Get() queries served by L2 and up + GET_HIT_L2_AND_UP, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 4 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + // Also includes keys dropped for range del. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted. + // Deletions obsoleted before bottom level due to file gap optimization. + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + // If a compaction was canceled in sfm to prevent ENOSPC + COMPACTION_CANCELLED, + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // The number of uncompressed bytes issued by DB::Put(), DB::Delete(), + // DB::Merge(), and DB::Write(). + BYTES_WRITTEN, + // The number of uncompressed bytes read from DB::Get(). It could be + // either from memtables, cache, or table files. + // For the number of logical bytes read from DB::MultiGet(), + // please use NUMBER_MULTIGET_BYTES_READ. + BYTES_READ, + // The number of calls to seek/next/prev + NUMBER_DB_SEEK, + NUMBER_DB_NEXT, + NUMBER_DB_PREV, + // The number of calls to seek/next/prev that returned data + NUMBER_DB_SEEK_FOUND, + NUMBER_DB_NEXT_FOUND, + NUMBER_DB_PREV_FOUND, + // The number of uncompressed bytes read from an iterator. + // Includes size of key and value. + ITER_BYTES_READ, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // Writer has to wait for compaction or flush to finish. + STALL_MICROS, + // The wait time for db mutex. + // Disabled by default. To enable it set stats level to kAll + DB_MUTEX_WAIT_MICROS, + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + NUMBER_MERGE_FAILURES, + + // Prefix filter stats when used for point lookups (Get / MultiGet). + // (For prefix filter stats on iterators, see *_LEVEL_SEEK_*.) + // Checked: filter was queried + BLOOM_FILTER_PREFIX_CHECKED, + // Useful: filter returned false so prevented accessing data+index blocks + BLOOM_FILTER_PREFIX_USEFUL, + // True positive: found a key matching the point query. When another key + // with the same prefix matches, it is considered a false positive by + // these statistics even though the filter returned a true positive. + BLOOM_FILTER_PREFIX_TRUE_POSITIVE, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpdatesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, // Equivalent to writes done for others + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + FLUSH_WRITE_BYTES, // Bytes written during flush + + // Compaction read and write statistics broken down by CompactionReason + COMPACT_READ_BYTES_MARKED, + COMPACT_READ_BYTES_PERIODIC, + COMPACT_READ_BYTES_TTL, + COMPACT_WRITE_BYTES_MARKED, + COMPACT_WRITE_BYTES_PERIODIC, + COMPACT_WRITE_BYTES_TTL, + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + NUMBER_SUPERVERSION_ACQUIRES, + NUMBER_SUPERVERSION_RELEASES, + NUMBER_SUPERVERSION_CLEANUPS, + + // # of compressions/decompressions executed + NUMBER_BLOCK_COMPRESSED, + NUMBER_BLOCK_DECOMPRESSED, + + // DEPRECATED / unused (see NUMBER_BLOCK_COMPRESSION_*) + NUMBER_BLOCK_NOT_COMPRESSED, + MERGE_OPERATION_TOTAL_TIME, + FILTER_OPERATION_TOTAL_TIME, + + // Row cache. + ROW_CACHE_HIT, + ROW_CACHE_MISS, + + // Read amplification statistics. + // Read amplification can be calculated using this formula + // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) + // + // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled + READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used. + READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks. + + // Number of refill intervals where rate limiter's bytes are fully consumed. + NUMBER_RATE_LIMITER_DRAINS, + + // Number of internal keys skipped by Iterator + NUMBER_ITER_SKIP, + + // BlobDB specific stats + // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_PUT, + // # of Write to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_WRITE, + // # of Get to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_GET, + // # of MultiGet to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_MULTIGET, + // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only + // applicable to legacy BlobDB. + BLOB_DB_NUM_SEEK, + // # of Next to BlobDB iterator. Only applicable to legacy BlobDB. + BLOB_DB_NUM_NEXT, + // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB. + BLOB_DB_NUM_PREV, + // # of keys written to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_KEYS_WRITTEN, + // # of keys read from BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_KEYS_READ, + // # of bytes (key + value) written to BlobDB. Only applicable to legacy + // BlobDB. + BLOB_DB_BYTES_WRITTEN, + // # of bytes (keys + value) read from BlobDB. Only applicable to legacy + // BlobDB. + BLOB_DB_BYTES_READ, + // # of keys written by BlobDB as non-TTL inlined value. Only applicable to + // legacy BlobDB. + BLOB_DB_WRITE_INLINED, + // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy + // BlobDB. + BLOB_DB_WRITE_INLINED_TTL, + // # of keys written by BlobDB as non-TTL blob value. Only applicable to + // legacy BlobDB. + BLOB_DB_WRITE_BLOB, + // # of keys written by BlobDB as TTL blob value. Only applicable to legacy + // BlobDB. + BLOB_DB_WRITE_BLOB_TTL, + // # of bytes written to blob file. + BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + // # of bytes read from blob file. + BLOB_DB_BLOB_FILE_BYTES_READ, + // # of times a blob files being synced. + BLOB_DB_BLOB_FILE_SYNCED, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of expiration. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of expiration. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of corresponding file deleted. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EVICTED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of corresponding file deleted. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EVICTED_SIZE, + // # of blob files that were obsoleted by garbage collection. Only applicable + // to legacy BlobDB. + BLOB_DB_GC_NUM_FILES, + // # of blob files generated by garbage collection. Only applicable to legacy + // BlobDB. + BLOB_DB_GC_NUM_NEW_FILES, + // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB. + BLOB_DB_GC_FAILURES, + // # of keys relocated to new blob file by garbage collection. + BLOB_DB_GC_NUM_KEYS_RELOCATED, + // # of bytes relocated to new blob file by garbage collection. + BLOB_DB_GC_BYTES_RELOCATED, + // # of blob files evicted because of BlobDB is full. Only applicable to + // legacy BlobDB. + BLOB_DB_FIFO_NUM_FILES_EVICTED, + // # of keys in the blob files evicted because of BlobDB is full. Only + // applicable to legacy BlobDB. + BLOB_DB_FIFO_NUM_KEYS_EVICTED, + // # of bytes in the blob files evicted because of BlobDB is full. Only + // applicable to legacy BlobDB. + BLOB_DB_FIFO_BYTES_EVICTED, + + // These counters indicate a performance issue in WritePrepared transactions. + // We should not seem them ticking them much. + // # of times prepare_mutex_ is acquired in the fast path. + TXN_PREPARE_MUTEX_OVERHEAD, + // # of times old_commit_map_mutex_ is acquired in the fast path. + TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, + // # of times we checked a batch for duplicate keys. + TXN_DUPLICATE_KEY_OVERHEAD, + // # of times snapshot_mutex_ is acquired in the fast path. + TXN_SNAPSHOT_MUTEX_OVERHEAD, + // # of times ::Get returned TryAgain due to expired snapshot seq + TXN_GET_TRY_AGAIN, + + // Number of keys actually found in MultiGet calls (vs number requested by + // caller) + // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller + NUMBER_MULTIGET_KEYS_FOUND, + + NO_ITERATOR_CREATED, // number of iterators created + NO_ITERATOR_DELETED, // number of iterators deleted + + BLOCK_CACHE_COMPRESSION_DICT_MISS, + BLOCK_CACHE_COMPRESSION_DICT_HIT, + BLOCK_CACHE_COMPRESSION_DICT_ADD, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + + // # of blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD + BLOCK_CACHE_ADD_REDUNDANT, + // # of index blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD + BLOCK_CACHE_INDEX_ADD_REDUNDANT, + // # of filter blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD + BLOCK_CACHE_FILTER_ADD_REDUNDANT, + // # of data blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD + BLOCK_CACHE_DATA_ADD_REDUNDANT, + // # of dict blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT + // <= BLOCK_CACHE_COMPRESSION_DICT_ADD + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + + // # of files marked as trash by sst file manager and will be deleted + // later by background thread. + FILES_MARKED_TRASH, + // # of files deleted immediately by sst file manger through delete scheduler. + FILES_DELETED_IMMEDIATELY, + + // The counters for error handler, not that, bg_io_error is the subset of + // bg_error and bg_retryable_io_error is the subset of bg_io_error + ERROR_HANDLER_BG_ERROR_COUNT, + ERROR_HANDLER_BG_IO_ERROR_COUNT, + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + ERROR_HANDLER_AUTORESUME_COUNT, + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + + // Statistics for memtable garbage collection: + // Raw bytes of data (payload) present on memtable at flush time. + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + // Outdated bytes of data present on memtable at flush time. + MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + + // Secondary cache statistics + SECONDARY_CACHE_HITS, + + // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs. + VERIFY_CHECKSUM_READ_BYTES, + + // Bytes read/written while creating backups + BACKUP_READ_BYTES, + BACKUP_WRITE_BYTES, + + // Remote compaction read/write statistics + REMOTE_COMPACT_READ_BYTES, + REMOTE_COMPACT_WRITE_BYTES, + + // Tiered storage related statistics + HOT_FILE_READ_BYTES, + WARM_FILE_READ_BYTES, + COLD_FILE_READ_BYTES, + HOT_FILE_READ_COUNT, + WARM_FILE_READ_COUNT, + COLD_FILE_READ_COUNT, + + // Last level and non-last level read statistics + LAST_LEVEL_READ_BYTES, + LAST_LEVEL_READ_COUNT, + NON_LAST_LEVEL_READ_BYTES, + NON_LAST_LEVEL_READ_COUNT, + + // Statistics on iterator Seek() (and variants) for each sorted run. I.e. a + // single user Seek() can result in many sorted run Seek()s. + // The stats are split between last level and non-last level. + // Filtered: a filter such as prefix Bloom filter indicate the Seek() would + // not find anything relevant, so avoided a likely access to data+index + // blocks. + LAST_LEVEL_SEEK_FILTERED, + // Filter match: a filter such as prefix Bloom filter was queried but did + // not filter out the seek. + LAST_LEVEL_SEEK_FILTER_MATCH, + // At least one data block was accessed for a Seek() (or variant) on a + // sorted run. + LAST_LEVEL_SEEK_DATA, + // At least one value() was accessed for the seek (suggesting it was useful), + // and no filter such as prefix Bloom was queried. + LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + // At least one value() was accessed for the seek (suggesting it was useful), + // after querying a filter such as prefix Bloom. + LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + // The same set of stats, but for non-last level seeks. + NON_LAST_LEVEL_SEEK_FILTERED, + NON_LAST_LEVEL_SEEK_FILTER_MATCH, + NON_LAST_LEVEL_SEEK_DATA, + NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + + // Number of block checksum verifications + BLOCK_CHECKSUM_COMPUTE_COUNT, + // Number of times RocksDB detected a corruption while verifying a block + // checksum. RocksDB does not remember corruptions that happened during user + // reads so the same block corruption may be detected multiple times. + BLOCK_CHECKSUM_MISMATCH_COUNT, + + MULTIGET_COROUTINE_COUNT, + + // Integrated BlobDB specific stats + // # of times cache miss when accessing blob from blob cache. + BLOB_DB_CACHE_MISS, + // # of times cache hit when accessing blob from blob cache. + BLOB_DB_CACHE_HIT, + // # of data blocks added to blob cache. + BLOB_DB_CACHE_ADD, + // # of failures when adding blobs to blob cache. + BLOB_DB_CACHE_ADD_FAILURES, + // # of bytes read from blob cache. + BLOB_DB_CACHE_BYTES_READ, + // # of bytes written into blob cache. + BLOB_DB_CACHE_BYTES_WRITE, + + // Time spent in the ReadAsync file system call + READ_ASYNC_MICROS, + // Number of errors returned to the async read callback + ASYNC_READ_ERROR_COUNT, + + // Fine grained secondary cache stats + SECONDARY_CACHE_FILTER_HITS, + SECONDARY_CACHE_INDEX_HITS, + SECONDARY_CACHE_DATA_HITS, + + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that can't find its data for table open + TABLE_OPEN_PREFETCH_TAIL_MISS, + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that finds its data for table open + TABLE_OPEN_PREFETCH_TAIL_HIT, + + // Statistics on the filtering by user-defined timestamps + // # of times timestamps are checked on accessing the table + TIMESTAMP_FILTER_TABLE_CHECKED, + // # of times timestamps can successfully help skip the table access + TIMESTAMP_FILTER_TABLE_FILTERED, + + // Number of input bytes (uncompressed) to compression for SST blocks that + // are stored compressed. + BYTES_COMPRESSED_FROM, + // Number of output bytes (compressed) from compression for SST blocks that + // are stored compressed. + BYTES_COMPRESSED_TO, + // Number of uncompressed bytes for SST blocks that are stored uncompressed + // because compression type is kNoCompression, or some error case caused + // compression not to run or produce an output. Index blocks are only counted + // if enable_index_compression is true. + BYTES_COMPRESSION_BYPASSED, + // Number of input bytes (uncompressed) to compression for SST blocks that + // are stored uncompressed because the compression result was rejected, + // either because the ratio was not acceptable (see + // CompressionOptions::max_compressed_bytes_per_kb) or found invalid by the + // `verify_compression` option. + BYTES_COMPRESSION_REJECTED, + + // Like BYTES_COMPRESSION_BYPASSED but counting number of blocks + NUMBER_BLOCK_COMPRESSION_BYPASSED, + // Like BYTES_COMPRESSION_REJECTED but counting number of blocks + NUMBER_BLOCK_COMPRESSION_REJECTED, + + // Number of input bytes (compressed) to decompression in reading compressed + // SST blocks from storage. + BYTES_DECOMPRESSED_FROM, + // Number of output bytes (uncompressed) from decompression in reading + // compressed SST blocks from storage. + BYTES_DECOMPRESSED_TO, + + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +extern const std::vector> TickersNameMap; + +/** + * Keep adding histograms here. Note that the C++ enum values, unlike the values + * in the Java bindings, are not guaranteed to be stable; also, the C++ and Java + * values for any given histogram are not guaranteed to match. + * 1. Add the new histogram before HISTOGRAM_ENUM_MAX. + * 2. Add a readable string in HistogramsNameMap below for the newly added + * histogram. + * 3. Add a corresponding enum value to HistogramType.java in the Java API. + * 4. Add the enum conversions from/to Java/C++ to portal.h's + * toJavaHistogramsType and toCppHistograms. + */ +enum Histograms : uint32_t { + DB_GET = 0, + DB_WRITE, + COMPACTION_TIME, + COMPACTION_CPU_TIME, + SUBCOMPACTION_SETUP_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + NUM_FILES_IN_SINGLE_COMPACTION, + DB_SEEK, + WRITE_STALL, + // Time spent in reading block-based or plain SST table + SST_READ_MICROS, + // Time spent in reading SST table (currently only block-based table) or blob + // file corresponding to `Env::IOActivity` + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + FILE_READ_DB_OPEN_MICROS, + + // The number of subcompactions actually scheduled during a compaction + NUM_SUBCOMPACTIONS_SCHEDULED, + // Value size distribution in each operation + BYTES_PER_READ, + BYTES_PER_WRITE, + BYTES_PER_MULTIGET, + + BYTES_COMPRESSED, // DEPRECATED / unused (see BYTES_COMPRESSED_{FROM,TO}) + BYTES_DECOMPRESSED, // DEPRECATED / unused (see BYTES_DECOMPRESSED_{FROM,TO}) + COMPRESSION_TIMES_NANOS, + DECOMPRESSION_TIMES_NANOS, + // Number of merge operands passed to the merge operator in user read + // requests. + READ_NUM_MERGE_OPERANDS, + + // BlobDB specific stats + // Size of keys written to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_KEY_SIZE, + // Size of values written to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_VALUE_SIZE, + // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy + // BlobDB. + BLOB_DB_WRITE_MICROS, + // BlobDB Get latency. Only applicable to legacy BlobDB. + BLOB_DB_GET_MICROS, + // BlobDB MultiGet latency. Only applicable to legacy BlobDB. + BLOB_DB_MULTIGET_MICROS, + // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to + // legacy BlobDB. + BLOB_DB_SEEK_MICROS, + // BlobDB Next latency. Only applicable to legacy BlobDB. + BLOB_DB_NEXT_MICROS, + // BlobDB Prev latency. Only applicable to legacy BlobDB. + BLOB_DB_PREV_MICROS, + // Blob file write latency. + BLOB_DB_BLOB_FILE_WRITE_MICROS, + // Blob file read latency. + BLOB_DB_BLOB_FILE_READ_MICROS, + // Blob file sync latency. + BLOB_DB_BLOB_FILE_SYNC_MICROS, + // BlobDB compression time. + BLOB_DB_COMPRESSION_MICROS, + // BlobDB decompression time. + BLOB_DB_DECOMPRESSION_MICROS, + // Time spent flushing memtable to disk + FLUSH_TIME, + SST_BATCH_SIZE, + + // MultiGet stats logged per level + // Num of index and filter blocks read from file system per level. + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + // Num of sst files read from file system per level. + NUM_SST_READ_PER_LEVEL, + + // Error handler statistics + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + + // Stats related to asynchronous read requests. + ASYNC_READ_BYTES, + POLL_WAIT_MICROS, + + // Number of prefetched bytes discarded by RocksDB. + PREFETCHED_BYTES_DISCARDED, + + // Number of IOs issued in parallel in a MultiGet batch + MULTIGET_IO_BATCH_SIZE, + + // Number of levels requiring IO for MultiGet + NUM_LEVEL_READ_PER_MULTIGET, + + // Wait time for aborting async read in FilePrefetchBuffer destructor + ASYNC_PREFETCH_ABORT_MICROS, + + // Number of bytes read for RocksDB's prefetching contents (as opposed to file + // system's prefetch) from the end of SST table during block based table open + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + + HISTOGRAM_ENUM_MAX +}; + +extern const std::vector> HistogramsNameMap; + +struct HistogramData { + double median; + double percentile95; + double percentile99; + double average; + double standard_deviation; + // zero-initialize new members since old Statistics::histogramData() + // implementations won't write them. + double max = 0.0; + uint64_t count = 0; + uint64_t sum = 0; + double min = 0.0; +}; + +// StatsLevel can be used to reduce statistics overhead by skipping certain +// types of stats in the stats collection process. +// Usage: +// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); +enum StatsLevel : uint8_t { + // Disable all metrics + kDisableAll, + // Disable tickers + kExceptTickers = kDisableAll, + // Disable timer stats, and skip histogram stats + kExceptHistogramOrTimers, + // Skip timer stats + kExceptTimers, + // Collect all stats except time inside mutex lock AND time spent on + // compression. + kExceptDetailedTimers, + // Collect all stats except the counters requiring to get time inside the + // mutex lock. + kExceptTimeForMutex, + // Collect all stats, including measuring duration of mutex operations. + // If getting time is expensive on the platform to run, it can + // reduce scalability to more threads, especially for writes. + kAll, +}; + +// Analyze the performance of a db by providing cumulative stats over time. +// Usage: +// Options options; +// options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// Status s = DB::Open(options, kDBPath, &db); +// ... +// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); +// HistogramData hist; +// options.statistics->histogramData(FLUSH_TIME, &hist); +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Statistics : public Customizable { + public: + ~Statistics() override {} + static const char* Type() { return "Statistics"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::shared_ptr* result); + // Default name of empty, for backwards compatibility. Derived classes should + // override this method. + // This default implementation will likely be removed in a future release + const char* Name() const override { return ""; } + virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; + virtual void histogramData(uint32_t type, + HistogramData* const data) const = 0; + virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; } + virtual void recordTick(uint32_t tickerType, uint64_t count = 1) = 0; + virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0; + virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0; + virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) { + if (get_stats_level() <= StatsLevel::kExceptTimers) { + return; + } + recordInHistogram(histogramType, time); + } + // The function is here only for backward compatibility reason. + // Users implementing their own Statistics class should override + // recordInHistogram() instead and leave measureTime() as it is. + virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) { + // This is not supposed to be called. + assert(false); + } + virtual void recordInHistogram(uint32_t histogramType, uint64_t time) { + // measureTime() is the old and inaccurate function name. + // To keep backward compatible. If users implement their own + // statistics, which overrides measureTime() but doesn't override + // this function. We forward to measureTime(). + measureTime(histogramType, time); + } + + // Resets all ticker and histogram stats + virtual Status Reset() { return Status::NotSupported("Not implemented"); } + + using Customizable::ToString; + // String representation of the statistic object. Must be thread-safe. + virtual std::string ToString() const { + // Do nothing by default + return std::string("ToString(): not implemented"); + } + + virtual bool getTickerMap(std::map*) const { + // Do nothing by default + return false; + } + + // Override this function to disable particular histogram collection + virtual bool HistEnabledForType(uint32_t type) const { + return type < HISTOGRAM_ENUM_MAX; + } + void set_stats_level(StatsLevel sl) { + stats_level_.store(sl, std::memory_order_relaxed); + } + StatsLevel get_stats_level() const { + return stats_level_.load(std::memory_order_relaxed); + } + + private: + std::atomic stats_level_{kExceptDetailedTimers}; +}; + +// Create a concrete DBStatistics object +std::shared_ptr CreateDBStatistics(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/stats_history.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/stats_history.h new file mode 100644 index 0000000000..57e469295b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/stats_history.h @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; + +// StatsHistoryIterator is the main interface for users to programmatically +// access statistics snapshots that was automatically stored by RocksDB. +// Depending on options, the stats can be in memory or on disk. +// The stats snapshots are indexed by time that they were recorded, and each +// stats snapshot contains individual stat name and value at the time of +// recording. +// Example: +// std::unique_ptr stats_iter; +// Status s = db->GetStatsHistory(0 /* start_time */, +// env->NowMicros() /* end_time*/, +// &stats_iter); +// if (s.ok) { +// for (; stats_iter->Valid(); stats_iter->Next()) { +// uint64_t stats_time = stats_iter->GetStatsTime(); +// const std::map& stats_map = +// stats_iter->GetStatsMap(); +// process(stats_time, stats_map); +// } +// } +class StatsHistoryIterator { + public: + StatsHistoryIterator() {} + virtual ~StatsHistoryIterator() {} + + virtual bool Valid() const = 0; + + // Moves to the next stats history record. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Return the time stamp (in seconds) when stats history is recorded. + // REQUIRES: Valid() + virtual uint64_t GetStatsTime() const = 0; + + // DEPRECATED (was never used) + virtual int GetFormatVersion() const { return -1; } + + // Return the current stats history as an std::map which specifies the + // mapping from stats name to stats value . The underlying storage + // for the returned map is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual const std::map& GetStatsMap() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/status.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/status.h new file mode 100644 index 0000000000..447c3b9fef --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/status.h @@ -0,0 +1,574 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#pragma once + +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include +#include +#endif + +#include +#include + +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include "port/stack_trace.h" +#endif + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Status { + public: + // Create a success status. + Status() + : code_(kOk), + subcode_(kNone), + sev_(kNoError), + retryable_(false), + data_loss_(false), + scope_(0), + state_(nullptr) {} + ~Status() { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!checked_) { + fprintf(stderr, "Failed to check Status %p\n", this); + port::PrintStack(); + std::abort(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } + + // Copy the specified status. + Status(const Status& s); + Status& operator=(const Status& s); + Status(Status&& s) noexcept; + Status& operator=(Status&& s) noexcept; + bool operator==(const Status& rhs) const; + bool operator!=(const Status& rhs) const; + + // In case of intentionally swallowing an error, user must explicitly call + // this function. That way we are easily able to search the code to find where + // error swallowing occurs. + inline void PermitUncheckedError() const { MarkChecked(); } + + inline void MustCheck() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } + + enum Code : unsigned char { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kMergeInProgress = 6, + kIncomplete = 7, + kShutdownInProgress = 8, + kTimedOut = 9, + kAborted = 10, + kBusy = 11, + kExpired = 12, + kTryAgain = 13, + kCompactionTooLarge = 14, + kColumnFamilyDropped = 15, + kMaxCode + }; + + Code code() const { + MarkChecked(); + return code_; + } + + enum SubCode : unsigned char { + kNone = 0, + kMutexTimeout = 1, + kLockTimeout = 2, + kLockLimit = 3, + kNoSpace = 4, + kDeadlock = 5, + kStaleFile = 6, + kMemoryLimit = 7, + kSpaceLimit = 8, + kPathNotFound = 9, + KMergeOperandsInsufficientCapacity = 10, + kManualCompactionPaused = 11, + kOverwritten = 12, + kTxnNotPrepared = 13, + kIOFenced = 14, + kMergeOperatorFailed = 15, + kMaxSubCode + }; + + SubCode subcode() const { + MarkChecked(); + return subcode_; + } + + enum Severity : unsigned char { + kNoError = 0, + kSoftError = 1, + kHardError = 2, + kFatalError = 3, + kUnrecoverableError = 4, + kMaxSeverity + }; + + Status(const Status& s, Severity sev); + + Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg) + : Status(_code, _subcode, msg, "", _sev) {} + + static Status CopyAppendMessage(const Status& s, const Slice& delim, + const Slice& msg); + + Severity severity() const { + MarkChecked(); + return sev_; + } + + // Returns a C style string indicating the message of the Status + const char* getState() const { + MarkChecked(); + return state_.get(); + } + + // Return a success status. + static Status OK() { return Status(); } + + // Successful, though an existing something was overwritten + // Note: using variants of OK status for program logic is discouraged, + // but it can be useful for communicating statistical information without + // changing public APIs. + static Status OkOverwritten() { return Status(kOk, kOverwritten); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + + // Fast path for not found without malloc; + static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); } + + static Status NotFound(SubCode sc, const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kNotFound, sc, msg, msg2); + } + + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status Corruption(SubCode msg = kNone) { + return Status(kCorruption, msg); + } + + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status NotSupported(SubCode msg = kNone) { + return Status(kNotSupported, msg); + } + + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status InvalidArgument(SubCode msg = kNone) { + return Status(kInvalidArgument, msg); + } + + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); } + + static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kMergeInProgress, msg, msg2); + } + static Status MergeInProgress(SubCode msg = kNone) { + return Status(kMergeInProgress, msg); + } + + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } + static Status Incomplete(SubCode msg = kNone) { + return Status(kIncomplete, msg); + } + + static Status ShutdownInProgress(SubCode msg = kNone) { + return Status(kShutdownInProgress, msg); + } + static Status ShutdownInProgress(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kShutdownInProgress, msg, msg2); + } + static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); } + static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, msg, msg2); + } + + static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); } + static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kBusy, msg, msg2); + } + + static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); } + static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTimedOut, msg, msg2); + } + + static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); } + static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kExpired, msg, msg2); + } + + static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); } + static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTryAgain, msg, msg2); + } + + static Status CompactionTooLarge(SubCode msg = kNone) { + return Status(kCompactionTooLarge, msg); + } + static Status CompactionTooLarge(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kCompactionTooLarge, msg, msg2); + } + + static Status ColumnFamilyDropped(SubCode msg = kNone) { + return Status(kColumnFamilyDropped, msg); + } + + static Status ColumnFamilyDropped(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kColumnFamilyDropped, msg, msg2); + } + + static Status NoSpace() { return Status(kIOError, kNoSpace); } + static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kNoSpace, msg, msg2); + } + + static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); } + static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, kMemoryLimit, msg, msg2); + } + + static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); } + static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kSpaceLimit, msg, msg2); + } + + static Status PathNotFound() { return Status(kIOError, kPathNotFound); } + static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kPathNotFound, msg, msg2); + } + + static Status TxnNotPrepared() { + return Status(kInvalidArgument, kTxnNotPrepared); + } + static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { + MarkChecked(); + return code() == kOk; + } + + // Returns true iff the status indicates success *with* something + // overwritten + bool IsOkOverwritten() const { + MarkChecked(); + return code() == kOk && subcode() == kOverwritten; + } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { + MarkChecked(); + return code() == kNotFound; + } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { + MarkChecked(); + return code() == kCorruption; + } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { + MarkChecked(); + return code() == kNotSupported; + } + + // Returns true iff the status indicates an InvalidArgument error. + bool IsInvalidArgument() const { + MarkChecked(); + return code() == kInvalidArgument; + } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { + MarkChecked(); + return code() == kIOError; + } + + // Returns true iff the status indicates an MergeInProgress. + bool IsMergeInProgress() const { + MarkChecked(); + return code() == kMergeInProgress; + } + + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { + MarkChecked(); + return code() == kIncomplete; + } + + // Returns true iff the status indicates Shutdown In progress + bool IsShutdownInProgress() const { + MarkChecked(); + return code() == kShutdownInProgress; + } + + bool IsTimedOut() const { + MarkChecked(); + return code() == kTimedOut; + } + + bool IsAborted() const { + MarkChecked(); + return code() == kAborted; + } + + bool IsLockLimit() const { + MarkChecked(); + return code() == kAborted && subcode() == kLockLimit; + } + + // Returns true iff the status indicates that a resource is Busy and + // temporarily could not be acquired. + bool IsBusy() const { + MarkChecked(); + return code() == kBusy; + } + + bool IsDeadlock() const { + MarkChecked(); + return code() == kBusy && subcode() == kDeadlock; + } + + // Returns true iff the status indicated that the operation has Expired. + bool IsExpired() const { + MarkChecked(); + return code() == kExpired; + } + + // Returns true iff the status indicates a TryAgain error. + // This usually means that the operation failed, but may succeed if + // re-attempted. + bool IsTryAgain() const { + MarkChecked(); + return code() == kTryAgain; + } + + // Returns true iff the status indicates the proposed compaction is too large + bool IsCompactionTooLarge() const { + MarkChecked(); + return code() == kCompactionTooLarge; + } + + // Returns true iff the status indicates Column Family Dropped + bool IsColumnFamilyDropped() const { + MarkChecked(); + return code() == kColumnFamilyDropped; + } + + // Returns true iff the status indicates a NoSpace error + // This is caused by an I/O error returning the specific "out of space" + // error condition. Stricto sensu, an NoSpace error is an I/O error + // with a specific subcode, enabling users to take the appropriate action + // if needed + bool IsNoSpace() const { + MarkChecked(); + return (code() == kIOError) && (subcode() == kNoSpace); + } + + // Returns true iff the status indicates a memory limit error. There may be + // cases where we limit the memory used in certain operations (eg. the size + // of a write batch) in order to avoid out of memory exceptions. + bool IsMemoryLimit() const { + MarkChecked(); + return (code() == kAborted) && (subcode() == kMemoryLimit); + } + + // Returns true iff the status indicates a PathNotFound error + // This is caused by an I/O error returning the specific "no such file or + // directory" error condition. A PathNotFound error is an I/O error with + // a specific subcode, enabling users to take appropriate action if necessary + bool IsPathNotFound() const { + MarkChecked(); + return (code() == kIOError || code() == kNotFound) && + (subcode() == kPathNotFound); + } + + // Returns true iff the status indicates manual compaction paused. This + // is caused by a call to PauseManualCompaction + bool IsManualCompactionPaused() const { + MarkChecked(); + return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); + } + + // Returns true iff the status indicates a TxnNotPrepared error. + bool IsTxnNotPrepared() const { + MarkChecked(); + return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared); + } + + // Returns true iff the status indicates a IOFenced error. + bool IsIOFenced() const { + MarkChecked(); + return (code() == kIOError) && (subcode() == kIOFenced); + } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + protected: + Code code_; + SubCode subcode_; + Severity sev_; + bool retryable_; + bool data_loss_; + unsigned char scope_; + // A nullptr state_ (which is at least the case for OK) means the extra + // message is empty. + std::unique_ptr state_; +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + mutable bool checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + + explicit Status(Code _code, SubCode _subcode = kNone) + : code_(_code), + subcode_(_subcode), + sev_(kNoError), + retryable_(false), + data_loss_(false), + scope_(0) {} + + explicit Status(Code _code, SubCode _subcode, bool retryable, bool data_loss, + unsigned char scope) + : code_(_code), + subcode_(_subcode), + sev_(kNoError), + retryable_(retryable), + data_loss_(data_loss), + scope_(scope) {} + + Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2, + Severity sev = kNoError); + Status(Code _code, const Slice& msg, const Slice& msg2) + : Status(_code, kNone, msg, msg2) {} + + static std::unique_ptr CopyState(const char* s); + + inline void MarkChecked() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } +}; + +inline Status::Status(const Status& s) + : code_(s.code_), + subcode_(s.subcode_), + sev_(s.sev_), + retryable_(s.retryable_), + data_loss_(s.data_loss_), + scope_(s.scope_) { + s.MarkChecked(); + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); +} +inline Status::Status(const Status& s, Severity sev) + : code_(s.code_), + subcode_(s.subcode_), + sev_(sev), + retryable_(s.retryable_), + data_loss_(s.data_loss_), + scope_(s.scope_) { + s.MarkChecked(); + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); +} +inline Status& Status::operator=(const Status& s) { + if (this != &s) { + s.MarkChecked(); + MustCheck(); + code_ = s.code_; + subcode_ = s.subcode_; + sev_ = s.sev_; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); + } + return *this; +} + +inline Status::Status(Status&& s) noexcept : Status() { + s.MarkChecked(); + *this = std::move(s); +} + +inline Status& Status::operator=(Status&& s) noexcept { + if (this != &s) { + s.MarkChecked(); + MustCheck(); + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + sev_ = std::move(s.sev_); + s.sev_ = kNoError; + retryable_ = std::move(s.retryable_); + s.retryable_ = false; + data_loss_ = std::move(s.data_loss_); + s.data_loss_ = false; + scope_ = std::move(s.scope_); + s.scope_ = 0; + state_ = std::move(s.state_); + } + return *this; +} + +inline bool Status::operator==(const Status& rhs) const { + MarkChecked(); + rhs.MarkChecked(); + return (code_ == rhs.code_); +} + +inline bool Status::operator!=(const Status& rhs) const { + MarkChecked(); + rhs.MarkChecked(); + return !(*this == rhs); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/system_clock.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/system_clock.h new file mode 100644 index 0000000000..7ca92e54e3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/system_clock.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef GetCurrentTime +#endif + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; + +// A SystemClock is an interface used by the rocksdb implementation to access +// operating system time-related functionality. +class SystemClock : public Customizable { + public: + ~SystemClock() override {} + + static const char* Type() { return "SystemClock"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + // The name of this system clock + virtual const char* Name() const override = 0; + + // The name/nickname for the Default SystemClock. This name can be used + // to determine if the clock is the default one. + static const char* kDefaultName() { return "DefaultClock"; } + + // Return a default SystemClock suitable for the current operating + // system. + static const std::shared_ptr& Default(); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // Returns the number of micro-seconds of CPU time used by the current thread. + // 0 indicates not supported. + virtual uint64_t CPUMicros() { return 0; } + + // Returns the number of nano-seconds of CPU time used by the current thread. + // Default implementation simply relies on CPUMicros. + // 0 indicates not supported. + virtual uint64_t CPUNanos() { return CPUMicros() * 1000; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; +}; + +// Wrapper class for a SystemClock. Redirects all methods (except Name) +// of the SystemClock interface to the target/wrapped class. +class SystemClockWrapper : public SystemClock { + public: + explicit SystemClockWrapper(const std::shared_ptr& t); + + uint64_t NowMicros() override { return target_->NowMicros(); } + + uint64_t NowNanos() override { return target_->NowNanos(); } + + uint64_t CPUMicros() override { return target_->CPUMicros(); } + + uint64_t CPUNanos() override { return target_->CPUNanos(); } + + virtual void SleepForMicroseconds(int micros) override { + return target_->SleepForMicroseconds(micros); + } + + Status GetCurrentTime(int64_t* unix_time) override { + return target_->GetCurrentTime(unix_time); + } + + std::string TimeToString(uint64_t time) override { + return target_->TimeToString(time); + } + + Status PrepareOptions(const ConfigOptions& options) override; + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; + const Customizable* Inner() const override { return target_.get(); } + + protected: + std::shared_ptr target_; +}; + +} // end namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table.h new file mode 100644 index 0000000000..6e8f60577a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table.h @@ -0,0 +1,927 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Currently we support two types of tables: plain table and block-based table. +// 1. Block-based table: this is the default table type that we inherited from +// LevelDB, which was designed for storing data in hard disk or flash +// device. +// 2. Plain table: it is one of RocksDB's SST file format optimized +// for low query latency on pure-memory or really low-latency media. +// +// A tutorial of rocksdb table formats is available here: +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats +// +// Example code is also available +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples + +#pragma once + +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// -- Block-based Table +class Cache; +class FilterPolicy; +class FlushBlockPolicyFactory; +class PersistentCache; +class RandomAccessFile; +struct TableReaderOptions; +struct TableBuilderOptions; +class TableBuilder; +class TableFactory; +class TableReader; +class WritableFileWriter; +struct ConfigOptions; +struct EnvOptions; + +// Types of checksums to use for checking integrity of logical blocks within +// files. All checksums currently use 32 bits of checking power (1 in 4B +// chance of failing to detect random corruption). +enum ChecksumType : char { + kNoChecksum = 0x0, + kCRC32c = 0x1, + kxxHash = 0x2, + kxxHash64 = 0x3, + kXXH3 = 0x4, // Supported since RocksDB 6.27 +}; + +// `PinningTier` is used to specify which tier of block-based tables should +// be affected by a block cache pinning setting (see +// `MetadataCacheOptions` below). +enum class PinningTier { + // For compatibility, this value specifies to fallback to the behavior + // indicated by the deprecated options, + // `pin_l0_filter_and_index_blocks_in_cache` and + // `pin_top_level_index_and_filter`. + kFallback, + + // This tier contains no block-based tables. + kNone, + + // This tier contains block-based tables that may have originated from a + // memtable flush. In particular, it includes tables from L0 that are smaller + // than 1.5 times the current `write_buffer_size`. Note these criteria imply + // it can include intra-L0 compaction outputs and ingested files, as long as + // they are not abnormally large compared to flushed files in L0. + kFlushedAndSimilar, + + // This tier contains all block-based tables. + kAll, +}; + +// `MetadataCacheOptions` contains members indicating the desired caching +// behavior for the different categories of metadata blocks. +struct MetadataCacheOptions { + // The tier of block-based tables whose top-level index into metadata + // partitions will be pinned. Currently indexes and filters may be + // partitioned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise any top-level index into metadata partitions would be + // held in table reader memory, outside the block cache. + PinningTier top_level_index_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose metadata partitions will be pinned. + // Currently indexes and filters may be partitioned. + PinningTier partition_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose unpartitioned metadata blocks will be + // pinned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise the unpartitioned meta-blocks would be held in table + // reader memory, outside the block cache. + PinningTier unpartitioned_pinning = PinningTier::kFallback; +}; + +struct CacheEntryRoleOptions { + enum class Decision { + kEnabled, + kDisabled, + kFallback, + }; + Decision charged = Decision::kFallback; + bool operator==(const CacheEntryRoleOptions& other) const { + return charged == other.charged; + } +}; + +struct CacheUsageOptions { + CacheEntryRoleOptions options; + std::map options_overrides; +}; + +// For advanced user only +struct BlockBasedTableOptions { + static const char* kName() { return "BlockTableOptions"; }; + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + + // TODO(kailiu) Temporarily disable this feature by making the default value + // to be false. + // + // TODO(ajkr) we need to update names of variables controlling meta-block + // caching as they should now apply to range tombstone and compression + // dictionary meta-blocks, in addition to index and filter meta-blocks. + // + // Whether to put index/filter blocks in the block cache. When false, + // each "table reader" object will pre-load index/filter blocks during + // table initialization. Index and filter partition blocks always use + // block cache regardless of this option. + bool cache_index_and_filter_blocks = false; + + // If cache_index_and_filter_blocks is enabled, cache index and filter + // blocks with high priority. If set to true, depending on implementation of + // block cache, index, filter, and other metadata blocks may be less likely + // to be evicted than data blocks. + bool cache_index_and_filter_blocks_with_high_priority = true; + + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating each of the following variables that + // has the default value, `PinningTier::kFallback`: + // + // - `MetadataCacheOptions::partition_pinning` + // - `MetadataCacheOptions::unpartitioned_pinning` + // + // The updated value is chosen as follows: + // + // - `pin_l0_filter_and_index_blocks_in_cache == false` -> + // `PinningTier::kNone` + // - `pin_l0_filter_and_index_blocks_in_cache == true` -> + // `PinningTier::kFlushedAndSimilar` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // + // if cache_index_and_filter_blocks is true and the below is true, then + // filter and index blocks are stored in the cache, but a reference is + // held in the "table reader" object so the blocks are pinned and only + // evicted from cache when the table reader is freed. + bool pin_l0_filter_and_index_blocks_in_cache = false; + + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating + // `MetadataCacheOptions::top_level_index_pinning` when it has the + // default value, `PinningTier::kFallback`. + // + // The updated value is chosen as follows: + // + // - `pin_top_level_index_and_filter == false` -> + // `PinningTier::kNone` + // - `pin_top_level_index_and_filter == true` -> + // `PinningTier::kAll` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // + // If cache_index_and_filter_blocks is true and the below is true, then + // the top-level index of partitioned filter and index blocks are stored in + // the cache, but a reference is held in the "table reader" object so the + // blocks are pinned and only evicted from cache when the table reader is + // freed. This is not limited to l0 in LSM tree. + bool pin_top_level_index_and_filter = true; + + // The desired block cache pinning behavior for the different categories of + // metadata blocks. While pinning can reduce block cache contention, users + // must take care not to pin excessive amounts of data, which risks + // overflowing block cache. + MetadataCacheOptions metadata_cache_options; + + // The index type that will be used for this table. + enum IndexType : char { + // A space efficient index block that is optimized for + // binary-search-based index. + kBinarySearch = 0x00, + + // The hash index, if enabled, will do the hash lookup when + // `Options.prefix_extractor` is provided. + kHashSearch = 0x01, + + // A two-level index implementation. Both levels are binary search indexes. + // Second level index blocks ("partitions") use block cache even when + // cache_index_and_filter_blocks=false. + kTwoLevelIndexSearch = 0x02, + + // Like kBinarySearch, but index also contains first key of each block. + // This allows iterators to defer reading the block until it's actually + // needed. May significantly reduce read amplification of short range scans. + // Without it, iterator seek usually reads one block from each level-0 file + // and from each level, which may be expensive. + // Works best in combination with: + // - IndexShorteningMode::kNoShortening, + // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + // e.g. when prefix changes. + // Makes the index significantly bigger (2x or more), especially when keys + // are long. + kBinarySearchWithFirstKey = 0x03, + }; + + IndexType index_type = kBinarySearch; + + // The index type that will be used for the data block. + enum DataBlockIndexType : char { + kDataBlockBinarySearch = 0, // traditional block type + kDataBlockBinaryAndHash = 1, // additional hash index + }; + + DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; + + // #entries/#buckets. It is valid only when data_block_hash_index_type is + // kDataBlockBinaryAndHash. + double data_block_hash_table_util_ratio = 0.75; + + // Option hash_index_allow_collision is now deleted. + // It will behave as if hash_index_allow_collision=true. + + // Use the specified checksum type. Newly created table files will be + // protected with this checksum type. Old table files will still be readable, + // even though they have different checksum type. + ChecksumType checksum = kXXH3; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + bool no_block_cache = false; + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use a 32MB internal cache. + std::shared_ptr block_cache = nullptr; + + // If non-NULL use the specified cache for pages read from device + // IF NULL, no page cache is used + std::shared_ptr persistent_cache = nullptr; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + uint64_t block_size = 4 * 1024; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + int block_size_deviation = 10; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. The minimum value allowed is 1. Any smaller + // value will be silently overwritten with 1. + int block_restart_interval = 16; + + // Same as block_restart_interval but used for the index block. + int index_block_restart_interval = 1; + + // Block size for partitioned metadata. Currently applied to indexes when + // kTwoLevelIndexSearch is used and to filters when partition_filters is used. + // Note: Since in the current implementation the filters and index partitions + // are aligned, an index/filter block is created when either index or filter + // block size reaches the specified limit. + // Note: this limit is currently applied to only index blocks; a filter + // partition is cut right after an index block is cut + // TODO(myabandeh): remove the note above when filter partitions are cut + // separately + uint64_t metadata_block_size = 4096; + + // `cache_usage_options` allows users to specify the default + // options (`cache_usage_options.options`) and the overriding + // options (`cache_usage_options.options_overrides`) + // for different `CacheEntryRole` under various features related to cache + // usage. + // + // For a certain `CacheEntryRole role` and a certain feature `f` of + // `CacheEntryRoleOptions`: + // 1. If `options_overrides` has an entry for `role` and + // `options_overrides[role].f != kFallback`, we use + // `options_overrides[role].f` + // 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f` + // 3. Otherwise, we follow the compatible existing behavior for `f` (see + // each feature's comment for more) + // + // `cache_usage_options` currently supports specifying options for the + // following features: + // + // 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`) + // Memory charging is a feature of accounting memory usage of specific area + // (represented by `CacheEntryRole`) toward usage in block cache (if + // available), by updating a dynamical charge to the block cache loosely based + // on the actual memory usage of that area. + // + // (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer + // (i) If kEnabled: + // Charge memory usage of the buffered data used as training samples for + // dictionary compression. + // If such memory usage exceeds the avaible space left in the block cache + // at some point (i.e, causing a cache full under + // `LRUCacheOptions::strict_capacity_limit` = true), the data will then be + // unbuffered. + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kEnabled. + // + // (b) CacheEntryRole::kFilterConstruction + // (i) If kEnabled: + // Charge memory usage of Bloom Filter + // (format_version >= 5) and Ribbon Filter construction. + // If additional temporary memory of Ribbon Filter exceeds the avaible + // space left in the block cache at some point (i.e, causing a cache full + // under `LRUCacheOptions::strict_capacity_limit` = true), + // construction will fall back to Bloom Filter. + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (c) CacheEntryRole::kBlockBasedTableReader + // (i) If kEnabled: + // Charge memory usage of table properties + + // index block/filter block/uncompression dictionary (when stored in table + // reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks == + // false) + some internal data structures during table reader creation. + // If such a table reader exceeds + // the avaible space left in the block cache at some point (i.e, causing + // a cache full under `LRUCacheOptions::strict_capacity_limit` = true), + // creation will fail with Status::MemoryLimit(). + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (d) CacheEntryRole::kFileMetadata + // (i) If kEnabled: + // Charge memory usage of file metadata. RocksDB holds one file metadata + // structure in-memory per on-disk table file. + // If such file metadata's + // memory exceeds the avaible space left in the block cache at some point + // (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` = + // true), creation will fail with Status::MemoryLimit(). + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (e) Other CacheEntryRole + // Not supported. + // `Status::kNotSupported` will be returned if + // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}. + // + // + // 2. More to come ... + // + CacheUsageOptions cache_usage_options; + + // Note: currently this option requires kTwoLevelIndexSearch to be set as + // well. + // TODO(myabandeh): remove the note above once the limitation is lifted + // Use partitioned full filters for each SST file. This option is + // incompatible with block-based filters. Filter partition blocks use + // block cache even when cache_index_and_filter_blocks=false. + bool partition_filters = false; + + // Option to generate Bloom/Ribbon filters that minimize memory + // internal fragmentation. + // + // When false, malloc_usable_size is not available, or format_version < 5, + // filters are generated without regard to internal fragmentation when + // loaded into memory (historical behavior). When true (and + // malloc_usable_size is available and format_version >= 5), then + // filters are generated to "round up" and "round down" their sizes to + // minimize internal fragmentation when loaded into memory, assuming the + // reading DB has the same memory allocation characteristics as the + // generating DB. This option does not break forward or backward + // compatibility. + // + // While individual filters will vary in bits/key and false positive rate + // when setting is true, the implementation attempts to maintain a weighted + // average FP rate for filters consistent with this option set to false. + // + // With Jemalloc for example, this setting is expected to save about 10% of + // the memory footprint and block cache charge of filters, while increasing + // disk usage of filters by about 1-2% due to encoding efficiency losses + // with variance in bits/key. + // + // NOTE: Because some memory counted by block cache might be unmapped pages + // within internal fragmentation, this option can increase observed RSS + // memory usage. With cache_index_and_filter_blocks=true, this option makes + // the block cache better at using space it is allowed. (These issues + // should not arise with partitioned filters.) + // + // NOTE: Do not set to true if you do not trust malloc_usable_size. With + // this option, RocksDB might access an allocated memory object beyond its + // original size if malloc_usable_size says it is safe to do so. While this + // can be considered bad practice, it should not produce undefined behavior + // unless malloc_usable_size is buggy or broken. + bool optimize_filters_for_memory = false; + + // Use delta encoding to compress keys in blocks. + // ReadOptions::pin_data requires this option to be disabled. + // + // Default: true + bool use_delta_encoding = true; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + std::shared_ptr filter_policy = nullptr; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + bool whole_key_filtering = true; + + // If true, detect corruption during Bloom Filter (format_version >= 5) + // and Ribbon Filter construction. + // + // This is an extra check that is only + // useful in detecting software bugs or CPU+memory malfunction. + // Turning on this feature increases filter construction time by 30%. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{detect_filter_construct_corruption=true;}"}}); + // + // TODO: optimize this performance + bool detect_filter_construct_corruption = false; + + // Verify that decompressing the compressed block gives back the input. This + // is a verification mode that we use to detect bugs in compression + // algorithms. + bool verify_compression = false; + + // If used, For every data block we load into memory, we will create a bitmap + // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap + // will be used to figure out the percentage we actually read of the blocks. + // + // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and + // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the + // read amplification using this formula + // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) + // + // value => memory usage (percentage of loaded blocks memory) + // 1 => 12.50 % + // 2 => 06.25 % + // 4 => 03.12 % + // 8 => 01.56 % + // 16 => 00.78 % + // + // Note: This number must be a power of 2, if not it will be sanitized + // to be the next lowest power of 2, for example a value of 7 will be + // treated as 4, a value of 19 will be treated as 16. + // + // Default: 0 (disabled) + uint32_t read_amp_bytes_per_bit = 0; + + // We currently have these versions: + // 0 -- This version can be read by really old RocksDB's. Doesn't support + // changing checksum type (default is CRC32). + // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default + // checksum, like xxHash. It is written by RocksDB when + // BlockBasedTableOptions::checksum is something other than kCRC32c. (version + // 0 is silently upconverted) + // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we + // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you + // don't plan to run RocksDB before version 3.10, you should probably use + // this. + // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we + // encode the keys in index blocks. If you don't plan to run RocksDB before + // version 5.15, you should probably use this. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. + // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we + // encode the values in index blocks. If you don't plan to run RocksDB before + // version 5.16 and you are using index_block_restart_interval > 1, you should + // probably use this as it would reduce the index size. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. + // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned + // filters use a generally faster and more accurate Bloom filter + // implementation, with a different schema. + uint32_t format_version = 5; + + // Store index blocks on disk in compressed format. Changing this option to + // false will avoid the overhead of decompression if index blocks are evicted + // and read back + bool enable_index_compression = true; + + // Align data blocks on lesser of page size and block size + bool block_align = false; + + // This enum allows trading off increased index size for improved iterator + // seek performance in some situations, particularly when block cache is + // disabled (ReadOptions::fill_cache = false) and direct IO is + // enabled (DBOptions::use_direct_reads = true). + // The default mode is the best tradeoff for most use cases. + // This option only affects newly written tables. + // + // The index contains a key separating each pair of consecutive blocks. + // Let A be the highest key in one block, B the lowest key in the next block, + // and I the index entry separating these two blocks: + // [ ... A] I [B ...] + // I is allowed to be anywhere in [A, B). + // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the + // first block, then immediately fall through to the second block. + // However, if I=A, this can't happen, and we'll read only the second block. + // In kNoShortening mode, we use I=A. In other modes, we use the shortest + // key in [A, B), which usually significantly reduces index size. + // + // There's a similar story for the last index entry, which is an upper bound + // of the highest key in the file. If it's shortened and therefore + // overestimated, iterator is likely to unnecessarily read the last data block + // from each file on each seek. + enum class IndexShorteningMode : char { + // Use full keys. + kNoShortening, + // Shorten index keys between blocks, but use full key for the last index + // key, which is the upper bound of the whole file. + kShortenSeparators, + // Shorten both keys between blocks and key after last block. + kShortenSeparatorsAndSuccessor, + }; + + IndexShorteningMode index_shortening = + IndexShorteningMode::kShortenSeparators; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead + // starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB) + // and doubles on every additional read upto max_auto_readahead_size and + // max_auto_readahead_size can be configured. + // + // Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable + // the implicit auto prefetching. + // If max_auto_readahead_size provided is less + // than initial_auto_readahead_size, then RocksDB will sanitize the + // initial_auto_readahead_size and set it to max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch + // the blocks. + // + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{max_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 256 KB (256 * 1024). + size_t max_auto_readahead_size = 256 * 1024; + + // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and + // filter blocks) which are already in memory into block cache at the time of + // flush. On a flush, the block that is in memory (in memtables) get flushed + // to the device. If using Direct IO, additional IO is incurred to read this + // data back into memory again, which is avoided by enabling this option. This + // further helps if the workload exhibits high temporal locality, where most + // of the reads go to recently written data. This also helps in case of + // Distributed FileSystem. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{prepopulate_block_cache=kFlushOnly;}"}})); + enum class PrepopulateBlockCache : char { + // Disable prepopulate block cache. + kDisable, + // Prepopulate blocks during flush only. + kFlushOnly, + }; + + PrepopulateBlockCache prepopulate_block_cache = + PrepopulateBlockCache::kDisable; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead size + // starts at initial_auto_readahead_size and doubles on every additional read + // upto BlockBasedTableOptions.max_auto_readahead_size. + // max_auto_readahead_size can also be configured. + // + // Scenarios: + // - If initial_auto_readahead_size is set 0 then it will disabled the + // implicit auto prefetching irrespective of max_auto_readahead_size. + // - If max_auto_readahead_size is set 0, it will disable the internal + // prefetching irrespective of initial_auto_readahead_size. + // - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB + // will sanitize the value of initial_auto_readahead_size to + // max_auto_readahead_size and readahead_size will be + // max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch + // the blocks. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{initial_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 8 KB (8 * 1024). + size_t initial_auto_readahead_size = 8 * 1024; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size and reads are + // sequential. + // num_file_reads_for_auto_readahead indicates after how many + // sequential reads internal auto prefetching should be start. + // + // For example, if value is 2 then after reading 2 sequential data blocks on + // third data block prefetching will start. + // If set 0, it will start prefetching from the first read. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{num_file_reads_for_auto_readahead=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 2 + uint64_t num_file_reads_for_auto_readahead = 2; +}; + +// Table Properties that are specific to block-based table properties. +struct BlockBasedTablePropertyNames { + // value of this properties is a fixed int32 number. + static const std::string kIndexType; + // value is "1" for true and "0" for false. + static const std::string kWholeKeyFiltering; + // value is "1" for true and "0" for false. + static const std::string kPrefixFiltering; +}; + +// Create default block based table factory. +extern TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + + +enum EncodingType : char { + // Always write full keys without any special encoding. + kPlain, + // Find opportunity to write the same prefix once for multiple rows. + // In some cases, when a key follows a previous key with the same prefix, + // instead of writing out the full key, it just writes out the size of the + // shared prefix, as well as other bytes, to save some bytes. + // + // When using this option, the user is required to use the same prefix + // extractor to make sure the same prefix will be extracted from the same key. + // The Name() value of the prefix extractor will be stored in the file. When + // reopening the file, the name of the options.prefix_extractor given will be + // bitwise compared to the prefix extractors stored in the file. An error + // will be returned if the two don't match. + kPrefix, +}; + +// Table Properties that are specific to plain table properties. +struct PlainTablePropertyNames { + static const std::string kEncodingType; + static const std::string kBloomVersion; + static const std::string kNumBloomBlocks; +}; + +const uint32_t kPlainTableVariableLength = 0; + +struct PlainTableOptions { + static const char* kName() { return "PlainTableOptions"; }; + // @user_key_len: plain table has optimization for fix-sized keys, which can + // be specified via user_key_len. Alternatively, you can pass + // `kPlainTableVariableLength` if your keys have variable + // lengths. + uint32_t user_key_len = kPlainTableVariableLength; + + // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. + // You may disable it by passing a zero. + int bloom_bits_per_key = 10; + + // @hash_table_ratio: the desired utilization of the hash table used for + // prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the + // hash table + double hash_table_ratio = 0.75; + + // @index_sparseness: inside each prefix, need to build one index record for + // how many keys for binary search inside each hash bucket. + // For encoding type kPrefix, the value will be used when + // writing to determine an interval to rewrite the full + // key. It will also be used as a suggestion and satisfied + // when possible. + size_t index_sparseness = 16; + + // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. + // Otherwise from huge page TLB. The user needs to + // reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + size_t huge_page_tlb_size = 0; + + // @encoding_type: how to encode the keys. See enum EncodingType above for + // the choices. The value will determine how to encode keys + // when writing to a new SST file. This value will be stored + // inside the SST file which will be used when reading from + // the file, which makes it possible for users to choose + // different encoding type when reopening a DB. Files with + // different encoding types can co-exist in the same DB and + // can be read. + EncodingType encoding_type = kPlain; + + // @full_scan_mode: mode for reading the whole file one record by one without + // using the index. + bool full_scan_mode = false; + + // @store_index_in_file: compute plain table index and bloom filter during + // file building and store it in file. When reading + // file, index will be mapped instead of recomputation. + bool store_index_in_file = false; +}; + +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extractor properly to make +// it work. Look-up will starts with prefix hash lookup for key prefix. Inside +// the hash bucket found, a binary search is executed for hash conflicts. +// Finally, a linear search is used. + +extern TableFactory* NewPlainTableFactory( + const PlainTableOptions& options = PlainTableOptions()); + +struct CuckooTablePropertyNames { + // The key that is used to fill empty buckets. + static const std::string kEmptyKey; + // Fixed length of value. + static const std::string kValueLength; + // Number of hash functions used in Cuckoo Hash. + static const std::string kNumHashFunc; + // It denotes the number of buckets in a Cuckoo Block. Given a key and a + // particular hash function, a Cuckoo Block is a set of consecutive buckets, + // where starting bucket id is given by the hash function on the key. In case + // of a collision during inserting the key, the builder tries to insert the + // key in other locations of the cuckoo block before using the next hash + // function. This reduces cache miss during read operation in case of + // collision. + static const std::string kCuckooBlockSize; + // Size of the hash table. Use this number to compute the modulo of hash + // function. The actual number of buckets will be kMaxHashTableSize + + // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to + // accommodate the Cuckoo Block from end of hash table, due to cache friendly + // implementation. + static const std::string kHashTableSize; + // Denotes if the key sorted in the file is Internal Key (if false) + // or User Key only (if true). + static const std::string kIsLastLevel; + // Indicate if using identity function for the first hash function. + static const std::string kIdentityAsFirstHash; + // Indicate if using module or bit and to calculate hash value + static const std::string kUseModuleHash; + // Fixed user key length + static const std::string kUserKeyLength; +}; + +struct CuckooTableOptions { + static const char* kName() { return "CuckooTableOptions"; }; + + // Determines the utilization of hash tables. Smaller values + // result in larger hash tables with fewer collisions. + double hash_table_ratio = 0.9; + // A property used by builder to determine the depth to go to + // to search for a path to displace elements in case of + // collision. See Builder.MakeSpaceForKey method. Higher + // values result in more efficient hash tables with fewer + // lookups but take more time to build. + uint32_t max_search_depth = 100; + // In case of collision while inserting, the builder + // attempts to insert in the next cuckoo_block_size + // locations before skipping over to the next Cuckoo hash + // function. This makes lookups more cache friendly in case + // of collisions. + uint32_t cuckoo_block_size = 5; + // If this option is enabled, user key is treated as uint64_t and its value + // is used as hash value directly. This option changes builder's behavior. + // Reader ignore this option and behave according to what specified in table + // property. + bool identity_as_first_hash = false; + // If this option is set to true, module is used during hash calculation. + // This often yields better space efficiency at the cost of performance. + // If this option is set to false, # of entries in table is constrained to be + // power of two, and bit and is used to calculate hash, which is faster in + // general. + bool use_module_hash = true; +}; + +// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing +extern TableFactory* NewCuckooTableFactory( + const CuckooTableOptions& table_options = CuckooTableOptions()); + + +class RandomAccessFileReader; + +// A base class for table factories. +class TableFactory : public Customizable { + public: + virtual ~TableFactory() override {} + + static const char* kBlockCacheOpts() { return "BlockCache"; }; + static const char* kBlockBasedTableName() { return "BlockBasedTable"; }; + static const char* kPlainTableName() { return "PlainTable"; } + static const char* kCuckooTableName() { return "CuckooTable"; }; + + // Creates and configures a new TableFactory from the input options and id. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* factory); + + static const char* Type() { return "TableFactory"; } + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // NewTableReader() is called in three places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (2) SstFileDumper (for SST Dump) opens the table and dump the table + // contents using the iterator of the table. + // (3) DBImpl::IngestExternalFile() calls this function to read the contents + // of the sst file it's attempting to add + // + // table_reader_options is a TableReaderOptions which contain all the + // needed parameters and configuration to open the table. + // file is a file handler to handle the file for the table. + // file_size is the physical file size of the file. + // table_reader is the output table reader. + virtual Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const { + ReadOptions ro; + return NewTableReader(ro, table_reader_options, std::move(file), file_size, + table_reader, prefetch_index_and_filter_in_cache); + } + + // Overload of the above function that allows the caller to pass in a + // ReadOptions + virtual Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + // + // Multiple configured can be accessed from there, including and not limited + // to compression options. file is a handle of a writable file. + // It is the caller's responsibility to keep the file open and close the file + // after closing the table builder. compression_type is the compression type + // to use in this table. + virtual TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const = 0; + + // Return is delete range supported + virtual bool IsDeleteRangeSupported() const { return false; } +}; + +// Create a special table factory that can open either of the supported +// table formats, based on setting inside the SST files. It should be used to +// convert a DB from one table format to another. +// @table_factory_to_write: the table factory used when writing to new files. +// @block_based_table_factory: block based table factory to use. If NULL, use +// a default one. +// @plain_table_factory: plain table factory to use. If NULL, use a default one. +// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default +// one. +extern TableFactory* NewAdaptiveTableFactory( + std::shared_ptr table_factory_to_write = nullptr, + std::shared_ptr block_based_table_factory = nullptr, + std::shared_ptr plain_table_factory = nullptr, + std::shared_ptr cuckoo_table_factory = nullptr); + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_properties.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_properties.h new file mode 100644 index 0000000000..ab259f930b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_properties.h @@ -0,0 +1,332 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// -- Table Properties +// Other than basic table properties, each table may also have the user +// collected properties. +// The value of the user-collected properties are encoded as raw bytes -- +// users have to interpret these values by themselves. +// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do +// something similar to: +// +// UserCollectedProperties props = ...; +// for (auto pos = props.lower_bound(prefix); +// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0; +// ++pos) { +// ... +// } +using UserCollectedProperties = std::map; + +// table properties' human-readable names in the property block. +struct TablePropertiesNames { + static const std::string kDbId; + static const std::string kDbSessionId; + static const std::string kDbHostId; + static const std::string kOriginalFileNumber; + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kIndexPartitions; + static const std::string kTopLevelIndexSize; + static const std::string kIndexKeyIsUserKey; + static const std::string kIndexValueIsDeltaEncoded; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kNumFilterEntries; + static const std::string kDeletedKeys; + static const std::string kMergeOperands; + static const std::string kNumRangeDeletions; + static const std::string kFormatVersion; + static const std::string kFixedKeyLen; + static const std::string kFilterPolicy; + static const std::string kColumnFamilyName; + static const std::string kColumnFamilyId; + static const std::string kComparator; + static const std::string kMergeOperator; + static const std::string kPrefixExtractorName; + static const std::string kPropertyCollectors; + static const std::string kCompression; + static const std::string kCompressionOptions; + static const std::string kCreationTime; + static const std::string kOldestKeyTime; + static const std::string kFileCreationTime; + static const std::string kSlowCompressionEstimatedDataSize; + static const std::string kFastCompressionEstimatedDataSize; + static const std::string kSequenceNumberTimeMapping; + static const std::string kTailStartOffset; +}; + +// `TablePropertiesCollector` provides the mechanism for users to collect +// their own properties that they are interested in. This class is essentially +// a collection of callback functions that will be invoked during table +// building. It is constructed with TablePropertiesCollectorFactory. The methods +// don't need to be thread-safe, as we will create exactly one +// TablePropertiesCollector object per table and then call it sequentially. +// +// Statuses from these callbacks are currently logged when not OK, but +// otherwise ignored by RocksDB. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class TablePropertiesCollector { + public: + virtual ~TablePropertiesCollector() {} + + // DEPRECATE User defined collector should implement AddUserKey(), though + // this old function still works for backward compatible reason. + // Add() will be called when a new key/value pair is inserted into the table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) { + return Status::InvalidArgument( + "TablePropertiesCollector::Add() deprecated."); + } + + // AddUserKey() will be called when a new key/value pair is inserted into the + // table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status AddUserKey(const Slice& key, const Slice& value, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) { + // For backwards-compatibility. + return Add(key, value); + } + + // Called after each new block is cut + virtual void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) { + // Nothing to do here. Callback registers can override. + return; + } + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* properties) = 0; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; + + // EXPERIMENTAL Return whether the output file should be further compacted + virtual bool NeedCompact() const { return false; } +}; + +// Constructs TablePropertiesCollector. Internals create a new +// TablePropertiesCollector for each new table +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class TablePropertiesCollectorFactory : public Customizable { + public: + struct Context { + uint32_t column_family_id; + // The level at creating the SST file (i.e, table), of which the + // properties are being collected. + int level_at_creation = kUnknownLevelAtCreation; + static const uint32_t kUnknownColumnFamily; + static const int kUnknownLevelAtCreation = -1; + }; + + ~TablePropertiesCollectorFactory() override {} + static const char* Type() { return "TablePropertiesCollectorFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + // has to be thread-safe + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) = 0; + + // The name of the properties collector can be used for debugging purpose. + const char* Name() const override = 0; + + // Can be overridden by sub-classes to return the Name, followed by + // configuration info that will // be logged to the info log when the + // DB is opened + virtual std::string ToString() const { return Name(); } +}; + +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // the file number at creation time, or 0 for unknown. When known, + // combining with db_session_id must uniquely identify an SST file. + uint64_t orig_file_number = 0; + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // Total number of index partitions if kTwoLevelIndexSearch is used + uint64_t index_partitions = 0; + // Size of the top-level index if kTwoLevelIndexSearch is used + uint64_t top_level_index_size = 0; + // Whether the index key is user key. Otherwise it includes 8 byte of sequence + // number added by internal key format. + uint64_t index_key_is_user_key = 0; + // Whether delta encoding is used to encode the index values. + uint64_t index_value_is_delta_encoded = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw (uncompressed, undelineated) key size + uint64_t raw_key_size = 0; + // total raw (uncompressed, undelineated) value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + // the number of unique entries (keys or prefixes) added to filters + uint64_t num_filter_entries = 0; + // the number of deletions in the table + uint64_t num_deletions = 0; + // the number of merge operands in the table + uint64_t num_merge_operands = 0; + // the number of range deletions in this table + uint64_t num_range_deletions = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; + // ID of column family for this SST file, corresponding to the CF identified + // by column_family_name. + uint64_t column_family_id = ROCKSDB_NAMESPACE:: + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + // Timestamp of the latest key. 0 means unknown. + // TODO(sagar0): Should be changed to latest_key_time ... but don't know the + // full implications of backward compatibility. Hence retaining for now. + uint64_t creation_time = 0; + + // Timestamp of the earliest key. 0 means unknown. + uint64_t oldest_key_time = 0; + // Actual SST file creation time. 0 means unknown. + uint64_t file_creation_time = 0; + // Estimated size of data blocks if compressed using a relatively slower + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t slow_compression_estimated_data_size = 0; + // Estimated size of data blocks if compressed using a relatively faster + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t fast_compression_estimated_data_size = 0; + // Offset of the value of the property "external sst file global seqno" in the + // file if the property exists. + // 0 means not exists. + uint64_t external_sst_file_global_seqno_offset = 0; + + // Offset where the "tail" part of SST file starts + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_start_offset = 0; + + // DB identity + // db_id is an identifier generated the first time the DB is created + // If DB identity is unset or unassigned, `db_id` will be an empty string. + std::string db_id; + + // DB session identity + // db_session_id is an identifier that gets reset every time the DB is opened + // If DB session identity is unset or unassigned, `db_session_id` will be an + // empty string. + std::string db_session_id; + + // Location of the machine hosting the DB instance + // db_host_id identifies the location of the host in some form + // (hostname by default, but can also be any string of the user's choosing). + // It can potentially change whenever the DB is opened + std::string db_host_id; + + // Name of the column family with which this SST file is associated. + // If column family is unknown, `column_family_name` will be an empty string. + std::string column_family_name; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // The name of the comparator used in this table. + std::string comparator_name; + + // The name of the merge operator used in this table. + // If no merge operator is used, `merge_operator_name` will be "nullptr". + std::string merge_operator_name; + + // The name of the prefix extractor used in this table + // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr". + std::string prefix_extractor_name; + + // The names of the property collectors factories used in this table + // separated by commas + // {collector_name[1]},{collector_name[2]},{collector_name[3]} .. + std::string property_collectors_names; + + // The compression algo used to compress the SST files. + std::string compression_name; + + // Compression options used to compress the SST files. + std::string compression_options; + + // Sequence number to time mapping, delta encoded. + std::string seqno_to_time_mapping; + + // user collected properties + UserCollectedProperties user_collected_properties; + UserCollectedProperties readable_properties; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; + + // Aggregate the numerical member variables of the specified + // TableProperties. + void Add(const TableProperties& tp); + + // Subset of properties that make sense when added together + // between tables. Keys match field names in this class instead + // of using full property names. + std::map GetAggregatablePropertiesAsMap() const; + + // Return the approximated memory usage of this TableProperties object, + // including memory used by the string properties and UserCollectedProperties + std::size_t ApproximateMemoryUsage() const; +}; + +// Extra properties +// Below is a list of non-basic properties that are collected by database +// itself. Especially some properties regarding to the internal keys (which +// is unknown to `table`). +// +// DEPRECATED: these properties now belong as TableProperties members. Please +// use TableProperties::num_deletions and TableProperties::num_merge_operands, +// respectively. +extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); +extern uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_reader_caller.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_reader_caller.h new file mode 100644 index 0000000000..10ec08130f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/table_reader_caller.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// A list of callers for a table reader. It is used to trace the caller that +// accesses on a block. This is only used for block cache tracing and analysis. +// A user may use kUncategorized if the caller is not interesting for analysis +// or the table reader is called in the test environment, e.g., unit test, table +// reader benchmark, etc. +enum TableReaderCaller : char { + kUserGet = 1, + kUserMultiGet = 2, + kUserIterator = 3, + kUserApproximateSize = 4, + kUserVerifyChecksum = 5, + kSSTDumpTool = 6, + kExternalSSTIngestion = 7, + kRepair = 8, + kPrefetch = 9, + kCompaction = 10, + // A compaction job may refill the block cache with blocks in the new SST + // files if paranoid_file_checks is true. + kCompactionRefill = 11, + // After building a table, it may load all its blocks into the block cache if + // paranoid_file_checks is true. + kFlush = 12, + // sst_file_reader. + kSSTFileReader = 13, + // A list of callers that are either not interesting for analysis or are + // calling from a test environment, e.g., unit test, benchmark, etc. + kUncategorized = 14, + // All callers should be added before kMaxBlockCacheLookupCaller. + kMaxBlockCacheLookupCaller +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/thread_status.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/thread_status.h new file mode 100644 index 0000000000..beecdfd258 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/thread_status.h @@ -0,0 +1,190 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines the structures for exposing run-time status of any +// rocksdb-related thread. Such run-time status can be obtained via +// GetThreadList() API. +// +// Note that all thread-status features are still under-development, and +// thus APIs and class definitions might subject to change at this point. +// Will remove this comment once the APIs have been finalized. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +#if !defined(NROCKSDB_THREAD_STATUS) +#define ROCKSDB_USING_THREAD_STATUS +#endif + +namespace ROCKSDB_NAMESPACE { + +// TODO(yhchiang): remove this function once c++14 is available +// as std::max will be able to cover this. +// Current MS compiler does not support constexpr +template +struct constexpr_max { + static const int result = (A > B) ? A : B; +}; + +// A structure that describes the current status of a thread. +// The status of active threads can be fetched using +// ROCKSDB_NAMESPACE::GetThreadList(). +struct ThreadStatus { + // The type of a thread. + enum ThreadType : int { + HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool + LOW_PRIORITY, // RocksDB BG thread in low-pri thread pool + USER, // User thread (Non-RocksDB BG thread) + BOTTOM_PRIORITY, // RocksDB BG thread in bottom-pri thread pool + NUM_THREAD_TYPES + }; + + // The type used to refer to a thread operation. + // A thread operation describes high-level action of a thread. + // Examples include compaction and flush. + enum OperationType : int { + OP_UNKNOWN = 0, + OP_COMPACTION, + OP_FLUSH, + OP_DBOPEN, + NUM_OP_TYPES + }; + + enum OperationStage : int { + STAGE_UNKNOWN = 0, + STAGE_FLUSH_RUN, + STAGE_FLUSH_WRITE_L0, + STAGE_COMPACTION_PREPARE, + STAGE_COMPACTION_RUN, + STAGE_COMPACTION_PROCESS_KV, + STAGE_COMPACTION_INSTALL, + STAGE_COMPACTION_SYNC_FILE, + STAGE_PICK_MEMTABLES_TO_FLUSH, + STAGE_MEMTABLE_ROLLBACK, + STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + NUM_OP_STAGES + }; + + enum CompactionPropertyType : int { + COMPACTION_JOB_ID = 0, + COMPACTION_INPUT_OUTPUT_LEVEL, + COMPACTION_PROP_FLAGS, + COMPACTION_TOTAL_INPUT_BYTES, + COMPACTION_BYTES_READ, + COMPACTION_BYTES_WRITTEN, + NUM_COMPACTION_PROPERTIES + }; + + enum FlushPropertyType : int { + FLUSH_JOB_ID = 0, + FLUSH_BYTES_MEMTABLES, + FLUSH_BYTES_WRITTEN, + NUM_FLUSH_PROPERTIES + }; + + // The maximum number of properties of an operation. + // This number should be set to the biggest NUM_XXX_PROPERTIES. + static const int kNumOperationProperties = + constexpr_max::result; + + // The type used to refer to a thread state. + // A state describes lower-level action of a thread + // such as reading / writing a file or waiting for a mutex. + enum StateType : int { + STATE_UNKNOWN = 0, + STATE_MUTEX_WAIT = 1, + NUM_STATE_TYPES + }; + + ThreadStatus(const uint64_t _id, const ThreadType _thread_type, + const std::string& _db_name, const std::string& _cf_name, + const OperationType _operation_type, + const uint64_t _op_elapsed_micros, + const OperationStage _operation_stage, + const uint64_t _op_props[], const StateType _state_type) + : thread_id(_id), + thread_type(_thread_type), + db_name(_db_name), + cf_name(_cf_name), + operation_type(_operation_type), + op_elapsed_micros(_op_elapsed_micros), + operation_stage(_operation_stage), + state_type(_state_type) { + for (int i = 0; i < kNumOperationProperties; ++i) { + op_properties[i] = _op_props[i]; + } + } + + // An unique ID for the thread. + const uint64_t thread_id; + + // The type of the thread, it could be HIGH_PRIORITY, + // LOW_PRIORITY, and USER + const ThreadType thread_type; + + // The name of the DB instance where the thread is currently + // involved with. It would be set to empty string if the thread + // does not involve in any DB operation. + const std::string db_name; + + // The name of the column family where the thread is currently + // It would be set to empty string if the thread does not involve + // in any column family. + const std::string cf_name; + + // The operation (high-level action) that the current thread is involved. + const OperationType operation_type; + + // The elapsed time of the current thread operation in microseconds. + const uint64_t op_elapsed_micros; + + // An integer showing the current stage where the thread is involved + // in the current operation. + const OperationStage operation_stage; + + // A list of properties that describe some details about the current + // operation. Same field in op_properties[] might have different + // meanings for different operations. + uint64_t op_properties[kNumOperationProperties]; + + // The state (lower-level action) that the current thread is involved. + const StateType state_type; + + // The followings are a set of utility functions for interpreting + // the information of ThreadStatus + + static std::string GetThreadTypeName(ThreadType thread_type); + + // Obtain the name of an operation given its type. + static const std::string& GetOperationName(OperationType op_type); + + static const std::string MicrosToString(uint64_t op_elapsed_time); + + // Obtain a human-readable string describing the specified operation stage. + static const std::string& GetOperationStageName(OperationStage stage); + + // Obtain the name of the "i"th operation property of the + // specified operation. + static const std::string& GetOperationPropertyName(OperationType op_type, + int i); + + // Translate the "i"th property of the specified operation given + // a property value. + static std::map InterpretOperationProperties( + OperationType op_type, const uint64_t* op_properties); + + // Obtain the name of a state given its type. + static const std::string& GetStateName(StateType state_type); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/threadpool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/threadpool.h new file mode 100644 index 0000000000..f1cc557524 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/threadpool.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * ThreadPool is a component that will spawn N background threads that will + * be used to execute scheduled work, The number of background threads could + * be modified by calling SetBackgroundThreads(). + * */ +class ThreadPool { + public: + virtual ~ThreadPool() {} + + // Wait for all threads to finish. + // Discard those threads that did not start + // executing + virtual void JoinAllThreads() = 0; + + // Set the number of background threads that will be executing the + // scheduled jobs. + virtual void SetBackgroundThreads(int num) = 0; + virtual int GetBackgroundThreads() = 0; + + // Get the number of jobs scheduled in the ThreadPool queue. + virtual unsigned int GetQueueLen() const = 0; + + // Waits for all jobs to complete those + // that already started running and those that did not + // start yet. This ensures that everything that was thrown + // on the TP runs even though + // we may not have specified enough threads for the amount + // of jobs + virtual void WaitForJobsAndJoinAllThreads() = 0; + + // Submit a fire and forget jobs + // This allows to submit the same job multiple times + virtual void SubmitJob(const std::function&) = 0; + // This moves the function in for efficiency + virtual void SubmitJob(std::function&&) = 0; + + // Reserve available background threads. This function does not ensure + // so many threads can be reserved, instead it will return the number of + // threads that can be reserved against the desired one. In other words, + // the number of available threads could be less than the input. + virtual int ReserveThreads(int /*threads_to_be_reserved*/) { return 0; } + + // Release a specific number of reserved threads + virtual int ReleaseThreads(int /*threads_to_be_released*/) { return 0; } +}; + +// NewThreadPool() is a function that could be used to create a ThreadPool +// with `num_threads` background threads. +extern ThreadPool* NewThreadPool(int num_threads); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_reader_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_reader_writer.h new file mode 100644 index 0000000000..335e091dc8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_reader_writer.h @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +// Allow custom implementations of TraceWriter and TraceReader. +// By default, RocksDB provides a way to capture the traces to a file using the +// factory NewFileTraceWriter(). But users could also choose to export traces to +// any other system by providing custom implementations of TraceWriter and +// TraceReader. + +// TraceWriter allows exporting RocksDB traces to any system, one operation at +// a time. +class TraceWriter { + public: + virtual ~TraceWriter() = default; + + virtual Status Write(const Slice& data) = 0; + virtual Status Close() = 0; + virtual uint64_t GetFileSize() = 0; +}; + +// TraceReader allows reading RocksDB traces from any system, one operation at +// a time. A RocksDB Replayer could depend on this to replay operations. +class TraceReader { + public: + virtual ~TraceReader() = default; + + virtual Status Read(std::string* data) = 0; + virtual Status Close() = 0; + + // Seek back to the trace header. Replayer can call this method to restart + // replaying. Note this method may fail if the reader is already closed. + virtual Status Reset() = 0; +}; + +// Factory methods to write/read traces to/from a file. +// The implementations may not be thread-safe. +Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_writer); +Status NewFileTraceReader(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_reader); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record.h new file mode 100644 index 0000000000..c00f5cafbe --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record.h @@ -0,0 +1,248 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +class DB; + +// Supported trace record types. +enum TraceType : char { + kTraceNone = 0, + kTraceBegin = 1, + kTraceEnd = 2, + // Query level tracing related trace types. + kTraceWrite = 3, + kTraceGet = 4, + kTraceIteratorSeek = 5, + kTraceIteratorSeekForPrev = 6, + // Block cache tracing related trace types. + kBlockTraceIndexBlock = 7, + // TODO: split out kinds of filter blocks? + kBlockTraceFilterBlock = 8, + kBlockTraceDataBlock = 9, + kBlockTraceUncompressionDictBlock = 10, + kBlockTraceRangeDeletionBlock = 11, + // IO tracing related trace type. + kIOTracer = 12, + // Query level tracing related trace type. + kTraceMultiGet = 13, + // All trace types should be added before kTraceMax + kTraceMax, +}; + +class GetQueryTraceRecord; +class IteratorSeekQueryTraceRecord; +class MultiGetQueryTraceRecord; +class TraceRecordResult; +class WriteQueryTraceRecord; + +// Base class for all types of trace records. +class TraceRecord { + public: + explicit TraceRecord(uint64_t timestamp); + + virtual ~TraceRecord() = default; + + // Type of the trace record. + virtual TraceType GetTraceType() const = 0; + + // Timestamp (in microseconds) of this trace. + virtual uint64_t GetTimestamp() const; + + class Handler { + public: + virtual ~Handler() = default; + + virtual Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) = 0; + }; + + // Accept the handler and report the corresponding result in `result`. + virtual Status Accept(Handler* handler, + std::unique_ptr* result) = 0; + + // Create a handler for the exeution of TraceRecord. + static Handler* NewExecutionHandler( + DB* db, const std::vector& handles); + + private: + uint64_t timestamp_; +}; + +// Base class for all query types of trace records. +class QueryTraceRecord : public TraceRecord { + public: + explicit QueryTraceRecord(uint64_t timestamp); +}; + +// Trace record for DB::Write() operation. +class WriteQueryTraceRecord : public QueryTraceRecord { + public: + WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp); + + WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp); + + virtual ~WriteQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceWrite; } + + // rep string for the WriteBatch. + virtual Slice GetWriteBatchRep() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + PinnableSlice rep_; +}; + +// Trace record for DB::Get() operation +class GetQueryTraceRecord : public QueryTraceRecord { + public: + GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key, + uint64_t timestamp); + + GetQueryTraceRecord(uint32_t column_family_id, const std::string& key, + uint64_t timestamp); + + virtual ~GetQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceGet; } + + // Column family ID. + virtual uint32_t GetColumnFamilyID() const; + + // Key to get. + virtual Slice GetKey() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + uint32_t cf_id_; + PinnableSlice key_; +}; + +// Base class for all Iterator related operations. +class IteratorQueryTraceRecord : public QueryTraceRecord { + public: + explicit IteratorQueryTraceRecord(uint64_t timestamp); + + IteratorQueryTraceRecord(PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, uint64_t timestamp); + + IteratorQueryTraceRecord(const std::string& lower_bound, + const std::string& upper_bound, uint64_t timestamp); + + virtual ~IteratorQueryTraceRecord() override; + + // Get the iterator's lower/upper bound. They may be used in ReadOptions to + // create an Iterator instance. + virtual Slice GetLowerBound() const; + virtual Slice GetUpperBound() const; + + private: + PinnableSlice lower_; + PinnableSlice upper_; +}; + +// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation. +class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord { + public: + // Currently we only support Seek() and SeekForPrev(). + enum SeekType { + kSeek = kTraceIteratorSeek, + kSeekForPrev = kTraceIteratorSeekForPrev + }; + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + PinnableSlice&& key, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + const std::string& key, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + PinnableSlice&& key, PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + const std::string& key, + const std::string& lower_bound, + const std::string& upper_bound, + uint64_t timestamp); + + virtual ~IteratorSeekQueryTraceRecord() override; + + // Trace type matches the seek type. + TraceType GetTraceType() const override; + + // Type of seek, Seek or SeekForPrev. + virtual SeekType GetSeekType() const; + + // Column family ID. + virtual uint32_t GetColumnFamilyID() const; + + // Key to seek to. + virtual Slice GetKey() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + SeekType type_; + uint32_t cf_id_; + PinnableSlice key_; +}; + +// Trace record for DB::MultiGet() operation. +class MultiGetQueryTraceRecord : public QueryTraceRecord { + public: + MultiGetQueryTraceRecord(std::vector column_family_ids, + std::vector&& keys, + uint64_t timestamp); + + MultiGetQueryTraceRecord(std::vector column_family_ids, + const std::vector& keys, + uint64_t timestamp); + + virtual ~MultiGetQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceMultiGet; } + + // Column familiy IDs. + virtual std::vector GetColumnFamilyIDs() const; + + // Keys to get. + virtual std::vector GetKeys() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + std::vector cf_ids_; + std::vector keys_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record_result.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record_result.h new file mode 100644 index 0000000000..0cd0004a6a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/trace_record_result.h @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +class IteratorTraceExecutionResult; +class MultiValuesTraceExecutionResult; +class SingleValueTraceExecutionResult; +class StatusOnlyTraceExecutionResult; + +// Base class for the results of all types of trace records. +// Theses classes can be used to report the execution result of +// TraceRecord::Handler::Handle() or TraceRecord::Accept(). +class TraceRecordResult { + public: + explicit TraceRecordResult(TraceType trace_type); + + virtual ~TraceRecordResult() = default; + + // Trace type of the corresponding TraceRecord. + virtual TraceType GetTraceType() const; + + class Handler { + public: + virtual ~Handler() = default; + + virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0; + + virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0; + + virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0; + + virtual Status Handle(const IteratorTraceExecutionResult& result) = 0; + }; + + // Accept the handler. + virtual Status Accept(Handler* handler) = 0; + + private: + TraceType trace_type_; +}; + +// Base class for the results from the trace record execution handler (created +// by TraceRecord::NewExecutionHandler()). +// +// The actual execution status or returned values may be hidden from +// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a +// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may +// return Status::NotFound() but TraceRecord::Handler::Handle() or +// TraceRecord::Accept() will still return Status::OK(). The actual status from +// DB::Get() and the returned value string may be saved in a +// SingleValueTraceExecutionResult. +class TraceExecutionResult : public TraceRecordResult { + public: + TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type); + + // Execution start/end timestamps and request latency in microseconds. + virtual uint64_t GetStartTimestamp() const; + virtual uint64_t GetEndTimestamp() const; + inline uint64_t GetLatency() const { + return GetEndTimestamp() - GetStartTimestamp(); + } + + private: + uint64_t ts_start_; + uint64_t ts_end_; +}; + +// Result for operations that only return a single Status. +// Example operation: DB::Write() +class StatusOnlyTraceExecutionResult : public TraceExecutionResult { + public: + StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~StatusOnlyTraceExecutionResult() override = default; + + // Return value of DB::Write(), etc. + virtual const Status& GetStatus() const; + + virtual Status Accept(Handler* handler) override; + + private: + Status status_; +}; + +// Result for operations that return a Status and a value. +// Example operation: DB::Get() +class SingleValueTraceExecutionResult : public TraceExecutionResult { + public: + SingleValueTraceExecutionResult(Status status, const std::string& value, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + SingleValueTraceExecutionResult(Status status, std::string&& value, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~SingleValueTraceExecutionResult() override; + + // Return status of DB::Get(). + virtual const Status& GetStatus() const; + + // Value for the searched key. + virtual const std::string& GetValue() const; + + virtual Status Accept(Handler* handler) override; + + private: + Status status_; + std::string value_; +}; + +// Result for operations that return multiple Status(es) and values as vectors. +// Example operation: DB::MultiGet() +class MultiValuesTraceExecutionResult : public TraceExecutionResult { + public: + MultiValuesTraceExecutionResult(std::vector multi_status, + std::vector values, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~MultiValuesTraceExecutionResult() override; + + // Returned Status(es) of DB::MultiGet(). + virtual const std::vector& GetMultiStatus() const; + + // Returned values for the searched keys. + virtual const std::vector& GetValues() const; + + virtual Status Accept(Handler* handler) override; + + private: + std::vector multi_status_; + std::vector values_; +}; + +// Result for Iterator operations. +// Example operations: Iterator::Seek(), Iterator::SeekForPrev() +class IteratorTraceExecutionResult : public TraceExecutionResult { + public: + IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key, + PinnableSlice&& value, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + IteratorTraceExecutionResult(bool valid, Status status, + const std::string& key, const std::string& value, + uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type); + + virtual ~IteratorTraceExecutionResult() override; + + // Return if the Iterator is valid. + virtual bool GetValid() const; + + // Return the status of the Iterator. + virtual const Status& GetStatus() const; + + // Key of the current iterating entry, empty if GetValid() is false. + virtual Slice GetKey() const; + + // Value of the current iterating entry, empty if GetValid() is false. + virtual Slice GetValue() const; + + virtual Status Accept(Handler* handler) override; + + private: + bool valid_; + Status status_; + PinnableSlice key_; + PinnableSlice value_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/transaction_log.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/transaction_log.h new file mode 100644 index 0000000000..e13ad8f80a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/transaction_log.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" + +namespace ROCKSDB_NAMESPACE { + +class LogFile; +using VectorLogPtr = std::vector>; + +enum WalFileType { + /* Indicates that WAL file is in archive directory. WAL files are moved from + * the main db directory to archive directory once they are not live and stay + * there until cleaned up. Files are cleaned depending on archive size + * (Options::WAL_size_limit_MB) and time since last cleaning + * (Options::WAL_ttl_seconds). + */ + kArchivedLogFile = 0, + + /* Indicates that WAL file is live and resides in the main db directory */ + kAliveLogFile = 1 +}; + +class LogFile { + public: + LogFile() {} + virtual ~LogFile() {} + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = /000003.log + // For an archived-log-file = /archive/000003.log + virtual std::string PathName() const = 0; + + // Primary identifier for log file. + // This is directly proportional to creation time of the log file + virtual uint64_t LogNumber() const = 0; + + // Log file can be either alive or archived + virtual WalFileType Type() const = 0; + + // Starting sequence number of writebatch written in this log file + virtual SequenceNumber StartSequence() const = 0; + + // Size of log file on disk in Bytes + virtual uint64_t SizeFileBytes() const = 0; +}; + +struct BatchResult { + SequenceNumber sequence = 0; + std::unique_ptr writeBatchPtr; + + // Add empty __ctor and __dtor for the rule of five + // However, preserve the original semantics and prohibit copying + // as the std::unique_ptr member does not copy. + BatchResult() {} + + ~BatchResult() {} + + BatchResult(const BatchResult&) = delete; + + BatchResult& operator=(const BatchResult&) = delete; + + BatchResult(BatchResult&& bResult) + : sequence(std::move(bResult.sequence)), + writeBatchPtr(std::move(bResult.writeBatchPtr)) {} + + BatchResult& operator=(BatchResult&& bResult) { + sequence = std::move(bResult.sequence); + writeBatchPtr = std::move(bResult.writeBatchPtr); + return *this; + } +}; + +// A TransactionLogIterator is used to iterate over the transactions in a db. +// One run of the iterator is continuous, i.e. the iterator will stop at the +// beginning of any gap in sequences +class TransactionLogIterator { + public: + TransactionLogIterator() {} + virtual ~TransactionLogIterator() {} + + // An iterator is either positioned at a WriteBatch or not valid. + // This method returns true if the iterator is valid. + // Can read data from a valid iterator. + virtual bool Valid() = 0; + + // Moves the iterator to the next WriteBatch. + // REQUIRES: Valid() to be true. + virtual void Next() = 0; + + // Returns ok if the iterator is valid. + // Returns the Error when something has gone wrong. + virtual Status status() = 0; + + // If valid return's the current write_batch and the sequence number of the + // earliest transaction contained in the batch. + // ONLY use if Valid() is true and status() is OK. + virtual BatchResult GetBatch() = 0; + + // The read options for TransactionLogIterator. + struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums_; + + ReadOptions() : verify_checksums_(true) {} + + explicit ReadOptions(bool verify_checksums) + : verify_checksums_(verify_checksums) {} + }; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/types.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/types.h new file mode 100644 index 0000000000..3f8ce97959 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/types.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +// Define all public custom types here. + +using ColumnFamilyId = uint32_t; + +// Represents a sequence number in a WAL file. +using SequenceNumber = uint64_t; + +const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed + +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, + kMisc, +}; + +enum class BlobFileCreationReason { + kFlush, + kCompaction, + kRecovery, +}; + +// The types of files RocksDB uses in a DB directory. (Available for +// advanced options.) +enum FileType { + kWalFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile, + kOptionsFile, + kBlobFile +}; + +// User-oriented representation of internal key types. +// Ordering of this enum entries should not change. +enum EntryType { + kEntryPut, + kEntryDelete, + kEntrySingleDelete, + kEntryMerge, + kEntryRangeDeletion, + kEntryBlobIndex, + kEntryDeleteWithTimestamp, + kEntryWideColumnEntity, + kEntryOther, +}; + +enum class WriteStallCause { + // Beginning of CF-scope write stall causes + // + // Always keep `kMemtableLimit` as the first stat in this section + kMemtableLimit, + kL0FileCountLimit, + kPendingCompactionBytes, + kCFScopeWriteStallCauseEnumMax, + // End of CF-scope write stall causes + + // Beginning of DB-scope write stall causes + // + // Always keep `kWriteBufferManagerLimit` as the first stat in this section + kWriteBufferManagerLimit, + kDBScopeWriteStallCauseEnumMax, + // End of DB-scope write stall causes + + // Always add new WriteStallCause before `kNone` + kNone, +}; + +enum class WriteStallCondition { + kDelayed, + kStopped, + // Always add new WriteStallCondition before `kNormal` + kNormal, +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/unique_id.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/unique_id.h new file mode 100644 index 0000000000..eb0c778266 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/unique_id.h @@ -0,0 +1,55 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// Computes a stable, universally unique 128-bit (16 binary char) identifier +// for an SST file from TableProperties. This is supported for table (SST) +// files created with RocksDB 6.24 and later. NotSupported will be returned +// for other cases. The first 16 bytes (128 bits) is of sufficient quality +// for almost all applications, and shorter prefixes are usable as a +// hash of the full unique id. +// +// Note: .c_str() is not compatible with binary char strings, so using +// .c_str() on the result will often result in information loss and very +// poor uniqueness probability. +// +// More detail: the value is *guaranteed* unique for SST files +// generated in the same process (even different DBs, RocksDB >= 6.26), +// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26) +// so that the "all zeros" value can be used reliably for a null ID. +// These IDs are more than sufficient for SST uniqueness within each of +// many DBs or hosts. For an extreme example assuming random IDs, consider +// 10^9 hosts each with 10^9 live SST files being replaced at 10^6/second. +// Such a service would need to run for 10 million years to see an ID +// collision among live SST files on any host. +// +// And assuming one generates many SST files in the lifetime of each process, +// the probability of ID collisions is much "better than random"; see +// https://github.com/pdillinger/unique_id +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id); + +// Computes a 192-bit (24 binary char) stable, universally unique ID +// with an extra 64 bits of uniqueness compared to the standard ID. It is only +// appropriate to use this ID instead of the 128-bit ID if ID collisions +// between files among any hosts in a vast fleet is a problem, such as a shared +// global namespace for SST file backups. Under this criteria, the extreme +// example above would expect a global file ID collision every 4 days with +// 128-bit IDs (using some worst-case assumptions about process lifetime). +// It's 10^17 years with 192-bit IDs. +Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id); + +// Converts a binary string (unique id) to hexadecimal, with each 64 bits +// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B +// Also works on unique id prefix. +std::string UniqueIdToHumanString(const std::string &id); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/universal_compaction.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/universal_compaction.h new file mode 100644 index 0000000000..0b0a85e1c8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/universal_compaction.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Algorithm used to make a compaction request stop picking new files +// into a single compaction run +// +enum CompactionStopStyle { + kCompactionStopStyleSimilarSize, // pick files of similar size + kCompactionStopStyleTotalSize // total size of picked files > next file +}; + +class CompactionOptionsUniversal { + public: + // Percentage flexibility while comparing file size. If the candidate file(s) + // size is 1% smaller than the next file's size, then include next file into + // this candidate set. // Default: 1 + unsigned int size_ratio; + + // The minimum number of files in a single compaction run. Default: 2 + unsigned int min_merge_width; + + // The maximum number of files in a single compaction run. Default: UINT_MAX + unsigned int max_merge_width; + + // The size amplification is defined as the amount (in percentage) of + // additional storage needed to store a single byte of data in the database. + // For example, a size amplification of 2% means that a database that + // contains 100 bytes of user-data may occupy up to 102 bytes of + // physical storage. By this definition, a fully compacted database has + // a size amplification of 0%. Rocksdb uses the following heuristic + // to calculate size amplification: it assumes that all files excluding + // the earliest file contribute to the size amplification. + // Default: 200, which means that a 100 byte database could require up to + // 300 bytes of storage. + unsigned int max_size_amplification_percent; + + // If this option is set to be -1 (the default value), all the output files + // will follow compression type specified. + // + // If this option is not negative, we will try to make sure compressed + // size is just above this value. In normal cases, at least this percentage + // of data will be compressed. + // When we are compacting to a new file, here is the criteria whether + // it needs to be compressed: assuming here are the list of files sorted + // by generation time: + // A1...An B1...Bm C1...Ct + // where A1 is the newest and Ct is the oldest, and we are going to compact + // B1...Bm, we calculate the total size of all the files as total_size, as + // well as the total size of C1...Ct as total_C, the compaction output file + // will be compressed iff + // total_C / total_size < this percentage + // Default: -1 + int compression_size_percent; + + // The algorithm used to stop picking files into a single compaction run + // Default: kCompactionStopStyleTotalSize + CompactionStopStyle stop_style; + + // Option to optimize the universal multi level compaction by enabling + // trivial move for non overlapping files. + // Default: false + bool allow_trivial_move; + + // EXPERIMENTAL + // If true, try to limit compaction size under max_compaction_bytes. + // This might cause higher write amplification, but can prevent some + // problem caused by large compactions. + // Default: false + bool incremental; + + // Default set of parameters + CompactionOptionsUniversal() + : size_ratio(1), + min_merge_width(2), + max_merge_width(UINT_MAX), + max_size_amplification_percent(200), + compression_size_percent(-1), + stop_style(kCompactionStopStyleTotalSize), + allow_trivial_move(false), + incremental(false) {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/agg_merge.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/agg_merge.h new file mode 100644 index 0000000000..4e21082db6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/agg_merge.h @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +// The feature is still in development so the encoding format is subject +// to change. +// +// Aggregation Merge Operator is a merge operator that allows users to +// aggregate merge operands of different keys with different registered +// aggregation functions. The aggregation can also change for the same +// key if the functions store the data in the same format. +// The target application highly overlaps with merge operator in general +// but we try to provide a better interface so that users are more likely +// to use pre-implemented plug-in functions and connect with existing +// third-party aggregation functions (such as those from SQL engines). +// In this case, the need for users to write customized C++ plug-in code +// is reduced. +// If the idea proves to useful, we might consider to move it to be +// a core functionality of RocksDB, and reduce the support of merge +// operators. +// +// Users can implement aggregation functions by implementing abstract +// class Aggregator, and register it using AddAggregator(). +// The merge operator can be retrieved from GetAggMergeOperator() and +// it is a singleton. +// +// Users can push values to be updated with a merge operand encoded with +// registered function name and payload using EncodeAggFuncAndPayload(), +// and the merge operator will invoke the aggregation function. +// An example: +// +// // Assume class ExampleSumAggregator is implemented to do simple sum. +// AddAggregator("sum", std::make_unique()); +// std::shared_ptr mp_guard = CreateAggMergeOperator(); +// options.merge_operator = mp_guard.get(); +// ...... // Creating DB +// +// +// std::string encoded_value; +// s = EncodeAggFuncAndPayload(kUnamedFuncName, "200", encoded_value); +// assert(s.ok()); +// db->Put(WriteOptions(), "foo", encoded_value); +// s = EncodeAggFuncAndPayload("sum", "200", encoded_value); +// assert(s.ok()); +// db->Merge(WriteOptions(), "foo", encoded_value); +// s = EncodeAggFuncAndPayload("sum", "200", encoded_value); +// assert(s.ok()); +// db->Merge(WriteOptions(), "foo", encoded_value); +// +// std::string value; +// Status s = db->Get(ReadOptions, "foo", &value); +// assert(s.ok()); +// Slice func, aggregated_value; +// assert(ExtractAggFuncAndValue(value, func, aggregated_value)); +// assert(func == "sum"); +// assert(aggregated_value == "600"); +// +// +// DB::Put() can also be used to add a payloadin the same way as Merge(). +// +// kUnamedFuncName can be used as a placeholder function name. This will +// be aggregated with merge operands inserted later based on function +// name given there. +// +// If the aggregation function is not registered or there is an error +// returned by aggregation function, the result will be encoded with a fake +// aggregation function kErrorFuncName, with each merge operands to be encoded +// into a list that can be extracted using ExtractList(); +// +// If users add a merge operand using a different aggregation function from +// the previous one, the merge operands for the previous one is aggregated +// and the payload part of the result is treated as the first payload of +// the items for the new aggregation function. For example, users can +// Merge("plus, 1"), merge("plus 2"), merge("minus 3") and the aggregation +// result would be "minus 0". +// + +// A class used to aggregate data per key/value. The plug-in function is +// implemented and registered using AddAggregator(). And then use it +// with merge operator created using CreateAggMergeOperator(). +class Aggregator { + public: + virtual ~Aggregator() {} + // The input list is in reverse insertion order, with values[0] to be + // the one inserted last and values.back() to be the one inserted first. + // The oldest one might be from Get(). + // Return whether aggregation succeeded. False for aggregation error. + virtual bool Aggregate(const std::vector& values, + std::string& result) const = 0; + + // True if a partial aggregation should be invoked. Some aggregators + // might opt to skip partial aggregation if possible. + virtual bool DoPartialAggregate() const { return true; } +}; + +// The function adds aggregation plugin by function name. It is used +// by all the aggregation operator created using CreateAggMergeOperator(). +// It's currently not thread safe to run concurrently with the aggregation +// merge operator. It is recommended that all the aggregation function +// is added before calling CreateAggMergeOperator(). +Status AddAggregator(const std::string& function_name, + std::unique_ptr&& agg); + +// Get the singleton instance of merge operator for aggregation. +// Always the same one is returned with a shared_ptr is hold as a +// static variable by the function. +// This is done so because options.merge_operator is shared_ptr. +std::shared_ptr GetAggMergeOperator(); + +// Encode aggregation function and payload that can be consumed by aggregation +// merge operator. +Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload, + std::string& output); +// Helper function to extract aggregation function name and payload. +// Return false if it fails to decode. +bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value); + +// Extract encoded list. This can be used to extract error merge operands when +// the returned function name is kErrorFuncName. +bool ExtractList(const Slice& encoded_list, std::vector& decoded_list); + +// Special function name that allows it to be merged to subsequent type. +extern const std::string kUnnamedFuncName; + +// Special error function name reserved for merging or aggregation error. +extern const std::string kErrorFuncName; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/backup_engine.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/backup_engine.h new file mode 100644 index 0000000000..204d12d6fe --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/backup_engine.h @@ -0,0 +1,689 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class BackupEngineReadOnlyBase; +class BackupEngine; + +// The default DB file checksum function name. +constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c"; +// The default BackupEngine file checksum function name. +constexpr char kBackupFileChecksumFuncName[] = "crc32c"; + +struct BackupEngineOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // Default: nullptr + Env* backup_env; + + // share_table_files supports table and blob files. + // + // If share_table_files == true, the backup directory will share table and + // blob files among backups, to save space among backups of the same DB and to + // enable incremental backups by only copying new files. + // If share_table_files == false, each backup will be on its own and will not + // share any data with other backups. + // + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup and + // restore even on a machine crash/reboot. Backup and restore processes are + // slower with sync enabled. If sync == false, we can only guarantee that + // other previously synced backups and restores are not modified while + // creating a new one. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // This limit only applies to writes. To also limit reads, + // a rate limiter able to also limit reads (e.g, its mode = kAllIo) + // have to be passed in through the option "backup_rate_limiter" + // Default: 0 + uint64_t backup_rate_limit; + + // Backup rate limiter. Used to control transfer speed for backup. If this is + // not null, backup_rate_limit is ignored. + // Default: nullptr + std::shared_ptr backup_rate_limiter{nullptr}; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // This limit only applies to writes. To also limit reads, + // a rate limiter able to also limit reads (e.g, its mode = kAllIo) + // have to be passed in through the option "restore_rate_limiter" + // Default: 0 + uint64_t restore_rate_limit; + + // Restore rate limiter. Used to control transfer speed during restore. If + // this is not null, restore_rate_limit is ignored. + // Default: nullptr + std::shared_ptr restore_rate_limiter{nullptr}; + + // share_files_with_checksum supports table and blob files. + // + // Only used if share_table_files is set to true. Setting to false is + // DEPRECATED and potentially dangerous because in that case BackupEngine + // can lose data if backing up databases with distinct or divergent + // history, for example if restoring from a backup other than the latest, + // writing to the DB, and creating another backup. Setting to true (default) + // prevents these issues by ensuring that different table files (SSTs) and + // blob files with the same number are treated as distinct. See + // share_files_with_checksum_naming and ShareFilesNaming. + // + // Default: true + bool share_files_with_checksum; + + // Up to this many background threads will copy files for CreateNewBackup() + // and RestoreDBFromBackup() + // Default: 1 + int max_background_operations; + + // During backup user can get callback every time next + // callback_trigger_interval_size bytes being copied. + // Default: 4194304 + uint64_t callback_trigger_interval_size; + + // For BackupEngineReadOnly, Open() will open at most this many of the + // latest non-corrupted backups. + // + // Note: this setting is ignored (behaves like INT_MAX) for any kind of + // writable BackupEngine because it would inhibit accounting for shared + // files for proper backup deletion, including purging any incompletely + // created backups on creation of a new backup. + // + // Default: INT_MAX + int max_valid_backups_to_open; + + // ShareFilesNaming describes possible naming schemes for backup + // table and blob file names when they are stored in the + // shared_checksum directory (i.e., both share_table_files and + // share_files_with_checksum are true). + enum ShareFilesNaming : uint32_t { + // Backup blob filenames are __.blob and + // backup SST filenames are __.sst + // where is an unsigned decimal integer. This is the + // original/legacy naming scheme for share_files_with_checksum, + // with two problems: + // * At massive scale, collisions on this triple with different file + // contents is plausible. + // * Determining the name to use requires computing the checksum, + // so generally requires reading the whole file even if the file + // is already backed up. + // + // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR ** + kLegacyCrc32cAndFileSize = 1U, + + // Backup SST filenames are _s.sst. This + // pair of values should be very strongly unique for a given SST file + // and easily determined before computing a checksum. The 's' indicates + // the value is a DB session id, not a checksum. + // + // Exceptions: + // * For blob files, kLegacyCrc32cAndFileSize is used as currently + // db_session_id is not supported by the blob file format. + // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize + // will be used instead, matching the names assigned by RocksDB versions + // not supporting the newer naming scheme. + // * See also flags below. + kUseDbSessionId = 2U, + + kMaskNoNamingFlags = 0xffffU, + + // If not already part of the naming scheme, insert + // _ + // before .sst and .blob in the name. In case of user code actually parsing + // the last _ before the .sst and .blob as the file size, this + // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this + // option makes official that unofficial feature of the backup metadata. + // + // We do not consider SST and blob file sizes to have sufficient entropy to + // contribute significantly to naming uniqueness. + kFlagIncludeFileSize = 1U << 31, + + kMaskNamingFlags = ~kMaskNoNamingFlags, + }; + + // Naming option for share_files_with_checksum table and blob files. See + // ShareFilesNaming for details. + // + // Modifying this option cannot introduce a downgrade compatibility issue + // because RocksDB can read, restore, and delete backups using different file + // names, and it's OK for a backup directory to use a mixture of table and + // blob files naming schemes. + // + // However, modifying this option and saving more backups to the same + // directory can lead to the same file getting saved again to that + // directory, under the new shared name in addition to the old shared + // name. + // + // Default: kUseDbSessionId | kFlagIncludeFileSize + // + // Note: This option comes into effect only if both share_files_with_checksum + // and share_table_files are true. + ShareFilesNaming share_files_with_checksum_naming; + + // Major schema version to use when writing backup meta files + // 1 (default) - compatible with very old versions of RocksDB. + // 2 - can be read by RocksDB versions >= 6.19.0. Minimum schema version for + // * (Experimental) saving and restoring file temperature metadata + int schema_version = 1; + + // (Experimental - subject to change or removal) When taking a backup and + // saving file temperature info (minimum schema_version is 2), there are + // two potential sources of truth for the placement of files into temperature + // tiers: (a) the current file temperature reported by the FileSystem or + // (b) the expected file temperature recorded in DB manifest. When this + // option is false (default), (b) overrides (a) if both are not UNKNOWN. + // When true, (a) overrides (b) if both are not UNKNOWN. Regardless of this + // setting, a known temperature overrides UNKNOWN. + bool current_temperatures_override_manifest = false; + + void Dump(Logger* logger) const; + + explicit BackupEngineOptions( + const std::string& _backup_dir, Env* _backup_env = nullptr, + bool _share_table_files = true, Logger* _info_log = nullptr, + bool _sync = true, bool _destroy_old_data = false, + bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, + uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, + int _max_valid_backups_to_open = INT_MAX, + ShareFilesNaming _share_files_with_checksum_naming = + static_cast(kUseDbSessionId | kFlagIncludeFileSize)) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit), + share_files_with_checksum(true), + max_background_operations(_max_background_operations), + callback_trigger_interval_size(_callback_trigger_interval_size), + max_valid_backups_to_open(_max_valid_backups_to_open), + share_files_with_checksum_naming(_share_files_with_checksum_naming) { + assert(share_table_files || !share_files_with_checksum); + assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0); + } +}; + +inline BackupEngineOptions::ShareFilesNaming operator&( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert(r == BackupEngineOptions::kMaskNoNamingFlags || + (r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l & r); +} + +inline BackupEngineOptions::ShareFilesNaming operator|( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l | r); +} + +// Identifying information about a backup shared file that is (or might be) +// excluded from a backup using exclude_files_callback. +struct BackupExcludedFileInfo { + explicit BackupExcludedFileInfo(const std::string& _relative_file) + : relative_file(_relative_file) {} + + // File name and path relative to the backup dir. + std::string relative_file; +}; + +// An auxiliary structure for exclude_files_callback +struct MaybeExcludeBackupFile { + explicit MaybeExcludeBackupFile(BackupExcludedFileInfo&& _info) + : info(std::move(_info)) {} + + // Identifying information about a backup shared file that could be excluded + const BackupExcludedFileInfo info; + + // API user sets to true if the file should be excluded from this backup + bool exclude_decision = false; +}; + +struct CreateBackupOptions { + // Flush will always trigger if 2PC is enabled. + // If write-ahead logs are disabled, set flush_before_backup=true to + // avoid losing unflushed key/value pairs from the memtable. + bool flush_before_backup = false; + + // Callback for reporting progress, based on callback_trigger_interval_size. + // + // An exception thrown from the callback will result in Status::Aborted from + // the operation. + std::function progress_callback = {}; + + // A callback that allows the API user to select files for exclusion, such + // as if the files are known to exist in an alternate backup directory. + // Only "shared" files can be excluded from backups. This is an advanced + // feature because the BackupEngine user is trusted to keep track of files + // such that the DB can be restored. + // + // Input to the callback is a [begin,end) range of sharable files live in + // the DB being backed up, and the callback implementation sets + // exclude_decision=true for files to exclude. A callback offers maximum + // flexibility, e.g. if remote files are unavailable at backup time but + // whose existence has been recorded somewhere. In case of an empty or + // no-op callback, all files are included in the backup . + // + // To restore the DB, RestoreOptions::alternate_dirs must be used to provide + // the excluded files. + // + // An exception thrown from the callback will result in Status::Aborted from + // the operation. + std::function + exclude_files_callback = {}; + + // If false, background_thread_cpu_priority is ignored. + // Otherwise, the cpu priority can be decreased, + // if you try to increase the priority, the priority will not change. + // The initial priority of the threads is CpuPriority::kNormal, + // so you can decrease to priorities lower than kNormal. + bool decrease_background_thread_cpu_priority = false; + CpuPriority background_thread_cpu_priority = CpuPriority::kNormal; +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupEngineOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + // For backups that were created using exclude_files_callback, this + // option enables restoring those backups by providing BackupEngines on + // directories known to contain the required files. + std::forward_list alternate_dirs; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +using BackupID = uint32_t; + +using BackupFileInfo = FileStorageInfo; + +struct BackupInfo { + BackupID backup_id = 0U; + // Creation time, according to GetCurrentTime + int64_t timestamp = 0; + + // Total size in bytes (based on file payloads, not including filesystem + // overheads or backup meta file) + uint64_t size = 0U; + + // Number of backed up files, some of which might be shared with other + // backups. Does not include backup meta file. + uint32_t number_files = 0U; + + // Backup API user metadata + std::string app_metadata; + + // Backup file details, if requested with include_file_details=true. + // Does not include excluded_files. + std::vector file_details; + + // Identifying information about shared files that were excluded from the + // created backup. See exclude_files_callback and alternate_dirs. + // This information is only provided if include_file_details=true. + std::vector excluded_files; + + // DB "name" (a directory in the backup_env) for opening this backup as a + // read-only DB. This should also be used as the DBOptions::wal_dir, such + // as by default setting wal_dir="". See also env_for_open. + // This field is only set if include_file_details=true + std::string name_for_open; + + // An Env(+FileSystem) for opening this backup as a read-only DB, with + // DB::OpenForReadOnly or similar. This field is only set if + // include_file_details=true. (The FileSystem in this Env takes care + // of making shared backup files openable from the `name_for_open` DB + // directory.) See also name_for_open. + // + // This Env might or might not be shared with other backups. To work + // around DBOptions::env being a raw pointer, this is a shared_ptr so + // that keeping either this BackupInfo, the BackupEngine, or a copy of + // this shared_ptr alive is sufficient to keep the Env alive for use by + // a read-only DB. + std::shared_ptr env_for_open; + + BackupInfo() {} + + explicit BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files, const std::string& _app_metadata) + : backup_id(_backup_id), + timestamp(_timestamp), + size(_size), + number_files(_number_files), + app_metadata(_app_metadata) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + explicit BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; +}; + +// Read-only functions of a BackupEngine. (Restore writes to another directory +// not the backup directory.) See BackupEngine comments for details on +// safe concurrent operations. +class BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnlyBase() {} + + // Returns info about the latest good backup in backup_info, or NotFound + // no good backup exists. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetLatestBackupInfo( + BackupInfo* backup_info, bool include_file_details = false) const = 0; + + // Returns info about a specific backup in backup_info, or NotFound + // or Corruption status if the requested backup id does not exist or is + // known corrupt. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const = 0; + + // Returns info about non-corrupt backups in backup_infos. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual void GetBackupInfo(std::vector* backup_infos, + bool include_file_details = false) const = 0; + + // Returns info about corrupt backups in corrupt_backups. + // WARNING: Any write to the BackupEngine could trigger automatic + // GarbageCollect(), which could delete files that would be needed to + // manually recover a corrupt backup or to preserve an unrecognized (e.g. + // incompatible future version) backup. + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) const = 0; + + // Restore to specified db_dir and wal_dir from backup_id. + virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual IOStatus RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); + } + + // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id + virtual IOStatus RestoreDBFromLatestBackup( + const RestoreOptions& options, const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual IOStatus RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromLatestBackup(options, db_dir, wal_dir); + } + + // If verify_with_checksum is true, this function + // inspects the current checksums and file sizes of backup files to see if + // they match our expectation. + // + // If verify_with_checksum is false, this function + // checks that each file exists and that the size of the file matches our + // expectation. It does not check file checksum. + // + // If this BackupEngine created the backup, it compares the files' current + // sizes (and current checksum) against the number of bytes written to + // them (and the checksum calculated) during creation. + // Otherwise, it compares the files' current sizes (and checksums) against + // their sizes (and checksums) when the BackupEngine was opened. + // + // Returns Status::OK() if all checks are good + virtual IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const = 0; + + // Internal use only + virtual BackupEngine* AsBackupEngine() = 0; +}; + +// Append-only functions of a BackupEngine. See BackupEngine comment for +// details on distinction between Append and Write operations and safe +// concurrent operations. +class BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngineAppendOnlyBase() {} + + // same as CreateNewBackup, but stores extra application metadata. + virtual IOStatus CreateNewBackupWithMetadata( + const CreateBackupOptions& options, DB* db, + const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0; + + // keep here for backward compatibility. + virtual IOStatus CreateNewBackupWithMetadata( + DB* db, const std::string& app_metadata, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackupWithMetadata(options, db, app_metadata); + } + + // Captures the state of the database by creating a new (latest) backup. + // On success (OK status), the BackupID of the new backup is saved to + // *new_backup_id when not nullptr. + // NOTE: db_paths and cf_paths are not supported for creating backups, + // and NotSupported will be returned when the DB (without WALs) uses more + // than one directory. + virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db, + BackupID* new_backup_id = nullptr) { + return CreateNewBackupWithMetadata(options, db, "", new_backup_id); + } + + // keep here for backward compatibility. + virtual IOStatus CreateNewBackup( + DB* db, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackup(options, db); + } + + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediately, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up the + // next time you call CreateNewBackup or GarbageCollect. + virtual void StopBackup() = 0; + + // Will delete any files left over from incomplete creation or deletion of + // a backup. This is not normally needed as those operations also clean up + // after prior incomplete calls to the same kind of operation (create or + // delete). This does not delete corrupt backups but can delete files that + // would be needed to manually recover a corrupt backup or to preserve an + // unrecognized (e.g. incompatible future version) backup. + // NOTE: This is not designed to delete arbitrary files added to the backup + // directory outside of BackupEngine, and clean-up is always subject to + // permissions on and availability of the underlying filesystem. + // NOTE2: For concurrency and interference purposes (see BackupEngine + // comment), GarbageCollect (GC) is like other Append operations, even + // though it seems different. Although GC can delete physical data, it does + // not delete any logical data read by Read operations. GC can interfere + // with Append or Write operations in another BackupEngine on the same + // backup_dir, because temporary files will be treated as obsolete and + // deleted. + virtual IOStatus GarbageCollect() = 0; +}; + +// A backup engine for organizing and managing backups. +// This class is not user-extensible. +// +// This class declaration adds "Write" operations in addition to the +// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase. +// +// # Concurrency between threads on the same BackupEngine* object +// +// As of version 6.20, BackupEngine* operations are generally thread-safe, +// using a read-write lock, though single-thread operation is still +// recommended to avoid TOCTOU bugs. Specifically, particular kinds of +// concurrent operations behave like this: +// +// op1\op2| Read | Append | Write +// -------|-------|--------|-------- +// Read | conc | block | block +// Append | block | block | block +// Write | block | block | block +// +// conc = operations safely proceed concurrently +// block = one of the operations safely blocks until the other completes. +// There is generally no guarantee as to which completes first. +// +// StopBackup is the only operation that affects an ongoing operation. +// +// # Interleaving operations between BackupEngine* objects open on the +// same backup_dir +// +// It is recommended only to have one BackupEngine* object open for a given +// backup_dir, but it is possible to mix / interleave some operations +// (regardless of whether they are concurrent) with these caveats: +// +// op1\op2| Open | Read | Append | Write +// -------|--------|--------|--------|-------- +// Open | conc | conc | atomic | unspec +// Read | conc | conc | old | unspec +// Append | atomic | old | unspec | unspec +// Write | unspec | unspec | unspec | unspec +// +// Special case: Open with destroy_old_data=true is really a Write +// +// conc = operations safely proceed, concurrently when applicable +// atomic = operations are effectively atomic; if a concurrent Append +// operation has not completed at some key point during Open, the +// opened BackupEngine* will never see the result of the Append op. +// old = Read operations do not include any state changes from other +// BackupEngine* objects; they return the state at their Open time. +// unspec = Behavior is unspecified, including possibly trashing the +// backup_dir, but is "memory safe" (no C++ undefined behavior) +// +class BackupEngine : public BackupEngineReadOnlyBase, + public BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngine() {} + + // BackupEngineOptions have to be the same as the ones used in previous + // BackupEngines for the same backup directory. + static IOStatus Open(const BackupEngineOptions& options, Env* db_env, + BackupEngine** backup_engine_ptr); + + // keep for backward compatibility. + static IOStatus Open(Env* db_env, const BackupEngineOptions& options, + BackupEngine** backup_engine_ptr) { + return BackupEngine::Open(options, db_env, backup_engine_ptr); + } + + // Deletes old backups, keeping latest num_backups_to_keep alive. + // See also DeleteBackup. + virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + + // Deletes a specific backup. If this operation (or PurgeOldBackups) + // is not completed due to crash, power failure, etc. the state + // will be cleaned up the next time you call DeleteBackup, + // PurgeOldBackups, or GarbageCollect. + virtual IOStatus DeleteBackup(BackupID backup_id) = 0; +}; + +// A variant of BackupEngine that only allows "Read" operations. See +// BackupEngine comment for details. This class is not user-extensible. +class BackupEngineReadOnly : public BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnly() {} + + static IOStatus Open(const BackupEngineOptions& options, Env* db_env, + BackupEngineReadOnly** backup_engine_ptr); + // keep for backward compatibility. + static IOStatus Open(Env* db_env, const BackupEngineOptions& options, + BackupEngineReadOnly** backup_engine_ptr) { + return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/cache_dump_load.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/cache_dump_load.h new file mode 100644 index 0000000000..8b91bb7e15 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/cache_dump_load.h @@ -0,0 +1,140 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +// The classes and functions in this header file is used for dumping out the +// blocks in a block cache, storing or transfering the blocks to another +// destination host, and load these blocks to the secondary cache at destination +// host. +// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They +// my be changed in the future when the development continues. + +// The major and minor version number of the data format to be stored/trandfered +// via CacheDumpWriter and read out via CacheDumpReader +static const int kCacheDumpMajorVersion = 0; +static const int kCacheDumpMinorVersion = 1; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is an abstract class to write or transfer the data that is created by +// CacheDumper. We pack one block with its block type, dump time, block key in +// the block cache, block len, block crc32c checksum and block itself as a unit +// and it is stored via WritePacket. Before we call WritePacket, we must call +// WriteMetadata once, which stores the sequence number, block unit checksum, +// and block unit size. +// We provide file based CacheDumpWriter to store the metadata and its package +// sequentially in a file as the defualt implementation. Users can implement +// their own CacheDumpWriter to store/transfer the data. For example, user can +// create a subclass which transfer the metadata and package on the fly. +class CacheDumpWriter { + public: + virtual ~CacheDumpWriter() = default; + + // Called ONCE before the calls to WritePacket + virtual IOStatus WriteMetadata(const Slice& metadata) = 0; + virtual IOStatus WritePacket(const Slice& data) = 0; + virtual IOStatus Close() = 0; +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is an abstract class to read or receive the data that is stored +// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called +// once before we call a ReadPacket. +class CacheDumpReader { + public: + virtual ~CacheDumpReader() = default; + // Called ONCE before the calls to ReadPacket + virtual IOStatus ReadMetadata(std::string* metadata) = 0; + // Sets data to empty string on EOF + virtual IOStatus ReadPacket(std::string* data) = 0; + // (Close not needed) +}; + +// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any +// dump or load process related control variables can be added here. +struct CacheDumpOptions { + SystemClock* clock; +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This the class to dump out the block in the block cache, store/transfer them +// via CacheDumpWriter. In order to dump out the blocks belonging to a certain +// DB or a list of DB (block cache can be shared by many DB), user needs to call +// SetDumpFilter to specify a list of DB to filter out the blocks that do not +// belong to those DB. +// A typical use case is: when we migrate a DB instance from host A to host B. +// We need to reopen the DB at host B after all the files are copied to host B. +// At this moment, the block cache at host B does not have any block from this +// migrated DB. Therefore, the read performance can be low due to cache warm up. +// By using CacheDumper before we shut down the DB at host A and using +// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache +// ahead. This function can be used in other use cases also. +class CacheDumper { + public: + virtual ~CacheDumper() = default; + // Only dump the blocks in the block cache that belong to the DBs in this list + virtual Status SetDumpFilter(std::vector db_list) { + (void)db_list; + return Status::NotSupported("SetDumpFilter is not supported"); + } + // The main function to dump out all the blocks that satisfy the filter + // condition from block cache to a certain CacheDumpWriter in one shot. This + // process may take some time. + virtual IOStatus DumpCacheEntriesToWriter() { + return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported"); + } +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is the class to load the dumped blocks to the destination cache. For now +// we only load the blocks to the SecondaryCache. In the future, we may plan to +// support loading to the block cache. +class CacheDumpedLoader { + public: + virtual ~CacheDumpedLoader() = default; + virtual IOStatus RestoreCacheEntriesToSecondaryCache() { + return IOStatus::NotSupported( + "RestoreCacheEntriesToSecondaryCache is not supported"); + } +}; + +// Get the writer which stores all the metadata and data sequentially to a file +IOStatus NewToFileCacheDumpWriter(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* writer); + +// Get the reader which read out the metadata and data sequentially from a file +IOStatus NewFromFileCacheDumpReader(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* reader); + +// Get the default cache dumper +Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer, + std::unique_ptr* cache_dumper); + +// Get the default cache dump loader +Status NewDefaultCacheDumpedLoader( + const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& toptions, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader, + std::unique_ptr* cache_dump_loader); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/checkpoint.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/checkpoint.h new file mode 100644 index 0000000000..6509f38d96 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/checkpoint.h @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A checkpoint is an openable snapshot of a database at a point in time. + +#pragma once + +#include +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; +class ColumnFamilyHandle; +struct LiveFileMetaData; +struct ExportImportFilesMetaData; + +class Checkpoint { + public: + // Creates a Checkpoint object to be used for creating openable snapshots + static Status Create(DB* db, Checkpoint** checkpoint_ptr); + + // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an + // absolute path. The specified directory should not exist, since it will be + // created by the API. + // When a checkpoint is created, + // (1) SST and blob files are hard linked if the output directory is on the + // same filesystem as the database, and copied otherwise. + // (2) other required files (like MANIFEST) are always copied. + // log_size_for_flush: if the total log file size is equal or larger than + // this value, then a flush is triggered for all the column families. The + // default value is 0, which means flush is always triggered. If you move + // away from the default, the checkpoint may not contain up-to-date data + // if WAL writing is not always enabled. + // Flush will always trigger if it is 2PC. + // sequence_number_ptr: if it is not nullptr, the value it points to will be + // set to a sequence number guaranteed to be part of the DB, not necessarily + // the latest. The default value of this parameter is nullptr. + // NOTE: db_paths and cf_paths are not supported for creating checkpoints + // and NotSupported will be returned when the DB (without WALs) uses more + // than one directory. + virtual Status CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush = 0, + uint64_t* sequence_number_ptr = nullptr); + + // Exports all live SST files of a specified Column Family onto export_dir, + // returning SST files information in metadata. + // - SST files will be created as hard links when the directory specified + // is in the same partition as the db directory, copied otherwise. + // - export_dir should not already exist and will be created by this API. + // - Always triggers a flush. + virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata); + + virtual ~Checkpoint() {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/convenience.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/convenience.h new file mode 100644 index 0000000000..f61afd69ef --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/convenience.h @@ -0,0 +1,10 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// This file was moved to rocksdb/convenience.h" + +#include "rocksdb/convenience.h" diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/customizable_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/customizable_util.h new file mode 100644 index 0000000000..adf2540540 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/customizable_util.h @@ -0,0 +1,322 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// The methods in this file are used to instantiate new Customizable +// instances of objects. These methods are most typically used by +// the "CreateFromString" method of a customizable class. +// If not developing a new Type of customizable class, you probably +// do not need the methods in this file. +// +// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects +// for more information on how to develop and use customizable objects + +#pragma once +#include +#include + +#include "options/configurable_helper.h" +#include "rocksdb/convenience.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { +// Creates a new shared customizable instance object based on the +// input parameters using the object registry. +// +// The id parameter specifies the instance class of the object to create. +// The opt_map parameter specifies the configuration of the new instance. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewSharedObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::shared_ptr* result) { + if (!id.empty()) { + Status status; + status = config_options.registry->NewSharedObject(id, result); + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + result->reset(); + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new managed customizable instance object based on the +// input parameters using the object registry. Unlike "shared" objects, +// managed objects are limited to a single instance per ID. +// +// The id parameter specifies the instance class of the object to create. +// If an object with this id exists in the registry, the existing object +// will be returned. If the object does not exist, a new one will be created. +// +// The opt_map parameter specifies the configuration of the new instance. +// If the object already exists, the existing object is returned "as is" and +// this parameter is ignored. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the object. This string +// will be used by the object registry to locate the appropriate object to +// create or return. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The managed instance. +template +static Status NewManagedObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::shared_ptr* result) { + Status status; + if (!id.empty()) { + status = config_options.registry->GetOrCreateManagedObject( + id, result, [config_options, opt_map](T* object) { + return object->ConfigureFromMap(config_options, opt_map); + }); + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + return Status::OK(); + } + } else { + status = Status::NotSupported("Cannot reset object "); + } + return status; +} + +// Creates a new shared Customizable object based on the input parameters. +// This method parses the input value to determine the type of instance to +// create. If there is an existing instance (in result) and it is the same ID +// as the object being created, the existing configuration is stored and used as +// the default for the new object. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there should be an "id=value" pairing or an error may +// result. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param result The newly created instance. +template +static Status LoadSharedObject(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else { + return NewSharedObject(config_options, id, opt_map, result); + } +} + +// Creates a new shared Customizable object based on the input parameters. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there should be an "id=value" pairing or an error may +// result. +// +// The "id" field from the value (either the whole field or "id=XX") is used +// to determine the type/id of the object to return. For a given id, there +// the same instance of the object will be returned from this method (as opposed +// to LoadSharedObject which would create different objects for the same id. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param result The newly created instance. +template +static Status LoadManagedObject(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, nullptr, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (value.empty()) { // No Id and no options. Clear the object + *result = nullptr; + return Status::OK(); + } else { + return NewManagedObject(config_options, id, opt_map, result); + } +} + +// Creates a new unique pointer customizable instance object based on the +// input parameters using the object registry. +// @see NewSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewUniqueObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::unique_ptr* result) { + if (!id.empty()) { + Status status; + status = config_options.registry->NewUniqueObject(id, result); + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + result->reset(); + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new unique customizable instance object based on the input +// parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param result The newly created instance. +template +static Status LoadUniqueObject(const ConfigOptions& config_options, + const std::string& value, + std::unique_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else { + return NewUniqueObject(config_options, id, opt_map, result); + } +} + +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters using the object registry. +// @see NewSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewStaticObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, T** result) { + if (!id.empty()) { + Status status; + status = config_options.registry->NewStaticObject(id, result); + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = + Customizable::ConfigureNewObject(config_options, *result, opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + *result = nullptr; + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param result The newly created instance. +template +static Status LoadStaticObject(const ConfigOptions& config_options, + const std::string& value, T** result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, *result, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else { + return NewStaticObject(config_options, id, opt_map, result); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/db_ttl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/db_ttl.h new file mode 100644 index 0000000000..12f5cbac0f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/db_ttl.h @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Database with TTL support. +// +// USE-CASES: +// This API should be used to open the db when key-values inserted are +// meant to be removed from the db in a non-strict 'ttl' amount of time +// Therefore, this guarantees that key-values inserted will remain in the +// db for >= ttl amount of time and the db will make efforts to remove the +// key-values as soon as possible after ttl seconds of their insertion. +// +// BEHAVIOUR: +// TTL is accepted in seconds +// (int32_t)Timestamp(creation) is suffixed to values in Put internally +// Expired TTL values deleted in compaction only:(Timestamp+ttl=5 +// read_only=true opens in the usual read-only mode. Compactions will not be +// triggered(neither manual nor automatic), so no expired entries removed +// +// CONSTRAINTS: +// Not specifying/passing or non-positive TTL behaves like TTL = infinity +// +// !!!WARNING!!!: +// Calling DB::Open directly to re-open a db created by this API will get +// corrupt values(timestamp suffixed) and no ttl effect will be there +// during the second Open, so use this API consistently to open the db +// Be careful when passing ttl with a small positive value because the +// whole database may be deleted in a small amount of time + +class DBWithTTL : public StackableDB { + public: + virtual Status CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) = 0; + + static Status Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl = 0, + bool read_only = false); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + DBWithTTL** dbptr, const std::vector& ttls, + bool read_only = false); + + virtual void SetTtl(int32_t ttl) = 0; + + virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0; + + protected: + explicit DBWithTTL(DB* db) : StackableDB(db) {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/debug.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/debug.h new file mode 100644 index 0000000000..e1fc76e3e0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/debug.h @@ -0,0 +1,46 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include "rocksdb/db.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Data associated with a particular version of a key. A database may internally +// store multiple versions of a same user key due to snapshots, compaction not +// happening yet, etc. +struct KeyVersion { + KeyVersion() : user_key(""), value(""), sequence(0), type(0) {} + + KeyVersion(const std::string& _user_key, const std::string& _value, + SequenceNumber _sequence, int _type) + : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {} + + std::string user_key; + std::string value; + SequenceNumber sequence; + int type; + std::string GetTypeName() const; +}; + +// Returns listing of all versions of keys in the provided user key range. +// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or +// `max_num_ikeys` has been reached. Since all those keys returned will be +// copied to memory, if the range covers too many keys, the memory usage +// may be huge. `max_num_ikeys` can be used to cap the memory usage. +// The result is inserted into the provided vector, `key_versions`. +Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, + size_t max_num_ikeys, + std::vector* key_versions); + +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions); + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/env_mirror.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/env_mirror.h new file mode 100644 index 0000000000..2a12612870 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/env_mirror.h @@ -0,0 +1,179 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// MirrorEnv is an Env implementation that mirrors all file-related +// operations to two backing Env's (provided at construction time). +// Writes are mirrored. For read operations, we do the read from both +// backends and assert that the results match. +// +// This is useful when implementing a new Env and ensuring that the +// semantics and behavior are correct (in that they match that of an +// existing, stable Env, like the default POSIX one). + +#pragma once + + +#include +#include +#include + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class SequentialFileMirror; +class RandomAccessFileMirror; +class WritableFileMirror; + +class EnvMirror : public EnvWrapper { + Env *a_, *b_; + bool free_a_, free_b_; + + public: + EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false) + : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {} + ~EnvMirror() { + if (free_a_) delete a_; + if (free_b_) delete b_; + } + + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override; + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) override; + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + std::unique_ptr br; + Status as = a_->NewDirectory(name, result); + Status bs = b_->NewDirectory(name, &br); + assert(as == bs); + return as; + } + Status FileExists(const std::string& f) override { + Status as = a_->FileExists(f); + Status bs = b_->FileExists(f); + assert(as == bs); + return as; + } +#if defined(_MSC_VER) +#pragma warning(push) +// logical operation on address of string constant +#pragma warning(disable : 4130) +#endif + Status GetChildren(const std::string& dir, + std::vector* r) override { + std::vector ar, br; + Status as = a_->GetChildren(dir, &ar); + Status bs = b_->GetChildren(dir, &br); + assert(as == bs); + std::sort(ar.begin(), ar.end()); + std::sort(br.begin(), br.end()); + if (!as.ok() || ar != br) { + assert(0 == "getchildren results don't match"); + } + *r = ar; + return as; + } +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + Status DeleteFile(const std::string& f) override { + Status as = a_->DeleteFile(f); + Status bs = b_->DeleteFile(f); + assert(as == bs); + return as; + } + Status CreateDir(const std::string& d) override { + Status as = a_->CreateDir(d); + Status bs = b_->CreateDir(d); + assert(as == bs); + return as; + } + Status CreateDirIfMissing(const std::string& d) override { + Status as = a_->CreateDirIfMissing(d); + Status bs = b_->CreateDirIfMissing(d); + assert(as == bs); + return as; + } + Status DeleteDir(const std::string& d) override { + Status as = a_->DeleteDir(d); + Status bs = b_->DeleteDir(d); + assert(as == bs); + return as; + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + uint64_t asize, bsize; + Status as = a_->GetFileSize(f, &asize); + Status bs = b_->GetFileSize(f, &bsize); + assert(as == bs); + assert(!as.ok() || asize == bsize); + *s = asize; + return as; + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + uint64_t amtime, bmtime; + Status as = a_->GetFileModificationTime(fname, &amtime); + Status bs = b_->GetFileModificationTime(fname, &bmtime); + assert(as == bs); + assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000); + *file_mtime = amtime; + return as; + } + + Status RenameFile(const std::string& s, const std::string& t) override { + Status as = a_->RenameFile(s, t); + Status bs = b_->RenameFile(s, t); + assert(as == bs); + return as; + } + + Status LinkFile(const std::string& s, const std::string& t) override { + Status as = a_->LinkFile(s, t); + Status bs = b_->LinkFile(s, t); + assert(as == bs); + return as; + } + + class FileLockMirror : public FileLock { + public: + FileLock *a_, *b_; + FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {} + }; + + Status LockFile(const std::string& f, FileLock** l) override { + FileLock *al, *bl; + Status as = a_->LockFile(f, &al); + Status bs = b_->LockFile(f, &bl); + assert(as == bs); + if (as.ok()) *l = new FileLockMirror(al, bl); + return as; + } + + Status UnlockFile(FileLock* l) override { + FileLockMirror* ml = static_cast(l); + Status as = a_->UnlockFile(ml->a_); + Status bs = b_->UnlockFile(ml->b_); + assert(as == bs); + delete ml; + return as; + } +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/info_log_finder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/info_log_finder.h new file mode 100644 index 0000000000..824f8a3dfe --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/info_log_finder.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// This function can be used to list the Information logs, +// given the db pointer. +Status GetInfoLogList(DB* db, std::vector* info_log_list); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd.h new file mode 100644 index 0000000000..af5ee4ba98 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd.h @@ -0,0 +1,316 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/ldb_tool.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/ldb_cmd_execute_result.h" + +namespace ROCKSDB_NAMESPACE { + +class LDBCommand { + public: + // Command-line arguments + static const std::string ARG_ENV_URI; + static const std::string ARG_FS_URI; + static const std::string ARG_DB; + static const std::string ARG_PATH; + static const std::string ARG_SECONDARY_PATH; + static const std::string ARG_HEX; + static const std::string ARG_KEY_HEX; + static const std::string ARG_VALUE_HEX; + static const std::string ARG_CF_NAME; + static const std::string ARG_TTL; + static const std::string ARG_TTL_START; + static const std::string ARG_TTL_END; + static const std::string ARG_TIMESTAMP; + static const std::string ARG_TRY_LOAD_OPTIONS; + static const std::string ARG_IGNORE_UNKNOWN_OPTIONS; + static const std::string ARG_FROM; + static const std::string ARG_TO; + static const std::string ARG_MAX_KEYS; + static const std::string ARG_BLOOM_BITS; + static const std::string ARG_FIX_PREFIX_LEN; + static const std::string ARG_COMPRESSION_TYPE; + static const std::string ARG_COMPRESSION_MAX_DICT_BYTES; + static const std::string ARG_BLOCK_SIZE; + static const std::string ARG_AUTO_COMPACTION; + static const std::string ARG_DB_WRITE_BUFFER_SIZE; + static const std::string ARG_WRITE_BUFFER_SIZE; + static const std::string ARG_FILE_SIZE; + static const std::string ARG_CREATE_IF_MISSING; + static const std::string ARG_NO_VALUE; + static const std::string ARG_DISABLE_CONSISTENCY_CHECKS; + static const std::string ARG_ENABLE_BLOB_FILES; + static const std::string ARG_MIN_BLOB_SIZE; + static const std::string ARG_BLOB_FILE_SIZE; + static const std::string ARG_BLOB_COMPRESSION_TYPE; + static const std::string ARG_ENABLE_BLOB_GARBAGE_COLLECTION; + static const std::string ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF; + static const std::string ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD; + static const std::string ARG_BLOB_COMPACTION_READAHEAD_SIZE; + static const std::string ARG_BLOB_FILE_STARTING_LEVEL; + static const std::string ARG_PREPOPULATE_BLOB_CACHE; + static const std::string ARG_DECODE_BLOB_INDEX; + static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS; + + struct ParsedParams { + std::string cmd; + std::vector cmd_params; + std::map option_map; + std::vector flags; + }; + + static LDBCommand* SelectCommand(const ParsedParams& parsed_parms); + + static LDBCommand* InitFromCmdLineArgs( + const std::vector& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families, + const std::function& selector = + SelectCommand); + + static LDBCommand* InitFromCmdLineArgs( + int argc, char const* const* argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families); + + bool ValidateCmdLineOptions(); + + virtual void PrepareOptions(); + + virtual void OverrideBaseOptions(); + + virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts); + + virtual void SetDBOptions(Options options) { options_ = options; } + + virtual void SetColumnFamilies( + const std::vector* column_families) { + if (column_families != nullptr) { + column_families_ = *column_families; + } else { + column_families_.clear(); + } + } + + void SetLDBOptions(const LDBOptions& ldb_options) { + ldb_options_ = ldb_options; + } + + const std::map& TEST_GetOptionMap() { + return option_map_; + } + + const std::vector& TEST_GetFlags() { return flags_; } + + virtual bool NoDBOpen() { return false; } + + virtual ~LDBCommand() { CloseDB(); } + + /* Run the command, and return the execute result. */ + void Run(); + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { return exec_state_; } + + void ClearPreviousRunState() { exec_state_.Reset(); } + + // Consider using Slice::DecodeHex directly instead if you don't need the + // 0x prefix + static std::string HexToString(const std::string& str); + + // Consider using Slice::ToString(true) directly instead if + // you don't need the 0x prefix + static std::string StringToHex(const std::string& str); + + static const char* DELIM; + + protected: + LDBCommandExecuteResult exec_state_; + std::string env_uri_; + std::string fs_uri_; + std::string db_path_; + // If empty, open DB as primary. If non-empty, open the DB as secondary + // with this secondary path. When running against a database opened by + // another process, ldb wll leave the source directory completely intact. + std::string secondary_path_; + std::string column_family_name_; + DB* db_; + DBWithTTL* db_ttl_; + std::map cf_handles_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + // If true, try to construct options from DB's option files. + bool try_load_options_; + + // The value passed to options.force_consistency_checks. + bool force_consistency_checks_; + + bool enable_blob_files_; + + bool enable_blob_garbage_collection_; + + bool create_if_missing_; + + /** + * Map of options passed on the command-line. + */ + const std::map option_map_; + + /** + * Flags passed on the command-line. + */ + const std::vector flags_; + + /** List of command-line options valid for this command */ + const std::vector valid_cmd_line_options_; + + /** Shared pointer to underlying environment if applicable **/ + std::shared_ptr env_guard_; + + bool ParseKeyValue(const std::string& line, std::string* key, + std::string* value, bool is_key_hex, bool is_value_hex); + + LDBCommand(const std::map& options, + const std::vector& flags, bool is_read_only, + const std::vector& valid_cmd_line_options); + + void OpenDB(); + + void CloseDB(); + + ColumnFamilyHandle* GetCfHandle(); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_key_hex, + bool is_value_hex); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_hex); + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const std::vector& flags, + const std::string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static std::string HelpRangeCmdArgs(); + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + static std::vector BuildCmdLineOptions( + std::vector options); + + bool ParseIntOption(const std::map& options, + const std::string& option, int& value, + LDBCommandExecuteResult& exec_state); + + bool ParseDoubleOption(const std::map& options, + const std::string& option, double& value, + LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const std::map& options, + const std::string& option, std::string* value); + + bool ParseCompressionTypeOption( + const std::map& options, + const std::string& option, CompressionType& value, + LDBCommandExecuteResult& exec_state); + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const std::map& options, + const std::string& option, bool default_val); + + Options options_; + std::vector column_families_; + ConfigOptions config_options_; + LDBOptions ldb_options_; + + private: + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const std::map& options, + const std::vector& flags); + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const std::map& options, + const std::vector& flags); + + bool IsTryLoadOptions(const std::map& options, + const std::vector& flags); + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(std::string val); +}; + +class LDBCommandRunner { + public: + static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name, + bool to_stderr = true); + + // Returns the status code to return. 0 is no error. + static int RunCommand( + int argc, char const* const* argv, Options options, + const LDBOptions& ldb_options, + const std::vector* column_families); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd_execute_result.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd_execute_result.h new file mode 100644 index 0000000000..57bac33468 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/ldb_cmd_execute_result.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +#ifdef FAILED +#undef FAILED +#endif + +namespace ROCKSDB_NAMESPACE { + +class LDBCommandExecuteResult { + public: + enum State { + EXEC_NOT_STARTED = 0, + EXEC_SUCCEED = 1, + EXEC_FAILED = 2, + }; + + LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {} + + LDBCommandExecuteResult(State state, std::string& msg) + : state_(state), message_(msg) {} + + std::string ToString() { + std::string ret; + switch (state_) { + case EXEC_SUCCEED: + break; + case EXEC_FAILED: + ret.append("Failed: "); + break; + case EXEC_NOT_STARTED: + ret.append("Not started: "); + } + if (!message_.empty()) { + ret.append(message_); + } + return ret; + } + + void Reset() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + bool IsSucceed() { return state_ == EXEC_SUCCEED; } + + bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; } + + bool IsFailed() { return state_ == EXEC_FAILED; } + + static LDBCommandExecuteResult Succeed(std::string msg) { + return LDBCommandExecuteResult(EXEC_SUCCEED, msg); + } + + static LDBCommandExecuteResult Failed(std::string msg) { + return LDBCommandExecuteResult(EXEC_FAILED, msg); + } + + private: + State state_; + std::string message_; + + bool operator==(const LDBCommandExecuteResult&); + bool operator!=(const LDBCommandExecuteResult&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/leveldb_options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/leveldb_options.h new file mode 100644 index 0000000000..7e4a6faa4e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/leveldb_options.h @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class Comparator; +class Env; +class FilterPolicy; +class Logger; +struct Options; +class Snapshot; + +// Options to control the behavior of a database (passed to +// DB::Open). A LevelDBOptions object can be initialized as though +// it were a LevelDB Options object, and then it can be converted into +// a RocksDB Options object. +struct LevelDBOptions { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + Logger* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // If non-NULL, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: NULL + const FilterPolicy* filter_policy; + + // Create a LevelDBOptions object with default values for all fields. + LevelDBOptions(); +}; + +// Converts a LevelDBOptions object into a RocksDB Options object. +Options ConvertOptions(const LevelDBOptions& leveldb_options); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_custom_library.h new file mode 100644 index 0000000000..f617da02be --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_custom_library.h @@ -0,0 +1,43 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifdef LUA + +// lua headers +extern "C" { +#include +#include +#include +} + +namespace ROCKSDB_NAMESPACE { +namespace lua { +// A class that used to define custom C Library that is callable +// from Lua script +class RocksLuaCustomLibrary { + public: + virtual ~RocksLuaCustomLibrary() {} + // The name of the C library. This name will also be used as the table + // (namespace) in Lua that contains the C library. + virtual const char* Name() const = 0; + + // Returns a "static const struct luaL_Reg[]", which includes a list of + // C functions. Note that the last entry of this static array must be + // {nullptr, nullptr} as required by Lua. + // + // More details about how to implement Lua C libraries can be found + // in the official Lua document http://www.lua.org/pil/26.2.html + virtual const struct luaL_Reg* Lib() const = 0; + + // A function that will be called right after the library has been created + // and pushed on the top of the lua_State. This custom setup function + // allows developers to put additional table or constant values inside + // the same table / namespace. + virtual void CustomSetup(lua_State* /*L*/) const {} +}; +} // namespace lua +} // namespace ROCKSDB_NAMESPACE +#endif // LUA diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_util.h new file mode 100644 index 0000000000..3427b65ef6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/lua/rocks_lua_util.h @@ -0,0 +1,55 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +// lua headers +extern "C" { +#include +#include +#include +} + +#ifdef LUA +#include +#include + +#include "rocksdb/utilities/lua/rocks_lua_custom_library.h" + +namespace ROCKSDB_NAMESPACE { +namespace lua { +class LuaStateWrapper { + public: + explicit LuaStateWrapper(const std::string& lua_script) { + lua_state_ = luaL_newstate(); + Init(lua_script, {}); + } + LuaStateWrapper( + const std::string& lua_script, + const std::vector>& libraries) { + lua_state_ = luaL_newstate(); + Init(lua_script, libraries); + } + lua_State* GetLuaState() const { return lua_state_; } + ~LuaStateWrapper() { lua_close(lua_state_); } + + private: + void Init( + const std::string& lua_script, + const std::vector>& libraries) { + if (lua_state_) { + luaL_openlibs(lua_state_); + for (const auto& library : libraries) { + luaL_openlib(lua_state_, library->Name(), library->Lib(), 0); + library->CustomSetup(lua_state_); + } + luaL_dostring(lua_state_, lua_script.c_str()); + } + } + + lua_State* lua_state_; +}; +} // namespace lua +} // namespace ROCKSDB_NAMESPACE +#endif // LUA diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/memory_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/memory_util.h new file mode 100644 index 0000000000..b141b4ef0e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/memory_util.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +// Returns the current memory usage of the specified DB instances. +class MemoryUtil { + public: + enum UsageType : int { + // Memory usage of all the mem-tables. + kMemTableTotal = 0, + // Memory usage of those un-flushed mem-tables. + kMemTableUnFlushed = 1, + // Memory usage of all the table readers. + kTableReadersTotal = 2, + // Memory usage by Cache. + kCacheTotal = 3, + kNumUsageTypes = 4 + }; + + // Returns the approximate memory usage of different types in the input + // list of DBs and Cache set. For instance, in the output map + // usage_by_type, usage_by_type[kMemTableTotal] will store the memory + // usage of all the mem-tables from all the input rocksdb instances. + // + // Note that for memory usage inside Cache class, we will + // only report the usage of the input "cache_set" without + // including those Cache usage inside the input list "dbs" + // of DBs. + static Status GetApproximateMemoryUsageByType( + const std::vector& dbs, + const std::unordered_set cache_set, + std::map* usage_by_type); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/object_registry.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/object_registry.h new file mode 100644 index 0000000000..613ef1cd9b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/object_registry.h @@ -0,0 +1,583 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Customizable; +class Logger; +class ObjectLibrary; + +// Returns a new T when called with a string. Populates the std::unique_ptr +// argument if granting ownership to caller. +template +using FactoryFunc = + std::function*, std::string*)>; + +// The signature of the function for loading factories +// into an object library. This method is expected to register +// factory functions in the supplied ObjectLibrary. +// The ObjectLibrary is the library in which the factories will be loaded. +// The std::string is the argument passed to the loader function. +// The RegistrarFunc should return the number of objects loaded into this +// library +using RegistrarFunc = std::function; + +template +using ConfigureFunc = std::function; + +class ObjectLibrary { + private: + // Base class for an Entry in the Registry. + class Entry { + public: + virtual ~Entry() {} + virtual bool Matches(const std::string& target) const = 0; + virtual const char* Name() const = 0; + }; + + public: + // Class for matching target strings to a pattern. + // Entries consist of a name that starts the pattern and attributes + // The following attributes can be added to the entry: + // -Suffix: Comparable to name(suffix) + // -Separator: Comparable to name(separator).+ or name(separator).* + // -Number: Comparable to name(separator).[0-9]+ + // -AltName: Comparable to (name|alt) + // -Optional: Comparable to name(separator)? + // Multiple separators can be combined and cause multiple matches. + // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#") + // is roughly equivalent to "(A|B)@.+#.+" + // + // Note that though this class does provide some regex-style matching, + // it is not a full regex parser and has some key differences: + // - Separators are matched left-most. For example, an entry + // Name("Hello").AddSeparator(" ").AddSuffix("!") would match + // "Hello world!", but not "Hello world!!" + // - No backtracking is necessary, enabling reliably efficient matching + class PatternEntry : public Entry { + private: + enum Quantifier { + kMatchZeroOrMore, // [suffix].* + kMatchAtLeastOne, // [suffix].+ + kMatchExact, // [suffix] + kMatchInteger, // [suffix][0-9]+ + kMatchDecimal, // [suffix][0-9]+[.][0-9]+ + }; + + public: + // Short-cut for creating an entry that matches to a + // Customizable::IndividualId + static PatternEntry AsIndividualId(const std::string& name) { + PatternEntry entry(name, true); + entry.AddSeparator("@"); + entry.AddSeparator("#"); + return entry; + } + + // Creates a new PatternEntry for "name". If optional is true, + // Matches will also return true if name==target + explicit PatternEntry(const std::string& name, bool optional = true) + : name_(name), optional_(optional), slength_(0) { + nlength_ = name_.size(); + } + + // Adds a suffix (exact match of separator with no trailing characters) to + // the separator + PatternEntry& AddSuffix(const std::string& suffix) { + separators_.emplace_back(suffix, kMatchExact); + slength_ += suffix.size(); + return *this; + } + + // Adds a separator (exact match of separator with trailing characters) to + // the entry + // If at_least_one is true, the separator must be followed by at least + // one character (e.g. separator.+). + // If at_least_one is false, the separator may be followed by zero or + // more characters (e.g. separator.*). + PatternEntry& AddSeparator(const std::string& separator, + bool at_least_one = true) { + slength_ += separator.size(); + if (at_least_one) { + separators_.emplace_back(separator, kMatchAtLeastOne); + ++slength_; + } else { + separators_.emplace_back(separator, kMatchZeroOrMore); + } + return *this; + } + + // Adds a separator (exact match of separator with trailing numbers) to the + // entry + PatternEntry& AddNumber(const std::string& separator, bool is_int = true) { + separators_.emplace_back(separator, + (is_int) ? kMatchInteger : kMatchDecimal); + slength_ += separator.size() + 1; + return *this; + } + + // Sets another name that this entry will match, similar to (name|alt) + PatternEntry& AnotherName(const std::string& alt) { + names_.emplace_back(alt); + return *this; + } + + // Sets whether the separators are required -- similar to name(separator)? + // If optional is true, then name(separator)? would match + // If optional is false, then the separators must also match + PatternEntry& SetOptional(bool optional) { + optional_ = optional; + return *this; + } + + // Checks to see if the target matches this entry + bool Matches(const std::string& target) const override; + const char* Name() const override { return name_.c_str(); } + + private: + size_t MatchSeparatorAt(size_t start, Quantifier mode, + const std::string& target, size_t tlen, + const std::string& pattern) const; + + bool MatchesTarget(const std::string& name, size_t nlen, + const std::string& target, size_t ylen) const; + std::string name_; // The base name for this entry + size_t nlength_; // The length of name_ + std::vector names_; // Alternative names for this entry + bool optional_; // Whether matching of separators is required + size_t slength_; // The minimum required length to match the separators + std::vector> + separators_; // What to match + }; // End class Entry + + private: + // An Entry containing a FactoryFunc for creating new Objects + template + class FactoryEntry : public Entry { + public: + FactoryEntry(Entry* e, FactoryFunc f) + : entry_(e), factory_(std::move(f)) {} + bool Matches(const std::string& target) const override { + return entry_->Matches(target); + } + const char* Name() const override { return entry_->Name(); } + + // Creates a new T object. + T* NewFactoryObject(const std::string& target, std::unique_ptr* guard, + std::string* msg) const { + return factory_(target, guard, msg); + } + const FactoryFunc& GetFactory() const { return factory_; } + + private: + std::unique_ptr entry_; // What to match for this entry + FactoryFunc factory_; + }; // End class FactoryEntry + public: + explicit ObjectLibrary(const std::string& id) { id_ = id; } + + const std::string& GetID() const { return id_; } + + // Finds the factory function for the input target. + // @see PatternEntry for the matching rules to target + // @return If matched, the FactoryFunc for this target, else nullptr + template + FactoryFunc FindFactory(const std::string& target) const { + std::unique_lock lock(mu_); + auto factories = factories_.find(T::Type()); + if (factories != factories_.end()) { + for (const auto& e : factories->second) { + if (e->Matches(target)) { + const auto* fe = + static_cast*>(e.get()); + return fe->GetFactory(); + } + } + } + return nullptr; + } + + // Returns the total number of factories registered for this library. + // This method returns the sum of all factories registered for all types. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(size_t* num_types) const; + + // Returns the number of factories registered for this library + // for the input type. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(const std::string& type) const; + + // Returns the registered factory names for the input type + // names is updated to include the names for the type + void GetFactoryNames(const std::string& type, + std::vector* names) const; + + void GetFactoryTypes(std::unordered_set* types) const; + + void Dump(Logger* logger) const; + + // Registers the factory with the library for the name. + // If name==target, the factory may be used to create a new object. + template + const FactoryFunc& AddFactory(const std::string& name, + const FactoryFunc& func) { + std::unique_ptr entry( + new FactoryEntry(new PatternEntry(name), func)); + AddFactoryEntry(T::Type(), std::move(entry)); + return func; + } + + // Registers the factory with the library for the entry. + // If the entry matches the target, the factory may be used to create a new + // object. + // @see PatternEntry for the matching rules. + // NOTE: This function replaces the old ObjectLibrary::Register() + template + const FactoryFunc& AddFactory(const PatternEntry& entry, + const FactoryFunc& func) { + std::unique_ptr factory( + new FactoryEntry(new PatternEntry(entry), func)); + AddFactoryEntry(T::Type(), std::move(factory)); + return func; + } + + // Invokes the registrar function with the supplied arg for this library. + int Register(const RegistrarFunc& registrar, const std::string& arg) { + return registrar(*this, arg); + } + + // Returns the default ObjectLibrary + static std::shared_ptr& Default(); + + private: + void AddFactoryEntry(const char* type, std::unique_ptr&& entry) { + std::unique_lock lock(mu_); + auto& factories = factories_[type]; + factories.emplace_back(std::move(entry)); + } + + // Protects the entry map + mutable std::mutex mu_; + // ** FactoryFunctions for this loader, organized by type + std::unordered_map>> + factories_; + + // The name for this library + std::string id_; +}; + +// The ObjectRegistry is used to register objects that can be created by a +// name/pattern at run-time where the specific implementation of the object may +// not be known in advance. +class ObjectRegistry { + public: + static std::shared_ptr NewInstance(); + static std::shared_ptr NewInstance( + const std::shared_ptr& parent); + static std::shared_ptr Default(); + explicit ObjectRegistry(const std::shared_ptr& parent) + : parent_(parent) {} + explicit ObjectRegistry(const std::shared_ptr& library); + + std::shared_ptr AddLibrary(const std::string& id) { + auto library = std::make_shared(id); + AddLibrary(library); + return library; + } + + void AddLibrary(const std::shared_ptr& library) { + std::unique_lock lock(library_mutex_); + libraries_.push_back(library); + } + + void AddLibrary(const std::string& id, const RegistrarFunc& registrar, + const std::string& arg) { + auto library = AddLibrary(id); + library->Register(registrar, arg); + } + + // Finds the factory for target and instantiates a new T. + // Returns NotSupported if no factory is found + // Returns InvalidArgument if a factory is found but the factory failed. + template + Status NewObject(const std::string& target, T** object, + std::unique_ptr* guard) { + assert(guard != nullptr); + guard->reset(); + auto factory = FindFactory(target); + if (factory != nullptr) { + std::string errmsg; + *object = factory(target, guard, &errmsg); + if (*object != nullptr) { + return Status::OK(); + } else if (errmsg.empty()) { + return Status::InvalidArgument( + std::string("Could not load ") + T::Type(), target); + } else { + return Status::InvalidArgument(errmsg, target); + } + } else { + return Status::NotSupported(std::string("Could not load ") + T::Type(), + target); + } + } + // Creates a new unique T using the input factory functions. + // Returns OK if a new unique T was successfully created + // Returns NotSupported if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a unique ptr) + template + Status NewUniqueObject(const std::string& target, + std::unique_ptr* result) { + T* ptr = nullptr; + std::unique_ptr guard; + Status s = NewObject(target, &ptr, &guard); + if (!s.ok()) { + return s; + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a unique ") + + T::Type() + " from unguarded one ", + target); + } + } + + // Creates a new shared T using the input factory functions. + // Returns OK if a new shared T was successfully created + // Returns NotSupported if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a shared ptr) + template + Status NewSharedObject(const std::string& target, + std::shared_ptr* result) { + std::unique_ptr guard; + T* ptr = nullptr; + Status s = NewObject(target, &ptr, &guard); + if (!s.ok()) { + return s; + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a shared ") + + T::Type() + " from unguarded one ", + target); + } + } + + // Creates a new static T using the input factory functions. + // Returns OK if a new static T was successfully created + // Returns NotSupported if the type/target could not be created + // Returns InvalidArgument if the factory return a guarded object + // (meaning it is managed by a unique ptr) + template + Status NewStaticObject(const std::string& target, T** result) { + std::unique_ptr guard; + T* ptr = nullptr; + Status s = NewObject(target, &ptr, &guard); + if (!s.ok()) { + return s; + } else if (guard.get()) { + return Status::InvalidArgument(std::string("Cannot make a static ") + + T::Type() + " from a guarded one ", + target); + } else { + *result = ptr; + return Status::OK(); + } + } + + // Sets the object for the given id/type to be the input object + // If the registry does not contain this id/type, the object is added and OK + // is returned. If the registry contains a different object, an error is + // returned. If the registry contains the input object, OK is returned. + template + Status SetManagedObject(const std::shared_ptr& object) { + assert(object != nullptr); + return SetManagedObject(object->GetId(), object); + } + + template + Status SetManagedObject(const std::string& id, + const std::shared_ptr& object) { + const auto c = std::static_pointer_cast(object); + return SetManagedObject(T::Type(), id, c); + } + + // Returns the object for the given id, if one exists. + // If the object is not found in the registry, a nullptr is returned + template + std::shared_ptr GetManagedObject(const std::string& id) const { + auto c = GetManagedObject(T::Type(), id); + return std::static_pointer_cast(c); + } + + // Returns the set of managed objects found in the registry matching + // the input type and ID. + // If the input id is not empty, then only objects of that class + // (IsInstanceOf(id)) will be returned (for example, only return LRUCache + // objects) If the input id is empty, then all objects of that type (all Cache + // objects) + template + Status ListManagedObjects(const std::string& id, + std::vector>* results) const { + std::vector> customizables; + results->clear(); + Status s = ListManagedObjects(T::Type(), id, &customizables); + if (s.ok()) { + for (const auto& c : customizables) { + results->push_back(std::static_pointer_cast(c)); + } + } + return s; + } + + template + Status ListManagedObjects(std::vector>* results) const { + return ListManagedObjects("", results); + } + + // Creates a new ManagedObject in the registry for the id if one does not + // currently exist. If an object with that ID already exists, the current + // object is returned. + // + // The ID is the identifier of the object to be returned/created and returned + // in result + // If a new object is created (using the object factories), the cfunc + // parameter will be invoked to configure the new object. + template + Status GetOrCreateManagedObject(const std::string& id, + std::shared_ptr* result, + const ConfigureFunc& cfunc = nullptr) { + if (parent_ != nullptr) { + auto object = parent_->GetManagedObject(T::Type(), id); + if (object != nullptr) { + *result = std::static_pointer_cast(object); + return Status::OK(); + } + } + { + std::unique_lock lock(objects_mutex_); + auto key = ToManagedObjectKey(T::Type(), id); + auto iter = managed_objects_.find(key); + if (iter != managed_objects_.end()) { + auto object = iter->second.lock(); + if (object != nullptr) { + *result = std::static_pointer_cast(object); + return Status::OK(); + } + } + std::shared_ptr object; + Status s = NewSharedObject(id, &object); + if (s.ok() && cfunc != nullptr) { + s = cfunc(object.get()); + } + if (s.ok()) { + auto c = std::static_pointer_cast(object); + if (id != c->Name()) { + // If the ID is not the base name of the class, add the new + // object under the input ID + managed_objects_[key] = c; + } + if (id != c->GetId() && c->GetId() != c->Name()) { + // If the input and current ID do not match, and the + // current ID is not the base bame, add the new object under + // its new ID + key = ToManagedObjectKey(T::Type(), c->GetId()); + managed_objects_[key] = c; + } + *result = object; + } + return s; + } + } + + // Returns the number of factories registered for this library + // for the input type. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(const std::string& type) const; + + // Returns the names of registered factories for the input type. + // names is updated to include the names for the type + void GetFactoryNames(const std::string& type, + std::vector* names) const; + + void GetFactoryTypes(std::unordered_set* types) const; + + // Dump the contents of the registry to the logger + void Dump(Logger* logger) const; + + // Invokes the input function to retrieve the properties for this plugin. + int RegisterPlugin(const std::string& name, const RegistrarFunc& func); + + private: + static std::string ToManagedObjectKey(const std::string& type, + const std::string& id) { + return type + "://" + id; + } + + // Returns the Customizable managed object associated with the key (Type/ID). + // If not found, nullptr is returned. + std::shared_ptr GetManagedObject(const std::string& type, + const std::string& id) const; + Status ListManagedObjects( + const std::string& type, const std::string& pattern, + std::vector>* results) const; + // Sets the managed object associated with the key (Type/ID) to c. + // If the named managed object does not exist, the object is added and OK is + // returned If the object exists and is the same as c, OK is returned + // Otherwise, an error status is returned. + Status SetManagedObject(const std::string& type, const std::string& id, + const std::shared_ptr& c); + + // Searches (from back to front) the libraries looking for the + // factory that matches this name. + // Returns the factory if it is found, and nullptr otherwise + template + const FactoryFunc FindFactory(const std::string& name) const { + { + std::unique_lock lock(library_mutex_); + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); + ++iter) { + const auto factory = iter->get()->FindFactory(name); + if (factory != nullptr) { + return factory; + } + } + } + if (parent_ == nullptr) { + return nullptr; + } else { + return parent_->FindFactory(name); + } + } + + // The set of libraries to search for factories for this registry. + // The libraries are searched in reverse order (back to front) when + // searching for entries. + std::vector> libraries_; + std::vector plugins_; + static std::unordered_map builtins_; + std::map> managed_objects_; + std::shared_ptr parent_; + mutable std::mutex objects_mutex_; // Mutex for managed objects + mutable std::mutex library_mutex_; // Mutex for managed libraries +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/optimistic_transaction_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/optimistic_transaction_db.h new file mode 100644 index 0000000000..261883782f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/optimistic_transaction_db.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +class Transaction; + +// Database with Transaction support. +// +// See optimistic_transaction.h and examples/transaction_example.cc + +// Options to use when starting an Optimistic Transaction +struct OptimisticTransactionOptions { + // Setting set_snapshot=true is the same as calling SetSnapshot(). + bool set_snapshot = false; + + // Should be set if the DB has a non-default comparator. + // See comment in WriteBatchWithIndex constructor. + const Comparator* cmp = BytewiseComparator(); +}; + +enum class OccValidationPolicy { + // Validate serially at commit stage, AFTER entering the write-group. + // Isolation validation is processed single-threaded(since in the + // write-group). + // May suffer from high mutex contention, as per this link: + // https://github.com/facebook/rocksdb/issues/4402 + kValidateSerial = 0, + // Validate parallelly before commit stage, BEFORE entering the write-group to + // reduce mutex contention. Each txn acquires locks for its write-set + // records in some well-defined order. + kValidateParallel = 1 +}; + +struct OptimisticTransactionDBOptions { + OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel; + + // works only if validate_policy == OccValidationPolicy::kValidateParallel + uint32_t occ_lock_buckets = (1 << 20); +}; + +// Range deletions (including those in `WriteBatch`es passed to `Write()`) are +// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status` +class OptimisticTransactionDB : public StackableDB { + public: + // Open an OptimisticTransactionDB similar to DB::Open(). + static Status Open(const Options& options, const std::string& dbname, + OptimisticTransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const OptimisticTransactionDBOptions& occ_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr); + + virtual ~OptimisticTransactionDB() {} + + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + OptimisticTransactionDB(const OptimisticTransactionDB&) = delete; + void operator=(const OptimisticTransactionDB&) = delete; + + protected: + // To Create an OptimisticTransactionDB, call Open() + explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {} +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/option_change_migration.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/option_change_migration.h new file mode 100644 index 0000000000..a73324a9e3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/option_change_migration.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// Try to migrate DB created with old_opts to be use new_opts. +// Multiple column families is not supported. +// It is best-effort. No guarantee to succeed. +// A full compaction may be executed. +// If the target options use FIFO compaction, the FIFO condition might be +// sacrificed: for data migrated, data inserted later might be dropped +// earlier. This is to gurantee FIFO compaction won't drop all the +// migrated data to fit max_table_files_size. +Status OptionChangeMigration(std::string dbname, const Options& old_opts, + const Options& new_opts); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_type.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_type.h new file mode 100644 index 0000000000..cd340ed596 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_type.h @@ -0,0 +1,1221 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// The OptionTypeInfo and related classes provide a framework for +// configuring and validating RocksDB classes via the Options framework. +// This file is part of the public API to allow developers who wish to +// write their own extensions and plugins to take use the Options +// framework in their custom implementations. +// +// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects +// for more information on how to develop and use custom extensions + +#pragma once + +#include +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class OptionTypeInfo; +struct ColumnFamilyOptions; +struct DBOptions; + +// The underlying "class/type" of the option. +// This enum is used to determine how the option should +// be converted to/from strings and compared. +enum class OptionType { + kBoolean, + kInt, + kInt32T, + kInt64T, + kUInt, + kUInt8T, + kUInt32T, + kUInt64T, + kSizeT, + kString, + kDouble, + kCompactionStyle, + kCompactionPri, + kCompressionType, + kCompactionStopStyle, + kChecksumType, + kEncodingType, + kEnv, + kEnum, + kStruct, + kVector, + kConfigurable, + kCustomizable, + kEncodedString, + kTemperature, + kArray, + kUnknown, +}; + +enum class OptionVerificationType { + kNormal, + kByName, // The option is pointer typed so we can only verify + // based on it's name. + kByNameAllowNull, // Same as kByName, but it also allows the case + // where one of them is a nullptr. + kByNameAllowFromNull, // Same as kByName, but it also allows the case + // where the old option is nullptr. + kDeprecated, // The option is no longer used in rocksdb. The RocksDB + // OptionsParser will still accept this option if it + // happen to exists in some Options file. However, + // the parser will not include it in serialization + // and verification processes. + kAlias, // This option represents is a name/shortcut for + // another option and should not be written or verified + // independently +}; + +// A set of modifier flags used to alter how an option is evaluated or +// processed. These flags can be combined together (e.g. kMutable | kShared). +// The kCompare flags can be used to control if/when options are compared. +// If kCompareNever is set, two related options would never be compared (always +// equal) If kCompareExact is set, the options will only be compared if the +// sanity mode +// is exact +// kMutable means the option can be changed after it is prepared +// kShared means the option is contained in a std::shared_ptr +// kUnique means the option is contained in a std::uniqued_ptr +// kRawPointer means the option is a raw pointer value. +// kAllowNull means that an option is allowed to be null for verification +// purposes. +// kDontSerialize means this option should not be serialized and included in +// the string representation. +// kDontPrepare means do not call PrepareOptions for this pointer value. +enum class OptionTypeFlags : uint32_t { + kNone = 0x00, // No flags + kCompareDefault = 0x0, + kCompareNever = ConfigOptions::kSanityLevelNone, + kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible, + kCompareExact = ConfigOptions::kSanityLevelExactMatch, + + kMutable = 0x0100, // Option is mutable + kRawPointer = 0x0200, // The option is stored as a raw pointer + kShared = 0x0400, // The option is stored as a shared_ptr + kUnique = 0x0800, // The option is stored as a unique_ptr + kAllowNull = 0x1000, // The option can be null + kDontSerialize = 0x2000, // Don't serialize the option + kDontPrepare = 0x4000, // Don't prepare or sanitize this option + kStringNameOnly = 0x8000, // The option serializes to a name only +}; + +inline OptionTypeFlags operator|(const OptionTypeFlags& a, + const OptionTypeFlags& b) { + return static_cast(static_cast(a) | + static_cast(b)); +} + +inline OptionTypeFlags operator&(const OptionTypeFlags& a, + const OptionTypeFlags& b) { + return static_cast(static_cast(a) & + static_cast(b)); +} + +// Converts an string into its enumerated value. +// @param type_map Mapping between strings and enum values +// @param type The string representation of the enum +// @param value Returns the enum value represented by the string +// @return true if the string was found in the enum map, false otherwise. +template +bool ParseEnum(const std::unordered_map& type_map, + const std::string& type, T* value) { + auto iter = type_map.find(type); + if (iter != type_map.end()) { + *value = iter->second; + return true; + } + return false; +} + +// Converts an enum into its string representation. +// @param type_map Mapping between strings and enum values +// @param type The enum +// @param value Returned as the string representation of the enum +// @return true if the enum was found in the enum map, false otherwise. +template +bool SerializeEnum(const std::unordered_map& type_map, + const T& type, std::string* value) { + for (const auto& pair : type_map) { + if (pair.second == type) { + *value = pair.first; + return true; + } + } + return false; +} + +template +Status ParseArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::array* result); + +template +Status SerializeArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::array& vec, + std::string* value); + +template +bool ArraysAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::array& array1, + const std::array& array2, std::string* mismatch); + +template +Status ParseVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::vector* result); + +template +Status SerializeVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::vector& vec, + std::string* value); +template +bool VectorsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::vector& vec1, const std::vector& vec2, + std::string* mismatch); + +// Function for converting a option string value into its underlying +// representation in "addr" +// On success, Status::OK is returned and addr is set to the parsed form +// On failure, a non-OK status is returned +// @param opts The ConfigOptions controlling how the value is parsed +// @param name The name of the options being parsed +// @param value The string representation of the option +// @param addr Pointer to the object +using ParseFunc = std::function; + +// Function for converting an option "addr" into its string representation. +// On success, Status::OK is returned and value is the serialized form. +// On failure, a non-OK status is returned +// @param opts The ConfigOptions controlling how the values are serialized +// @param name The name of the options being serialized +// @param addr Pointer to the value being serialized +// @param value The result of the serialization. +using SerializeFunc = std::function; + +// Function for comparing two option values +// If they are not equal, updates "mismatch" with the name of the bad option +// @param opts The ConfigOptions controlling how the values are compared +// @param name The name of the options being compared +// @param addr1 The first address to compare +// @param addr2 The address to compare to +// @param mismatch If the values are not equal, the name of the option that +// first differs +using EqualsFunc = std::function; + +// Function for preparing/initializing an option. +using PrepareFunc = + std::function; + +// Function for validating an option. +using ValidateFunc = std::function; + +// A struct for storing constant option information such as option name, +// option type, and offset. +class OptionTypeInfo { + public: + // A simple "normal", non-mutable Type "type" at offset + OptionTypeInfo(int offset, OptionType type) + : offset_(offset), + parse_func_(nullptr), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(OptionVerificationType::kNormal), + flags_(OptionTypeFlags::kNone) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags) + : offset_(offset), + parse_func_(nullptr), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(verification), + flags_(flags) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func) + : offset_(offset), + parse_func_(parse_func), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(verification), + flags_(flags) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) + : offset_(offset), + parse_func_(parse_func), + serialize_func_(serialize_func), + equals_func_(equals_func), + type_(type), + verification_(verification), + flags_(flags) {} + + // Creates an OptionTypeInfo for an enum type. Enums use an additional + // map to convert the enums to/from their string representation. + // To create an OptionTypeInfo that is an Enum, one should: + // - Create a static map of string values to the corresponding enum value + // - Call this method passing the static map in as a parameter. + // Note that it is not necessary to add a new OptionType or make any + // other changes -- the returned object handles parsing, serialization, and + // comparisons. + // + // @param offset The offset in the option object for this enum + // @param map The string to enum mapping for this enum + template + static OptionTypeInfo Enum( + int offset, const std::unordered_map* const map, + OptionTypeFlags flags = OptionTypeFlags::kNone) { + OptionTypeInfo info(offset, OptionType::kEnum, + OptionVerificationType::kNormal, flags); + info.SetParseFunc( + // Uses the map argument to convert the input string into + // its corresponding enum value. If value is found in the map, + // addr is updated to the corresponding map entry. + // @return OK if the value is found in the map + // @return InvalidArgument if the value is not found in the map + [map](const ConfigOptions&, const std::string& name, + const std::string& value, void* addr) { + if (map == nullptr) { + return Status::NotSupported("No enum mapping ", name); + } else if (ParseEnum(*map, value, static_cast(addr))) { + return Status::OK(); + } else { + return Status::InvalidArgument("No mapping for enum ", name); + } + }); + info.SetSerializeFunc( + // Uses the map argument to convert the input enum into + // its corresponding string value. If enum value is found in the map, + // value is updated to the corresponding string value in the map. + // @return OK if the enum is found in the map + // @return InvalidArgument if the enum is not found in the map + [map](const ConfigOptions&, const std::string& name, const void* addr, + std::string* value) { + if (map == nullptr) { + return Status::NotSupported("No enum mapping ", name); + } else if (SerializeEnum(*map, (*static_cast(addr)), + value)) { + return Status::OK(); + } else { + return Status::InvalidArgument("No mapping for enum ", name); + } + }); + info.SetEqualsFunc( + // Casts addr1 and addr2 to the enum type and returns true if + // they are equal, false otherwise. + [](const ConfigOptions&, const std::string&, const void* addr1, + const void* addr2, std::string*) { + return (*static_cast(addr1) == + *static_cast(addr2)); + }); + return info; + } // End OptionTypeInfo::Enum + + // Creates an OptionTypeInfo for a Struct type. Structs have a + // map of string-OptionTypeInfo associated with them that describes how + // to process the object for parsing, serializing, and matching. + // Structs also have a struct_name, which is the name of the object + // as registered in the parent map. + // When processing a struct, the option name can be specified as: + // - Meaning to process the entire struct. + // - Meaning to process the single field + // - Process the single fields + // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions + // are all examples of Struct options. + // + // To create an OptionTypeInfo that is a Struct, one should: + // - Create a static map of string-OptionTypeInfo corresponding to the + // properties of the object that can be set via the options. + // - Call this method passing the name and map in as parameters. + // Note that it is not necessary to add a new OptionType or make any + // other changes -- the returned object handles parsing, serialization, and + // comparisons. + // + // @param offset The offset in the option object for this enum + // @param map The string to enum mapping for this enum + static OptionTypeInfo Struct( + const std::string& struct_name, + const std::unordered_map* struct_map, + int offset, OptionVerificationType verification, OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kStruct, verification, flags); + info.SetParseFunc( + // Parses the struct and updates the fields at addr + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + return ParseStruct(opts, struct_name, struct_map, name, value, addr); + }); + info.SetSerializeFunc( + // Serializes the struct options into value + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr, + std::string* value) { + return SerializeStruct(opts, struct_name, struct_map, name, addr, + value); + }); + info.SetEqualsFunc( + // Compares the struct fields of addr1 and addr2 for equality + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + return StructsAreEqual(opts, struct_name, struct_map, name, addr1, + addr2, mismatch); + }); + return info; + } + static OptionTypeInfo Struct( + const std::string& struct_name, + const std::unordered_map* struct_map, + int offset, OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func) { + OptionTypeInfo info( + Struct(struct_name, struct_map, offset, verification, flags)); + return info.SetParseFunc(parse_func); + } + + template + static OptionTypeInfo Array(int _offset, OptionVerificationType _verification, + OptionTypeFlags _flags, + const OptionTypeInfo& elem_info, + char separator = ':') { + OptionTypeInfo info(_offset, OptionType::kArray, _verification, _flags); + info.SetParseFunc([elem_info, separator]( + const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto result = static_cast*>(addr); + return ParseArray(opts, elem_info, separator, name, value, + result); + }); + info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts, + const std::string& name, + const void* addr, + std::string* value) { + const auto& array = *(static_cast*>(addr)); + return SerializeArray(opts, elem_info, separator, name, array, + value); + }); + info.SetEqualsFunc([elem_info](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto& array1 = *(static_cast*>(addr1)); + const auto& array2 = *(static_cast*>(addr2)); + return ArraysAreEqual(opts, elem_info, name, array1, array2, + mismatch); + }); + return info; + } + + template + static OptionTypeInfo Vector(int _offset, + OptionVerificationType _verification, + OptionTypeFlags _flags, + const OptionTypeInfo& elem_info, + char separator = ':') { + OptionTypeInfo info(_offset, OptionType::kVector, _verification, _flags); + info.SetParseFunc([elem_info, separator]( + const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto result = static_cast*>(addr); + return ParseVector(opts, elem_info, separator, name, value, result); + }); + info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts, + const std::string& name, + const void* addr, + std::string* value) { + const auto& vec = *(static_cast*>(addr)); + return SerializeVector(opts, elem_info, separator, name, vec, value); + }); + info.SetEqualsFunc([elem_info](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto& vec1 = *(static_cast*>(addr1)); + const auto& vec2 = *(static_cast*>(addr2)); + return VectorsAreEqual(opts, elem_info, name, vec1, vec2, mismatch); + }); + return info; + } + + // Create a new std::shared_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::shared_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kShared); + return info.SetParseFunc([](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto* shared = static_cast*>(addr); + if (name == kIdPropName() && value.empty()) { + shared->reset(); + return Status::OK(); + } else { + return T::CreateFromString(opts, value, shared); + } + }); + } + + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + OptionTypeInfo info(AsCustomSharedPtr(offset, ovt, flags)); + info.SetSerializeFunc(serialize_func); + info.SetEqualsFunc(equals_func); + return info; + } + + // Create a new std::unique_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::unique_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kUnique); + return info.SetParseFunc([](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto* unique = static_cast*>(addr); + if (name == kIdPropName() && value.empty()) { + unique->reset(); + return Status::OK(); + } else { + return T::CreateFromString(opts, value, unique); + } + }); + } + + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + OptionTypeInfo info(AsCustomUniquePtr(offset, ovt, flags)); + info.SetSerializeFunc(serialize_func); + info.SetEqualsFunc(equals_func); + return info; + } + + // Create a new Customizable* OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // T object. + // + // @param _offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kRawPointer); + return info.SetParseFunc([](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto** pointer = static_cast(addr); + if (name == kIdPropName() && value.empty()) { + *pointer = nullptr; + return Status::OK(); + } else { + return T::CreateFromString(opts, value, pointer); + } + }); + } + + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + OptionTypeInfo info(AsCustomRawPtr(offset, ovt, flags)); + info.SetSerializeFunc(serialize_func); + info.SetEqualsFunc(equals_func); + return info; + } + + OptionTypeInfo& SetParseFunc(const ParseFunc& f) { + parse_func_ = f; + return *this; + } + + OptionTypeInfo& SetSerializeFunc(const SerializeFunc& f) { + serialize_func_ = f; + return *this; + } + OptionTypeInfo& SetEqualsFunc(const EqualsFunc& f) { + equals_func_ = f; + return *this; + } + + OptionTypeInfo& SetPrepareFunc(const PrepareFunc& f) { + prepare_func_ = f; + return *this; + } + + OptionTypeInfo& SetValidateFunc(const ValidateFunc& f) { + validate_func_ = f; + return *this; + } + + bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; } + + bool IsEditable(const ConfigOptions& opts) const { + if (opts.mutable_options_only) { + return IsMutable(); + } else { + return true; + } + } + bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); } + + bool IsDeprecated() const { + return IsEnabled(OptionVerificationType::kDeprecated); + } + + // Returns true if the option is marked as an Alias. + // Aliases are valid options that are parsed but are not converted to strings + // or compared. + bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); } + + bool IsEnabled(OptionVerificationType ovf) const { + return verification_ == ovf; + } + + // Returns the sanity level for comparing the option. + // If the options should not be compared, returns None + // If the option has a compare flag, returns it. + // Otherwise, returns "exact" + ConfigOptions::SanityLevel GetSanityLevel() const { + if (IsDeprecated() || IsAlias()) { + return ConfigOptions::SanityLevel::kSanityLevelNone; + } else { + auto match = (flags_ & OptionTypeFlags::kCompareExact); + if (match == OptionTypeFlags::kCompareDefault) { + return ConfigOptions::SanityLevel::kSanityLevelExactMatch; + } else { + return (ConfigOptions::SanityLevel)match; + } + } + } + + // Returns true if the option should be serialized. + // Options should be serialized if the are not deprecated, aliases, + // or marked as "Don't Serialize". + bool ShouldSerialize() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) { + return false; + } else { + return true; + } + } + + bool ShouldPrepare() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else if (IsEnabled(OptionTypeFlags::kDontPrepare)) { + return false; + } else { + return (prepare_func_ != nullptr || IsConfigurable()); + } + } + + bool ShouldValidate() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else { + return (validate_func_ != nullptr || IsConfigurable()); + } + } + + // Returns true if the option is allowed to be null. + // Options can be null if the verification type is allow from null + // or if the flags specify allow null. + bool CanBeNull() const { + return (IsEnabled(OptionTypeFlags::kAllowNull) || + IsEnabled(OptionVerificationType::kByNameAllowNull) || + IsEnabled(OptionVerificationType::kByNameAllowFromNull)); + } + + bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); } + + bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); } + + bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); } + + bool IsByName() const { + return (verification_ == OptionVerificationType::kByName || + verification_ == OptionVerificationType::kByNameAllowNull || + verification_ == OptionVerificationType::kByNameAllowFromNull); + } + + bool IsStruct() const { return (type_ == OptionType::kStruct); } + + bool IsConfigurable() const { + return (type_ == OptionType::kConfigurable || + type_ == OptionType::kCustomizable); + } + + bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); } + + inline const void* GetOffset(const void* base) const { + return static_cast(base) + offset_; + } + + inline void* GetOffset(void* base) const { + return static_cast(base) + offset_; + } + + template + const T* GetOffsetAs(const void* base) const { + const void* addr = GetOffset(base); + return static_cast(addr); + } + + template + T* GetOffsetAs(void* base) const { + void* addr = GetOffset(base); + return static_cast(addr); + } + + // Returns the underlying pointer for the type at base_addr + // The value returned is the underlying "raw" pointer, offset from base. + template + const T* AsRawPointer(const void* const base_addr) const { + if (base_addr == nullptr) { + return nullptr; + } + if (IsUniquePtr()) { + const auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsSharedPtr()) { + const auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsRawPtr()) { + const T* const* ptr = GetOffsetAs(base_addr); + return *ptr; + } else { + return GetOffsetAs(base_addr); + } + } + + // Returns the underlying pointer for the type at base_addr + // The value returned is the underlying "raw" pointer, offset from base. + template + T* AsRawPointer(void* base_addr) const { + if (base_addr == nullptr) { + return nullptr; + } + if (IsUniquePtr()) { + auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsSharedPtr()) { + auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsRawPtr()) { + auto ptr = GetOffsetAs(base_addr); + return *ptr; + } else { + return GetOffsetAs(base_addr); + } + } + + // Parses the option in "opt_value" according to the rules of this class + // and updates the value at "opt_ptr". + // On success, Status::OK() is returned. On failure: + // NotFound means the opt_name is not valid for this option + // NotSupported means we do not know how to parse the value for this option + // InvalidArgument means the opt_value is not valid for this option. + Status Parse(const ConfigOptions& config_options, const std::string& opt_name, + const std::string& opt_value, void* const opt_ptr) const; + + // Serializes the option in "opt_addr" according to the rules of this class + // into the value at "opt_value". + Status Serialize(const ConfigOptions& config_options, + const std::string& opt_name, const void* const opt_ptr, + std::string* opt_value) const; + + // Compares the "addr1" and "addr2" values according to the rules of this + // class and returns true if they match. On a failed match, mismatch is the + // name of the option that failed to match. + bool AreEqual(const ConfigOptions& config_options, + const std::string& opt_name, const void* const addr1, + const void* const addr2, std::string* mismatch) const; + + // Used to override the match rules for "ByName" options. + bool AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr) const; + bool AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, const void* const this_ptr, + const std::string& that_value) const; + + Status Prepare(const ConfigOptions& config_options, const std::string& name, + void* opt_ptr) const; + Status Validate(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts, + const std::string& name, const void* opt_ptr) const; + + // Parses the input opts_map according to the type_map for the opt_addr + // For each name-value pair in opts_map, find the corresponding name in + // type_map If the name is found: + // - set the corresponding value in opt_addr, returning the status on + // failure; + // If the name is not found: + // - If unused is specified, add the name-value to unused and continue + // - If ingore_unknown_options is false, return NotFound + // Returns OK if all options were either: + // - Successfully set + // - options were not found and ignore_unknown_options=true + // - options were not found and unused was specified + // Note that this method is much less sophisticated than the comparable + // Configurable::Configure methods. For example, on error, there is no + // attempt to return opt_addr to the initial state. Additionally, there + // is no effort to initialize (Configurable::PrepareOptions) the object + // on success. This method should typically only be used for simpler, + // standalone structures and not those that contain shared and embedded + // objects. + static Status ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + static Status ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + + // Parses the input value according to the map for the struct at opt_addr + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it, based on struct_name and + // opt_name. + static Status ParseStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const std::string& value, void* opt_addr); + + // Serializes the values from opt_addr using the rules in type_map. + // Returns the serialized form in result. + // Returns OK on success or non-OK if some option could not be serialized. + static Status SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* value); + + // Serializes the input addr according to the map for the struct to value. + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it + static Status SerializeStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const void* opt_addr, std::string* value); + + // Compares the values in this_addr and that_addr using the rules in type_map. + // If the values are equal, returns true + // If the values are not equal, returns false and sets mismatch to the name + // of the first value that did not match. + static bool TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& map, + const void* this_addr, const void* that_addr, std::string* mismatch); + + // Compares the input offsets according to the map for the struct and returns + // true if they are equivalent, false otherwise. + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it + static bool StructsAreEqual( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const void* this_offset, + const void* that_offset, std::string* mismatch); + + // Finds the entry for the opt_name in the opt_map, returning + // nullptr if not found. + // If found, elem_name will be the name of option to find. + // This may be opt_name, or a substring of opt_name. + // For "simple" options, opt_name will be equal to elem_name. Given the + // opt_name "opt", elem_name will equal "opt". + // For "embedded" options (like structs), elem_name may be opt_name + // or a field within the opt_name. For example, given the struct "struct", + // and opt_name of "struct.field", elem_name will be "field" + static const OptionTypeInfo* Find( + const std::string& opt_name, + const std::unordered_map& opt_map, + std::string* elem_name); + + // Returns the next token marked by the delimiter from "opts" after start in + // token and updates end to point to where that token stops. Delimiters inside + // of braces are ignored. Returns OK if a token is found and an error if the + // input opts string is mis-formatted. + // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points + // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B" + // + // @param opts The string in which to find the next token + // @param delimiter The delimiter between tokens + // @param start The position in opts to start looking for the token + // @param ed Returns the end position in opts of the token + // @param token Returns the token + // @returns OK if a token was found + // @return InvalidArgument if the braces mismatch + // (e.g. "{a={b=c;}" ) -- missing closing brace + // @return InvalidArgument if an expected delimiter is not found + // e.g. "{a=b}c=d;" -- missing delimiter before "c" + static Status NextToken(const std::string& opts, char delimiter, size_t start, + size_t* end, std::string* token); + + constexpr static const char* kIdPropName() { return "id"; } + constexpr static const char* kIdPropSuffix() { return ".id"; } + + private: + int offset_; + + // The optional function to convert a string to its representation + ParseFunc parse_func_; + + // The optional function to convert a value to its string representation + SerializeFunc serialize_func_; + + // The optional function to match two option values + EqualsFunc equals_func_; + + PrepareFunc prepare_func_; + ValidateFunc validate_func_; + OptionType type_; + OptionVerificationType verification_; + OptionTypeFlags flags_; +}; + +// Parses the input value into elements of the result array, which has fixed +// array size. For example, if the value=1:2:3 and elem_info parses integers, +// the result array will be {1,2,3}. Array size is defined in the OptionTypeInfo +// the input value has to match with that. +// @param config_options Controls how the option value is parsed. +// @param elem_info Controls how individual tokens in value are parsed +// @param separator Character separating tokens in values (':' in the above +// example) +// @param name The name associated with this array option +// @param value The input string to parse into tokens +// @param result Returns the results of parsing value into its elements. +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or element number +// doesn't match array size defined in OptionTypeInfo +// or if the token could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status ParseArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::array* result) { + Status status; + + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; + size_t i = 0, start = 0, end = 0; + for (; status.ok() && i < kSize && start < value.size() && + end != std::string::npos; + i++, start = end + 1) { + std::string token; + status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); + if (status.ok()) { + status = elem_info.Parse(copy, name, token, &((*result)[i])); + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); + } + } + } + if (!status.ok()) { + return status; + } + // make sure the element number matches the array size + if (i < kSize) { + return Status::InvalidArgument( + "Serialized value has less elements than array size", name); + } + if (start < value.size() && end != std::string::npos) { + return Status::InvalidArgument( + "Serialized value has more elements than array size", name); + } + return status; +} + +// Serializes the fixed size input array into its output value. Elements are +// separated by the separator character. This element will convert all of the +// elements in array into their serialized form, using elem_info to perform the +// serialization. +// For example, if the array contains the integers 1,2,3 and elem_info +// serializes the output would be 1:2:3 for separator ":". +// @param config_options Controls how the option value is serialized. +// @param elem_info Controls how individual tokens in value are serialized +// @param separator Character separating tokens in value (':' in the above +// example) +// @param name The name associated with this array option +// @param array The input array to serialize +// @param value The output string of serialized options +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status SerializeArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, + const std::array& array, std::string* value) { + std::string result; + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& elem : array) { + std::string elem_str; + Status s = elem_info.Serialize(embedded, name, &elem, &elem_str); + if (!s.ok()) { + return s; + } else if (!elem_str.empty()) { + if (printed++ > 0) { + result += separator; + } + // If the element contains embedded separators, put it inside of brackets + if (elem_str.find(separator) != std::string::npos) { + result += "{" + elem_str + "}"; + } else { + result += elem_str; + } + } + } + if (result.find("=") != std::string::npos) { + *value = "{" + result + "}"; + } else if (printed > 1 && result.at(0) == '{') { + *value = "{" + result + "}"; + } else { + *value = result; + } + return Status::OK(); +} + +// Compares the input arrays array1 and array2 for equality +// Elements of the array are compared one by one using elem_info to perform the +// comparison. +// +// @param config_options Controls how the arrays are compared. +// @param elem_info Controls how individual elements in the arrays are compared +// @param name The name associated with this array option +// @param array1,array2 The arrays to compare. +// @param mismatch If the arrays are not equivalent, mismatch will point to +// the first element of the comparison that did not match. +// @return true If vec1 and vec2 are "equal", false otherwise +template +bool ArraysAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::array& array1, + const std::array& array2, std::string* mismatch) { + assert(array1.size() == kSize); + assert(array2.size() == kSize); + for (size_t i = 0; i < kSize; ++i) { + if (!elem_info.AreEqual(config_options, name, &array1[i], &array2[i], + mismatch)) { + return false; + } + } + return true; +} + +// Parses the input value into elements of the result vector. This method +// will break the input value into the individual tokens (based on the +// separator), where each of those tokens will be parsed based on the rules of +// elem_info. The result vector will be populated with elements based on the +// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses +// integers, the result vector will contain the integers 1,2,3,4,5 +// @param config_options Controls how the option value is parsed. +// @param elem_info Controls how individual tokens in value are parsed +// @param separator Character separating tokens in values (':' in the above +// example) +// @param name The name associated with this vector option +// @param value The input string to parse into tokens +// @param result Returns the results of parsing value into its elements. +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status ParseVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::vector* result) { + result->clear(); + Status status; + + // Turn off ignore_unknown_objects so we can tell if the returned + // object is valid or not. + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; + for (size_t start = 0, end = 0; + status.ok() && start < value.size() && end != std::string::npos; + start = end + 1) { + std::string token; + status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); + if (status.ok()) { + T elem; + status = elem_info.Parse(copy, name, token, &elem); + if (status.ok()) { + result->emplace_back(elem); + } else if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); + } + } + } + return status; +} + +// Serializes the input vector into its output value. Elements are +// separated by the separator character. This element will convert all of the +// elements in vec into their serialized form, using elem_info to perform the +// serialization. +// For example, if the vec contains the integers 1,2,3,4,5 and elem_info +// serializes the output would be 1:2:3:4:5 for separator ":". +// @param config_options Controls how the option value is serialized. +// @param elem_info Controls how individual tokens in value are serialized +// @param separator Character separating tokens in value (':' in the above +// example) +// @param name The name associated with this vector option +// @param vec The input vector to serialize +// @param value The output string of serialized options +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status SerializeVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::vector& vec, + std::string* value) { + std::string result; + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& elem : vec) { + std::string elem_str; + Status s = elem_info.Serialize(embedded, name, &elem, &elem_str); + if (!s.ok()) { + return s; + } else if (!elem_str.empty()) { + if (printed++ > 0) { + result += separator; + } + // If the element contains embedded separators, put it inside of brackets + if (elem_str.find(separator) != std::string::npos) { + result += "{" + elem_str + "}"; + } else { + result += elem_str; + } + } + } + if (result.find("=") != std::string::npos) { + *value = "{" + result + "}"; + } else if (printed > 1 && result.at(0) == '{') { + *value = "{" + result + "}"; + } else { + *value = result; + } + return Status::OK(); +} + +// Compares the input vectors vec1 and vec2 for equality +// If the vectors are the same size, elements of the vectors are compared one by +// one using elem_info to perform the comparison. +// +// @param config_options Controls how the vectors are compared. +// @param elem_info Controls how individual elements in the vectors are compared +// @param name The name associated with this vector option +// @param vec1,vec2 The vectors to compare. +// @param mismatch If the vectors are not equivalent, mismatch will point to +// the first +// element of the comparison that did not match. +// @return true If vec1 and vec2 are "equal", false otherwise +template +bool VectorsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::vector& vec1, const std::vector& vec2, + std::string* mismatch) { + if (vec1.size() != vec2.size()) { + *mismatch = name; + return false; + } else { + for (size_t i = 0; i < vec1.size(); ++i) { + if (!elem_info.AreEqual( + config_options, name, reinterpret_cast(&vec1[i]), + reinterpret_cast(&vec2[i]), mismatch)) { + return false; + } + } + return true; + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_util.h new file mode 100644 index 0000000000..8d9488f5f8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/options_util.h @@ -0,0 +1,105 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file contains utility functions for RocksDB Options. +#pragma once + + +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; +// Constructs the DBOptions and ColumnFamilyDescriptors by loading the +// latest RocksDB options file stored in the specified rocksdb database. +// +// Note that the all the pointer options (except table_factory, which will +// be described in more details below) will be initialized with the default +// values. Developers can further initialize them after this function call. +// Below is an example list of pointer options which will be initialized +// +// * env +// * memtable_factory +// * compaction_filter_factory +// * prefix_extractor +// * comparator +// * merge_operator +// * compaction_filter +// +// User can also choose to load customized comparator, env, and/or +// merge_operator through object registry: +// * comparator needs to be registered through Registrar +// * env needs to be registered through Registrar +// * merge operator needs to be registered through +// Registrar>. +// +// For table_factory, this function further supports deserializing +// BlockBasedTableFactory and its BlockBasedTableOptions except the +// pointer options of BlockBasedTableOptions (flush_block_policy_factory, +// block_cache, and block_cache_compressed), which will be initialized with +// default values. Developers can further specify these three options by +// casting the return value of TableFactory::GetOptions() to +// BlockBasedTableOptions and making necessary changes. +// +// config_options contains a set of options that controls the processing +// of the options. +// +// config_options.ignore_unknown_options can be set to true if you want to +// ignore options that are from a newer version of the db, essentially for +// forward compatibility. +// +// examples/options_file_example.cc demonstrates how to use this function +// to open a RocksDB instance. +// +// @return the function returns an OK status when it went successfully. If +// the specified "dbpath" does not contain any option file, then a +// Status::NotFound will be returned. A return value other than +// Status::OK or Status::NotFound indicates there is some error related +// to the options file itself. +// +// @see LoadOptionsFromFile +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); + +// Similar to LoadLatestOptions, this function constructs the DBOptions +// and ColumnFamilyDescriptors based on the specified RocksDB Options file. +// +// @see LoadLatestOptions +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& options_file_name, + DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); + +// Returns the latest options file name under the specified db path. +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name); + +// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors +// are compatible with the latest options stored in the specified DB path. +// +// If the return status is non-ok, it means the specified RocksDB instance +// might not be correctly opened with the input set of options. Currently, +// changing one of the following options will fail the compatibility check: +// +// * comparator +// * prefix_extractor +// * table_factory +// * merge_operator +// * persist_user_defined_timestamps +Status CheckOptionsCompatibility( + const ConfigOptions& config_options, const std::string& dbpath, + const DBOptions& db_options, + const std::vector& cf_descs); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/replayer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/replayer.h new file mode 100644 index 0000000000..fc5319989b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/replayer.h @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class TraceRecord; +class TraceRecordResult; + +struct ReplayOptions { + // Number of threads used for replaying. If 0 or 1, replay using + // single thread. + uint32_t num_threads; + + // Enables fast forwarding a replay by increasing/reducing the delay between + // the ingested traces. + // If > 0.0 and < 1.0, slow down the replay by this amount. + // If 1.0, replay the operations at the same rate as in the trace stream. + // If > 1, speed up the replay by this amount. + double fast_forward; + + ReplayOptions() : num_threads(1), fast_forward(1.0) {} + + ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio) + : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {} +}; + +// Replayer helps to replay the captured RocksDB query level operations. +// The Replayer can either be created from DB::NewReplayer method, or be +// instantiated via db_bench today, on using "replay" benchmark. +class Replayer { + public: + virtual ~Replayer() = default; + + // Make some preparation before replaying the trace. This will also reset the + // replayer in order to restart replaying. + virtual Status Prepare() = 0; + + // Return the timestamp when the trace recording was started. + virtual uint64_t GetHeaderTimestamp() const = 0; + + // Atomically read one trace into a TraceRecord (excluding the header and + // footer traces). + // Return Status::OK() on success; + // Status::Incomplete() if Prepare() was not called or no more available + // trace; + // Status::NotSupported() if the read trace type is not supported. + virtual Status Next(std::unique_ptr* record) = 0; + + // Execute one TraceRecord. + // Return Status::OK() if the execution was successful. Get/MultiGet traces + // will still return Status::OK() even if they got Status::NotFound() + // from DB::Get() or DB::MultiGet(); + // Status::Incomplete() if Prepare() was not called or no more available + // trace; + // Status::NotSupported() if the operation is not supported; + // Otherwise, return the corresponding error status. + // + // The actual operation execution status and result(s) will be saved in + // result. For example, a GetQueryTraceRecord will have its DB::Get() status + // and the returned value saved in a SingleValueTraceExecutionResult. + virtual Status Execute(const std::unique_ptr& record, + std::unique_ptr* result) = 0; + + // Replay all the traces from the provided trace stream, taking the delay + // between the traces into consideration. + // + // result_callback reports the status of executing a trace record, and the + // actual operation execution result (See the description for Execute()). + virtual Status Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/sim_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/sim_cache.h new file mode 100644 index 0000000000..6c52453e7e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/sim_cache.h @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "rocksdb/advanced_cache.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class SimCache; + +// For instrumentation purpose, use NewSimCache instead of NewLRUCache API +// NewSimCache is a wrapper function returning a SimCache instance that can +// have additional interface provided in Simcache class besides Cache interface +// to predict block cache hit rate without actually allocating the memory. It +// can help users tune their current block cache size, and determine how +// efficient they are using the memory. +// +// Since GetSimCapacity() returns the capacity for simulation, it differs from +// actual memory usage, which can be estimated as: +// sim_capacity * entry_size / (entry_size + block_size), +// where 76 <= entry_size <= 104, +// BlockBasedTableOptions.block_size = 4096 by default but is configurable, +// Therefore, generally the actual memory overhead of SimCache is Less than +// sim_capacity * 2% +extern std::shared_ptr NewSimCache(std::shared_ptr cache, + size_t sim_capacity, + int num_shard_bits); + +extern std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits); + +// An abstract base class (public interface) to the SimCache implementation +class SimCache : public CacheWrapper { + public: + using CacheWrapper::CacheWrapper; + + // returns the maximum configured capacity of the simcache for simulation + virtual size_t GetSimCapacity() const = 0; + + // simcache doesn't provide internal handler reference to user, so always + // PinnedUsage = 0 and the behavior will be not exactly consistent the + // with real cache. + // returns the memory size for the entries residing in the simcache. + virtual size_t GetSimUsage() const = 0; + + // sets the maximum configured capacity of the simcache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will purge old entries + // to fit new capacity. + virtual void SetSimCapacity(size_t capacity) = 0; + + // returns the lookup times of simcache + virtual uint64_t get_miss_counter() const = 0; + // returns the hit times of simcache + virtual uint64_t get_hit_counter() const = 0; + // reset the lookup and hit counters + virtual void reset_counter() = 0; + // String representation of the statistics of the simcache + virtual std::string ToString() const = 0; + + // Start storing logs of the cache activity (Add/Lookup) into + // a file located at activity_log_file, max_logging_size option can be used to + // stop logging to the file automatically after reaching a specific size in + // bytes, a values of 0 disable this feature + virtual Status StartActivityLogging(const std::string& activity_log_file, + Env* env, + uint64_t max_logging_size = 0) = 0; + + // Stop cache activity logging if any + virtual void StopActivityLogging() = 0; + + // Status of cache logging happening in background + virtual Status GetActivityLoggingStatus() = 0; + + private: + SimCache(const SimCache&); + SimCache& operator=(const SimCache&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/stackable_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/stackable_db.h new file mode 100644 index 0000000000..1a87a11366 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/stackable_db.h @@ -0,0 +1,587 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include "rocksdb/db.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#endif + +namespace ROCKSDB_NAMESPACE { + +// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d +class StackableDB : public DB { + public: + // StackableDB take sole ownership of the underlying db. + explicit StackableDB(DB* db) : db_(db) {} + + // StackableDB take shared ownership of the underlying db. + explicit StackableDB(std::shared_ptr db) + : db_(db.get()), shared_db_ptr_(db) {} + + ~StackableDB() { + if (shared_db_ptr_ == nullptr) { + delete db_; + } else { + assert(shared_db_ptr_.get() == db_); + } + db_ = nullptr; + } + + virtual Status Close() override { return db_->Close(); } + + virtual DB* GetBaseDB() { return db_; } + + virtual DB* GetRootDB() override { return db_->GetRootDB(); } + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamily(options, column_family_name, handle); + } + + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles) override { + return db_->CreateColumnFamilies(options, column_family_names, handles); + } + + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override { + return db_->CreateColumnFamilies(column_families, handles); + } + + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override { + return db_->DropColumnFamily(column_family); + } + + virtual Status DropColumnFamilies( + const std::vector& column_families) override { + return db_->DropColumnFamilies(column_families); + } + + virtual Status DestroyColumnFamilyHandle( + ColumnFamilyHandle* column_family) override { + return db_->DestroyColumnFamilyHandle(column_family); + } + + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override { + return db_->Put(options, column_family, key, val); + } + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) override { + return db_->Put(options, column_family, key, ts, val); + } + + using DB::PutEntity; + Status PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) override { + return db_->PutEntity(options, column_family, key, columns); + } + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override { + return db_->Get(options, column_family, key, value); + } + + using DB::GetEntity; + Status GetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) override { + return db_->GetEntity(options, column_family, key, columns); + } + + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* slice, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + return db_->GetMergeOperands(options, column_family, key, slice, + get_merge_operands_options, + number_of_operands); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override { + return db_->MultiGet(options, column_family, keys, values); + } + + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override { + return db_->MultiGet(options, column_family, num_keys, keys, values, + statuses, sorted_input); + } + + using DB::MultiGetEntity; + + void MultiGetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, size_t num_keys, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) override { + db_->MultiGetEntity(options, column_family, num_keys, keys, results, + statuses, sorted_input); + } + + void MultiGetEntity(const ReadOptions& options, size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableWideColumns* results, Status* statuses, + bool sorted_input) override { + db_->MultiGetEntity(options, num_keys, column_families, keys, results, + statuses, sorted_input); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& options) override { + return db_->IngestExternalFile(column_family, external_files, options); + } + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override { + return db_->IngestExternalFiles(args); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadata, handle); + } + + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { + return db_->ClipColumnFamily(column_family, begin_key, end_key); + } + + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_opts) override { + return db_->VerifyFileChecksums(read_opts); + } + + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + + virtual Status VerifyChecksum(const ReadOptions& options) override { + return db_->VerifyChecksum(options); + } + + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + return db_->KeyMayExist(options, column_family, key, value, value_found); + } + + using DB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->Delete(wopts, column_family, key); + } + Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) override { + return db_->Delete(wopts, column_family, key, ts); + } + + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->SingleDelete(wopts, column_family, key); + } + Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override { + return db_->SingleDelete(wopts, column_family, key, ts); + } + + using DB::DeleteRange; + Status DeleteRange(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, const Slice& start_key, + const Slice& end_key) override { + return db_->DeleteRange(wopts, column_family, start_key, end_key); + } + + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return db_->Merge(options, column_family, key, value); + } + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override { + return db_->Merge(options, column_family, key, ts, value); + } + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { + return db_->Write(opts, updates); + } + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override { + return db_->NewIterator(opts, column_family); + } + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override { + return db_->NewIterators(options, column_families, iterators); + } + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + return db_->ReleaseSnapshot(snapshot); + } + + using DB::GetMapProperty; + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return db_->GetProperty(column_family, property, value); + } + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override { + return db_->GetMapProperty(column_family, property, value); + } + + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override { + return db_->GetIntProperty(column_family, property, value); + } + + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { + return db_->GetAggregatedIntProperty(property, value); + } + + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(options, column_family, r, n, sizes); + } + + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override { + return db_->GetApproximateMemTableStats(column_family, range, count, size); + } + + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override { + return db_->CompactRange(options, column_family, begin, end); + } + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override { + return db_->CompactFiles(compact_options, column_family, input_file_names, + output_level, output_path_id, output_file_names, + compaction_job_info); + } + + virtual Status PauseBackgroundWork() override { + return db_->PauseBackgroundWork(); + } + virtual Status ContinueBackgroundWork() override { + return db_->ContinueBackgroundWork(); + } + + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override { + return db_->EnableAutoCompaction(column_family_handles); + } + + virtual void EnableManualCompaction() override { + return db_->EnableManualCompaction(); + } + virtual void DisableManualCompaction() override { + return db_->DisableManualCompaction(); + } + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return db_->NumberLevels(column_family); + } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel( + ColumnFamilyHandle* column_family) override { + return db_->MaxMemCompactionLevel(column_family); + } + + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override { + return db_->Level0StopWriteTrigger(column_family); + } + + virtual const std::string& GetName() const override { return db_->GetName(); } + + virtual Env* GetEnv() const override { return db_->GetEnv(); } + + virtual FileSystem* GetFileSystem() const override { + return db_->GetFileSystem(); + } + + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override { + return db_->GetOptions(column_family); + } + + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override { + return db_->GetDBOptions(); + } + + using DB::Flush; + virtual Status Flush(const FlushOptions& fopts, + ColumnFamilyHandle* column_family) override { + return db_->Flush(fopts, column_family); + } + virtual Status Flush( + const FlushOptions& fopts, + const std::vector& column_families) override { + return db_->Flush(fopts, column_families); + } + + virtual Status SyncWAL() override { return db_->SyncWAL(); } + + virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); } + + virtual Status LockWAL() override { return db_->LockWAL(); } + + virtual Status UnlockWAL() override { return db_->UnlockWAL(); } + + + virtual Status DisableFileDeletions() override { + return db_->DisableFileDeletions(); + } + + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); + } + + virtual void GetLiveFilesMetaData( + std::vector* metadata) override { + db_->GetLiveFilesMetaData(metadata); + } + + virtual Status GetLiveFilesChecksumInfo( + FileChecksumList* checksum_list) override { + return db_->GetLiveFilesChecksumInfo(checksum_list); + } + + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override { + return db_->GetLiveFilesStorageInfo(opts, files); + } + + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) override { + db_->GetColumnFamilyMetaData(column_family, cf_meta); + } + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) override { + return db_->StartBlockCacheTrace(trace_options, std::move(trace_writer)); + } + + Status StartBlockCacheTrace( + const BlockCacheTraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartBlockCacheTrace(options, std::move(trace_writer)); + } + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartIOTrace(options, std::move(trace_writer)); + } + + using DB::EndIOTrace; + Status EndIOTrace() override { return db_->EndIOTrace(); } + + using DB::StartTrace; + Status StartTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartTrace(options, std::move(trace_writer)); + } + + using DB::EndTrace; + Status EndTrace() override { return db_->EndTrace(); } + + using DB::NewDefaultReplayer; + Status NewDefaultReplayer(const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override { + return db_->NewDefaultReplayer(handles, std::move(reader), replayer); + } + + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + return db_->GetLiveFiles(vec, mfs, flush_memtable); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { + return db_->GetLatestSequenceNumber(); + } + + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) override { + return db_->IncreaseFullHistoryTsLow(column_family, ts_low); + } + + Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) override { + return db_->GetFullHistoryTsLow(column_family, ts_low); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return db_->GetSortedWalFiles(files); + } + + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override { + return db_->GetCurrentWalFile(current_log_file); + } + + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override { + return db_->GetCreationTimeOfOldestFile(creation_time); + } + + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. + virtual Status DeleteFile(std::string name) override { + return db_->DeleteFile(name); + } + + virtual Status GetDbIdentity(std::string& identity) const override { + return db_->GetDbIdentity(identity); + } + + virtual Status GetDbSessionId(std::string& session_id) const override { + return db_->GetDbSessionId(session_id); + } + + using DB::SetOptions; + virtual Status SetOptions(ColumnFamilyHandle* column_family_handle, + const std::unordered_map& + new_options) override { + return db_->SetOptions(column_family_handle, new_options); + } + + virtual Status SetDBOptions( + const std::unordered_map& new_options) + override { + return db_->SetDBOptions(new_options); + } + + using DB::ResetStats; + virtual Status ResetStats() override { return db_->ResetStats(); } + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfAllTables(column_family, props); + } + + using DB::GetPropertiesOfTablesInRange; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); + } + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) override { + return db_->GetUpdatesSince(seq_number, iter, read_options); + } + + virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, + const Slice* end) override { + return db_->SuggestCompactRange(column_family, begin, end); + } + + virtual Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override { + return db_->PromoteL0(column_family, target_level); + } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return db_->DefaultColumnFamily(); + } + + Status TryCatchUpWithPrimary() override { + return db_->TryCatchUpWithPrimary(); + } + + protected: + DB* db_; + std::shared_ptr shared_db_ptr_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/table_properties_collectors.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/table_properties_collectors.h new file mode 100644 index 0000000000..f9d8d5dcdd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/table_properties_collectors.h @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// A factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entries or the ratio of tombstone +// entries in the whole file >= the specified deletion ratio. +class CompactOnDeletionCollectorFactory + : public TablePropertiesCollectorFactory { + public: + // A factory of a table property collector that marks a SST + // file as need-compaction when it observe at least "D" deletion + // entries in any "N" consecutive entries, or the ratio of tombstone + // entries >= deletion_ratio. + // + // @param sliding_window_size "N" + // @param deletion_trigger "D" + // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction + // based on deletion ratio. + CompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio); + + ~CompactOnDeletionCollectorFactory() {} + + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override; + + // Change the value of sliding_window_size "N" + // Setting it to 0 disables the delete triggered compaction + void SetWindowSize(size_t sliding_window_size) { + sliding_window_size_.store(sliding_window_size); + } + size_t GetWindowSize() const { return sliding_window_size_.load(); } + + // Change the value of deletion_trigger "D" + void SetDeletionTrigger(size_t deletion_trigger) { + deletion_trigger_.store(deletion_trigger); + } + + size_t GetDeletionTrigger() const { return deletion_trigger_.load(); } + // Change deletion ratio. + // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction + // based on deletion ratio. + void SetDeletionRatio(double deletion_ratio) { + deletion_ratio_.store(deletion_ratio); + } + + double GetDeletionRatio() const { return deletion_ratio_.load(); } + static const char* kClassName() { return "CompactOnDeletionCollector"; } + const char* Name() const override { return kClassName(); } + + std::string ToString() const override; + + private: + std::atomic sliding_window_size_; + std::atomic deletion_trigger_; + std::atomic deletion_ratio_; +}; + +// Creates a factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entries, or the ratio of tombstone +// entries >= deletion_ratio. +// +// @param sliding_window_size "N". Note that this number will be +// round up to the smallest multiple of 128 that is no less +// than the specified size. +// @param deletion_trigger "D". Note that even when "N" is changed, +// the specified number for "D" will not be changed. +// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction +// based on deletion ratio. Disabled by default. +extern std::shared_ptr +NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio = 0); +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction.h new file mode 100644 index 0000000000..947fcec55a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction.h @@ -0,0 +1,683 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator; +class TransactionDB; +class WriteBatchWithIndex; + +using TransactionName = std::string; + +using TransactionID = uint64_t; + +using TxnTimestamp = uint64_t; + +constexpr TxnTimestamp kMaxTxnTimestamp = + std::numeric_limits::max(); + +/* + class Endpoint allows to define prefix ranges. + + Prefix ranges are introduced below. + + == Basic Ranges == + Let's start from basic ranges. Key Comparator defines ordering of rowkeys. + Then, one can specify finite closed ranges by just providing rowkeys of their + endpoints: + + lower_endpoint <= X <= upper_endpoint + + However our goal is to provide a richer set of endpoints. Read on. + + == Lexicographic ordering == + A lexicographic (or dictionary) ordering satisfies these criteria: If there + are two keys in form + key_a = {prefix_a, suffix_a} + key_b = {prefix_b, suffix_b} + and + prefix_a < prefix_b + then + key_a < key_b. + + == Prefix ranges == + With lexicographic ordering, one may want to define ranges in form + + "prefix is $PREFIX" + + which translates to a range in form + + {$PREFIX, -infinity} < X < {$PREFIX, +infinity} + + where -infinity will compare less than any possible suffix, and +infinity + will compare as greater than any possible suffix. + + class Endpoint allows to define these kind of rangtes. + + == Notes == + BytewiseComparator and ReverseBytewiseComparator produce lexicographic + ordering. + + The row comparison function is able to compare key prefixes. If the data + domain includes keys A and B, then the comparison function is able to compare + equal-length prefixes: + + min_len= min(byte_length(A), byte_length(B)); + cmp(Slice(A, min_len), Slice(B, min_len)); // this call is valid + + == Other options == + As far as MyRocks is concerned, the alternative to prefix ranges would be to + support both open (non-inclusive) and closed (inclusive) range endpoints. +*/ + +class Endpoint { + public: + Slice slice; + + /* + true : the key has a "+infinity" suffix. A suffix that would compare as + greater than any other suffix + false : otherwise + */ + bool inf_suffix; + + explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false) + : slice(slice_arg), inf_suffix(inf_suffix_arg) {} + + explicit Endpoint(const char* s, bool inf_suffix_arg = false) + : slice(s), inf_suffix(inf_suffix_arg) {} + + Endpoint(const char* s, size_t size, bool inf_suffix_arg = false) + : slice(s, size), inf_suffix(inf_suffix_arg) {} + + Endpoint() : inf_suffix(false) {} +}; + +// Provides notification to the caller of SetSnapshotOnNextOperation when +// the actual snapshot gets created +class TransactionNotifier { + public: + virtual ~TransactionNotifier() {} + + // Implement this method to receive notification when a snapshot is + // requested via SetSnapshotOnNextOperation. + // Do not take exclusive ownership of `newSnapshot` because it is shared with + // the underlying transaction. + virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0; +}; + +// Provides BEGIN/COMMIT/ROLLBACK transactions. +// +// To use transactions, you must first create either an OptimisticTransactionDB +// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for +// more information. +// +// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction(). +// +// It is up to the caller to synchronize access to this object. +// +// See examples/transaction_example.cc for some simple examples. +// +// TODO(agiardullo): Not yet implemented +// -PerfContext statistics +// -Support for using Transactions with DBWithTTL +class Transaction { + public: + // No copying allowed + Transaction(const Transaction&) = delete; + void operator=(const Transaction&) = delete; + + virtual ~Transaction() {} + + // If a transaction has a snapshot set, the transaction will ensure that + // any keys successfully written(or fetched via GetForUpdate()) have not + // been modified outside of this transaction since the time the snapshot was + // set. + // If a snapshot has not been set, the transaction guarantees that keys have + // not been modified since the time each key was first written (or fetched via + // GetForUpdate()). + // + // Using SetSnapshot() will provide stricter isolation guarantees at the + // expense of potentially more transaction failures due to conflicts with + // other writes. + // + // Calling SetSnapshot() has no effect on keys written before this function + // has been called. + // + // SetSnapshot() may be called multiple times if you would like to change + // the snapshot used for different operations in this transaction. + // + // Calling SetSnapshot will not affect the version of Data returned by Get() + // methods. See Transaction::Get() for more details. + virtual void SetSnapshot() = 0; + + // Similar to SetSnapshot(), but will not change the current snapshot + // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called. + // By calling this function, the transaction will essentially call + // SetSnapshot() for you right before performing the next write/GetForUpdate. + // + // Calling SetSnapshotOnNextOperation() will not affect what snapshot is + // returned by GetSnapshot() until the next write/GetForUpdate is executed. + // + // When the snapshot is created the notifier's SnapshotCreated method will + // be called so that the caller can get access to the snapshot. + // + // This is an optimization to reduce the likelihood of conflicts that + // could occur in between the time SetSnapshot() is called and the first + // write/GetForUpdate operation. Eg, this prevents the following + // race-condition: + // + // txn1->SetSnapshot(); + // txn2->Put("A", ...); + // txn2->Commit(); + // txn1->GetForUpdate(opts, "A", ...); // FAIL! + // + // WriteCommittedTxn only: a new snapshot will be taken upon next operation, + // and next operation can be a Commit. + // TODO(yanqin) remove the "write-committed only" limitation. + virtual void SetSnapshotOnNextOperation( + std::shared_ptr notifier = nullptr) = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // + // REQUIRED: The returned Snapshot is only valid up until the next time + // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot() + // is called, or the Transaction is deleted. + virtual const Snapshot* GetSnapshot() const = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // The returned snapshot can outlive the transaction. + virtual std::shared_ptr GetTimestampedSnapshot() const = 0; + + // Clears the current snapshot (i.e. no snapshot will be 'set') + // + // This removes any snapshot that currently exists or is set to be created + // on the next update operation (SetSnapshotOnNextOperation). + // + // Calling ClearSnapshot() has no effect on keys written before this function + // has been called. + // + // If a reference to a snapshot was retrieved via GetSnapshot(), it will no + // longer be valid and should be discarded after a call to ClearSnapshot(). + virtual void ClearSnapshot() = 0; + + // Prepare the current transaction for 2PC + virtual Status Prepare() = 0; + + // Write all batched keys to the db atomically. + // + // Returns OK on success. + // + // May return any error status that could be returned by DB:Write(). + // + // If this transaction was created by an OptimisticTransactionDB(), + // Status::Busy() may be returned if the transaction could not guarantee + // that there are no write conflicts. Status::TryAgain() may be returned + // if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain). + // + // If this transaction was created by a TransactionDB(), Status::Expired() + // may be returned if this transaction has lived for longer than + // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if + // TransactionOptions.skip_prepare is false and Prepare is not called on this + // transaction before Commit. + virtual Status Commit() = 0; + + // In addition to Commit(), also creates a snapshot of the db after all + // writes by this txn are visible to other readers. + // Caller is responsible for ensuring that + // snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts + // in which snapshot1 and snapshot2 are created by this API. + // + // Currently only supported by WriteCommittedTxn. Calling this method on + // other types of transactions will return non-ok Status resulting from + // Commit() or a `NotSupported` error. + // This method returns OK if and only if the transaction successfully + // commits. It is possible that transaction commits successfully but fails to + // create a timestamped snapshot. Therefore, the caller should check that the + // snapshot is created. + // notifier will be notified upon next snapshot creation. Nullable. + // ret non-null output argument storing a shared_ptr to the newly created + // snapshot. + Status CommitAndTryCreateSnapshot( + std::shared_ptr notifier = + std::shared_ptr(), + TxnTimestamp ts = kMaxTxnTimestamp, + std::shared_ptr* snapshot = nullptr); + + // Discard all batched writes in this transaction. + virtual Status Rollback() = 0; + + // Records the state of the transaction for future calls to + // RollbackToSavePoint(). May be called multiple times to set multiple save + // points. + virtual void SetSavePoint() = 0; + + // Undo all operations in this transaction (Put, Merge, Delete, PutLogData) + // since the most recent call to SetSavePoint() and removes the most recent + // SetSavePoint(). + // If there is no previous call to SetSavePoint(), returns Status::NotFound() + virtual Status RollbackToSavePoint() = 0; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + + // This function is similar to DB::Get() except it will also read pending + // changes in this transaction. Currently, this function will return + // Status::MergeInProgress if the most recent write to the queried key in + // this batch is a Merge. + // + // If read_options.snapshot is not set, the current version of the key will + // be read. Calling SetSnapshot() does not affect the version of the data + // returned. + // + // Note that setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = Get(options, column_family, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = Get(options, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) = 0; + + // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are + // expected to override this with an implementation that calls + // DBImpl::MultiGet() + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Get(options, column_family, keys[i], &values[i]); + } + } + + // Read this key and ensure that this transaction will only + // be able to be committed if this key is not written outside this + // transaction after it has first been read (or after the snapshot if a + // snapshot is set in this transaction and do_validate is true). If + // do_validate is false, ReadOptions::snapshot is expected to be nullptr so + // that GetForUpdate returns the latest committed value. The transaction + // behavior is the same regardless of whether the key exists or not. + // + // Note: Currently, this function will return Status::MergeInProgress + // if the most recent write to the queried key in this batch is a Merge. + // + // The values returned by this function are similar to Transaction::Get(). + // If value==nullptr, then this function will not read any data, but will + // still ensure that this key cannot be written to by outside of this + // transaction. + // + // If this transaction was created by an OptimisticTransaction, GetForUpdate() + // could cause commit() to fail. Otherwise, it could return any error + // that could be returned by DB::Get(). + // + // If this transaction was created by a TransactionDB, it can return + // Status::OK() on success, + // Status::Busy() if there is a write conflict, + // Status::TimedOut() if a lock could not be acquired, + // Status::TryAgain() if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain) + // Status::MergeInProgress() if merge operations cannot be resolved. + // or other errors if this key could not be read. + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive = true, + const bool do_validate = true) = 0; + + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, + bool exclusive = true, + const bool do_validate = true) { + if (pinnable_val == nullptr) { + std::string* null_str = nullptr; + return GetForUpdate(options, column_family, key, null_str, exclusive, + do_validate); + } else { + auto s = GetForUpdate(options, column_family, key, + pinnable_val->GetSelf(), exclusive, do_validate); + pinnable_val->PinSelf(); + return s; + } + } + + // Get a range lock on [start_endpoint; end_endpoint]. + virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&, + const Endpoint&) { + return Status::NotSupported(); + } + + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value, bool exclusive = true, + const bool do_validate = true) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) = 0; + + // Returns an iterator that will iterate on all keys in the default + // column family including both keys in the DB and uncommitted keys in this + // transaction. + // + // Setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + // + // Caller is responsible for deleting the returned Iterator. + // + // The returned iterator is only valid until Commit(), Rollback(), or + // RollbackToSavePoint() is called. + virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; + + virtual Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) = 0; + + // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding + // functions in WriteBatch, but will also do conflict checking on the + // keys being written. + // + // assume_tracked=true expects the key be already tracked. More + // specifically, it means the the key was previous tracked in the same + // savepoint, with the same exclusive flag, and at a lower sequence number. + // If valid then it skips ValidateSnapshot. Returns error otherwise. + // + // If this Transaction was created on an OptimisticTransactionDB, these + // functions should always return Status::OK(). + // + // If this Transaction was created on a TransactionDB, the status returned + // can be: + // Status::OK() on success, + // Status::Busy() if there is a write conflict, + // Status::TimedOut() if a lock could not be acquired, + // Status::TryAgain() if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain) + // or other errors on unexpected failures. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) = 0; + virtual Status Put(const SliceParts& key, const SliceParts& value) = 0; + + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) = 0; + virtual Status Delete(const SliceParts& key) = 0; + + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked = false) = 0; + virtual Status SingleDelete(const Slice& key) = 0; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) = 0; + virtual Status SingleDelete(const SliceParts& key) = 0; + + // PutUntracked() will write a Put to the batch of operations to be committed + // in this transaction. This write will only happen if this transaction + // gets committed successfully. But unlike Transaction::Put(), + // no conflict checking will be done for this key. + // + // If this Transaction was created on a PessimisticTransactionDB, this + // function will still acquire locks necessary to make sure this write doesn't + // cause conflicts in other transactions and may return Status::Busy(). + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) = 0; + virtual Status PutUntracked(const SliceParts& key, + const SliceParts& value) = 0; + + virtual Status MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0; + + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status DeleteUntracked(const Slice& key) = 0; + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status DeleteUntracked(const SliceParts& key) = 0; + virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status SingleDeleteUntracked(const Slice& key) = 0; + + // Similar to WriteBatch::PutLogData + virtual void PutLogData(const Slice& blob) = 0; + + // By default, all Put/Merge/Delete operations will be indexed in the + // transaction so that Get/GetForUpdate/GetIterator can search for these + // keys. + // + // If the caller does not want to fetch the keys about to be written, + // they may want to avoid indexing as a performance optimization. + // Calling DisableIndexing() will turn off indexing for all future + // Put/Merge/Delete operations until EnableIndexing() is called. + // + // If a key is Put/Merge/Deleted after DisableIndexing is called and then + // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is + // undefined. + virtual void DisableIndexing() = 0; + virtual void EnableIndexing() = 0; + + // Returns the number of distinct Keys being tracked by this transaction. + // If this transaction was created by a TransactionDB, this is the number of + // keys that are currently locked by this transaction. + // If this transaction was created by an OptimisticTransactionDB, this is the + // number of keys that need to be checked for conflicts at commit time. + virtual uint64_t GetNumKeys() const = 0; + + // Returns the number of Puts/Deletes/Merges that have been applied to this + // transaction so far. + virtual uint64_t GetNumPuts() const = 0; + virtual uint64_t GetNumDeletes() const = 0; + virtual uint64_t GetNumMerges() const = 0; + + // Returns the elapsed time in milliseconds since this Transaction began. + virtual uint64_t GetElapsedTime() const = 0; + + // Fetch the underlying write batch that contains all pending changes to be + // committed. + // + // Note: You should not write or delete anything from the batch directly and + // should only use the functions in the Transaction class to + // write to this transaction. + virtual WriteBatchWithIndex* GetWriteBatch() = 0; + + // Change the value of TransactionOptions.lock_timeout (in milliseconds) for + // this transaction. + // Has no effect on OptimisticTransactions. + virtual void SetLockTimeout(int64_t timeout) = 0; + + // Return the WriteOptions that will be used during Commit() + virtual WriteOptions* GetWriteOptions() = 0; + + // Reset the WriteOptions that will be used during Commit(). + virtual void SetWriteOptions(const WriteOptions& write_options) = 0; + + // If this key was previously fetched in this transaction using + // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell + // the transaction that it no longer needs to do any conflict checking + // for this key. + // + // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(), + // then UndoGetForUpdate will only have an effect if it is also called N + // times. If this key has been written to in this transaction, + // UndoGetForUpdate() will have no effect. + // + // If SetSavePoint() has been called after the GetForUpdate(), + // UndoGetForUpdate() will not have any effect. + // + // If this Transaction was created by an OptimisticTransactionDB, + // calling UndoGetForUpdate can affect whether this key is conflict checked + // at commit time. + // If this Transaction was created by a TransactionDB, + // calling UndoGetForUpdate may release any held locks for this key. + virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual void UndoGetForUpdate(const Slice& key) = 0; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0; + + // Note: data in the commit-time-write-batch bypasses concurrency control, + // thus should be used with great caution. + // For write-prepared/write-unprepared transactions, + // GetCommitTimeWriteBatch() can be used only if the transaction is started + // with + // `TransactionOptions::use_only_the_last_commit_time_batch_for_recovery` set + // to true. Otherwise, it is possible that two uncommitted versions of the + // same key exist in the database due to the current implementation (see the + // explanation in WritePreparedTxn::CommitInternal). + // During bottommost compaction, RocksDB may + // set the sequence numbers of both to zero once becoming committed, causing + // output SST file to have two identical internal keys. + virtual WriteBatch* GetCommitTimeWriteBatch() = 0; + + virtual void SetLogNumber(uint64_t log) { log_number_ = log; } + + virtual uint64_t GetLogNumber() const { return log_number_; } + + virtual Status SetName(const TransactionName& name) = 0; + + virtual TransactionName GetName() const { return name_; } + + virtual TransactionID GetID() const { return 0; } + + virtual bool IsDeadlockDetect() const { return false; } + + virtual std::vector GetWaitingTxns( + uint32_t* /*column_family_id*/, std::string* /*key*/) const { + assert(false); + return std::vector(); + } + + enum TransactionState { + STARTED = 0, + AWAITING_PREPARE = 1, + PREPARED = 2, + AWAITING_COMMIT = 3, + COMMITTED = 4, + COMMITED = COMMITTED, // old misspelled name + AWAITING_ROLLBACK = 5, + ROLLEDBACK = 6, + LOCKS_STOLEN = 7, + }; + + TransactionState GetState() const { return txn_state_; } + void SetState(TransactionState state) { txn_state_ = state; } + + // NOTE: Experimental feature + // The globally unique id with which the transaction is identified. This id + // might or might not be set depending on the implementation. Similarly the + // implementation decides the point in lifetime of a transaction at which it + // assigns the id. Although currently it is the case, the id is not guaranteed + // to remain the same across restarts. + uint64_t GetId() { return id_; } + + virtual Status SetReadTimestampForValidation(TxnTimestamp /*ts*/) { + return Status::NotSupported("timestamp not supported"); + } + + virtual Status SetCommitTimestamp(TxnTimestamp /*ts*/) { + return Status::NotSupported("timestamp not supported"); + } + + virtual TxnTimestamp GetCommitTimestamp() const { return kMaxTxnTimestamp; } + + protected: + explicit Transaction(const TransactionDB* /*db*/) {} + Transaction() : log_number_(0), txn_state_(STARTED) {} + + // the log in which the prepared section for this txn resides + // (for two phase commit) + uint64_t log_number_; + TransactionName name_; + + // Execution status of the transaction. + std::atomic txn_state_; + + uint64_t id_ = 0; + virtual void SetId(uint64_t id) { + assert(id_ == 0); + id_ = id; + } + + virtual uint64_t GetLastLogNumber() const { return log_number_; } + + private: + friend class PessimisticTransactionDB; + friend class WriteUnpreparedTxnDB; + friend class TransactionTest_TwoPhaseLogRollingTest_Test; + friend class TransactionTest_TwoPhaseLogRollingTest2_Test; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db.h new file mode 100644 index 0000000000..3c4b63068e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db.h @@ -0,0 +1,506 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" + +// Database with Transaction support. +// +// See transaction.h and examples/transaction_example.cc + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutexFactory; + +enum TxnDBWritePolicy { + WRITE_COMMITTED = 0, // write only the committed data + WRITE_PREPARED, // write data after the prepare phase of 2pc + WRITE_UNPREPARED // write data before the prepare phase of 2pc +}; + +constexpr uint32_t kInitialMaxDeadlocks = 5; + +class LockManager; +struct RangeLockInfo; + +// A lock manager handle +// The workflow is as follows: +// * Use a factory method (like NewRangeLockManager()) to create a lock +// manager and get its handle. +// * A Handle for a particular kind of lock manager will have extra +// methods and parameters to control the lock manager +// * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It +// will be used to perform locking. +class LockManagerHandle { + public: + // PessimisticTransactionDB will call this to get the Lock Manager it's going + // to use. + virtual LockManager* getLockManager() = 0; + + virtual ~LockManagerHandle() {} +}; + +// Same as class Endpoint, but use std::string to manage the buffer allocation +struct EndpointWithString { + std::string slice; + bool inf_suffix; +}; + +struct RangeDeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + + EndpointWithString m_start; + EndpointWithString m_end; +}; + +struct RangeDeadlockPath { + std::vector path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit RangeDeadlockPath(std::vector path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +// A handle to control RangeLockManager (Range-based lock manager) from outside +// RocksDB +class RangeLockManagerHandle : public LockManagerHandle { + public: + // Set total amount of lock memory to use. + // + // @return 0 Ok + // @return EDOM Failed to set because currently using more memory than + // specified + virtual int SetMaxLockMemory(size_t max_lock_memory) = 0; + virtual size_t GetMaxLockMemory() = 0; + + using RangeLockStatus = + std::unordered_multimap; + + // Lock Escalation barrier check function. + // It is called for a couple of endpoints A and B, such that A < B. + // If escalation_barrier_check_func(A, B)==true, then there's a lock + // escalation barrier between A and B, and lock escalation is not allowed + // to bridge the gap between A and B. + // + // The function may be called from any thread that acquires or releases + // locks. It should not throw exceptions. There is currently no way to return + // an error. + using EscalationBarrierFunc = + std::function; + + // Set the user-provided barrier check function + virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0; + + virtual RangeLockStatus GetRangeLockStatusData() = 0; + + class Counters { + public: + // Number of times lock escalation was triggered (for all column families) + uint64_t escalation_count; + + // Number of times lock acquisition had to wait for a conflicting lock + // to be released. This counts both successful waits (where the desired + // lock was acquired) and waits that timed out or got other error. + uint64_t lock_wait_count; + + // How much memory is currently used for locks (total for all column + // families) + uint64_t current_lock_memory; + }; + + // Get the current counter values + virtual Counters GetStatus() = 0; + + // Functions for range-based Deadlock reporting. + virtual std::vector GetRangeDeadlockInfoBuffer() = 0; + virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0; + + virtual ~RangeLockManagerHandle() {} +}; + +// A factory function to create a Range Lock Manager. The created object should +// be: +// 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in +// range-locking mode +// 2. Used to control the lock manager when the DB is already open. +RangeLockManagerHandle* NewRangeLockManager( + std::shared_ptr mutex_factory); + +struct TransactionDBOptions { + // Specifies the maximum number of keys that can be locked at the same time + // per column family. + // If the number of locked keys is greater than max_num_locks, transaction + // writes (or GetForUpdate) will return an error. + // If this value is not positive, no limit will be enforced. + int64_t max_num_locks = -1; + + // Stores the number of latest deadlocks to track + uint32_t max_num_deadlocks = kInitialMaxDeadlocks; + + // Increasing this value will increase the concurrency by dividing the lock + // table (per column family) into more sub-tables, each with their own + // separate mutex. + size_t num_stripes = 16; + + // If positive, specifies the default wait timeout in milliseconds when + // a transaction attempts to lock a key if not specified by + // TransactionOptions::lock_timeout. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout. Not using a timeout is not recommended + // as it can lead to deadlocks. Currently, there is no deadlock-detection to + // recover from a deadlock. + int64_t transaction_lock_timeout = 1000; // 1 second + + // If positive, specifies the wait timeout in milliseconds when writing a key + // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() + // directly). + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout and will block indefinitely when acquiring + // a lock. + // + // Not using a timeout can lead to deadlocks. Currently, there + // is no deadlock-detection to recover from a deadlock. While DB writes + // cannot deadlock with other DB writes, they can deadlock with a transaction. + // A negative timeout should only be used if all transactions have a small + // expiration set. + int64_t default_lock_timeout = 1000; // 1 second + + // If set, the TransactionDB will use this implementation of a mutex and + // condition variable for all transaction locking instead of the default + // mutex/condvar implementation. + std::shared_ptr custom_mutex_factory; + + // The policy for when to write the data into the DB. The default policy is to + // write only the committed data (WRITE_COMMITTED). The data could be written + // before the commit phase. The DB then needs to provide the mechanisms to + // tell apart committed from uncommitted data. + TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + + // TODO(myabandeh): remove this option + // Note: this is a temporary option as a hot fix in rollback of writeprepared + // txns in myrocks. MyRocks uses merge operands for autoinc column id without + // however obtaining locks. This breaks the assumption behind the rollback + // logic in myrocks. This hack of simply not rolling back merge operands works + // for the special way that myrocks uses this operands. + bool rollback_merge_operands = false; + + // nullptr means use default lock manager. + // Other value means the user provides a custom lock manager. + std::shared_ptr lock_mgr_handle; + + // If true, the TransactionDB implementation might skip concurrency control + // unless it is overridden by TransactionOptions or + // TransactionDBWriteOptimizations. This can be used in conjunction with + // DBOptions::unordered_write when the TransactionDB is used solely for write + // ordering rather than concurrency control. + bool skip_concurrency_control = false; + + // This option is only valid for write unprepared. If a write batch exceeds + // this threshold, then the transaction will implicitly flush the currently + // pending writes into the database. A value of 0 or less means no limit. + int64_t default_write_batch_flush_threshold = 0; + + // This option is valid only for write-prepared/write-unprepared. Transaction + // will rely on this callback to determine if a key should be rolled back + // with Delete or SingleDelete when necessary. If the callback returns true, + // then SingleDelete should be used. If the callback is not callable or the + // callback returns false, then a Delete is used. + // The application should ensure thread-safety of this callback. + // The callback should not throw because RocksDB is not exception-safe. + // The callback may be removed if we allow mixing Delete and SingleDelete in + // the future. + std::function + rollback_deletion_type_callback; + + private: + // 128 entries + // Should the default value change, please also update wp_snapshot_cache_bits + // in db_stress_gflags.cc + size_t wp_snapshot_cache_bits = static_cast(7); + // 8m entry, 64MB size + // Should the default value change, please also update wp_commit_cache_bits + // in db_stress_gflags.cc + size_t wp_commit_cache_bits = static_cast(23); + + // For testing, whether transaction name should be auto-generated or not. This + // is useful for write unprepared which requires named transactions. + bool autogenerate_name = false; + + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxn; + friend class WritePreparedTransactionTestBase; + friend class TransactionTestBase; + friend class MySQLStyleTransactionTest; + friend class StressTest; +}; + +struct TransactionOptions { + // Setting set_snapshot=true is the same as calling + // Transaction::SetSnapshot(). + bool set_snapshot = false; + + // Setting to true means that before acquiring locks, this transaction will + // check if doing so will cause a deadlock. If so, it will return with + // Status::Busy. The user should retry their transaction. + bool deadlock_detect = false; + + // If set, it states that the CommitTimeWriteBatch represents the latest state + // of the application, has only one sub-batch, i.e., no duplicate keys, and + // meant to be used later during recovery. It enables an optimization to + // postpone updating the memtable with CommitTimeWriteBatch to only + // SwitchMemtable or recovery. + // This option does not affect write-committed. Only + // write-prepared/write-unprepared transactions will be affected. + bool use_only_the_last_commit_time_batch_for_recovery = false; + + // TODO(agiardullo): TransactionDB does not yet support comparators that allow + // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only + // return 0 if + // a.compare(b) returns 0. + + // If positive, specifies the wait timeout in milliseconds when + // a transaction attempts to lock a key. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, TransactionDBOptions::transaction_lock_timeout will be used. + int64_t lock_timeout = -1; + + // Expiration duration in milliseconds. If non-negative, transactions that + // last longer than this many milliseconds will fail to commit. If not set, + // a forgotten transaction that is never committed, rolled back, or deleted + // will never relinquish any locks it holds. This could prevent keys from + // being written by other writers. + int64_t expiration = -1; + + // The number of traversals to make during deadlock detection. + int64_t deadlock_detect_depth = 50; + + // The maximum number of bytes used for the write batch. 0 means no limit. + size_t max_write_batch_size = 0; + + // Skip Concurrency Control. This could be as an optimization if the + // application knows that the transaction would not have any conflict with + // concurrent transactions. It could also be used during recovery if (i) + // application guarantees no conflict between prepared transactions in the WAL + // (ii) application guarantees that recovered transactions will be rolled + // back/commit before new transactions start. + // Default: false + bool skip_concurrency_control = false; + + // In pessimistic transaction, if this is true, then you can skip Prepare + // before Commit, otherwise, you must Prepare before Commit. + bool skip_prepare = true; + + // See TransactionDBOptions::default_write_batch_flush_threshold for + // description. If a negative value is specified, then the default value from + // TransactionDBOptions is used. + int64_t write_batch_flush_threshold = -1; +}; + +// The per-write optimizations that do not involve transactions. TransactionDB +// implementation might or might not make use of the specified optimizations. +struct TransactionDBWriteOptimizations { + // If it is true it means that the application guarantees that the + // key-set in the write batch do not conflict with any concurrent transaction + // and hence the concurrency control mechanism could be skipped for this + // write. + bool skip_concurrency_control = false; + // If true, the application guarantees that there is no duplicate in the write batch and any employed mechanism to handle + // duplicate keys could be skipped. + bool skip_duplicate_key_check = false; +}; + +struct KeyLockInfo { + std::string key; + std::vector ids; + bool exclusive; +}; + +struct RangeLockInfo { + EndpointWithString start; + EndpointWithString end; + std::vector ids; + bool exclusive; +}; + +struct DeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + std::string m_waiting_key; +}; + +struct DeadlockPath { + std::vector path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit DeadlockPath(std::vector path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +class TransactionDB : public StackableDB { + public: + // Optimized version of ::Write that receives more optimization request such + // as skip_concurrency_control. + using StackableDB::Write; + virtual Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations&, + WriteBatch* updates) { + // The default implementation ignores TransactionDBWriteOptimizations and + // falls back to the un-optimized version of ::Write + return Write(opts, updates); + } + // Transactional `DeleteRange()` is not yet supported. + // However, users who know their deleted range does not conflict with + // anything can still use it via the `Write()` API. In all cases, the + // `Write()` overload specifying `TransactionDBWriteOptimizations` must be + // used and `skip_concurrency_control` must be set. When using either + // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must + // additionally be set. + using StackableDB::DeleteRange; + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } + // Open a TransactionDB similar to DB::Open(). + // Internally call PrepareWrap() and WrapDB() + // If the return status is not ok, then dbptr is set to nullptr. + static Status Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + TransactionDB** dbptr); + // Note: PrepareWrap() may change parameters, make copies before the + // invocation if needed. + static void PrepareWrap(DBOptions* db_options, + std::vector* column_families, + std::vector* compaction_enabled_cf_indices); + // If the return status is not ok, then dbptr will bet set to nullptr. The + // input db parameter might or might not be deleted as a result of the + // failure. If it is properly deleted it will be set to nullptr. If the return + // status is ok, the ownership of db is transferred to dbptr. + static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, + TransactionDB** dbptr); + // If the return status is not ok, then dbptr will bet set to nullptr. The + // input db parameter might or might not be deleted as a result of the + // failure. If it is properly deleted it will be set to nullptr. If the return + // status is ok, the ownership of db is transferred to dbptr. + static Status WrapStackableDB( + StackableDB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr); + // Since the destructor in StackableDB is virtual, this destructor is virtual + // too. The root db will be deleted by the base's destructor. + ~TransactionDB() override {} + + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; + virtual void GetAllPreparedTransactions(std::vector* trans) = 0; + + // Returns set of all locks held. + // + // The mapping is column family id -> KeyLockInfo + virtual std::unordered_multimap + GetLockStatusData() = 0; + + virtual std::vector GetDeadlockInfoBuffer() = 0; + virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; + + // Create a snapshot and assign ts to it. Return the snapshot to caller. The + // snapshot-timestamp mapping is also tracked by the database. + // Caller must ensure there are no active writes when this API is called. + virtual std::pair> + CreateTimestampedSnapshot(TxnTimestamp ts) = 0; + + // Return the latest timestamped snapshot if present. + std::shared_ptr GetLatestTimestampedSnapshot() const { + return GetTimestampedSnapshot(kMaxTxnTimestamp); + } + // Return the snapshot correponding to given timestamp. If ts is + // kMaxTxnTimestamp, then we return the latest timestamped snapshot if + // present. Othersise, we return the snapshot whose timestamp is equal to + // `ts`. If no such snapshot exists, then we return null. + virtual std::shared_ptr GetTimestampedSnapshot( + TxnTimestamp ts) const = 0; + // Release timestamped snapshots whose timestamps are less than or equal to + // ts. + virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0; + + // Get all timestamped snapshots which will be stored in + // timestamped_snapshots. + Status GetAllTimestampedSnapshots( + std::vector>& timestamped_snapshots) + const { + return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp, + timestamped_snapshots); + } + + // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub). + // timestamped_snapshots will be cleared and contain returned snapshots. + virtual Status GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& timestamped_snapshots) + const = 0; + + protected: + // To Create an TransactionDB, call Open() + // The ownership of db is transferred to the base StackableDB + explicit TransactionDB(DB* db) : StackableDB(db) {} + // No copying allowed + TransactionDB(const TransactionDB&) = delete; + void operator=(const TransactionDB&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db_mutex.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db_mutex.h new file mode 100644 index 0000000000..4ef566dad9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/transaction_db_mutex.h @@ -0,0 +1,89 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// TransactionDBMutex and TransactionDBCondVar APIs allows applications to +// implement custom mutexes and condition variables to be used by a +// TransactionDB when locking keys. +// +// To open a TransactionDB with a custom TransactionDBMutexFactory, set +// TransactionDBOptions.custom_mutex_factory. +class TransactionDBMutex { + public: + virtual ~TransactionDBMutex() {} + + // Attempt to acquire lock. Return OK on success, or other Status on failure. + // If returned status is OK, TransactionDB will eventually call UnLock(). + virtual Status Lock() = 0; + + // Attempt to acquire lock. If timeout is non-negative, operation may be + // failed after this many microseconds. + // Returns OK on success, + // TimedOut if timed out, + // or other Status on failure. + // If returned status is OK, TransactionDB will eventually call UnLock(). + virtual Status TryLockFor(int64_t timeout_time) = 0; + + // Unlock Mutex that was successfully locked by Lock() or TryLockUntil() + virtual void UnLock() = 0; +}; + +class TransactionDBCondVar { + public: + virtual ~TransactionDBCondVar() {} + + // Block current thread until condition variable is notified by a call to + // Notify() or NotifyAll(). Wait() will be called with mutex locked. + // Returns OK if notified. + // Returns non-OK if TransactionDB should stop waiting and fail the operation. + // May return OK spuriously even if not notified. + virtual Status Wait(std::shared_ptr mutex) = 0; + + // Block current thread until condition variable is notified by a call to + // Notify() or NotifyAll(), or if the timeout is reached. + // Wait() will be called with mutex locked. + // + // If timeout is non-negative, operation should be failed after this many + // microseconds. + // If implementing a custom version of this class, the implementation may + // choose to ignore the timeout. + // + // Returns OK if notified. + // Returns TimedOut if timeout is reached. + // Returns other status if TransactionDB should otherwise stop waiting and + // fail the operation. + // May return OK spuriously even if not notified. + virtual Status WaitFor(std::shared_ptr mutex, + int64_t timeout_time) = 0; + + // If any threads are waiting on *this, unblock at least one of the + // waiting threads. + virtual void Notify() = 0; + + // Unblocks all threads waiting on *this. + virtual void NotifyAll() = 0; +}; + +// Factory class that can allocate mutexes and condition variables. +class TransactionDBMutexFactory { + public: + // Create a TransactionDBMutex object. + virtual std::shared_ptr AllocateMutex() = 0; + + // Create a TransactionDBCondVar object. + virtual std::shared_ptr AllocateCondVar() = 0; + + virtual ~TransactionDBMutexFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/write_batch_with_index.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/write_batch_with_index.h new file mode 100644 index 0000000000..d5867567ba --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/utilities/write_batch_with_index.h @@ -0,0 +1,307 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +#pragma once + + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_batch_base.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +class Comparator; +class DB; +class ReadCallback; +struct ReadOptions; +struct DBOptions; + +enum WriteType { + kPutRecord, + kMergeRecord, + kDeleteRecord, + kSingleDeleteRecord, + kDeleteRangeRecord, + kLogDataRecord, + kXIDRecord, + kUnknownRecord, +}; + +// an entry for Put, Merge, Delete, or SingleDelete entry for write batches. +// Used in WBWIIterator. +struct WriteEntry { + WriteType type = kUnknownRecord; + Slice key; + Slice value; +}; + +// Iterator of one column family out of a WriteBatchWithIndex. +class WBWIIterator { + public: + virtual ~WBWIIterator() {} + + virtual bool Valid() const = 0; + + virtual void SeekToFirst() = 0; + + virtual void SeekToLast() = 0; + + virtual void Seek(const Slice& key) = 0; + + virtual void SeekForPrev(const Slice& key) = 0; + + virtual void Next() = 0; + + virtual void Prev() = 0; + + // the return WriteEntry is only valid until the next mutation of + // WriteBatchWithIndex + virtual WriteEntry Entry() const = 0; + + virtual Status status() const = 0; +}; + +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +// In Put(), Merge() Delete(), or SingleDelete(), the same function of the +// wrapped will be called. At the same time, indexes will be built. +// By calling GetWriteBatch(), a user will get the WriteBatch for the data +// they inserted, which can be used for DB::Write(). +// A user can call NewIterator() to create an iterator. +class WriteBatchWithIndex : public WriteBatchBase { + public: + // backup_index_comparator: the backup comparator used to compare keys + // within the same column family, if column family is not given in the + // interface, or we can't find a column family from the column family handle + // passed in, backup_index_comparator will be used for the column family. + // reserved_bytes: reserved bytes in underlying WriteBatch + // max_bytes: maximum size of underlying WriteBatch in bytes + // overwrite_key: if true, overwrite the key in the index when inserting + // the same key as previously, so iterator will never + // show two entries with the same key. + explicit WriteBatchWithIndex( + const Comparator* backup_index_comparator = BytewiseComparator(), + size_t reserved_bytes = 0, bool overwrite_key = false, + size_t max_bytes = 0, size_t protection_bytes_per_key = 0); + + ~WriteBatchWithIndex() override; + WriteBatchWithIndex(WriteBatchWithIndex&&); + WriteBatchWithIndex& operator=(WriteBatchWithIndex&&); + + using WriteBatchBase::Put; + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + Status Put(const Slice& key, const Slice& value) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) override; + + Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */, + const WideColumns& /* columns */) override { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call this method without a column family handle"); + } + + return Status::NotSupported( + "PutEntity not supported by WriteBatchWithIndex"); + } + + using WriteBatchBase::Merge; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + Status Merge(const Slice& key, const Slice& value) override; + Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*ts*/, const Slice& /*value*/) override { + return Status::NotSupported( + "Merge does not support user-defined timestamp"); + } + + using WriteBatchBase::Delete; + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override; + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + using WriteBatchBase::SingleDelete; + Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const Slice& key) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + using WriteBatchBase::DeleteRange; + Status DeleteRange(ColumnFamilyHandle* /* column_family */, + const Slice& /* begin_key */, + const Slice& /* end_key */) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + Status DeleteRange(const Slice& /* begin_key */, + const Slice& /* end_key */) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + Status DeleteRange(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin_key*/, const Slice& /*end_key*/, + const Slice& /*ts*/) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + + using WriteBatchBase::PutLogData; + Status PutLogData(const Slice& blob) override; + + using WriteBatchBase::Clear; + void Clear() override; + + using WriteBatchBase::GetWriteBatch; + WriteBatch* GetWriteBatch() override; + + // Create an iterator of a column family. User can call iterator.Seek() to + // search to the next entry of or after a key. Keys will be iterated in the + // order given by index_comparator. For multiple updates on the same key, + // each update will be returned as a separate entry, in the order of update + // time. + // + // The returned iterator should be deleted by the caller. + WBWIIterator* NewIterator(ColumnFamilyHandle* column_family); + // Create an iterator of the default column family. + WBWIIterator* NewIterator(); + + // Will create a new Iterator that will use WBWIIterator as a delta and + // base_iterator as base. + // + // This function is only supported if the WriteBatchWithIndex was + // constructed with overwrite_key=true. + // + // The returned iterator should be deleted by the caller. + // The base_iterator is now 'owned' by the returned iterator. Deleting the + // returned iterator will also delete the base_iterator. + // + // Updating write batch with the current key of the iterator is not safe. + // We strongly recommend users not to do it. It will invalidate the current + // key() and value() of the iterator. This invalidation happens even before + // the write batch update finishes. The state may recover after Next() is + // called. + Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family, + Iterator* base_iterator, + const ReadOptions* opts = nullptr); + // default column family + Iterator* NewIteratorWithBase(Iterator* base_iterator); + + // Similar to DB::Get() but will only read the key from this batch. + // If the batch does not have enough data to resolve Merge operations, + // MergeInProgress status may be returned. + Status GetFromBatch(ColumnFamilyHandle* column_family, + const DBOptions& options, const Slice& key, + std::string* value); + + // Similar to previous function but does not require a column_family. + // Note: An InvalidArgument status will be returned if there are any Merge + // operators for this key. Use previous method instead. + Status GetFromBatch(const DBOptions& options, const Slice& key, + std::string* value) { + return GetFromBatch(nullptr, options, key, value); + } + + // Similar to DB::Get() but will also read writes from this batch. + // + // This function will query both this batch and the DB and then merge + // the results using the DB's merge operator (if the batch contains any + // merge requests). + // + // Setting read_options.snapshot will affect what is read from the DB + // but will NOT change which keys are read from the batch (the keys in + // this batch do not yet belong to any snapshot and will be fetched + // regardless). + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + const Slice& key, std::string* value); + + // An overload of the above method that receives a PinnableSlice + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + const Slice& key, PinnableSlice* value); + + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); + + // An overload of the above method that receives a PinnableSlice + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value); + + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input); + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + void SetSavePoint() override; + + // Remove all entries in this batch (Put, Merge, Delete, SingleDelete, + // PutLogData) since the most recent call to SetSavePoint() and removes the + // most recent save point. + // If there is no previous call to SetSavePoint(), behaves the same as + // Clear(). + // + // Calling RollbackToSavePoint invalidates any open iterators on this batch. + // + // Returns Status::OK() on success, + // Status::NotFound() if no previous call to SetSavePoint(), + // or other Status on corruption. + Status RollbackToSavePoint() override; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status PopSavePoint() override; + + void SetMaxBytes(size_t max_bytes) override; + size_t GetDataSize() const; + + private: + friend class PessimisticTransactionDB; + friend class WritePreparedTxn; + friend class WriteUnpreparedTxn; + friend class WriteBatchWithIndex_SubBatchCnt_Test; + friend class WriteBatchWithIndexInternal; + // Returns the number of sub-batches inside the write batch. A sub-batch + // starts right before inserting a key that is a duplicate of a key in the + // last sub-batch. + size_t SubBatchCnt(); + + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, ReadCallback* callback); + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input, ReadCallback* callback); + struct Rep; + std::unique_ptr rep; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/version.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/version.h new file mode 100644 index 0000000000..5cfa477f6a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/version.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +// NOTE: in 'main' development branch, this should be the *next* +// minor or major version number planned for release. +#define ROCKSDB_MAJOR 8 +#define ROCKSDB_MINOR 3 +#define ROCKSDB_PATCH 2 + +// Do not use these. We made the mistake of declaring macros starting with +// double underscore. Now we have to live with our choice. We'll deprecate these +// at some point +#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR +#define __ROCKSDB_MINOR__ ROCKSDB_MINOR +#define __ROCKSDB_PATCH__ ROCKSDB_PATCH + +namespace ROCKSDB_NAMESPACE { +// Returns a set of properties indicating how/when/where this version of RocksDB +// was created. +const std::unordered_map& GetRocksBuildProperties(); + +// Returns the current version of RocksDB as a string (e.g. "6.16.0"). +// If with_patch is true, the patch is included (6.16.x). +// Otherwise, only major and minor version is included (6.16) +std::string GetRocksVersionAsString(bool with_patch = true); + +// Gets the set of build properties (@see GetRocksBuildProperties) into a +// string. Properties are returned one-per-line, with the first line being: +// " from RocksDB . +// If verbose is true, the full set of properties is +// printed. If verbose is false, only the version information (@see +// GetRocksVersionString) is printed. +std::string GetRocksBuildInfoAsString(const std::string& program, + bool verbose = false); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wal_filter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wal_filter.h new file mode 100644 index 0000000000..3e66c39e46 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wal_filter.h @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteBatch; +struct ConfigOptions; + +// WALFilter allows an application to inspect write-ahead-log (WAL) +// records or modify their processing on recovery. +// Please see the details below. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class WalFilter : public Customizable { + public: + static const char* Type() { return "WalFilter"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, WalFilter** result); + enum class WalProcessingOption { + // Continue processing as usual + kContinueProcessing = 0, + // Ignore the current record but continue processing of log(s) + kIgnoreCurrentRecord = 1, + // Stop replay of logs and discard logs + // Logs won't be replayed on subsequent recovery + kStopReplay = 2, + // Corrupted record detected by filter + kCorruptedRecord = 3, + // Marker for enum count + kWalProcessingOptionMax = 4 + }; + + virtual ~WalFilter() {} + + // Provide ColumnFamily->LogNumber map to filter + // so that filter can determine whether a log number applies to a given + // column family (i.e. that log hasn't been flushed to SST already for the + // column family). + // We also pass in name->id map as only name is known during + // recovery (as handles are opened post-recovery). + // while write batch callbacks happen in terms of column family id. + // + // @params cf_lognumber_map column_family_id to lognumber map + // @params cf_name_id_map column_family_name to column_family_id map + + virtual void ColumnFamilyLogNumberMap( + const std::map& /*cf_lognumber_map*/, + const std::map& /*cf_name_id_map*/) {} + + // LogRecord is invoked for each log record encountered for all the logs + // during replay on logs on recovery. This method can be used to: + // * inspect the record (using the batch parameter) + // * ignoring current record + // (by returning WalProcessingOption::kIgnoreCurrentRecord) + // * reporting corrupted record + // (by returning WalProcessingOption::kCorruptedRecord) + // * stop log replay + // (by returning kStop replay) - please note that this implies + // discarding the logs from current record onwards. + // + // @params log_number log_number of the current log. + // Filter might use this to determine if the log + // record is applicable to a certain column family. + // @params log_file_name log file name - only for informational purposes + // @params batch batch encountered in the log during recovery + // @params new_batch new_batch to populate if filter wants to change + // the batch (for example to filter some records out, + // or alter some records). + // Please note that the new batch MUST NOT contain + // more records than original, else recovery would + // be failed. + // @params batch_changed Whether batch was changed by the filter. + // It must be set to true if new_batch was populated, + // else new_batch has no effect. + // @returns Processing option for the current record. + // Please see WalProcessingOption enum above for + // details. + virtual WalProcessingOption LogRecordFound( + unsigned long long /*log_number*/, const std::string& /*log_file_name*/, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { + // Default implementation falls back to older function for compatibility + return LogRecord(batch, new_batch, batch_changed); + } + + // Please see the comments for LogRecord above. This function is for + // compatibility only and contains a subset of parameters. + // New code should use the function above. + virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const { + return WalProcessingOption::kContinueProcessing; + } + + // Returns a name that identifies this WAL filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const override = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wide_columns.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wide_columns.h new file mode 100644 index 0000000000..5af3f51de8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/wide_columns.h @@ -0,0 +1,210 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Class representing a wide column, which is defined as a pair of column name +// and column value. +class WideColumn { + public: + WideColumn() = default; + + // Initializes a WideColumn object by forwarding the name and value + // arguments to the corresponding member Slices. This makes it possible to + // construct a WideColumn using combinations of const char*, const + // std::string&, const Slice& etc., for example: + // + // constexpr char foo[] = "foo"; + // const std::string bar("bar"); + // WideColumn column(foo, bar); + template + WideColumn(N&& name, V&& value) + : name_(std::forward(name)), value_(std::forward(value)) {} + + // Initializes a WideColumn object by forwarding the elements of + // name_tuple and value_tuple to the constructors of the corresponding member + // Slices. This makes it possible to initialize the Slices using the Slice + // constructors that take more than one argument, for example: + // + // constexpr char foo_name[] = "foo_name"; + // constexpr char bar_value[] = "bar_value"; + // WideColumn column(std::piecewise_construct, + // std::forward_as_tuple(foo_name, 3), + // std::forward_as_tuple(bar_value, 3)); + template + WideColumn(std::piecewise_construct_t, NTuple&& name_tuple, + VTuple&& value_tuple) + : name_(std::make_from_tuple(std::forward(name_tuple))), + value_(std::make_from_tuple(std::forward(value_tuple))) { + } + + const Slice& name() const { return name_; } + const Slice& value() const { return value_; } + + Slice& name() { return name_; } + Slice& value() { return value_; } + + private: + Slice name_; + Slice value_; +}; + +// Note: column names and values are compared bytewise. +inline bool operator==(const WideColumn& lhs, const WideColumn& rhs) { + return lhs.name() == rhs.name() && lhs.value() == rhs.value(); +} + +inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) { + return !(lhs == rhs); +} + +inline std::ostream& operator<<(std::ostream& os, const WideColumn& column) { + const bool hex = + (os.flags() & std::ios_base::basefield) == std::ios_base::hex; + os << column.name().ToString(hex) << ':' << column.value().ToString(hex); + + return os; +} + +// A collection of wide columns. +using WideColumns = std::vector; + +// The anonymous default wide column (an empty Slice). +extern const Slice kDefaultWideColumnName; + +// An empty set of wide columns. +extern const WideColumns kNoWideColumns; + +// A self-contained collection of wide columns. Used for the results of +// wide-column queries. +class PinnableWideColumns { + public: + const WideColumns& columns() const { return columns_; } + size_t serialized_size() const { return value_.size(); } + + void SetPlainValue(const Slice& value); + void SetPlainValue(const Slice& value, Cleanable* cleanable); + void SetPlainValue(PinnableSlice&& value); + void SetPlainValue(std::string&& value); + + Status SetWideColumnValue(const Slice& value); + Status SetWideColumnValue(const Slice& value, Cleanable* cleanable); + Status SetWideColumnValue(PinnableSlice&& value); + Status SetWideColumnValue(std::string&& value); + + void Reset(); + + private: + void CopyValue(const Slice& value); + void PinOrCopyValue(const Slice& value, Cleanable* cleanable); + void MoveValue(PinnableSlice&& value); + void MoveValue(std::string&& value); + + void CreateIndexForPlainValue(); + Status CreateIndexForWideColumns(); + + PinnableSlice value_; + WideColumns columns_; +}; + +inline void PinnableWideColumns::CopyValue(const Slice& value) { + value_.PinSelf(value); +} + +inline void PinnableWideColumns::PinOrCopyValue(const Slice& value, + Cleanable* cleanable) { + if (!cleanable) { + CopyValue(value); + return; + } + + value_.PinSlice(value, cleanable); +} + +inline void PinnableWideColumns::MoveValue(PinnableSlice&& value) { + value_ = std::move(value); +} + +inline void PinnableWideColumns::MoveValue(std::string&& value) { + std::string* const buf = value_.GetSelf(); + assert(buf); + + *buf = std::move(value); + value_.PinSelf(); +} + +inline void PinnableWideColumns::CreateIndexForPlainValue() { + columns_ = WideColumns{{kDefaultWideColumnName, value_}}; +} + +inline void PinnableWideColumns::SetPlainValue(const Slice& value) { + CopyValue(value); + CreateIndexForPlainValue(); +} + +inline void PinnableWideColumns::SetPlainValue(const Slice& value, + Cleanable* cleanable) { + PinOrCopyValue(value, cleanable); + CreateIndexForPlainValue(); +} + +inline void PinnableWideColumns::SetPlainValue(PinnableSlice&& value) { + MoveValue(std::move(value)); + CreateIndexForPlainValue(); +} + +inline void PinnableWideColumns::SetPlainValue(std::string&& value) { + MoveValue(std::move(value)); + CreateIndexForPlainValue(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) { + CopyValue(value); + return CreateIndexForWideColumns(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value, + Cleanable* cleanable) { + PinOrCopyValue(value, cleanable); + return CreateIndexForWideColumns(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(PinnableSlice&& value) { + MoveValue(std::move(value)); + return CreateIndexForWideColumns(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(std::string&& value) { + MoveValue(std::move(value)); + return CreateIndexForWideColumns(); +} + +inline void PinnableWideColumns::Reset() { + value_.Reset(); + columns_.clear(); +} + +inline bool operator==(const PinnableWideColumns& lhs, + const PinnableWideColumns& rhs) { + return lhs.columns() == rhs.columns(); +} + +inline bool operator!=(const PinnableWideColumns& lhs, + const PinnableWideColumns& rhs) { + return !(lhs == rhs); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch.h new file mode 100644 index 0000000000..61ba5a7391 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch.h @@ -0,0 +1,494 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/write_batch_base.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class ColumnFamilyHandle; +struct SavePoints; +struct SliceParts; + +struct SavePoint { + size_t size; // size of rep_ + int count; // count of elements in rep_ + uint32_t content_flags; + + SavePoint() : size(0), count(0), content_flags(0) {} + + SavePoint(size_t _size, int _count, uint32_t _flags) + : size(_size), count(_count), content_flags(_flags) {} + + void clear() { + size = 0; + count = 0; + content_flags = 0; + } + + bool is_cleared() const { return (size | count | content_flags) == 0; } +}; + +class WriteBatch : public WriteBatchBase { + public: + explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0) + : WriteBatch(reserved_bytes, max_bytes, 0, 0) {} + + // `protection_bytes_per_key` is the number of bytes used to store + // protection information for each key entry. Currently supported values are + // zero (disabled) and eight. + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, + size_t protection_bytes_per_key, size_t default_cf_ts_sz); + ~WriteBatch() override; + + using WriteBatchBase::Put; + // Store the mapping "key->value" in the database. + // The following Put(..., const Slice& key, ...) API can also be used when + // user-defined timestamp is enabled as long as `key` points to a contiguous + // buffer with timestamp appended after user key. The caller is responsible + // for setting up the memory buffer pointed to by `key`. + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) override; + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatenations of arrays of + // slices. + // The following Put(..., const SliceParts& key, ...) API can be used when + // user-defined timestamp is enabled as long as the timestamp is the last + // Slice in `key`, a SliceParts (array of Slices). The caller is responsible + // for setting up the `key` SliceParts object. + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + // Store the mapping "key->{column1:value1, column2:value2, ...}" in the + // column family specified by "column_family". + using WriteBatchBase::PutEntity; + Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) override; + + using WriteBatchBase::Delete; + // If the database contains a mapping for "key", erase it. Else do nothing. + // The following Delete(..., const Slice& key) can be used when user-defined + // timestamp is enabled as long as `key` points to a contiguous buffer with + // timestamp appended after user key. The caller is responsible for setting + // up the memory buffer pointed to by `key`. + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + // variant that takes SliceParts + // These two variants of Delete(..., const SliceParts& key) can be used when + // user-defined timestamp is enabled as long as the timestamp is the last + // Slice in `key`, a SliceParts (array of Slices). The caller is responsible + // for setting up the `key` SliceParts object. + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + using WriteBatchBase::SingleDelete; + // WriteBatch implementation of DB::SingleDelete(). See db.h. + Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const Slice& key) override { + return SingleDelete(nullptr, key); + } + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + // variant that takes SliceParts + Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status SingleDelete(const SliceParts& key) override { + return SingleDelete(nullptr, key); + } + + using WriteBatchBase::DeleteRange; + // WriteBatch implementation of DB::DeleteRange(). See db.h. + Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key) override; + Status DeleteRange(const Slice& begin_key, const Slice& end_key) override { + return DeleteRange(nullptr, begin_key, end_key); + } + // begin_key and end_key should be user keys without timestamp. + Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key, const Slice& ts) override; + + // variant that takes SliceParts + Status DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) override; + Status DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key) override { + return DeleteRange(nullptr, begin_key, end_key); + } + + using WriteBatchBase::Merge; + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*ts*/, const Slice& /*value*/) override; + + // variant that takes SliceParts + Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Merge(const SliceParts& key, const SliceParts& value) override { + return Merge(nullptr, key, value); + } + + using WriteBatchBase::PutLogData; + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in which they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + Status PutLogData(const Slice& blob) override; + + using WriteBatchBase::Clear; + // Clear all updates buffered in this batch. + void Clear() override; + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + void SetSavePoint() override; + + // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the + // most recent call to SetSavePoint() and removes the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status RollbackToSavePoint() override; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status PopSavePoint() override; + + // Support for iterating over the contents of a batch. + // Objects of subclasses of Handler will be used by WriteBatch::Iterate(). + class Handler { + public: + virtual ~Handler(); + // All handler functions in this class provide default implementations so + // we won't break existing clients of Handler on a source code level when + // adding a new member function. + + // default implementation will just call Put without column family for + // backwards compatibility. If the column family is not default, + // the function is noop + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + // Put() historically doesn't return status. We didn't want to be + // backwards incompatible so we didn't change the return status + // (this is a public API). We do an ordinary get and return Status::OK() + Put(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and PutCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {} + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status PutEntityCF(uint32_t /* column_family_id */, + const Slice& /* key */, + const Slice& /* entity */) { + return Status::NotSupported("PutEntityCF not implemented"); + } + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + Delete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and DeleteCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void Delete(const Slice& /*key*/) {} + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + SingleDelete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and SingleDeleteCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void SingleDelete(const Slice& /*key*/) {} + + // If user-defined timestamp is enabled, then `begin_key` and `end_key` + // both include timestamp. + virtual Status DeleteRangeCF(uint32_t /*column_family_id*/, + const Slice& /*begin_key*/, + const Slice& /*end_key*/) { + return Status::InvalidArgument("DeleteRangeCF not implemented"); + } + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + Merge(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and MergeCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {} + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/, + const Slice& /*key*/, + const Slice& /*value*/) { + return Status::InvalidArgument("PutBlobIndexCF not implemented"); + } + + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + + virtual Status MarkBeginPrepare(bool = false) { + return Status::InvalidArgument("MarkBeginPrepare() handler not defined."); + } + + virtual Status MarkEndPrepare(const Slice& /*xid*/) { + return Status::InvalidArgument("MarkEndPrepare() handler not defined."); + } + + virtual Status MarkNoop(bool /*empty_batch*/) { + return Status::InvalidArgument("MarkNoop() handler not defined."); + } + + virtual Status MarkRollback(const Slice& /*xid*/) { + return Status::InvalidArgument( + "MarkRollbackPrepare() handler not defined."); + } + + virtual Status MarkCommit(const Slice& /*xid*/) { + return Status::InvalidArgument("MarkCommit() handler not defined."); + } + + virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/, + const Slice& /*commit_ts*/) { + return Status::InvalidArgument( + "MarkCommitWithTimestamp() handler not defined."); + } + + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); + + protected: + friend class WriteBatchInternal; + enum class OptionState { + kUnknown, + kDisabled, + kEnabled, + }; + virtual OptionState WriteAfterCommit() const { + return OptionState::kUnknown; + } + virtual OptionState WriteBeforePrepare() const { + return OptionState::kUnknown; + } + }; + Status Iterate(Handler* handler) const; + + // Retrieve the serialized version of this batch. + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + uint32_t Count() const; + + // Returns true if PutCF will be called during Iterate + bool HasPut() const; + + // Returns true if PutEntityCF will be called during Iterate + bool HasPutEntity() const; + + // Returns true if DeleteCF will be called during Iterate + bool HasDelete() const; + + // Returns true if SingleDeleteCF will be called during Iterate + bool HasSingleDelete() const; + + // Returns true if DeleteRangeCF will be called during Iterate + bool HasDeleteRange() const; + + // Returns true if MergeCF will be called during Iterate + bool HasMerge() const; + + // Returns true if MarkBeginPrepare will be called during Iterate + bool HasBeginPrepare() const; + + // Returns true if MarkEndPrepare will be called during Iterate + bool HasEndPrepare() const; + + // Returns true if MarkCommit will be called during Iterate + bool HasCommit() const; + + // Returns true if MarkRollback will be called during Iterate + bool HasRollback() const; + + // Experimental. + // + // Update timestamps of existing entries in the write batch if + // applicable. If a key is intended for a column family that disables + // timestamp, then this API won't set the timestamp for this key. + // This requires that all keys, if enable timestamp, (possibly from multiple + // column families) in the write batch have timestamps of the same format. + // + // ts_sz_func: callable object to obtain the timestamp sizes of column + // families. If ts_sz_func() accesses data structures, then the caller of this + // API must guarantee thread-safety. Like other parts of RocksDB, this API is + // not exception-safe. Therefore, ts_sz_func() must not throw. + // + // in: cf, the column family id. + // ret: timestamp size of the given column family. Return + // std::numeric_limits::max() indicating "don't know or column + // family info not found", this will cause UpdateTimestamps() to fail. + // size_t ts_sz_func(uint32_t cf); + Status UpdateTimestamps(const Slice& ts, + std::function ts_sz_func); + + // Verify the per-key-value checksums of this write batch. + // Corruption status will be returned if the verification fails. + // If this write batch does not have per-key-value checksum, + // OK status will be returned. + Status VerifyChecksum() const; + + using WriteBatchBase::GetWriteBatch; + WriteBatch* GetWriteBatch() override { return this; } + + // Constructor with a serialized string object + explicit WriteBatch(const std::string& rep); + explicit WriteBatch(std::string&& rep); + + WriteBatch(const WriteBatch& src); + WriteBatch(WriteBatch&& src) noexcept; + WriteBatch& operator=(const WriteBatch& src); + WriteBatch& operator=(WriteBatch&& src); + + // marks this point in the WriteBatch as the last record to + // be inserted into the WAL, provided the WAL is enabled + void MarkWalTerminationPoint(); + const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; } + + void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; } + + struct ProtectionInfo; + size_t GetProtectionBytesPerKey() const; + + private: + friend class WriteBatchInternal; + friend class LocalSavePoint; + // TODO(myabandeh): this is needed for a hack to collapse the write batch and + // remove duplicate keys. Remove it when the hack is replaced with a proper + // solution. + friend class WriteBatchWithIndex; + std::unique_ptr save_points_; + + // When sending a WriteBatch through WriteImpl we might want to + // specify that only the first x records of the batch be written to + // the WAL. + SavePoint wal_term_point_; + + // Is the content of the batch the application's latest state that meant only + // to be used for recovery? Refer to + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for + // more details. + bool is_latest_persistent_state_ = false; + + // False if all keys are from column families that disable user-defined + // timestamp OR UpdateTimestamps() has been called at least once. + // This flag will be set to true if any of the above Put(), Delete(), + // SingleDelete(), etc. APIs are called at least once. + // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag + // to true because the assumption is that these APIs have already set the + // timestamps to desired values. + bool needs_in_place_update_ts_ = false; + + // True if the write batch contains at least one key from a column family + // that enables user-defined timestamp. + bool has_key_with_ts_ = false; + + // For HasXYZ. Mutable to allow lazy computation of results + mutable std::atomic content_flags_; + + // Performs deferred computation of content_flags if necessary + uint32_t ComputeContentFlags() const; + + // Maximum size of rep_. + size_t max_bytes_; + + std::unique_ptr prot_info_; + + size_t default_cf_ts_sz_ = 0; + + protected: + std::string rep_; // See comment in write_batch.cc for the format of rep_ +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch_base.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch_base.h new file mode 100644 index 0000000000..f6f39ef0be --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_batch_base.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Status; +class ColumnFamilyHandle; +class WriteBatch; +struct SliceParts; + +// Abstract base class that defines the basic interface for a write batch. +// See WriteBatch for a basic implementation and WrithBatchWithIndex for an +// indexed implementation. +class WriteBatchBase { + public: + virtual ~WriteBatchBase() {} + + // Store the mapping "key->value" in the database. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) = 0; + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatenations of arrays of + // slices. + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + virtual Status Put(const SliceParts& key, const SliceParts& value); + + // Store the mapping "key->{column1:value1, column2:value2, ...}" in the + // column family specified by "column_family". + virtual Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) = 0; + + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) = 0; + + // variant that takes SliceParts + virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + virtual Status Merge(const SliceParts& key, const SliceParts& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + virtual Status Delete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) = 0; + + // variant that takes SliceParts + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key); + virtual Status Delete(const SliceParts& key); + + // If the database contains a mapping for "key", erase it. Expects that the + // key was not overwritten. Else do nothing. + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status SingleDelete(const Slice& key) = 0; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) = 0; + + // variant that takes SliceParts + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key); + virtual Status SingleDelete(const SliceParts& key); + + // If the database contains mappings in the range ["begin_key", "end_key"), + // erase them. Else do nothing. + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) = 0; + virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0; + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) = 0; + + // variant that takes SliceParts + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key); + virtual Status DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key); + + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in which they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + virtual Status PutLogData(const Slice& blob) = 0; + + // Clear all updates buffered in this batch. + virtual void Clear() = 0; + + // Covert this batch into a WriteBatch. This is an abstracted way of + // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic + // WriteBatch. + virtual WriteBatch* GetWriteBatch() = 0; + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + virtual void SetSavePoint() = 0; + + // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the + // most recent call to SetSavePoint() and removes the most recent save point. + // If there is no previous call to SetSavePoint(), behaves the same as + // Clear(). + virtual Status RollbackToSavePoint() = 0; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + + // Sets the maximum size of the write batch in bytes. 0 means no limit. + virtual void SetMaxBytes(size_t max_bytes) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_buffer_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_buffer_manager.h new file mode 100644 index 0000000000..61e75c8888 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/include/rocksdb/write_buffer_manager.h @@ -0,0 +1,183 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBufferManager is for managing memory allocation for one or more +// MemTables. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { +class CacheReservationManager; + +// Interface to block and signal DB instances, intended for RocksDB +// internal use only. Each DB instance contains ptr to StallInterface. +class StallInterface { + public: + virtual ~StallInterface() {} + + virtual void Block() = 0; + + virtual void Signal() = 0; +}; + +class WriteBufferManager final { + public: + // Parameters: + // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped. + // memory_usage() won't be valid and ShouldFlush() will always return true. + // + // cache_: if `cache` is provided, we'll put dummy entries in the cache and + // cost the memory allocated to the cache. It can be used even if _buffer_size + // = 0. + // + // allow_stall: if set true, it will enable stalling of writes when + // memory_usage() exceeds buffer_size. It will wait for flush to complete and + // memory usage to drop down. + explicit WriteBufferManager(size_t _buffer_size, + std::shared_ptr cache = {}, + bool allow_stall = false); + // No copying allowed + WriteBufferManager(const WriteBufferManager&) = delete; + WriteBufferManager& operator=(const WriteBufferManager&) = delete; + + ~WriteBufferManager(); + + // Returns true if buffer_limit is passed to limit the total memory usage and + // is greater than 0. + bool enabled() const { return buffer_size() > 0; } + + // Returns true if pointer to cache is passed. + bool cost_to_cache() const { return cache_res_mgr_ != nullptr; } + + // Returns the total memory used by memtables. + // Only valid if enabled() + size_t memory_usage() const { + return memory_used_.load(std::memory_order_relaxed); + } + + // Returns the total memory used by active memtables. + size_t mutable_memtable_memory_usage() const { + return memory_active_.load(std::memory_order_relaxed); + } + + size_t dummy_entries_in_cache_usage() const; + + // Returns the buffer_size. + size_t buffer_size() const { + return buffer_size_.load(std::memory_order_relaxed); + } + + // REQUIRED: `new_size` > 0 + void SetBufferSize(size_t new_size) { + assert(new_size > 0); + buffer_size_.store(new_size, std::memory_order_relaxed); + mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); + // Check if stall is active and can be ended. + MaybeEndWriteStall(); + } + + void SetAllowStall(bool new_allow_stall) { + allow_stall_.store(new_allow_stall, std::memory_order_relaxed); + MaybeEndWriteStall(); + } + + // Below functions should be called by RocksDB internally. + + // Should only be called from write thread + bool ShouldFlush() const { + if (enabled()) { + if (mutable_memtable_memory_usage() > + mutable_limit_.load(std::memory_order_relaxed)) { + return true; + } + size_t local_size = buffer_size(); + if (memory_usage() >= local_size && + mutable_memtable_memory_usage() >= local_size / 2) { + // If the memory exceeds the buffer size, we trigger more aggressive + // flush. But if already more than half memory is being flushed, + // triggering more flush may not help. We will hold it instead. + return true; + } + } + return false; + } + + // Returns true if total memory usage exceeded buffer_size. + // We stall the writes untill memory_usage drops below buffer_size. When the + // function returns true, all writer threads (including one checking this + // condition) across all DBs will be stalled. Stall is allowed only if user + // pass allow_stall = true during WriteBufferManager instance creation. + // + // Should only be called by RocksDB internally . + bool ShouldStall() const { + if (!allow_stall_.load(std::memory_order_relaxed) || !enabled()) { + return false; + } + + return IsStallActive() || IsStallThresholdExceeded(); + } + + // Returns true if stall is active. + bool IsStallActive() const { + return stall_active_.load(std::memory_order_relaxed); + } + + // Returns true if stalling condition is met. + bool IsStallThresholdExceeded() const { + return memory_usage() >= buffer_size_; + } + + void ReserveMem(size_t mem); + + // We are in the process of freeing `mem` bytes, so it is not considered + // when checking the soft limit. + void ScheduleFreeMem(size_t mem); + + void FreeMem(size_t mem); + + // Add the DB instance to the queue and block the DB. + // Should only be called by RocksDB internally. + void BeginWriteStall(StallInterface* wbm_stall); + + // If stall conditions have resolved, remove DB instances from queue and + // signal them to continue. + void MaybeEndWriteStall(); + + void RemoveDBFromQueue(StallInterface* wbm_stall); + + private: + std::atomic buffer_size_; + std::atomic mutable_limit_; + std::atomic memory_used_; + // Memory that hasn't been scheduled to free. + std::atomic memory_active_; + std::shared_ptr cache_res_mgr_; + // Protects cache_res_mgr_ + std::mutex cache_res_mgr_mu_; + + std::list queue_; + // Protects the queue_ and stall_active_. + std::mutex mu_; + std::atomic allow_stall_; + // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall() + // while holding mu_, but it can be read without a lock. + std::atomic stall_active_; + + void ReserveMemWithCache(size_t mem); + void FreeMemWithCache(size_t mem); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/compaction_filter_factory_jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/compaction_filter_factory_jnicallback.h new file mode 100644 index 0000000000..2f26f8dbe5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/compaction_filter_factory_jnicallback.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::CompactionFilterFactory. + +#ifndef JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_ + +#include + +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionFilterFactoryJniCallback : public JniCallback, + public CompactionFilterFactory { + public: + CompactionFilterFactoryJniCallback(JNIEnv* env, + jobject jcompaction_filter_factory); + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context); + virtual const char* Name() const; + + private: + std::unique_ptr m_name; + jmethodID m_jcreate_compaction_filter_methodid; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/comparatorjnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/comparatorjnicallback.h new file mode 100644 index 0000000000..034c0d5d7d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/comparatorjnicallback.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::Comparator + +#ifndef JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ +#define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ + +#include + +#include +#include + +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksjni/jnicallback.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +enum ReusedSynchronisationType { + /** + * Standard mutex. + */ + MUTEX, + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + */ + ADAPTIVE_MUTEX, + + /** + * There is a reused buffer per-thread. + */ + THREAD_LOCAL +}; + +struct ComparatorJniCallbackOptions { + // Set the synchronisation type used to guard the reused buffers. + // Only used if max_reused_buffer_size > 0. + ReusedSynchronisationType reused_synchronisation_type = ADAPTIVE_MUTEX; + + // Indicates if a direct byte buffer (i.e. outside of the normal + // garbage-collected heap) is used for the callbacks to Java, + // as opposed to a non-direct byte buffer which is a wrapper around + // an on-heap byte[]. + bool direct_buffer = true; + + // Maximum size of a buffer (in bytes) that will be reused. + // Comparators will use 5 of these buffers, + // so the retained memory size will be 5 * max_reused_buffer_size. + // When a buffer is needed for transferring data to a callback, + // if it requires less than max_reused_buffer_size, then an + // existing buffer will be reused, else a new buffer will be + // allocated just for that callback. -1 to disable. + int32_t max_reused_buffer_size = 64; +}; + +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB storage engine (C++) + * we then callback to the appropriate Java method + * this enables Comparators to be implemented in Java. + * + * The design of this Comparator caches the Java Slice + * objects that are used in the compare and findShortestSeparator + * method callbacks. Instead of creating new objects for each callback + * of those functions, by reuse via setHandle we are a lot + * faster; Unfortunately this means that we have to + * introduce independent locking in regions of each of those methods + * via the mutexs mtx_compare and mtx_findShortestSeparator respectively + */ +class ComparatorJniCallback : public JniCallback, public Comparator { + public: + ComparatorJniCallback(JNIEnv* env, jobject jcomparator, + const ComparatorJniCallbackOptions* options); + ~ComparatorJniCallback(); + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + const std::unique_ptr m_options; + + private: + struct ThreadLocalBuf { + ThreadLocalBuf(JavaVM* _jvm, bool _direct_buffer, jobject _jbuf) + : jvm(_jvm), direct_buffer(_direct_buffer), jbuf(_jbuf) {} + JavaVM* jvm; + bool direct_buffer; + jobject jbuf; + }; + inline void MaybeLockForReuse(const std::unique_ptr& mutex, + const bool cond) const; + inline void MaybeUnlockForReuse(const std::unique_ptr& mutex, + const bool cond) const; + jobject GetBuffer(JNIEnv* env, const Slice& src, bool reuse_buffer, + ThreadLocalPtr* tl_buf, jobject jreuse_buffer) const; + jobject ReuseBuffer(JNIEnv* env, const Slice& src, + jobject jreuse_buffer) const; + jobject NewBuffer(JNIEnv* env, const Slice& src) const; + void DeleteBuffer(JNIEnv* env, jobject jbuffer) const; + // used for synchronisation in compare method + std::unique_ptr mtx_compare; + // used for synchronisation in findShortestSeparator method + std::unique_ptr mtx_shortest; + // used for synchronisation in findShortSuccessor method + std::unique_ptr mtx_short; + std::unique_ptr m_name; + jclass m_abstract_comparator_jni_bridge_clazz; // TODO(AR) could we make this + // static somehow? + jclass m_jbytebuffer_clazz; // TODO(AR) we could cache this globally for the + // entire VM if we switch more APIs to use + // ByteBuffer // TODO(AR) could we make this + // static somehow? + jmethodID m_jcompare_mid; // TODO(AR) could we make this static somehow? + jmethodID m_jshortest_mid; // TODO(AR) could we make this static somehow? + jmethodID m_jshort_mid; // TODO(AR) could we make this static somehow? + jobject m_jcompare_buf_a; + jobject m_jcompare_buf_b; + jobject m_jshortest_buf_start; + jobject m_jshortest_buf_limit; + jobject m_jshort_buf_key; + ThreadLocalPtr* m_tl_buf_a; + ThreadLocalPtr* m_tl_buf_b; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/cplusplus_to_java_convert.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/cplusplus_to_java_convert.h new file mode 100644 index 0000000000..0eea6fa2c4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/cplusplus_to_java_convert.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +/* + * This macro is used for 32 bit OS. In 32 bit OS, the result number is a + negative number if we use reinterpret_cast(pointer). + * For example, jlong ptr = reinterpret_cast(pointer), ptr is a negative + number in 32 bit OS. + * If we check ptr using ptr > 0, it fails. For example, the following code is + not correct. + * if (jblock_cache_handle > 0) { + std::shared_ptr *pCache = + reinterpret_cast *>( + jblock_cache_handle); + options.block_cache = *pCache; + } + * But the result number is positive number if we do + reinterpret_cast(pointer) first and then cast it to jlong. size_t is 4 + bytes long in 32 bit OS and 8 bytes long in 64 bit OS. + static_cast(reinterpret_cast(_pointer)) is also working in 64 + bit OS. + * + * We don't need an opposite cast because it works from jlong to c++ pointer in + both 32 bit and 64 bit OS. + * For example, the following code is working in both 32 bit and 64 bit OS. + jblock_cache_handle is jlong. + * std::shared_ptr *pCache = + reinterpret_cast *>( + jblock_cache_handle); +*/ + +#define GET_CPLUSPLUS_POINTER(_pointer) \ + static_cast(reinterpret_cast(_pointer)) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/event_listener_jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/event_listener_jnicallback.h new file mode 100644 index 0000000000..f4a235a232 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/event_listener_jnicallback.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::EventListener. + +#ifndef JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ + +#include + +#include +#include + +#include "rocksdb/listener.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +enum EnabledEventCallback { + ON_FLUSH_COMPLETED = 0x0, + ON_FLUSH_BEGIN = 0x1, + ON_TABLE_FILE_DELETED = 0x2, + ON_COMPACTION_BEGIN = 0x3, + ON_COMPACTION_COMPLETED = 0x4, + ON_TABLE_FILE_CREATED = 0x5, + ON_TABLE_FILE_CREATION_STARTED = 0x6, + ON_MEMTABLE_SEALED = 0x7, + ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED = 0x8, + ON_EXTERNAL_FILE_INGESTED = 0x9, + ON_BACKGROUND_ERROR = 0xA, + ON_STALL_CONDITIONS_CHANGED = 0xB, + ON_FILE_READ_FINISH = 0xC, + ON_FILE_WRITE_FINISH = 0xD, + ON_FILE_FLUSH_FINISH = 0xE, + ON_FILE_SYNC_FINISH = 0xF, + ON_FILE_RANGE_SYNC_FINISH = 0x10, + ON_FILE_TRUNCATE_FINISH = 0x11, + ON_FILE_CLOSE_FINISH = 0x12, + SHOULD_BE_NOTIFIED_ON_FILE_IO = 0x13, + ON_ERROR_RECOVERY_BEGIN = 0x14, + ON_ERROR_RECOVERY_COMPLETED = 0x15, + + NUM_ENABLED_EVENT_CALLBACK = 0x16, +}; + +class EventListenerJniCallback : public JniCallback, public EventListener { + public: + EventListenerJniCallback( + JNIEnv* env, jobject jevent_listener, + const std::set& enabled_event_callbacks); + virtual ~EventListenerJniCallback(); + virtual void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info); + virtual void OnFlushBegin(DB* db, const FlushJobInfo& flush_job_info); + virtual void OnTableFileDeleted(const TableFileDeletionInfo& info); + virtual void OnCompactionBegin(DB* db, const CompactionJobInfo& ci); + virtual void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci); + virtual void OnTableFileCreated(const TableFileCreationInfo& info); + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info); + virtual void OnMemTableSealed(const MemTableInfo& info); + virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle); + virtual void OnExternalFileIngested(DB* db, + const ExternalFileIngestionInfo& info); + virtual void OnBackgroundError(BackgroundErrorReason reason, + Status* bg_error); + virtual void OnStallConditionsChanged(const WriteStallInfo& info); + virtual void OnFileReadFinish(const FileOperationInfo& info); + virtual void OnFileWriteFinish(const FileOperationInfo& info); + virtual void OnFileFlushFinish(const FileOperationInfo& info); + virtual void OnFileSyncFinish(const FileOperationInfo& info); + virtual void OnFileRangeSyncFinish(const FileOperationInfo& info); + virtual void OnFileTruncateFinish(const FileOperationInfo& info); + virtual void OnFileCloseFinish(const FileOperationInfo& info); + virtual bool ShouldBeNotifiedOnFileIO(); + virtual void OnErrorRecoveryBegin(BackgroundErrorReason reason, + Status bg_error, bool* auto_recovery); + virtual void OnErrorRecoveryCompleted(Status old_bg_error); + + private: + inline void InitCallbackMethodId(jmethodID& mid, EnabledEventCallback eec, + JNIEnv* env, + jmethodID (*get_id)(JNIEnv* env)); + template + inline jobject SetupCallbackInvocation( + JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj, + jobject (*convert)(JNIEnv* env, const T* cpp_obj)); + inline void CleanupCallbackInvocation(JNIEnv* env, jboolean attached_thread, + std::initializer_list refs); + inline void OnFileOperation(const jmethodID& mid, + const FileOperationInfo& info); + + const std::set m_enabled_event_callbacks; + jmethodID m_on_flush_completed_proxy_mid; + jmethodID m_on_flush_begin_proxy_mid; + jmethodID m_on_table_file_deleted_mid; + jmethodID m_on_compaction_begin_proxy_mid; + jmethodID m_on_compaction_completed_proxy_mid; + jmethodID m_on_table_file_created_mid; + jmethodID m_on_table_file_creation_started_mid; + jmethodID m_on_mem_table_sealed_mid; + jmethodID m_on_column_family_handle_deletion_started_mid; + jmethodID m_on_external_file_ingested_proxy_mid; + jmethodID m_on_background_error_proxy_mid; + jmethodID m_on_stall_conditions_changed_mid; + jmethodID m_on_file_read_finish_mid; + jmethodID m_on_file_write_finish_mid; + jmethodID m_on_file_flush_finish_mid; + jmethodID m_on_file_sync_finish_mid; + jmethodID m_on_file_range_sync_finish_mid; + jmethodID m_on_file_truncate_finish_mid; + jmethodID m_on_file_close_finish_mid; + jmethodID m_should_be_notified_on_file_io; + jmethodID m_on_error_recovery_begin_proxy_mid; + jmethodID m_on_error_recovery_completed_mid; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/jnicallback.h new file mode 100644 index 0000000000..a03a041282 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/jnicallback.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject + +#ifndef JAVA_ROCKSJNI_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_JNICALLBACK_H_ + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class JniCallback { + public: + JniCallback(JNIEnv* env, jobject jcallback_obj); + virtual ~JniCallback(); + + const jobject& GetJavaObject() const { return m_jcallback_obj; } + + protected: + JavaVM* m_jvm; + jobject m_jcallback_obj; + JNIEnv* getJniEnv(jboolean* attached) const; + void releaseJniEnv(jboolean& attached) const; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/loggerjnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/loggerjnicallback.h new file mode 100644 index 0000000000..57774988c5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/loggerjnicallback.h @@ -0,0 +1,51 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::Logger + +#ifndef JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_ +#define JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_ + +#include + +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +class LoggerJniCallback : public JniCallback, public Logger { + public: + LoggerJniCallback(JNIEnv* env, jobject jLogger); + ~LoggerJniCallback(); + + using Logger::GetInfoLogLevel; + using Logger::SetInfoLogLevel; + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap); + // Write an entry to the log file with the specified log level + // and format. Any log with level under the internal log level + // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be + // printed. + virtual void Logv(const InfoLogLevel log_level, const char* format, + va_list ap); + + private: + jmethodID m_jLogMethodId; + jobject m_jdebug_level; + jobject m_jinfo_level; + jobject m_jwarn_level; + jobject m_jerror_level; + jobject m_jfatal_level; + jobject m_jheader_level; + std::unique_ptr format_str(const char* format, va_list ap) const; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/portal.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/portal.h new file mode 100644 index 0000000000..aa0cc61317 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/portal.h @@ -0,0 +1,8706 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file is designed for caching those frequently used IDs and provide +// efficient portal (i.e, a set of static functions) to access java code +// from c++. + +#ifndef JAVA_ROCKSJNI_PORTAL_H_ +#define JAVA_ROCKSJNI_PORTAL_H_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/backup_engine.h" +#include "rocksdb/utilities/memory_util.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksjni/compaction_filter_factory_jnicallback.h" +#include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/cplusplus_to_java_convert.h" +#include "rocksjni/event_listener_jnicallback.h" +#include "rocksjni/loggerjnicallback.h" +#include "rocksjni/table_filter_jnicallback.h" +#include "rocksjni/trace_writer_jnicallback.h" +#include "rocksjni/transaction_notifier_jnicallback.h" +#include "rocksjni/wal_filter_jnicallback.h" +#include "rocksjni/writebatchhandlerjnicallback.h" + +// Remove macro on windows +#ifdef DELETE +#undef DELETE +#endif + +namespace ROCKSDB_NAMESPACE { + +class JavaClass { + public: + /** + * Gets and initializes a Java Class + * + * @param env A pointer to the Java environment + * @param jclazz_name The fully qualified JNI name of the Java Class + * e.g. "java/lang/String" + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env, const char* jclazz_name) { + jclass jclazz = env->FindClass(jclazz_name); + assert(jclazz != nullptr); + return jclazz; + } +}; + +// Native class template +template +class RocksDBNativeClass : public JavaClass {}; + +// Native class template for sub-classes of RocksMutableObject +template +class NativeRocksMutableObject : public RocksDBNativeClass { + public: + /** + * Gets the Java Method ID for the + * RocksMutableObject#setNativeHandle(long, boolean) method + * + * @param env A pointer to the Java environment + * @return The Java Method ID or nullptr the RocksMutableObject class cannot + * be accessed, or if one of the NoSuchMethodError, + * ExceptionInInitializerError or OutOfMemoryError exceptions is thrown + */ + static jmethodID getSetNativeHandleMethod(JNIEnv* env) { + static jclass jclazz = DERIVED::getJClass(env); + if (jclazz == nullptr) { + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "setNativeHandle", "(JZ)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Sets the C++ object pointer handle in the Java object + * + * @param env A pointer to the Java environment + * @param jobj The Java object on which to set the pointer handle + * @param ptr The C++ object pointer + * @param java_owns_handle JNI_TRUE if ownership of the C++ object is + * managed by the Java object + * + * @return true if a Java exception is pending, false otherwise + */ + static bool setHandle(JNIEnv* env, jobject jobj, PTR ptr, + jboolean java_owns_handle) { + assert(jobj != nullptr); + static jmethodID mid = getSetNativeHandleMethod(env); + if (mid == nullptr) { + return true; // signal exception + } + + env->CallVoidMethod(jobj, mid, GET_CPLUSPLUS_POINTER(ptr), + java_owns_handle); + if (env->ExceptionCheck()) { + return true; // signal exception + } + + return false; + } +}; + +// Java Exception template +template +class JavaException : public JavaClass { + public: + /** + * Create and throw a java exception with the provided message + * + * @param env A pointer to the Java environment + * @param msg The message for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const std::string& msg) { + jclass jclazz = DERIVED::getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + std::cerr << "JavaException::ThrowNew - Error: unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + const jint rs = env->ThrowNew(jclazz, msg.c_str()); + if (rs != JNI_OK) { + // exception could not be thrown + std::cerr << "JavaException::ThrowNew - Fatal: could not throw exception!" + << std::endl; + return env->ExceptionCheck(); + } + + return true; + } +}; + +// The portal class for java.lang.IllegalArgumentException +class IllegalArgumentExceptionJni + : public JavaException { + public: + /** + * Get the Java Class java.lang.IllegalArgumentException + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaException::getJClass(env, "java/lang/IllegalArgumentException"); + } + + /** + * Create and throw a Java IllegalArgumentException with the provided status + * + * If s.ok() == true, then this function will not throw any exception. + * + * @param env A pointer to the Java environment + * @param s The status for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const Status& s) { + assert(!s.ok()); + if (s.ok()) { + return false; + } + + // get the IllegalArgumentException class + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + std::cerr << "IllegalArgumentExceptionJni::ThrowNew/class - Error: " + "unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + return JavaException::ThrowNew(env, s.ToString()); + } +}; + +// The portal class for org.rocksdb.Status.Code +class CodeJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.Status.Code + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/Status$Code"); + } + + /** + * Get the Java Method: Status.Code#getValue + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getValueMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "getValue", "()b"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.Status.SubCode +class SubCodeJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.Status.SubCode + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/Status$SubCode"); + } + + /** + * Get the Java Method: Status.SubCode#getValue + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getValueMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "getValue", "()b"); + assert(mid != nullptr); + return mid; + } + + static ROCKSDB_NAMESPACE::Status::SubCode toCppSubCode( + const jbyte jsub_code) { + switch (jsub_code) { + case 0x0: + return ROCKSDB_NAMESPACE::Status::SubCode::kNone; + case 0x1: + return ROCKSDB_NAMESPACE::Status::SubCode::kMutexTimeout; + case 0x2: + return ROCKSDB_NAMESPACE::Status::SubCode::kLockTimeout; + case 0x3: + return ROCKSDB_NAMESPACE::Status::SubCode::kLockLimit; + case 0x4: + return ROCKSDB_NAMESPACE::Status::SubCode::kNoSpace; + case 0x5: + return ROCKSDB_NAMESPACE::Status::SubCode::kDeadlock; + case 0x6: + return ROCKSDB_NAMESPACE::Status::SubCode::kStaleFile; + case 0x7: + return ROCKSDB_NAMESPACE::Status::SubCode::kMemoryLimit; + + case 0x7F: + default: + return ROCKSDB_NAMESPACE::Status::SubCode::kNone; + } + } +}; + +// The portal class for org.rocksdb.Status +class StatusJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.Status + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/Status"); + } + + /** + * Get the Java Method: Status#getCode + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getCodeMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getCode", "()Lorg/rocksdb/Status$Code;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Status#getSubCode + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getSubCodeMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "getSubCode", + "()Lorg/rocksdb/Status$SubCode;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Status#getState + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getStateMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getState", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + /** + * Create a new Java org.rocksdb.Status object with the same properties as + * the provided C++ ROCKSDB_NAMESPACE::Status object + * + * @param env A pointer to the Java environment + * @param status The ROCKSDB_NAMESPACE::Status object + * + * @return A reference to a Java org.rocksdb.Status object, or nullptr + * if an an exception occurs + */ + static jobject construct(JNIEnv* env, const Status& status) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = + env->GetMethodID(jclazz, "", "(BBLjava/lang/String;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + // convert the Status state for Java + jstring jstate = nullptr; + if (status.getState() != nullptr) { + const char* const state = status.getState(); + jstate = env->NewStringUTF(state); + if (env->ExceptionCheck()) { + if (jstate != nullptr) { + env->DeleteLocalRef(jstate); + } + return nullptr; + } + } + + jobject jstatus = + env->NewObject(jclazz, mid, toJavaStatusCode(status.code()), + toJavaStatusSubCode(status.subcode()), jstate); + if (env->ExceptionCheck()) { + // exception occurred + if (jstate != nullptr) { + env->DeleteLocalRef(jstate); + } + return nullptr; + } + + if (jstate != nullptr) { + env->DeleteLocalRef(jstate); + } + + return jstatus; + } + + static jobject construct(JNIEnv* env, const Status* status) { + return construct(env, *status); + } + + // Returns the equivalent org.rocksdb.Status.Code for the provided + // C++ ROCKSDB_NAMESPACE::Status::Code enum + static jbyte toJavaStatusCode(const ROCKSDB_NAMESPACE::Status::Code& code) { + switch (code) { + case ROCKSDB_NAMESPACE::Status::Code::kOk: + return 0x0; + case ROCKSDB_NAMESPACE::Status::Code::kNotFound: + return 0x1; + case ROCKSDB_NAMESPACE::Status::Code::kCorruption: + return 0x2; + case ROCKSDB_NAMESPACE::Status::Code::kNotSupported: + return 0x3; + case ROCKSDB_NAMESPACE::Status::Code::kInvalidArgument: + return 0x4; + case ROCKSDB_NAMESPACE::Status::Code::kIOError: + return 0x5; + case ROCKSDB_NAMESPACE::Status::Code::kMergeInProgress: + return 0x6; + case ROCKSDB_NAMESPACE::Status::Code::kIncomplete: + return 0x7; + case ROCKSDB_NAMESPACE::Status::Code::kShutdownInProgress: + return 0x8; + case ROCKSDB_NAMESPACE::Status::Code::kTimedOut: + return 0x9; + case ROCKSDB_NAMESPACE::Status::Code::kAborted: + return 0xA; + case ROCKSDB_NAMESPACE::Status::Code::kBusy: + return 0xB; + case ROCKSDB_NAMESPACE::Status::Code::kExpired: + return 0xC; + case ROCKSDB_NAMESPACE::Status::Code::kTryAgain: + return 0xD; + case ROCKSDB_NAMESPACE::Status::Code::kColumnFamilyDropped: + return 0xE; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent org.rocksdb.Status.SubCode for the provided + // C++ ROCKSDB_NAMESPACE::Status::SubCode enum + static jbyte toJavaStatusSubCode( + const ROCKSDB_NAMESPACE::Status::SubCode& subCode) { + switch (subCode) { + case ROCKSDB_NAMESPACE::Status::SubCode::kNone: + return 0x0; + case ROCKSDB_NAMESPACE::Status::SubCode::kMutexTimeout: + return 0x1; + case ROCKSDB_NAMESPACE::Status::SubCode::kLockTimeout: + return 0x2; + case ROCKSDB_NAMESPACE::Status::SubCode::kLockLimit: + return 0x3; + case ROCKSDB_NAMESPACE::Status::SubCode::kNoSpace: + return 0x4; + case ROCKSDB_NAMESPACE::Status::SubCode::kDeadlock: + return 0x5; + case ROCKSDB_NAMESPACE::Status::SubCode::kStaleFile: + return 0x6; + case ROCKSDB_NAMESPACE::Status::SubCode::kMemoryLimit: + return 0x7; + default: + return 0x7F; // undefined + } + } + + static std::unique_ptr toCppStatus( + const jbyte jcode_value, const jbyte jsub_code_value) { + std::unique_ptr status; + switch (jcode_value) { + case 0x0: + // Ok + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::OK())); + break; + case 0x1: + // NotFound + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::NotFound( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0x2: + // Corruption + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Corruption( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0x3: + // NotSupported + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status( + ROCKSDB_NAMESPACE::Status::NotSupported( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode( + jsub_code_value)))); + break; + case 0x4: + // InvalidArgument + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status( + ROCKSDB_NAMESPACE::Status::InvalidArgument( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode( + jsub_code_value)))); + break; + case 0x5: + // IOError + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::IOError( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0x6: + // MergeInProgress + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status( + ROCKSDB_NAMESPACE::Status::MergeInProgress( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode( + jsub_code_value)))); + break; + case 0x7: + // Incomplete + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Incomplete( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0x8: + // ShutdownInProgress + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status( + ROCKSDB_NAMESPACE::Status::ShutdownInProgress( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode( + jsub_code_value)))); + break; + case 0x9: + // TimedOut + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::TimedOut( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0xA: + // Aborted + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Aborted( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0xB: + // Busy + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Busy( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0xC: + // Expired + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Expired( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0xD: + // TryAgain + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::TryAgain( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; + case 0xE: + // ColumnFamilyDropped + status = std::unique_ptr( + new ROCKSDB_NAMESPACE::Status( + ROCKSDB_NAMESPACE::Status::ColumnFamilyDropped( + ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode( + jsub_code_value)))); + break; + case 0x7F: + default: + return nullptr; + } + return status; + } + + // Returns the equivalent ROCKSDB_NAMESPACE::Status for the Java + // org.rocksdb.Status + static std::unique_ptr toCppStatus( + JNIEnv* env, const jobject jstatus) { + jmethodID mid_code = getCodeMethod(env); + if (mid_code == nullptr) { + // exception occurred + return nullptr; + } + jobject jcode = env->CallObjectMethod(jstatus, mid_code); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + jmethodID mid_code_value = ROCKSDB_NAMESPACE::CodeJni::getValueMethod(env); + if (mid_code_value == nullptr) { + // exception occurred + return nullptr; + } + jbyte jcode_value = env->CallByteMethod(jcode, mid_code_value); + if (env->ExceptionCheck()) { + // exception occurred + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + + jmethodID mid_subCode = getSubCodeMethod(env); + if (mid_subCode == nullptr) { + // exception occurred + return nullptr; + } + jobject jsubCode = env->CallObjectMethod(jstatus, mid_subCode); + if (env->ExceptionCheck()) { + // exception occurred + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + + jbyte jsub_code_value = 0x0; // None + if (jsubCode != nullptr) { + jmethodID mid_subCode_value = + ROCKSDB_NAMESPACE::SubCodeJni::getValueMethod(env); + if (mid_subCode_value == nullptr) { + // exception occurred + return nullptr; + } + jsub_code_value = env->CallByteMethod(jsubCode, mid_subCode_value); + if (env->ExceptionCheck()) { + // exception occurred + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + } + + jmethodID mid_state = getStateMethod(env); + if (mid_state == nullptr) { + // exception occurred + return nullptr; + } + jobject jstate = env->CallObjectMethod(jstatus, mid_state); + if (env->ExceptionCheck()) { + // exception occurred + if (jsubCode != nullptr) { + env->DeleteLocalRef(jsubCode); + } + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + + std::unique_ptr status = + toCppStatus(jcode_value, jsub_code_value); + + // delete all local refs + if (jstate != nullptr) { + env->DeleteLocalRef(jstate); + } + if (jsubCode != nullptr) { + env->DeleteLocalRef(jsubCode); + } + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + + return status; + } +}; + +// The portal class for org.rocksdb.RocksDBException +class RocksDBExceptionJni : public JavaException { + public: + /** + * Get the Java Class org.rocksdb.RocksDBException + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaException::getJClass(env, "org/rocksdb/RocksDBException"); + } + + /** + * Create and throw a Java RocksDBException with the provided message + * + * @param env A pointer to the Java environment + * @param msg The message for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const std::string& msg) { + return JavaException::ThrowNew(env, msg); + } + + /** + * Create and throw a Java RocksDBException with the provided status + * + * If s->ok() == true, then this function will not throw any exception. + * + * @param env A pointer to the Java environment + * @param s The status for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, std::unique_ptr& s) { + return ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, *(s.get())); + } + + /** + * Create and throw a Java RocksDBException with the provided status + * + * If s.ok() == true, then this function will not throw any exception. + * + * @param env A pointer to the Java environment + * @param s The status for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const Status& s) { + if (s.ok()) { + return false; + } + + // get the RocksDBException class + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected " + "exception!" + << std::endl; + return env->ExceptionCheck(); + } + + // get the constructor of org.rocksdb.RocksDBException + jmethodID mid = + env->GetMethodID(jclazz, "", "(Lorg/rocksdb/Status;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + std::cerr + << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + // get the Java status object + jobject jstatus = StatusJni::construct(env, s); + if (jstatus == nullptr) { + // exception occcurred + std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: " + "unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + // construct the RocksDBException + jthrowable rocksdb_exception = + reinterpret_cast(env->NewObject(jclazz, mid, jstatus)); + if (env->ExceptionCheck()) { + if (jstatus != nullptr) { + env->DeleteLocalRef(jstatus); + } + if (rocksdb_exception != nullptr) { + env->DeleteLocalRef(rocksdb_exception); + } + std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: " + "unexpected exception!" + << std::endl; + return true; + } + + // throw the RocksDBException + const jint rs = env->Throw(rocksdb_exception); + if (rs != JNI_OK) { + // exception could not be thrown + std::cerr + << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!" + << std::endl; + if (jstatus != nullptr) { + env->DeleteLocalRef(jstatus); + } + if (rocksdb_exception != nullptr) { + env->DeleteLocalRef(rocksdb_exception); + } + return env->ExceptionCheck(); + } + + if (jstatus != nullptr) { + env->DeleteLocalRef(jstatus); + } + if (rocksdb_exception != nullptr) { + env->DeleteLocalRef(rocksdb_exception); + } + + return true; + } + + /** + * Create and throw a Java RocksDBException with the provided message + * and status + * + * If s.ok() == true, then this function will not throw any exception. + * + * @param env A pointer to the Java environment + * @param msg The message for the exception + * @param s The status for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const std::string& msg, const Status& s) { + assert(!s.ok()); + if (s.ok()) { + return false; + } + + // get the RocksDBException class + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected " + "exception!" + << std::endl; + return env->ExceptionCheck(); + } + + // get the constructor of org.rocksdb.RocksDBException + jmethodID mid = env->GetMethodID( + jclazz, "", "(Ljava/lang/String;Lorg/rocksdb/Status;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + std::cerr + << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + jstring jmsg = env->NewStringUTF(msg.c_str()); + if (jmsg == nullptr) { + // exception thrown: OutOfMemoryError + std::cerr + << "RocksDBExceptionJni::ThrowNew/msg - Error: unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + // get the Java status object + jobject jstatus = StatusJni::construct(env, s); + if (jstatus == nullptr) { + // exception occcurred + std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: " + "unexpected exception!" + << std::endl; + if (jmsg != nullptr) { + env->DeleteLocalRef(jmsg); + } + return env->ExceptionCheck(); + } + + // construct the RocksDBException + jthrowable rocksdb_exception = reinterpret_cast( + env->NewObject(jclazz, mid, jmsg, jstatus)); + if (env->ExceptionCheck()) { + if (jstatus != nullptr) { + env->DeleteLocalRef(jstatus); + } + if (jmsg != nullptr) { + env->DeleteLocalRef(jmsg); + } + if (rocksdb_exception != nullptr) { + env->DeleteLocalRef(rocksdb_exception); + } + std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: " + "unexpected exception!" + << std::endl; + return true; + } + + // throw the RocksDBException + const jint rs = env->Throw(rocksdb_exception); + if (rs != JNI_OK) { + // exception could not be thrown + std::cerr + << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!" + << std::endl; + if (jstatus != nullptr) { + env->DeleteLocalRef(jstatus); + } + if (jmsg != nullptr) { + env->DeleteLocalRef(jmsg); + } + if (rocksdb_exception != nullptr) { + env->DeleteLocalRef(rocksdb_exception); + } + return env->ExceptionCheck(); + } + + if (jstatus != nullptr) { + env->DeleteLocalRef(jstatus); + } + if (jmsg != nullptr) { + env->DeleteLocalRef(jmsg); + } + if (rocksdb_exception != nullptr) { + env->DeleteLocalRef(rocksdb_exception); + } + + return true; + } + + /** + * Get the Java Method: RocksDBException#getStatus + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getStatusMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getStatus", "()Lorg/rocksdb/Status;"); + assert(mid != nullptr); + return mid; + } + + static std::unique_ptr toCppStatus( + JNIEnv* env, jthrowable jrocksdb_exception) { + if (!env->IsInstanceOf(jrocksdb_exception, getJClass(env))) { + // not an instance of RocksDBException + return nullptr; + } + + // get the java status object + jmethodID mid = getStatusMethod(env); + if (mid == nullptr) { + // exception occurred accessing class or method + return nullptr; + } + + jobject jstatus = env->CallObjectMethod(jrocksdb_exception, mid); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + if (jstatus == nullptr) { + return nullptr; // no status available + } + + return ROCKSDB_NAMESPACE::StatusJni::toCppStatus(env, jstatus); + } +}; + +// The portal class for java.util.List +class ListJni : public JavaClass { + public: + /** + * Get the Java Class java.util.List + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getListClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/List"); + } + + /** + * Get the Java Class java.util.ArrayList + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getArrayListClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/ArrayList"); + } + + /** + * Get the Java Class java.util.Iterator + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getIteratorClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/Iterator"); + } + + /** + * Get the Java Method: List#iterator + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getIteratorMethod(JNIEnv* env) { + jclass jlist_clazz = getListClass(env); + if (jlist_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jlist_clazz, "iterator", "()Ljava/util/Iterator;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Iterator#hasNext + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getHasNextMethod(JNIEnv* env) { + jclass jiterator_clazz = getIteratorClass(env); + if (jiterator_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jiterator_clazz, "hasNext", "()Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Iterator#next + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getNextMethod(JNIEnv* env) { + jclass jiterator_clazz = getIteratorClass(env); + if (jiterator_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jiterator_clazz, "next", "()Ljava/lang/Object;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: ArrayList constructor + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getArrayListConstructorMethodId(JNIEnv* env) { + jclass jarray_list_clazz = getArrayListClass(env); + if (jarray_list_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID mid = + env->GetMethodID(jarray_list_clazz, "", "(I)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: List#add + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getListAddMethodId(JNIEnv* env) { + jclass jlist_clazz = getListClass(env); + if (jlist_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jlist_clazz, "add", "(Ljava/lang/Object;)Z"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for java.lang.Byte +class ByteJni : public JavaClass { + public: + /** + * Get the Java Class java.lang.Byte + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/lang/Byte"); + } + + /** + * Get the Java Class byte[] + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getArrayJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "[B"); + } + + /** + * Creates a new 2-dimensional Java Byte Array byte[][] + * + * @param env A pointer to the Java environment + * @param len The size of the first dimension + * + * @return A reference to the Java byte[][] or nullptr if an exception occurs + */ + static jobjectArray new2dByteArray(JNIEnv* env, const jsize len) { + jclass clazz = getArrayJClass(env); + if (clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + return env->NewObjectArray(len, clazz, nullptr); + } + + /** + * Get the Java Method: Byte#byteValue + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getByteValueMethod(JNIEnv* env) { + jclass clazz = getJClass(env); + if (clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(clazz, "byteValue", "()B"); + assert(mid != nullptr); + return mid; + } + + /** + * Calls the Java Method: Byte#valueOf, returning a constructed Byte jobject + * + * @param env A pointer to the Java environment + * + * @return A constructing Byte object or nullptr if the class or method id + * could not be retrieved, or an exception occurred + */ + static jobject valueOf(JNIEnv* env, jbyte jprimitive_byte) { + jclass clazz = getJClass(env); + if (clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetStaticMethodID(clazz, "valueOf", "(B)Ljava/lang/Byte;"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jobject jbyte_obj = + env->CallStaticObjectMethod(clazz, mid, jprimitive_byte); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + return jbyte_obj; + } +}; + +// The portal class for java.nio.ByteBuffer +class ByteBufferJni : public JavaClass { + public: + /** + * Get the Java Class java.nio.ByteBuffer + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/nio/ByteBuffer"); + } + + /** + * Get the Java Method: ByteBuffer#allocate + * + * @param env A pointer to the Java environment + * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or + * nullptr + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getAllocateMethodId(JNIEnv* env, + jclass jbytebuffer_clazz = nullptr) { + const jclass jclazz = + jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetStaticMethodID(jclazz, "allocate", "(I)Ljava/nio/ByteBuffer;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: ByteBuffer#array + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getArrayMethodId(JNIEnv* env, + jclass jbytebuffer_clazz = nullptr) { + const jclass jclazz = + jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "array", "()[B"); + assert(mid != nullptr); + return mid; + } + + static jobject construct(JNIEnv* env, const bool direct, + const size_t capacity, + jclass jbytebuffer_clazz = nullptr) { + return constructWith(env, direct, nullptr, capacity, jbytebuffer_clazz); + } + + static jobject constructWith(JNIEnv* env, const bool direct, const char* buf, + const size_t capacity, + jclass jbytebuffer_clazz = nullptr) { + if (direct) { + bool allocated = false; + if (buf == nullptr) { + buf = new char[capacity]; + allocated = true; + } + jobject jbuf = env->NewDirectByteBuffer(const_cast(buf), + static_cast(capacity)); + if (jbuf == nullptr) { + // exception occurred + if (allocated) { + delete[] static_cast(buf); + } + return nullptr; + } + return jbuf; + } else { + const jclass jclazz = + jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + const jmethodID jmid_allocate = + getAllocateMethodId(env, jbytebuffer_clazz); + if (jmid_allocate == nullptr) { + // exception occurred accessing class, or NoSuchMethodException or + // OutOfMemoryError + return nullptr; + } + const jobject jbuf = env->CallStaticObjectMethod( + jclazz, jmid_allocate, static_cast(capacity)); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + // set buffer data? + if (buf != nullptr) { + jbyteArray jarray = array(env, jbuf, jbytebuffer_clazz); + if (jarray == nullptr) { + // exception occurred + env->DeleteLocalRef(jbuf); + return nullptr; + } + + jboolean is_copy = JNI_FALSE; + jbyte* ja = reinterpret_cast( + env->GetPrimitiveArrayCritical(jarray, &is_copy)); + if (ja == nullptr) { + // exception occurred + env->DeleteLocalRef(jarray); + env->DeleteLocalRef(jbuf); + return nullptr; + } + + memcpy(ja, const_cast(buf), capacity); + + env->ReleasePrimitiveArrayCritical(jarray, ja, 0); + + env->DeleteLocalRef(jarray); + } + + return jbuf; + } + } + + static jbyteArray array(JNIEnv* env, const jobject& jbyte_buffer, + jclass jbytebuffer_clazz = nullptr) { + const jmethodID mid = getArrayMethodId(env, jbytebuffer_clazz); + if (mid == nullptr) { + // exception occurred accessing class, or NoSuchMethodException or + // OutOfMemoryError + return nullptr; + } + const jobject jarray = env->CallObjectMethod(jbyte_buffer, mid); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + return static_cast(jarray); + } +}; + +// The portal class for java.lang.Integer +class IntegerJni : public JavaClass { + public: + /** + * Get the Java Class java.lang.Integer + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/lang/Integer"); + } + + static jobject valueOf(JNIEnv* env, jint jprimitive_int) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = + env->GetStaticMethodID(jclazz, "valueOf", "(I)Ljava/lang/Integer;"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jobject jinteger_obj = + env->CallStaticObjectMethod(jclazz, mid, jprimitive_int); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + return jinteger_obj; + } +}; + +// The portal class for java.lang.Long +class LongJni : public JavaClass { + public: + /** + * Get the Java Class java.lang.Long + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/lang/Long"); + } + + static jobject valueOf(JNIEnv* env, jlong jprimitive_long) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = + env->GetStaticMethodID(jclazz, "valueOf", "(J)Ljava/lang/Long;"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jobject jlong_obj = + env->CallStaticObjectMethod(jclazz, mid, jprimitive_long); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + return jlong_obj; + } +}; + +// The portal class for java.lang.StringBuilder +class StringBuilderJni : public JavaClass { + public: + /** + * Get the Java Class java.lang.StringBuilder + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/lang/StringBuilder"); + } + + /** + * Get the Java Method: StringBuilder#append + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getListAddMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID( + jclazz, "append", "(Ljava/lang/String;)Ljava/lang/StringBuilder;"); + assert(mid != nullptr); + return mid; + } + + /** + * Appends a C-style string to a StringBuilder + * + * @param env A pointer to the Java environment + * @param jstring_builder Reference to a java.lang.StringBuilder + * @param c_str A C-style string to append to the StringBuilder + * + * @return A reference to the updated StringBuilder, or a nullptr if + * an exception occurs + */ + static jobject append(JNIEnv* env, jobject jstring_builder, + const char* c_str) { + jmethodID mid = getListAddMethodId(env); + if (mid == nullptr) { + // exception occurred accessing class or method + return nullptr; + } + + jstring new_value_str = env->NewStringUTF(c_str); + if (new_value_str == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + jobject jresult_string_builder = + env->CallObjectMethod(jstring_builder, mid, new_value_str); + if (env->ExceptionCheck()) { + // exception occurred + env->DeleteLocalRef(new_value_str); + return nullptr; + } + + return jresult_string_builder; + } +}; + +// various utility functions for working with RocksDB and JNI +class JniUtil { + public: + /** + * Detect if jlong overflows size_t + * + * @param jvalue the jlong value + * + * @return + */ + inline static Status check_if_jlong_fits_size_t(const jlong& jvalue) { + Status s = Status::OK(); + if (static_cast(jvalue) > std::numeric_limits::max()) { + s = Status::InvalidArgument(Slice("jlong overflows 32 bit value.")); + } + return s; + } + + /** + * Obtains a reference to the JNIEnv from + * the JVM + * + * If the current thread is not attached to the JavaVM + * then it will be attached so as to retrieve the JNIEnv + * + * If a thread is attached, it must later be manually + * released by calling JavaVM::DetachCurrentThread. + * This can be handled by always matching calls to this + * function with calls to {@link JniUtil::releaseJniEnv(JavaVM*, jboolean)} + * + * @param jvm (IN) A pointer to the JavaVM instance + * @param attached (OUT) A pointer to a boolean which + * will be set to JNI_TRUE if we had to attach the thread + * + * @return A pointer to the JNIEnv or nullptr if a fatal error + * occurs and the JNIEnv cannot be retrieved + */ + static JNIEnv* getJniEnv(JavaVM* jvm, jboolean* attached) { + assert(jvm != nullptr); + + JNIEnv* env; + const jint env_rs = + jvm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6); + + if (env_rs == JNI_OK) { + // current thread is already attached, return the JNIEnv + *attached = JNI_FALSE; + return env; + } else if (env_rs == JNI_EDETACHED) { + // current thread is not attached, attempt to attach + const jint rs_attach = + jvm->AttachCurrentThread(reinterpret_cast(&env), NULL); + if (rs_attach == JNI_OK) { + *attached = JNI_TRUE; + return env; + } else { + // error, could not attach the thread + std::cerr << "JniUtil::getJniEnv - Fatal: could not attach current " + "thread to JVM!" + << std::endl; + return nullptr; + } + } else if (env_rs == JNI_EVERSION) { + // error, JDK does not support JNI_VERSION_1_6+ + std::cerr + << "JniUtil::getJniEnv - Fatal: JDK does not support JNI_VERSION_1_6" + << std::endl; + return nullptr; + } else { + std::cerr << "JniUtil::getJniEnv - Fatal: Unknown error: env_rs=" + << env_rs << std::endl; + return nullptr; + } + } + + /** + * Counterpart to {@link JniUtil::getJniEnv(JavaVM*, jboolean*)} + * + * Detachess the current thread from the JVM if it was previously + * attached + * + * @param jvm (IN) A pointer to the JavaVM instance + * @param attached (IN) JNI_TRUE if we previously had to attach the thread + * to the JavaVM to get the JNIEnv + */ + static void releaseJniEnv(JavaVM* jvm, jboolean& attached) { + assert(jvm != nullptr); + if (attached == JNI_TRUE) { + const jint rs_detach = jvm->DetachCurrentThread(); + assert(rs_detach == JNI_OK); + if (rs_detach != JNI_OK) { + std::cerr << "JniUtil::getJniEnv - Warn: Unable to detach current " + "thread from JVM!" + << std::endl; + } + } + } + + /** + * Copies a Java String[] to a C++ std::vector + * + * @param env (IN) A pointer to the java environment + * @param jss (IN) The Java String array to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError or ArrayIndexOutOfBoundsException + * exception occurs + * + * @return A std::vector containing copies of the Java strings + */ + static std::vector copyStrings(JNIEnv* env, jobjectArray jss, + jboolean* has_exception) { + return ROCKSDB_NAMESPACE::JniUtil::copyStrings( + env, jss, env->GetArrayLength(jss), has_exception); + } + + /** + * Copies a Java String[] to a C++ std::vector + * + * @param env (IN) A pointer to the java environment + * @param jss (IN) The Java String array to copy + * @param jss_len (IN) The length of the Java String array to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError or ArrayIndexOutOfBoundsException + * exception occurs + * + * @return A std::vector containing copies of the Java strings + */ + static std::vector copyStrings(JNIEnv* env, jobjectArray jss, + const jsize jss_len, + jboolean* has_exception) { + std::vector strs; + strs.reserve(jss_len); + for (jsize i = 0; i < jss_len; i++) { + jobject js = env->GetObjectArrayElement(jss, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + *has_exception = JNI_TRUE; + return strs; + } + + jstring jstr = static_cast(js); + const char* str = env->GetStringUTFChars(jstr, nullptr); + if (str == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(js); + *has_exception = JNI_TRUE; + return strs; + } + + strs.push_back(std::string(str)); + + env->ReleaseStringUTFChars(jstr, str); + env->DeleteLocalRef(js); + } + + *has_exception = JNI_FALSE; + return strs; + } + + /** + * Copies a jstring to a C-style null-terminated byte string + * and releases the original jstring + * + * The jstring is copied as UTF-8 + * + * If an exception occurs, then JNIEnv::ExceptionCheck() + * will have been called + * + * @param env (IN) A pointer to the java environment + * @param js (IN) The java string to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + * + * @return A pointer to the copied string, or a + * nullptr if has_exception == JNI_TRUE + */ + static std::unique_ptr copyString(JNIEnv* env, jstring js, + jboolean* has_exception) { + const char* utf = env->GetStringUTFChars(js, nullptr); + if (utf == nullptr) { + // exception thrown: OutOfMemoryError + env->ExceptionCheck(); + *has_exception = JNI_TRUE; + return nullptr; + } else if (env->ExceptionCheck()) { + // exception thrown + env->ReleaseStringUTFChars(js, utf); + *has_exception = JNI_TRUE; + return nullptr; + } + + const jsize utf_len = env->GetStringUTFLength(js); + std::unique_ptr str( + new char[utf_len + + 1]); // Note: + 1 is needed for the c_str null terminator + std::strcpy(str.get(), utf); + env->ReleaseStringUTFChars(js, utf); + *has_exception = JNI_FALSE; + return str; + } + + /** + * Copies a jstring to a std::string + * and releases the original jstring + * + * If an exception occurs, then JNIEnv::ExceptionCheck() + * will have been called + * + * @param env (IN) A pointer to the java environment + * @param js (IN) The java string to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + * + * @return A std:string copy of the jstring, or an + * empty std::string if has_exception == JNI_TRUE + */ + static std::string copyStdString(JNIEnv* env, jstring js, + jboolean* has_exception) { + const char* utf = env->GetStringUTFChars(js, nullptr); + if (utf == nullptr) { + // exception thrown: OutOfMemoryError + env->ExceptionCheck(); + *has_exception = JNI_TRUE; + return std::string(); + } else if (env->ExceptionCheck()) { + // exception thrown + env->ReleaseStringUTFChars(js, utf); + *has_exception = JNI_TRUE; + return std::string(); + } + + std::string name(utf); + env->ReleaseStringUTFChars(js, utf); + *has_exception = JNI_FALSE; + return name; + } + + /** + * Copies bytes from a std::string to a jByteArray + * + * @param env A pointer to the java environment + * @param bytes The bytes to copy + * + * @return the Java byte[], or nullptr if an exception occurs + * + * @throws RocksDBException thrown + * if memory size to copy exceeds general java specific array size + * limitation. + */ + static jbyteArray copyBytes(JNIEnv* env, std::string bytes) { + return createJavaByteArrayWithSizeCheck(env, bytes.c_str(), bytes.size()); + } + + /** + * Given a Java byte[][] which is an array of java.lang.Strings + * where each String is a byte[], the passed function `string_fn` + * will be called on each String, the result is the collected by + * calling the passed function `collector_fn` + * + * @param env (IN) A pointer to the java environment + * @param jbyte_strings (IN) A Java array of Strings expressed as bytes + * @param string_fn (IN) A transform function to call for each String + * @param collector_fn (IN) A collector which is called for the result + * of each `string_fn` + * @param has_exception (OUT) will be set to JNI_TRUE + * if an ArrayIndexOutOfBoundsException or OutOfMemoryError + * exception occurs + */ + template + static void byteStrings(JNIEnv* env, jobjectArray jbyte_strings, + std::function string_fn, + std::function collector_fn, + jboolean* has_exception) { + const jsize jlen = env->GetArrayLength(jbyte_strings); + + for (jsize i = 0; i < jlen; i++) { + jobject jbyte_string_obj = env->GetObjectArrayElement(jbyte_strings, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + *has_exception = JNI_TRUE; // signal error + return; + } + + jbyteArray jbyte_string_ary = + reinterpret_cast(jbyte_string_obj); + T result = byteString(env, jbyte_string_ary, string_fn, has_exception); + + env->DeleteLocalRef(jbyte_string_obj); + + if (*has_exception == JNI_TRUE) { + // exception thrown: OutOfMemoryError + return; + } + + collector_fn(i, result); + } + + *has_exception = JNI_FALSE; + } + + /** + * Given a Java String which is expressed as a Java Byte Array byte[], + * the passed function `string_fn` will be called on the String + * and the result returned + * + * @param env (IN) A pointer to the java environment + * @param jbyte_string_ary (IN) A Java String expressed in bytes + * @param string_fn (IN) A transform function to call on the String + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + */ + template + static T byteString(JNIEnv* env, jbyteArray jbyte_string_ary, + std::function string_fn, + jboolean* has_exception) { + const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary); + return byteString(env, jbyte_string_ary, jbyte_string_len, string_fn, + has_exception); + } + + /** + * Given a Java String which is expressed as a Java Byte Array byte[], + * the passed function `string_fn` will be called on the String + * and the result returned + * + * @param env (IN) A pointer to the java environment + * @param jbyte_string_ary (IN) A Java String expressed in bytes + * @param jbyte_string_len (IN) The length of the Java String + * expressed in bytes + * @param string_fn (IN) A transform function to call on the String + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + */ + template + static T byteString(JNIEnv* env, jbyteArray jbyte_string_ary, + const jsize jbyte_string_len, + std::function string_fn, + jboolean* has_exception) { + jbyte* jbyte_string = env->GetByteArrayElements(jbyte_string_ary, nullptr); + if (jbyte_string == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return nullptr; // signal error + } + + T result = + string_fn(reinterpret_cast(jbyte_string), jbyte_string_len); + + env->ReleaseByteArrayElements(jbyte_string_ary, jbyte_string, JNI_ABORT); + + *has_exception = JNI_FALSE; + return result; + } + + /** + * Converts a std::vector to a Java byte[][] where each Java String + * is expressed as a Java Byte Array byte[]. + * + * @param env A pointer to the java environment + * @param strings A vector of Strings + * + * @return A Java array of Strings expressed as bytes, + * or nullptr if an exception is thrown + */ + static jobjectArray stringsBytes(JNIEnv* env, + std::vector strings) { + jclass jcls_ba = ByteJni::getArrayJClass(env); + if (jcls_ba == nullptr) { + // exception occurred + return nullptr; + } + + const jsize len = static_cast(strings.size()); + + jobjectArray jbyte_strings = env->NewObjectArray(len, jcls_ba, nullptr); + if (jbyte_strings == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + for (jsize i = 0; i < len; i++) { + std::string* str = &strings[i]; + const jsize str_len = static_cast(str->size()); + + jbyteArray jbyte_string_ary = env->NewByteArray(str_len); + if (jbyte_string_ary == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jbyte_strings); + return nullptr; + } + + env->SetByteArrayRegion( + jbyte_string_ary, 0, str_len, + const_cast(reinterpret_cast(str->c_str()))); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jbyte_string_ary); + env->DeleteLocalRef(jbyte_strings); + return nullptr; + } + + env->SetObjectArrayElement(jbyte_strings, i, jbyte_string_ary); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + // or ArrayStoreException + env->DeleteLocalRef(jbyte_string_ary); + env->DeleteLocalRef(jbyte_strings); + return nullptr; + } + + env->DeleteLocalRef(jbyte_string_ary); + } + + return jbyte_strings; + } + + /** + * Converts a std::vector to a Java String[]. + * + * @param env A pointer to the java environment + * @param strings A vector of Strings + * + * @return A Java array of Strings, + * or nullptr if an exception is thrown + */ + static jobjectArray toJavaStrings(JNIEnv* env, + const std::vector* strings) { + jclass jcls_str = env->FindClass("java/lang/String"); + if (jcls_str == nullptr) { + // exception occurred + return nullptr; + } + + const jsize len = static_cast(strings->size()); + + jobjectArray jstrings = env->NewObjectArray(len, jcls_str, nullptr); + if (jstrings == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + for (jsize i = 0; i < len; i++) { + const std::string* str = &((*strings)[i]); + jstring js = ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, str); + if (js == nullptr) { + env->DeleteLocalRef(jstrings); + return nullptr; + } + + env->SetObjectArrayElement(jstrings, i, js); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + // or ArrayStoreException + env->DeleteLocalRef(js); + env->DeleteLocalRef(jstrings); + return nullptr; + } + } + + return jstrings; + } + + /** + * Creates a Java UTF String from a C++ std::string + * + * @param env A pointer to the java environment + * @param string the C++ std::string + * @param treat_empty_as_null true if empty strings should be treated as null + * + * @return the Java UTF string, or nullptr if the provided string + * is null (or empty and treat_empty_as_null is set), or if an + * exception occurs allocating the Java String. + */ + static jstring toJavaString(JNIEnv* env, const std::string* string, + const bool treat_empty_as_null = false) { + if (string == nullptr) { + return nullptr; + } + + if (treat_empty_as_null && string->empty()) { + return nullptr; + } + + return env->NewStringUTF(string->c_str()); + } + + /** + * Copies bytes to a new jByteArray with the check of java array size + * limitation. + * + * @param bytes pointer to memory to copy to a new jByteArray + * @param size number of bytes to copy + * + * @return the Java byte[], or nullptr if an exception occurs + * + * @throws RocksDBException thrown + * if memory size to copy exceeds general java array size limitation to + * avoid overflow. + */ + static jbyteArray createJavaByteArrayWithSizeCheck(JNIEnv* env, + const char* bytes, + const size_t size) { + // Limitation for java array size is vm specific + // In general it cannot exceed Integer.MAX_VALUE (2^31 - 1) + // Current HotSpot VM limitation for array size is Integer.MAX_VALUE - 5 + // (2^31 - 1 - 5) It means that the next call to env->NewByteArray can still + // end with OutOfMemoryError("Requested array size exceeds VM limit") coming + // from VM + static const size_t MAX_JARRAY_SIZE = (static_cast(1)) << 31; + if (size > MAX_JARRAY_SIZE) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Requested array size exceeds VM limit"); + return nullptr; + } + + const jsize jlen = static_cast(size); + jbyteArray jbytes = env->NewByteArray(jlen); + if (jbytes == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + env->SetByteArrayRegion( + jbytes, 0, jlen, + const_cast(reinterpret_cast(bytes))); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jbytes); + return nullptr; + } + + return jbytes; + } + + /** + * Copies bytes from a ROCKSDB_NAMESPACE::Slice to a jByteArray + * + * @param env A pointer to the java environment + * @param bytes The bytes to copy + * + * @return the Java byte[] or nullptr if an exception occurs + * + * @throws RocksDBException thrown + * if memory size to copy exceeds general java specific array size + * limitation. + */ + static jbyteArray copyBytes(JNIEnv* env, const Slice& bytes) { + return createJavaByteArrayWithSizeCheck(env, bytes.data(), bytes.size()); + } + + /* + * Helper for operations on a key and value + * for example WriteBatch->Put + * + * TODO(AR) could be used for RocksDB->Put etc. + */ + static std::unique_ptr kv_op( + std::function + op, + JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } + return nullptr; + } + + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); + ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), + jvalue_len); + + auto status = op(key_slice, value_slice); + + if (value != nullptr) { + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); + } + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } + + return std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(status)); + } + + /* + * Helper for operations on a key + * for example WriteBatch->Delete + * + * TODO(AR) could be used for RocksDB->Delete etc. + */ + static std::unique_ptr k_op( + std::function op, + JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); + + auto status = op(key_slice); + + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } + + return std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(status)); + } + + /* + * Helper for operations on a key which is a region of an array + * Used to extract the common code from seek/seekForPrev. + * Possible that it can be generalised from that. + * + * We use GetByteArrayRegion to copy the key region of the whole array into + * a char[] We suspect this is not much slower than GetByteArrayElements, + * which probably copies anyway. + */ + static void k_op_region(std::function op, + JNIEnv* env, jbyteArray jkey, jint jkey_off, + jint jkey_len) { + const std::unique_ptr key(new char[jkey_len]); + if (key == nullptr) { + jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError"); + env->ThrowNew(oom_class, + "Memory allocation failed in RocksDB JNI function"); + return; + } + env->GetByteArrayRegion(jkey, jkey_off, jkey_len, + reinterpret_cast(key.get())); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return; + } + + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key.get()), + jkey_len); + op(key_slice); + } + + /* + * Helper for operations on a value + * for example WriteBatchWithIndex->GetFromBatch + */ + static jbyteArray v_op(std::function + op, + JNIEnv* env, jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); + + std::string value; + ROCKSDB_NAMESPACE::Status s = op(key_slice, &value); + + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } + + if (s.IsNotFound()) { + return nullptr; + } + + if (s.ok()) { + jbyteArray jret_value = + env->NewByteArray(static_cast(value.size())); + if (jret_value == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + env->SetByteArrayRegion( + jret_value, 0, static_cast(value.size()), + const_cast(reinterpret_cast(value.c_str()))); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + if (jret_value != nullptr) { + env->DeleteLocalRef(jret_value); + } + return nullptr; + } + + return jret_value; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; + } + + /** + * Creates a vector of C++ pointers from + * a Java array of C++ pointer addresses. + * + * @param env (IN) A pointer to the java environment + * @param pointers (IN) A Java array of C++ pointer addresses + * @param has_exception (OUT) will be set to JNI_TRUE + * if an ArrayIndexOutOfBoundsException or OutOfMemoryError + * exception occurs. + * + * @return A vector of C++ pointers. + */ + template + static std::vector fromJPointers(JNIEnv* env, jlongArray jptrs, + jboolean* has_exception) { + const jsize jptrs_len = env->GetArrayLength(jptrs); + std::vector ptrs; + jlong* jptr = env->GetLongArrayElements(jptrs, nullptr); + if (jptr == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return ptrs; + } + ptrs.reserve(jptrs_len); + for (jsize i = 0; i < jptrs_len; i++) { + ptrs.push_back(reinterpret_cast(jptr[i])); + } + env->ReleaseLongArrayElements(jptrs, jptr, JNI_ABORT); + return ptrs; + } + + /** + * Creates a Java array of C++ pointer addresses + * from a vector of C++ pointers. + * + * @param env (IN) A pointer to the java environment + * @param pointers (IN) A vector of C++ pointers + * @param has_exception (OUT) will be set to JNI_TRUE + * if an ArrayIndexOutOfBoundsException or OutOfMemoryError + * exception occurs + * + * @return Java array of C++ pointer addresses. + */ + template + static jlongArray toJPointers(JNIEnv* env, const std::vector& pointers, + jboolean* has_exception) { + const jsize len = static_cast(pointers.size()); + std::unique_ptr results(new jlong[len]); + std::transform( + pointers.begin(), pointers.end(), results.get(), + [](T* pointer) -> jlong { return GET_CPLUSPLUS_POINTER(pointer); }); + + jlongArray jpointers = env->NewLongArray(len); + if (jpointers == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return nullptr; + } + + env->SetLongArrayRegion(jpointers, 0, len, results.get()); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + *has_exception = JNI_TRUE; + env->DeleteLocalRef(jpointers); + return nullptr; + } + + *has_exception = JNI_FALSE; + + return jpointers; + } + + /* + * Helper for operations on a key and value + * for example WriteBatch->Put + * + * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status + * from `op` and used for RocksDB->Put etc. + */ + static void kv_op_direct( + std::function + op, + JNIEnv* env, jobject jkey, jint jkey_off, jint jkey_len, jobject jval, + jint jval_off, jint jval_len) { + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + if (key == nullptr || + env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, + "Invalid key argument"); + return; + } + + char* value = reinterpret_cast(env->GetDirectBufferAddress(jval)); + if (value == nullptr || + env->GetDirectBufferCapacity(jval) < (jval_off + jval_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Invalid value argument"); + return; + } + + key += jkey_off; + value += jval_off; + + ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); + ROCKSDB_NAMESPACE::Slice value_slice(value, jval_len); + + op(key_slice, value_slice); + } + + /* + * Helper for operations on a key and value + * for example WriteBatch->Delete + * + * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status + * from `op` and used for RocksDB->Delete etc. + */ + static void k_op_direct(std::function op, + JNIEnv* env, jobject jkey, jint jkey_off, + jint jkey_len) { + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + if (key == nullptr || + env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, + "Invalid key argument"); + return; + } + + key += jkey_off; + + ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); + + return op(key_slice); + } + + template + static jint copyToDirect(JNIEnv* env, T& source, jobject jtarget, + jint jtarget_off, jint jtarget_len) { + char* target = + reinterpret_cast(env->GetDirectBufferAddress(jtarget)); + if (target == nullptr || + env->GetDirectBufferCapacity(jtarget) < (jtarget_off + jtarget_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Invalid target argument"); + return 0; + } + + target += jtarget_off; + + const jint cvalue_len = static_cast(source.size()); + const jint length = std::min(jtarget_len, cvalue_len); + + memcpy(target, source.data(), length); + + return cvalue_len; + } +}; + +class MapJni : public JavaClass { + public: + /** + * Get the Java Class java.util.Map + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/Map"); + } + + /** + * Get the Java Method: Map#put + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMapPutMethodId(JNIEnv* env) { + jclass jlist_clazz = getJClass(env); + if (jlist_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID( + jlist_clazz, "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); + assert(mid != nullptr); + return mid; + } +}; + +class HashMapJni : public JavaClass { + public: + /** + * Get the Java Class java.util.HashMap + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/HashMap"); + } + + /** + * Create a new Java java.util.HashMap object. + * + * @param env A pointer to the Java environment + * + * @return A reference to a Java java.util.HashMap object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const uint32_t initial_capacity = 16) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "(I)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jhash_map = + env->NewObject(jclazz, mid, static_cast(initial_capacity)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jhash_map; + } + + /** + * A function which maps a std::pair to a std::pair + * + * @return Either a pointer to a std::pair, or nullptr + * if an error occurs during the mapping + */ + template + using FnMapKV = + std::function>(const std::pair&)>; + + // template ::value_type, std::pair>::value, + // int32_t>::type = 0> static void putAll(JNIEnv* env, const jobject + // jhash_map, I iterator, const FnMapKV &fn_map_kv) { + /** + * Returns true if it succeeds, false if an error occurs + */ + template + static bool putAll(JNIEnv* env, const jobject jhash_map, + iterator_type iterator, iterator_type end, + const FnMapKV& fn_map_kv) { + const jmethodID jmid_put = + ROCKSDB_NAMESPACE::MapJni::getMapPutMethodId(env); + if (jmid_put == nullptr) { + return false; + } + + for (auto it = iterator; it != end; ++it) { + const std::unique_ptr> result = + fn_map_kv(*it); + if (result == nullptr) { + // an error occurred during fn_map_kv + return false; + } + env->CallObjectMethod(jhash_map, jmid_put, result->first, result->second); + if (env->ExceptionCheck()) { + // exception occurred + env->DeleteLocalRef(result->second); + env->DeleteLocalRef(result->first); + return false; + } + + // release local references + env->DeleteLocalRef(result->second); + env->DeleteLocalRef(result->first); + } + + return true; + } + + /** + * Creates a java.util.Map from a std::map + * + * @param env A pointer to the Java environment + * @param map the Cpp map + * + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred + */ + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { + if (map == nullptr) { + return nullptr; + } + + jobject jhash_map = construct(env, static_cast(map->size())); + if (jhash_map == nullptr) { + // exception occurred + return nullptr; + } + + const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV< + const std::string, const std::string, jobject, jobject> + fn_map_kv = + [env](const std::pair& kv) { + jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &(kv.first), false); + if (env->ExceptionCheck()) { + // an error occurred + return std::unique_ptr>(nullptr); + } + + jstring jvalue = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &(kv.second), true); + if (env->ExceptionCheck()) { + // an error occurred + env->DeleteLocalRef(jkey); + return std::unique_ptr>(nullptr); + } + + return std::unique_ptr>( + new std::pair( + static_cast(jkey), + static_cast(jvalue))); + }; + + if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) { + // exception occurred + return nullptr; + } + + return jhash_map; + } + + /** + * Creates a java.util.Map from a std::map + * + * @param env A pointer to the Java environment + * @param map the Cpp map + * + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred + */ + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { + if (map == nullptr) { + return nullptr; + } + + if (map == nullptr) { + return nullptr; + } + + jobject jhash_map = construct(env, static_cast(map->size())); + if (jhash_map == nullptr) { + // exception occurred + return nullptr; + } + + const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV< + const std::string, const uint32_t, jobject, jobject> + fn_map_kv = + [env](const std::pair& kv) { + jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &(kv.first), false); + if (env->ExceptionCheck()) { + // an error occurred + return std::unique_ptr>(nullptr); + } + + jobject jvalue = ROCKSDB_NAMESPACE::IntegerJni::valueOf( + env, static_cast(kv.second)); + if (env->ExceptionCheck()) { + // an error occurred + env->DeleteLocalRef(jkey); + return std::unique_ptr>(nullptr); + } + + return std::unique_ptr>( + new std::pair(static_cast(jkey), + jvalue)); + }; + + if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) { + // exception occurred + return nullptr; + } + + return jhash_map; + } + + /** + * Creates a java.util.Map from a std::map + * + * @param env A pointer to the Java environment + * @param map the Cpp map + * + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred + */ + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { + if (map == nullptr) { + return nullptr; + } + + jobject jhash_map = construct(env, static_cast(map->size())); + if (jhash_map == nullptr) { + // exception occurred + return nullptr; + } + + const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV< + const std::string, const uint64_t, jobject, jobject> + fn_map_kv = + [env](const std::pair& kv) { + jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &(kv.first), false); + if (env->ExceptionCheck()) { + // an error occurred + return std::unique_ptr>(nullptr); + } + + jobject jvalue = ROCKSDB_NAMESPACE::LongJni::valueOf( + env, static_cast(kv.second)); + if (env->ExceptionCheck()) { + // an error occurred + env->DeleteLocalRef(jkey); + return std::unique_ptr>(nullptr); + } + + return std::unique_ptr>( + new std::pair(static_cast(jkey), + jvalue)); + }; + + if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) { + // exception occurred + return nullptr; + } + + return jhash_map; + } + + /** + * Creates a java.util.Map from a std::map + * + * @param env A pointer to the Java environment + * @param map the Cpp map + * + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred + */ + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { + if (map == nullptr) { + return nullptr; + } + + jobject jhash_map = construct(env, static_cast(map->size())); + if (jhash_map == nullptr) { + // exception occurred + return nullptr; + } + + const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV + fn_map_kv = [env](const std::pair& kv) { + jobject jkey = ROCKSDB_NAMESPACE::IntegerJni::valueOf( + env, static_cast(kv.first)); + if (env->ExceptionCheck()) { + // an error occurred + return std::unique_ptr>(nullptr); + } + + jobject jvalue = ROCKSDB_NAMESPACE::LongJni::valueOf( + env, static_cast(kv.second)); + if (env->ExceptionCheck()) { + // an error occurred + env->DeleteLocalRef(jkey); + return std::unique_ptr>(nullptr); + } + + return std::unique_ptr>( + new std::pair(static_cast(jkey), + jvalue)); + }; + + if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) { + // exception occurred + return nullptr; + } + + return jhash_map; + } +}; + +// The portal class for org.rocksdb.RocksDB +class RocksDBJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.RocksDB + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB"); + } +}; + +// The portal class for org.rocksdb.Options +class OptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.Options + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options"); + } +}; + +// The portal class for org.rocksdb.DBOptions +class DBOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.DBOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions"); + } +}; + +// The portal class for org.rocksdb.ColumnFamilyOptions +class ColumnFamilyOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.ColumnFamilyOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/ColumnFamilyOptions"); + } + + /** + * Create a new Java org.rocksdb.ColumnFamilyOptions object with the same + * properties as the provided C++ ROCKSDB_NAMESPACE::ColumnFamilyOptions + * object + * + * @param env A pointer to the Java environment + * @param cfoptions A pointer to ROCKSDB_NAMESPACE::ColumnFamilyOptions object + * + * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const ColumnFamilyOptions* cfoptions) { + auto* cfo = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(*cfoptions); + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "(J)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jcfd = env->NewObject(jclazz, mid, GET_CPLUSPLUS_POINTER(cfo)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jcfd; + } +}; + +// The portal class for org.rocksdb.WriteOptions +class WriteOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.WriteOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteOptions"); + } +}; + +// The portal class for org.rocksdb.ReadOptions +class ReadOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.ReadOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/ReadOptions"); + } +}; + +// The portal class for org.rocksdb.WriteBatch +class WriteBatchJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.WriteBatch + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch"); + } + + /** + * Create a new Java org.rocksdb.WriteBatch object + * + * @param env A pointer to the Java environment + * @param wb A pointer to ROCKSDB_NAMESPACE::WriteBatch object + * + * @return A reference to a Java org.rocksdb.WriteBatch object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const WriteBatch* wb) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "(J)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jwb = env->NewObject(jclazz, mid, GET_CPLUSPLUS_POINTER(wb)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jwb; + } +}; + +// The portal class for org.rocksdb.WriteBatch.Handler +class WriteBatchHandlerJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::WriteBatchHandlerJniCallback*, + WriteBatchHandlerJni> { + public: + /** + * Get the Java Class org.rocksdb.WriteBatch.Handler + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch$Handler"); + } + + /** + * Get the Java Method: WriteBatch.Handler#put + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getPutCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "put", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#put + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getPutMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "put", "([B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#merge + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMergeCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "merge", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#merge + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMergeMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "merge", "([B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#delete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getDeleteCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "delete", "(I[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#delete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getDeleteMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "delete", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#singleDelete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "(I[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#singleDelete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getSingleDeleteMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#deleteRange + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#deleteRange + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getDeleteRangeMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "([B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#logData + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getLogDataMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "logData", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#putBlobIndex + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "putBlobIndex", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markBeginPrepare + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markBeginPrepare", "()V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markEndPrepare + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markEndPrepare", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markNoop + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkNoopMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markNoop", "(Z)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markRollback + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkRollbackMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markRollback", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markCommit + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkCommitMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markCommit", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markCommitWithTimestamp + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkCommitWithTimestampMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "markCommitWithTimestamp", "([B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#shouldContinue + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getContinueMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "shouldContinue", "()Z"); + assert(mid != nullptr); + return mid; + } +}; + +class WriteBatchSavePointJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.WriteBatch.SavePoint + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WriteBatch$SavePoint"); + } + + /** + * Get the Java Method: HistogramData constructor + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getConstructorMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "", "(JJJ)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Create a new Java org.rocksdb.WriteBatch.SavePoint object + * + * @param env A pointer to the Java environment + * @param savePoint A pointer to ROCKSDB_NAMESPACE::WriteBatch::SavePoint + * object + * + * @return A reference to a Java org.rocksdb.WriteBatch.SavePoint object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const SavePoint& save_point) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = getConstructorMethodId(env); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jsave_point = + env->NewObject(jclazz, mid, static_cast(save_point.size), + static_cast(save_point.count), + static_cast(save_point.content_flags)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jsave_point; + } +}; + +// The portal class for org.rocksdb.WriteBatchWithIndex +class WriteBatchWithIndexJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.WriteBatchWithIndex + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/WriteBatchWithIndex"); + } +}; + +// The portal class for org.rocksdb.HistogramData +class HistogramDataJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.HistogramData + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/HistogramData"); + } + + /** + * Get the Java Method: HistogramData constructor + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getConstructorMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "", "(DDDDDDJJD)V"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.BackupEngineOptions +class BackupEngineOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.BackupEngineOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/BackupEngineOptions"); + } +}; + +// The portal class for org.rocksdb.BackupEngine +class BackupEngineJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.BackupEngine + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/BackupEngine"); + } +}; + +// The portal class for org.rocksdb.RocksIterator +class IteratorJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.RocksIterator + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksIterator"); + } +}; + +// The portal class for org.rocksdb.Filter +class FilterJni + : public RocksDBNativeClass< + std::shared_ptr*, FilterJni> { + public: + /** + * Get the Java Class org.rocksdb.Filter + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/Filter"); + } +}; + +// The portal class for org.rocksdb.ColumnFamilyHandle +class ColumnFamilyHandleJni + : public RocksDBNativeClass { + public: + static jobject fromCppColumnFamilyHandle( + JNIEnv* env, const ROCKSDB_NAMESPACE::ColumnFamilyHandle* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, GET_CPLUSPLUS_POINTER(info)); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(J)V"); + } + + /** + * Get the Java Class org.rocksdb.ColumnFamilyHandle + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/ColumnFamilyHandle"); + } +}; + +// The portal class for org.rocksdb.FlushOptions +class FlushOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.FlushOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/FlushOptions"); + } +}; + +// The portal class for org.rocksdb.ComparatorOptions +class ComparatorOptionsJni + : public RocksDBNativeClass< + ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*, + ComparatorOptionsJni> { + public: + /** + * Get the Java Class org.rocksdb.ComparatorOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/ComparatorOptions"); + } +}; + +// The portal class for org.rocksdb.AbstractCompactionFilterFactory +class AbstractCompactionFilterFactoryJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::CompactionFilterFactoryJniCallback*, + AbstractCompactionFilterFactoryJni> { + public: + /** + * Get the Java Class org.rocksdb.AbstractCompactionFilterFactory + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass( + env, "org/rocksdb/AbstractCompactionFilterFactory"); + } + + /** + * Get the Java Method: AbstractCompactionFilterFactory#name + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getNameMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "name", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractCompactionFilterFactory#createCompactionFilter + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "createCompactionFilter", "(ZZ)J"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.AbstractTransactionNotifier +class AbstractTransactionNotifierJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::TransactionNotifierJniCallback*, + AbstractTransactionNotifierJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass( + env, "org/rocksdb/AbstractTransactionNotifier"); + } + + // Get the java method `snapshotCreated` + // of org.rocksdb.AbstractTransactionNotifier. + static jmethodID getSnapshotCreatedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "snapshotCreated", "(J)V"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.AbstractComparatorJniBridge +class AbstractComparatorJniBridge : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.AbstractComparatorJniBridge + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/AbstractComparatorJniBridge"); + } + + /** + * Get the Java Method: Comparator#compareInternal + * + * @param env A pointer to the Java environment + * @param jclazz the AbstractComparatorJniBridge class + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) { + static jmethodID mid = + env->GetStaticMethodID(jclazz, "compareInternal", + "(Lorg/rocksdb/AbstractComparator;Ljava/nio/" + "ByteBuffer;ILjava/nio/ByteBuffer;I)I"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Comparator#findShortestSeparatorInternal + * + * @param env A pointer to the Java environment + * @param jclazz the AbstractComparatorJniBridge class + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, + jclass jclazz) { + static jmethodID mid = + env->GetStaticMethodID(jclazz, "findShortestSeparatorInternal", + "(Lorg/rocksdb/AbstractComparator;Ljava/nio/" + "ByteBuffer;ILjava/nio/ByteBuffer;I)I"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Comparator#findShortSuccessorInternal + * + * @param env A pointer to the Java environment + * @param jclazz the AbstractComparatorJniBridge class + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, + jclass jclazz) { + static jmethodID mid = env->GetStaticMethodID( + jclazz, "findShortSuccessorInternal", + "(Lorg/rocksdb/AbstractComparator;Ljava/nio/ByteBuffer;I)I"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.AbstractComparator +class AbstractComparatorJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.AbstractComparator + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractComparator"); + } + + /** + * Get the Java Method: Comparator#name + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getNameMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "name", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.AbstractSlice +class AbstractSliceJni + : public NativeRocksMutableObject { + public: + /** + * Get the Java Class org.rocksdb.AbstractSlice + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractSlice"); + } +}; + +// The portal class for org.rocksdb.Slice +class SliceJni + : public NativeRocksMutableObject { + public: + /** + * Get the Java Class org.rocksdb.Slice + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/Slice"); + } + + /** + * Constructs a Slice object + * + * @param env A pointer to the Java environment + * + * @return A reference to a Java Slice object, or a nullptr if an + * exception occurs + */ + static jobject construct0(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "", "()V"); + if (mid == nullptr) { + // exception occurred accessing method + return nullptr; + } + + jobject jslice = env->NewObject(jclazz, mid); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jslice; + } +}; + +// The portal class for org.rocksdb.DirectSlice +class DirectSliceJni + : public NativeRocksMutableObject { + public: + /** + * Get the Java Class org.rocksdb.DirectSlice + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/DirectSlice"); + } + + /** + * Constructs a DirectSlice object + * + * @param env A pointer to the Java environment + * + * @return A reference to a Java DirectSlice object, or a nullptr if an + * exception occurs + */ + static jobject construct0(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "", "()V"); + if (mid == nullptr) { + // exception occurred accessing method + return nullptr; + } + + jobject jdirect_slice = env->NewObject(jclazz, mid); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jdirect_slice; + } +}; + +// The portal class for org.rocksdb.BackupInfo +class BackupInfoJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.BackupInfo + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/BackupInfo"); + } + + /** + * Constructs a BackupInfo object + * + * @param env A pointer to the Java environment + * @param backup_id id of the backup + * @param timestamp timestamp of the backup + * @param size size of the backup + * @param number_files number of files related to the backup + * @param app_metadata application specific metadata + * + * @return A reference to a Java BackupInfo object, or a nullptr if an + * exception occurs + */ + static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp, + uint64_t size, uint32_t number_files, + const std::string& app_metadata) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "", "(IJJILjava/lang/String;)V"); + if (mid == nullptr) { + // exception occurred accessing method + return nullptr; + } + + jstring japp_metadata = nullptr; + if (app_metadata != nullptr) { + japp_metadata = env->NewStringUTF(app_metadata.c_str()); + if (japp_metadata == nullptr) { + // exception occurred creating java string + return nullptr; + } + } + + jobject jbackup_info = env->NewObject(jclazz, mid, backup_id, timestamp, + size, number_files, japp_metadata); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(japp_metadata); + return nullptr; + } + + return jbackup_info; + } +}; + +class BackupInfoListJni { + public: + /** + * Converts a C++ std::vector object to + * a Java ArrayList object + * + * @param env A pointer to the Java environment + * @param backup_infos A vector of BackupInfo + * + * @return Either a reference to a Java ArrayList object, or a nullptr + * if an exception occurs + */ + static jobject getBackupInfo(JNIEnv* env, + std::vector backup_infos) { + jclass jarray_list_clazz = + ROCKSDB_NAMESPACE::ListJni::getArrayListClass(env); + if (jarray_list_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID cstr_mid = + ROCKSDB_NAMESPACE::ListJni::getArrayListConstructorMethodId(env); + if (cstr_mid == nullptr) { + // exception occurred accessing method + return nullptr; + } + + jmethodID add_mid = ROCKSDB_NAMESPACE::ListJni::getListAddMethodId(env); + if (add_mid == nullptr) { + // exception occurred accessing method + return nullptr; + } + + // create java list + jobject jbackup_info_handle_list = + env->NewObject(jarray_list_clazz, cstr_mid, backup_infos.size()); + if (env->ExceptionCheck()) { + // exception occurred constructing object + return nullptr; + } + + // insert in java list + auto end = backup_infos.end(); + for (auto it = backup_infos.begin(); it != end; ++it) { + auto backup_info = *it; + + jobject obj = ROCKSDB_NAMESPACE::BackupInfoJni::construct0( + env, backup_info.backup_id, backup_info.timestamp, backup_info.size, + backup_info.number_files, backup_info.app_metadata); + if (env->ExceptionCheck()) { + // exception occurred constructing object + if (obj != nullptr) { + env->DeleteLocalRef(obj); + } + if (jbackup_info_handle_list != nullptr) { + env->DeleteLocalRef(jbackup_info_handle_list); + } + return nullptr; + } + + jboolean rs = + env->CallBooleanMethod(jbackup_info_handle_list, add_mid, obj); + if (env->ExceptionCheck() || rs == JNI_FALSE) { + // exception occurred calling method, or could not add + if (obj != nullptr) { + env->DeleteLocalRef(obj); + } + if (jbackup_info_handle_list != nullptr) { + env->DeleteLocalRef(jbackup_info_handle_list); + } + return nullptr; + } + } + + return jbackup_info_handle_list; + } +}; + +// The portal class for org.rocksdb.WBWIRocksIterator +class WBWIRocksIteratorJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.WBWIRocksIterator + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator"); + } + + /** + * Get the Java Field: WBWIRocksIterator#entry + * + * @param env A pointer to the Java environment + * + * @return The Java Field ID or nullptr if the class or field id could not + * be retrieved + */ + static jfieldID getWriteEntryField(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jfieldID fid = env->GetFieldID( + jclazz, "entry", "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;"); + assert(fid != nullptr); + return fid; + } + + /** + * Gets the value of the WBWIRocksIterator#entry + * + * @param env A pointer to the Java environment + * @param jwbwi_rocks_iterator A reference to a WBWIIterator + * + * @return A reference to a Java WBWIRocksIterator.WriteEntry object, or + * a nullptr if an exception occurs + */ + static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) { + assert(jwbwi_rocks_iterator != nullptr); + + jfieldID jwrite_entry_field = getWriteEntryField(env); + if (jwrite_entry_field == nullptr) { + // exception occurred accessing the field + return nullptr; + } + + jobject jwe = env->GetObjectField(jwbwi_rocks_iterator, jwrite_entry_field); + assert(jwe != nullptr); + return jwe; + } +}; + +// The portal class for org.rocksdb.WBWIRocksIterator.WriteType +class WriteTypeJni : public JavaClass { + public: + /** + * Get the PUT enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject PUT(JNIEnv* env) { return getEnum(env, "PUT"); } + + /** + * Get the MERGE enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject MERGE(JNIEnv* env) { return getEnum(env, "MERGE"); } + + /** + * Get the DELETE enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject DELETE(JNIEnv* env) { return getEnum(env, "DELETE"); } + + /** + * Get the LOG enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject LOG(JNIEnv* env) { return getEnum(env, "LOG"); } + + // Returns the equivalent org.rocksdb.WBWIRocksIterator.WriteType for the + // provided C++ ROCKSDB_NAMESPACE::WriteType enum + static jbyte toJavaWriteType(const ROCKSDB_NAMESPACE::WriteType& writeType) { + switch (writeType) { + case ROCKSDB_NAMESPACE::WriteType::kPutRecord: + return 0x0; + case ROCKSDB_NAMESPACE::WriteType::kMergeRecord: + return 0x1; + case ROCKSDB_NAMESPACE::WriteType::kDeleteRecord: + return 0x2; + case ROCKSDB_NAMESPACE::WriteType::kSingleDeleteRecord: + return 0x3; + case ROCKSDB_NAMESPACE::WriteType::kDeleteRangeRecord: + return 0x4; + case ROCKSDB_NAMESPACE::WriteType::kLogDataRecord: + return 0x5; + case ROCKSDB_NAMESPACE::WriteType::kXIDRecord: + return 0x6; + default: + return 0x7F; // undefined + } + } + + private: + /** + * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteType"); + } + + /** + * Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * @param name The name of the enum field + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject getEnum(JNIEnv* env, const char name[]) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jfieldID jfid = env->GetStaticFieldID( + jclazz, name, "Lorg/rocksdb/WBWIRocksIterator$WriteType;"); + if (env->ExceptionCheck()) { + // exception occurred while getting field + return nullptr; + } else if (jfid == nullptr) { + return nullptr; + } + + jobject jwrite_type = env->GetStaticObjectField(jclazz, jfid); + assert(jwrite_type != nullptr); + return jwrite_type; + } +}; + +// The portal class for org.rocksdb.WBWIRocksIterator.WriteEntry +class WriteEntryJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteEntry + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/WBWIRocksIterator$WriteEntry"); + } +}; + +// The portal class for org.rocksdb.InfoLogLevel +class InfoLogLevelJni : public JavaClass { + public: + /** + * Get the DEBUG_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject DEBUG_LEVEL(JNIEnv* env) { + return getEnum(env, "DEBUG_LEVEL"); + } + + /** + * Get the INFO_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject INFO_LEVEL(JNIEnv* env) { return getEnum(env, "INFO_LEVEL"); } + + /** + * Get the WARN_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject WARN_LEVEL(JNIEnv* env) { return getEnum(env, "WARN_LEVEL"); } + + /** + * Get the ERROR_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject ERROR_LEVEL(JNIEnv* env) { + return getEnum(env, "ERROR_LEVEL"); + } + + /** + * Get the FATAL_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject FATAL_LEVEL(JNIEnv* env) { + return getEnum(env, "FATAL_LEVEL"); + } + + /** + * Get the HEADER_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject HEADER_LEVEL(JNIEnv* env) { + return getEnum(env, "HEADER_LEVEL"); + } + + private: + /** + * Get the Java Class org.rocksdb.InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/InfoLogLevel"); + } + + /** + * Get an enum field of org.rocksdb.InfoLogLevel + * + * @param env A pointer to the Java environment + * @param name The name of the enum field + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject getEnum(JNIEnv* env, const char name[]) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jfieldID jfid = + env->GetStaticFieldID(jclazz, name, "Lorg/rocksdb/InfoLogLevel;"); + if (env->ExceptionCheck()) { + // exception occurred while getting field + return nullptr; + } else if (jfid == nullptr) { + return nullptr; + } + + jobject jinfo_log_level = env->GetStaticObjectField(jclazz, jfid); + assert(jinfo_log_level != nullptr); + return jinfo_log_level; + } +}; + +// The portal class for org.rocksdb.Logger +class LoggerJni + : public RocksDBNativeClass< + std::shared_ptr*, LoggerJni> { + public: + /** + * Get the Java Class org/rocksdb/Logger + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/Logger"); + } + + /** + * Get the Java Method: Logger#log + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getLogMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID( + jclazz, "log", "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.TransactionLogIterator.BatchResult +class BatchResultJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionLogIterator.BatchResult + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass( + env, "org/rocksdb/TransactionLogIterator$BatchResult"); + } + + /** + * Create a new Java org.rocksdb.TransactionLogIterator.BatchResult object + * with the same properties as the provided C++ ROCKSDB_NAMESPACE::BatchResult + * object + * + * @param env A pointer to the Java environment + * @param batch_result The ROCKSDB_NAMESPACE::BatchResult object + * + * @return A reference to a Java + * org.rocksdb.TransactionLogIterator.BatchResult object, + * or nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, + ROCKSDB_NAMESPACE::BatchResult& batch_result) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "(JJ)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jbatch_result = env->NewObject(jclazz, mid, batch_result.sequence, + batch_result.writeBatchPtr.get()); + if (jbatch_result == nullptr) { + // exception thrown: InstantiationException or OutOfMemoryError + return nullptr; + } + + batch_result.writeBatchPtr.release(); + return jbatch_result; + } +}; + +// The portal class for org.rocksdb.BottommostLevelCompaction +class BottommostLevelCompactionJni { + public: + // Returns the equivalent org.rocksdb.BottommostLevelCompaction for the + // provided C++ ROCKSDB_NAMESPACE::BottommostLevelCompaction enum + static jint toJavaBottommostLevelCompaction( + const ROCKSDB_NAMESPACE::BottommostLevelCompaction& + bottommost_level_compaction) { + switch (bottommost_level_compaction) { + case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kSkip: + return 0x0; + case ROCKSDB_NAMESPACE::BottommostLevelCompaction:: + kIfHaveCompactionFilter: + return 0x1; + case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForce: + return 0x2; + case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForceOptimized: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::BottommostLevelCompaction + // enum for the provided Java org.rocksdb.BottommostLevelCompaction + static ROCKSDB_NAMESPACE::BottommostLevelCompaction + toCppBottommostLevelCompaction(jint bottommost_level_compaction) { + switch (bottommost_level_compaction) { + case 0x0: + return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kSkip; + case 0x1: + return ROCKSDB_NAMESPACE::BottommostLevelCompaction:: + kIfHaveCompactionFilter; + case 0x2: + return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForce; + case 0x3: + return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForceOptimized; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BottommostLevelCompaction:: + kIfHaveCompactionFilter; + } + } +}; + +// The portal class for org.rocksdb.CompactionStopStyle +class CompactionStopStyleJni { + public: + // Returns the equivalent org.rocksdb.CompactionStopStyle for the provided + // C++ ROCKSDB_NAMESPACE::CompactionStopStyle enum + static jbyte toJavaCompactionStopStyle( + const ROCKSDB_NAMESPACE::CompactionStopStyle& compaction_stop_style) { + switch (compaction_stop_style) { + case ROCKSDB_NAMESPACE::CompactionStopStyle:: + kCompactionStopStyleSimilarSize: + return 0x0; + case ROCKSDB_NAMESPACE::CompactionStopStyle:: + kCompactionStopStyleTotalSize: + return 0x1; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionStopStyle enum for + // the provided Java org.rocksdb.CompactionStopStyle + static ROCKSDB_NAMESPACE::CompactionStopStyle toCppCompactionStopStyle( + jbyte jcompaction_stop_style) { + switch (jcompaction_stop_style) { + case 0x0: + return ROCKSDB_NAMESPACE::CompactionStopStyle:: + kCompactionStopStyleSimilarSize; + case 0x1: + return ROCKSDB_NAMESPACE::CompactionStopStyle:: + kCompactionStopStyleTotalSize; + default: + // undefined/default + return ROCKSDB_NAMESPACE::CompactionStopStyle:: + kCompactionStopStyleSimilarSize; + } + } +}; + +// The portal class for org.rocksdb.CompressionType +class CompressionTypeJni { + public: + // Returns the equivalent org.rocksdb.CompressionType for the provided + // C++ ROCKSDB_NAMESPACE::CompressionType enum + static jbyte toJavaCompressionType( + const ROCKSDB_NAMESPACE::CompressionType& compression_type) { + switch (compression_type) { + case ROCKSDB_NAMESPACE::CompressionType::kNoCompression: + return 0x0; + case ROCKSDB_NAMESPACE::CompressionType::kSnappyCompression: + return 0x1; + case ROCKSDB_NAMESPACE::CompressionType::kZlibCompression: + return 0x2; + case ROCKSDB_NAMESPACE::CompressionType::kBZip2Compression: + return 0x3; + case ROCKSDB_NAMESPACE::CompressionType::kLZ4Compression: + return 0x4; + case ROCKSDB_NAMESPACE::CompressionType::kLZ4HCCompression: + return 0x5; + case ROCKSDB_NAMESPACE::CompressionType::kXpressCompression: + return 0x6; + case ROCKSDB_NAMESPACE::CompressionType::kZSTD: + return 0x7; + case ROCKSDB_NAMESPACE::CompressionType::kDisableCompressionOption: + default: + return 0x7F; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompressionType enum for the + // provided Java org.rocksdb.CompressionType + static ROCKSDB_NAMESPACE::CompressionType toCppCompressionType( + jbyte jcompression_type) { + switch (jcompression_type) { + case 0x0: + return ROCKSDB_NAMESPACE::CompressionType::kNoCompression; + case 0x1: + return ROCKSDB_NAMESPACE::CompressionType::kSnappyCompression; + case 0x2: + return ROCKSDB_NAMESPACE::CompressionType::kZlibCompression; + case 0x3: + return ROCKSDB_NAMESPACE::CompressionType::kBZip2Compression; + case 0x4: + return ROCKSDB_NAMESPACE::CompressionType::kLZ4Compression; + case 0x5: + return ROCKSDB_NAMESPACE::CompressionType::kLZ4HCCompression; + case 0x6: + return ROCKSDB_NAMESPACE::CompressionType::kXpressCompression; + case 0x7: + return ROCKSDB_NAMESPACE::CompressionType::kZSTD; + case 0x7F: + default: + return ROCKSDB_NAMESPACE::CompressionType::kDisableCompressionOption; + } + } +}; + +// The portal class for org.rocksdb.CompactionPriority +class CompactionPriorityJni { + public: + // Returns the equivalent org.rocksdb.CompactionPriority for the provided + // C++ ROCKSDB_NAMESPACE::CompactionPri enum + static jbyte toJavaCompactionPriority( + const ROCKSDB_NAMESPACE::CompactionPri& compaction_priority) { + switch (compaction_priority) { + case ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize: + return 0x0; + case ROCKSDB_NAMESPACE::CompactionPri::kOldestLargestSeqFirst: + return 0x1; + case ROCKSDB_NAMESPACE::CompactionPri::kOldestSmallestSeqFirst: + return 0x2; + case ROCKSDB_NAMESPACE::CompactionPri::kMinOverlappingRatio: + return 0x3; + case ROCKSDB_NAMESPACE::CompactionPri::kRoundRobin: + return 0x4; + default: + return 0x0; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionPri enum for the + // provided Java org.rocksdb.CompactionPriority + static ROCKSDB_NAMESPACE::CompactionPri toCppCompactionPriority( + jbyte jcompaction_priority) { + switch (jcompaction_priority) { + case 0x0: + return ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize; + case 0x1: + return ROCKSDB_NAMESPACE::CompactionPri::kOldestLargestSeqFirst; + case 0x2: + return ROCKSDB_NAMESPACE::CompactionPri::kOldestSmallestSeqFirst; + case 0x3: + return ROCKSDB_NAMESPACE::CompactionPri::kMinOverlappingRatio; + case 0x4: + return ROCKSDB_NAMESPACE::CompactionPri::kRoundRobin; + default: + // undefined/default + return ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize; + } + } +}; + +// The portal class for org.rocksdb.AccessHint +class AccessHintJni { + public: + // Returns the equivalent org.rocksdb.AccessHint for the provided + // C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum + static jbyte toJavaAccessHint( + const ROCKSDB_NAMESPACE::DBOptions::AccessHint& access_hint) { + switch (access_hint) { + case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE: + return 0x0; + case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL: + return 0x1; + case ROCKSDB_NAMESPACE::DBOptions::AccessHint::SEQUENTIAL: + return 0x2; + case ROCKSDB_NAMESPACE::DBOptions::AccessHint::WILLNEED: + return 0x3; + default: + // undefined/default + return 0x1; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum + // for the provided Java org.rocksdb.AccessHint + static ROCKSDB_NAMESPACE::DBOptions::AccessHint toCppAccessHint( + jbyte jaccess_hint) { + switch (jaccess_hint) { + case 0x0: + return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE; + case 0x1: + return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL; + case 0x2: + return ROCKSDB_NAMESPACE::DBOptions::AccessHint::SEQUENTIAL; + case 0x3: + return ROCKSDB_NAMESPACE::DBOptions::AccessHint::WILLNEED; + default: + // undefined/default + return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL; + } + } +}; + +// The portal class for org.rocksdb.WALRecoveryMode +class WALRecoveryModeJni { + public: + // Returns the equivalent org.rocksdb.WALRecoveryMode for the provided + // C++ ROCKSDB_NAMESPACE::WALRecoveryMode enum + static jbyte toJavaWALRecoveryMode( + const ROCKSDB_NAMESPACE::WALRecoveryMode& wal_recovery_mode) { + switch (wal_recovery_mode) { + case ROCKSDB_NAMESPACE::WALRecoveryMode::kTolerateCorruptedTailRecords: + return 0x0; + case ROCKSDB_NAMESPACE::WALRecoveryMode::kAbsoluteConsistency: + return 0x1; + case ROCKSDB_NAMESPACE::WALRecoveryMode::kPointInTimeRecovery: + return 0x2; + case ROCKSDB_NAMESPACE::WALRecoveryMode::kSkipAnyCorruptedRecords: + return 0x3; + default: + // undefined/default + return 0x2; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::WALRecoveryMode enum for the + // provided Java org.rocksdb.WALRecoveryMode + static ROCKSDB_NAMESPACE::WALRecoveryMode toCppWALRecoveryMode( + jbyte jwal_recovery_mode) { + switch (jwal_recovery_mode) { + case 0x0: + return ROCKSDB_NAMESPACE::WALRecoveryMode:: + kTolerateCorruptedTailRecords; + case 0x1: + return ROCKSDB_NAMESPACE::WALRecoveryMode::kAbsoluteConsistency; + case 0x2: + return ROCKSDB_NAMESPACE::WALRecoveryMode::kPointInTimeRecovery; + case 0x3: + return ROCKSDB_NAMESPACE::WALRecoveryMode::kSkipAnyCorruptedRecords; + default: + // undefined/default + return ROCKSDB_NAMESPACE::WALRecoveryMode::kPointInTimeRecovery; + } + } +}; + +// The portal class for org.rocksdb.TickerType +class TickerTypeJni { + public: + // Returns the equivalent org.rocksdb.TickerType for the provided + // C++ ROCKSDB_NAMESPACE::Tickers enum + static jbyte toJavaTickerType(const ROCKSDB_NAMESPACE::Tickers& tickers) { + switch (tickers) { + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS: + return 0x0; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_HIT: + return 0x1; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD: + return 0x2; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD_FAILURES: + return 0x3; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_MISS: + return 0x4; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_HIT: + return 0x5; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD: + return 0x6; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT: + return 0x7; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS: + return 0x9; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT: + return 0xA; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD: + return 0xB; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT: + return 0xC; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS: + return 0xE; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT: + return 0xF; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD: + return 0x10; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT: + return 0x11; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_READ: + return 0x12; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_WRITE: + return 0x13; + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL: + return 0x14; + case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT: + return 0x15; + case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS: + return 0x16; + case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT: + return 0x17; + case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS: + return 0x18; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT: + return 0x19; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS: + return 0x1A; + case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0: + return 0x1B; + case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1: + return 0x1C; + case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP: + return 0x1D; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY: + return 0x1E; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE: + return 0x1F; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL: + return 0x20; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER: + return 0x21; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE: + return 0x22; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN: + return 0x23; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ: + return 0x24; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED: + return 0x25; + case ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN: + return 0x26; + case ROCKSDB_NAMESPACE::Tickers::BYTES_READ: + return 0x27; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK: + return 0x28; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT: + return 0x29; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV: + return 0x2A; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND: + return 0x2B; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND: + return 0x2C; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND: + return 0x2D; + case ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ: + return 0x2E; + case ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS: + return 0x30; + case ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS: + return 0x31; + case ROCKSDB_NAMESPACE::Tickers::STALL_MICROS: + return 0x35; + case ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS: + return 0x36; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS: + return 0x39; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ: + return 0x3A; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ: + return 0x3B; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES: + return 0x3D; + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED: + return 0x3E; + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL: + return 0x3F; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION: + return 0x40; + case ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS: + return 0x41; + case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED: + return 0x46; + case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES: + return 0x47; + case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF: + return 0x48; + case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER: + return 0x49; + case ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL: + return 0x4B; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES: + return 0x4C; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES: + return 0x4D; + case ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES: + return 0x4E; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES: + return 0x4F; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES: + return 0x50; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES: + return 0x51; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS: + return 0x52; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED: + return 0x53; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED: + return 0x54; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_NOT_COMPRESSED: + return 0x55; + case ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME: + return 0x56; + case ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME: + return 0x57; + case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT: + return 0x58; + case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS: + return 0x59; + case ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES: + return 0x5A; + case ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES: + return 0x5B; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS: + return 0x5C; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP: + return 0x5D; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND: + return 0x5E; + case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED: + // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX). + return -0x01; + case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED: + return 0x60; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE: + return 0x61; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED: + return 0x62; + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE: + return 0x63; + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE: + return 0x64; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT: + return 0x65; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE: + return 0x66; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET: + return 0x67; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET: + return 0x68; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK: + return 0x69; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT: + return 0x6A; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV: + return 0x6B; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN: + return 0x6C; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ: + return 0x6D; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN: + return 0x6E; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ: + return 0x6F; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED: + return 0x70; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL: + return 0x71; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB: + return 0x72; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL: + return 0x73; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN: + return 0x74; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ: + return 0x75; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED: + return 0x76; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT: + return 0x77; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE: + return 0x78; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT: + return 0x79; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE: + return 0x7A; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES: + return 0x7B; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES: + return 0x7C; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES: + return 0x7D; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED: + return -0x02; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED: + return -0x05; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED: + return -0x06; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED: + return -0x07; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED: + return -0x08; + case ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD: + return -0x09; + case ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD: + return -0x0A; + case ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD: + return -0x0B; + case ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD: + return -0x0C; + case ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN: + return -0x0D; + case ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH: + return -0x0E; + case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY: + return -0X0F; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED: + return -0x10; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC: + return -0x11; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL: + return -0x12; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED: + return -0x13; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC: + return -0x14; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: + return -0x15; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT: + return -0x16; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT: + return -0x17; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT: + return -0x18; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT: + return -0x19; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT: + return -0x1A; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT: + return -0x1B; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH: + return -0x1C; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH: + return -0x1D; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS: + return -0x1E; + case ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES: + return -0x1F; + case ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES: + return -0x20; + case ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES: + return -0x21; + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES: + return -0x22; + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES: + return -0x23; + case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES: + return -0x24; + case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES: + return -0x25; + case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES: + return -0x26; + case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT: + return -0x27; + case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT: + return -0x28; + case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT: + return -0x29; + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES: + return -0x2A; + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT: + return -0x2B; + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES: + return -0x2C; + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT: + return -0x2D; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT: + return -0x2E; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS: + return -0x2F; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT: + return -0x30; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD: + return -0x31; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES: + return -0x32; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ: + return -0x33; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE: + return -0x34; + case ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS: + return -0x35; + case ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT: + return -0x36; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS: + return -0x37; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS: + return -0x38; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS: + return -0x39; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS: + return -0x3A; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: + return -0x3B; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT: + return -0x3C; + case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: + // 0x5F was the max value in the initial copy of tickers to Java. + // Since these values are exposed directly to Java clients, we keep + // the value the same forever. + // + // TODO: This particular case seems confusing and unnecessary to pin the + // value since it's meant to be the number of tickers, not an actual + // ticker value. But we aren't yet in a position to fix it since the + // number of tickers doesn't fit in the Java representation (jbyte). + return 0x5F; + default: + // undefined/default + return 0x0; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::Tickers enum for the + // provided Java org.rocksdb.TickerType + static ROCKSDB_NAMESPACE::Tickers toCppTickers(jbyte jticker_type) { + switch (jticker_type) { + case 0x0: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS; + case 0x1: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_HIT; + case 0x2: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD; + case 0x3: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD_FAILURES; + case 0x4: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_MISS; + case 0x5: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_HIT; + case 0x6: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD; + case 0x7: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT; + case 0x9: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS; + case 0xA: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT; + case 0xB: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD; + case 0xC: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT; + case 0xE: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS; + case 0xF: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT; + case 0x10: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD; + case 0x11: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT; + case 0x12: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_READ; + case 0x13: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_WRITE; + case 0x14: + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL; + case 0x15: + return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT; + case 0x16: + return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS; + case 0x17: + return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT; + case 0x18: + return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS; + case 0x19: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT; + case 0x1A: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS; + case 0x1B: + return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0; + case 0x1C: + return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1; + case 0x1D: + return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP; + case 0x1E: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY; + case 0x1F: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE; + case 0x20: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL; + case 0x21: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER; + case 0x22: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE; + case 0x23: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN; + case 0x24: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ; + case 0x25: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED; + case 0x26: + return ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN; + case 0x27: + return ROCKSDB_NAMESPACE::Tickers::BYTES_READ; + case 0x28: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK; + case 0x29: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT; + case 0x2A: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV; + case 0x2B: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND; + case 0x2C: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND; + case 0x2D: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND; + case 0x2E: + return ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ; + case 0x30: + return ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS; + case 0x31: + return ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS; + case 0x35: + return ROCKSDB_NAMESPACE::Tickers::STALL_MICROS; + case 0x36: + return ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS; + case 0x39: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS; + case 0x3A: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ; + case 0x3B: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ; + case 0x3D: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES; + case 0x3E: + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED; + case 0x3F: + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL; + case 0x40: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION; + case 0x41: + return ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS; + case 0x46: + return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED; + case 0x47: + return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES; + case 0x48: + return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF; + case 0x49: + return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER; + case 0x4B: + return ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL; + case 0x4C: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES; + case 0x4D: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES; + case 0x4E: + return ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES; + case 0x4F: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES; + case 0x50: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES; + case 0x51: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES; + case 0x52: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS; + case 0x53: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED; + case 0x54: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED; + case 0x55: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_NOT_COMPRESSED; + case 0x56: + return ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME; + case 0x57: + return ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME; + case 0x58: + return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT; + case 0x59: + return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS; + case 0x5A: + return ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES; + case 0x5B: + return ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES; + case 0x5C: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS; + case 0x5D: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP; + case 0x5E: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND; + case -0x01: + // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX). + return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED; + case 0x60: + return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED; + case 0x61: + return ROCKSDB_NAMESPACE::Tickers:: + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE; + case 0x62: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED; + case 0x63: + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE; + case 0x64: + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE; + case 0x65: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT; + case 0x66: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE; + case 0x67: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET; + case 0x68: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET; + case 0x69: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK; + case 0x6A: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT; + case 0x6B: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV; + case 0x6C: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN; + case 0x6D: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ; + case 0x6E: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN; + case 0x6F: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ; + case 0x70: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED; + case 0x71: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL; + case 0x72: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB; + case 0x73: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL; + case 0x74: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN; + case 0x75: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ; + case 0x76: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED; + case 0x77: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT; + case 0x78: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE; + case 0x79: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT; + case 0x7A: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE; + case 0x7B: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES; + case 0x7C: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES; + case 0x7D: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES; + case -0x02: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED; + case -0x05: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED; + case -0x06: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED; + case -0x07: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED; + case -0x08: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED; + case -0x09: + return ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD; + case -0x0A: + return ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD; + case -0x0B: + return ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD; + case -0x0C: + return ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD; + case -0x0D: + return ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN; + case -0x0E: + return ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH; + case -0x0F: + return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY; + case -0x10: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED; + case -0x11: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC; + case -0x12: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL; + case -0x13: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED; + case -0x14: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; + case -0x15: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; + case -0x16: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT; + case -0x17: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT; + case -0x18: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT; + case -0x19: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT; + case -0x1A: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT; + case -0x1B: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT; + case -0x1C: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH; + case -0x1D: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + case -0x1E: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS; + case -0x1F: + return ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES; + case -0x20: + return ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES; + case -0x21: + return ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES; + case -0x22: + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES; + case -0x23: + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES; + case -0x24: + return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES; + case -0x25: + return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES; + case -0x26: + return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES; + case -0x27: + return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT; + case -0x28: + return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT; + case -0x29: + return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT; + case -0x2A: + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES; + case -0x2B: + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT; + case -0x2C: + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES; + case -0x2D: + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT; + case -0x2E: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT; + case -0x2F: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS; + case -0x30: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT; + case -0x31: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD; + case -0x32: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES; + case -0x33: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ; + case -0x34: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE; + case -0x35: + return ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS; + case -0x36: + return ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT; + case -0x37: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS; + case -0x38: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS; + case -0x39: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS; + case -0x3A: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; + case -0x3B: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; + case -0x3C: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT; + case 0x5F: + // 0x5F was the max value in the initial copy of tickers to Java. + // Since these values are exposed directly to Java clients, we keep + // the value the same forever. + // + // TODO: This particular case seems confusing and unnecessary to pin the + // value since it's meant to be the number of tickers, not an actual + // ticker value. But we aren't yet in a position to fix it since the + // number of tickers doesn't fit in the Java representation (jbyte). + return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX; + + default: + // undefined/default + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS; + } + } +}; + +// The portal class for org.rocksdb.HistogramType +class HistogramTypeJni { + public: + // Returns the equivalent org.rocksdb.HistogramType for the provided + // C++ ROCKSDB_NAMESPACE::Histograms enum + static jbyte toJavaHistogramsType( + const ROCKSDB_NAMESPACE::Histograms& histograms) { + switch (histograms) { + case ROCKSDB_NAMESPACE::Histograms::DB_GET: + return 0x0; + case ROCKSDB_NAMESPACE::Histograms::DB_WRITE: + return 0x1; + case ROCKSDB_NAMESPACE::Histograms::COMPACTION_TIME: + return 0x2; + case ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME: + return 0x3; + case ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS: + return 0x4; + case ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS: + return 0x5; + case ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS: + return 0x6; + case ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS: + return 0x7; + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS: + return 0x8; + case ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET: + return 0x9; + case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS: + return 0xA; + case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS: + return 0xB; + case ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS: + return 0xC; + case ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION: + return 0x12; + case ROCKSDB_NAMESPACE::Histograms::DB_SEEK: + return 0x13; + case ROCKSDB_NAMESPACE::Histograms::WRITE_STALL: + return 0x14; + case ROCKSDB_NAMESPACE::Histograms::SST_READ_MICROS: + return 0x15; + case ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED: + return 0x16; + case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ: + return 0x17; + case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE: + return 0x18; + case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET: + return 0x19; + case ROCKSDB_NAMESPACE::Histograms::BYTES_COMPRESSED: + return 0x1A; + case ROCKSDB_NAMESPACE::Histograms::BYTES_DECOMPRESSED: + return 0x1B; + case ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS: + return 0x1C; + case ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS: + return 0x1D; + case ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS: + return 0x1E; + // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor + // version compatibility. + case ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME: + return 0x20; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE: + return 0x21; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE: + return 0x22; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS: + return 0x23; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS: + return 0x24; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS: + return 0x25; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS: + return 0x26; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS: + return 0x27; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS: + return 0x28; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS: + return 0x29; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS: + return 0x2A; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS: + return 0x2B; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS: + return 0x2D; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS: + return 0x2E; + case ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL: + return 0x2F; + case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: + return 0x31; + case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: + return 0x32; + case ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES: + return 0x33; + case ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS: + return 0x34; + case ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED: + return 0x35; + case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE: + return 0x36; + case NUM_LEVEL_READ_PER_MULTIGET: + return 0x37; + case ASYNC_PREFETCH_ABORT_MICROS: + return 0x38; + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: + return 0x39; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS: + return 0x3A; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS: + return 0x3B; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS: + return 0x3C; + case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: + // 0x1F for backwards compatibility on current minor version. + return 0x1F; + + default: + // undefined/default + return 0x0; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::Histograms enum for the + // provided Java org.rocksdb.HistogramsType + static ROCKSDB_NAMESPACE::Histograms toCppHistograms(jbyte jhistograms_type) { + switch (jhistograms_type) { + case 0x0: + return ROCKSDB_NAMESPACE::Histograms::DB_GET; + case 0x1: + return ROCKSDB_NAMESPACE::Histograms::DB_WRITE; + case 0x2: + return ROCKSDB_NAMESPACE::Histograms::COMPACTION_TIME; + case 0x3: + return ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME; + case 0x4: + return ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS; + case 0x5: + return ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS; + case 0x6: + return ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS; + case 0x7: + return ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS; + case 0x8: + return ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS; + case 0x9: + return ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET; + case 0xA: + return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS; + case 0xB: + return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS; + case 0xC: + return ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS; + case 0x12: + return ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION; + case 0x13: + return ROCKSDB_NAMESPACE::Histograms::DB_SEEK; + case 0x14: + return ROCKSDB_NAMESPACE::Histograms::WRITE_STALL; + case 0x15: + return ROCKSDB_NAMESPACE::Histograms::SST_READ_MICROS; + case 0x16: + return ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED; + case 0x17: + return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ; + case 0x18: + return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE; + case 0x19: + return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET; + case 0x1A: + return ROCKSDB_NAMESPACE::Histograms::BYTES_COMPRESSED; + case 0x1B: + return ROCKSDB_NAMESPACE::Histograms::BYTES_DECOMPRESSED; + case 0x1C: + return ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS; + case 0x1D: + return ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS; + case 0x1E: + return ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS; + // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor + // version compatibility. + case 0x20: + return ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME; + case 0x21: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE; + case 0x22: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE; + case 0x23: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS; + case 0x24: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS; + case 0x25: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS; + case 0x26: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS; + case 0x27: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS; + case 0x28: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS; + case 0x29: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS; + case 0x2A: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS; + case 0x2B: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS; + case 0x2D: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS; + case 0x2E: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS; + case 0x2F: + return ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL; + case 0x31: + return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; + case 0x32: + return ROCKSDB_NAMESPACE::Histograms:: + ERROR_HANDLER_AUTORESUME_RETRY_COUNT; + case 0x33: + return ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES; + case 0x34: + return ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS; + case 0x35: + return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED; + case 0x36: + return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE; + case 0x37: + return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET; + case 0x38: + return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; + case 0x39: + return ROCKSDB_NAMESPACE::Histograms:: + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + case 0x3A: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS; + case 0x3B: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS; + case 0x3C: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS; + case 0x1F: + // 0x1F for backwards compatibility on current minor version. + return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; + + default: + // undefined/default + return ROCKSDB_NAMESPACE::Histograms::DB_GET; + } + } +}; + +// The portal class for org.rocksdb.StatsLevel +class StatsLevelJni { + public: + // Returns the equivalent org.rocksdb.StatsLevel for the provided + // C++ ROCKSDB_NAMESPACE::StatsLevel enum + static jbyte toJavaStatsLevel( + const ROCKSDB_NAMESPACE::StatsLevel& stats_level) { + switch (stats_level) { + case ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers: + return 0x0; + case ROCKSDB_NAMESPACE::StatsLevel::kExceptTimeForMutex: + return 0x1; + case ROCKSDB_NAMESPACE::StatsLevel::kAll: + return 0x2; + + default: + // undefined/default + return 0x0; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::StatsLevel enum for the + // provided Java org.rocksdb.StatsLevel + static ROCKSDB_NAMESPACE::StatsLevel toCppStatsLevel(jbyte jstats_level) { + switch (jstats_level) { + case 0x0: + return ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers; + case 0x1: + return ROCKSDB_NAMESPACE::StatsLevel::kExceptTimeForMutex; + case 0x2: + return ROCKSDB_NAMESPACE::StatsLevel::kAll; + + default: + // undefined/default + return ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers; + } + } +}; + +// The portal class for org.rocksdb.RateLimiterMode +class RateLimiterModeJni { + public: + // Returns the equivalent org.rocksdb.RateLimiterMode for the provided + // C++ ROCKSDB_NAMESPACE::RateLimiter::Mode enum + static jbyte toJavaRateLimiterMode( + const ROCKSDB_NAMESPACE::RateLimiter::Mode& rate_limiter_mode) { + switch (rate_limiter_mode) { + case ROCKSDB_NAMESPACE::RateLimiter::Mode::kReadsOnly: + return 0x0; + case ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly: + return 0x1; + case ROCKSDB_NAMESPACE::RateLimiter::Mode::kAllIo: + return 0x2; + + default: + // undefined/default + return 0x1; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::RateLimiter::Mode enum for + // the provided Java org.rocksdb.RateLimiterMode + static ROCKSDB_NAMESPACE::RateLimiter::Mode toCppRateLimiterMode( + jbyte jrate_limiter_mode) { + switch (jrate_limiter_mode) { + case 0x0: + return ROCKSDB_NAMESPACE::RateLimiter::Mode::kReadsOnly; + case 0x1: + return ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly; + case 0x2: + return ROCKSDB_NAMESPACE::RateLimiter::Mode::kAllIo; + + default: + // undefined/default + return ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly; + } + } +}; + +// The portal class for org.rocksdb.MemoryUsageType +class MemoryUsageTypeJni { + public: + // Returns the equivalent org.rocksdb.MemoryUsageType for the provided + // C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum + static jbyte toJavaMemoryUsageType( + const ROCKSDB_NAMESPACE::MemoryUtil::UsageType& usage_type) { + switch (usage_type) { + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal: + return 0x0; + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed: + return 0x1; + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal: + return 0x2; + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal: + return 0x3; + default: + // undefined: use kNumUsageTypes + return 0x4; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum + // for the provided Java org.rocksdb.MemoryUsageType + static ROCKSDB_NAMESPACE::MemoryUtil::UsageType toCppMemoryUsageType( + jbyte usage_type) { + switch (usage_type) { + case 0x0: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal; + case 0x1: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed; + case 0x2: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal; + case 0x3: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal; + default: + // undefined/default: use kNumUsageTypes + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kNumUsageTypes; + } + } +}; + +// The portal class for org.rocksdb.Transaction +class TransactionJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.Transaction + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/Transaction"); + } + + /** + * Create a new Java org.rocksdb.Transaction.WaitingTransactions object + * + * @param env A pointer to the Java environment + * @param jtransaction A Java org.rocksdb.Transaction object + * @param column_family_id The id of the column family + * @param key The key + * @param transaction_ids The transaction ids + * + * @return A reference to a Java + * org.rocksdb.Transaction.WaitingTransactions object, + * or nullptr if an an exception occurs + */ + static jobject newWaitingTransactions( + JNIEnv* env, jobject jtransaction, const uint32_t column_family_id, + const std::string& key, + const std::vector& transaction_ids) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "newWaitingTransactions", + "(JLjava/lang/String;[J)Lorg/rocksdb/Transaction$WaitingTransactions;"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jkey = env->NewStringUTF(key.c_str()); + if (jkey == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + const size_t len = transaction_ids.size(); + jlongArray jtransaction_ids = env->NewLongArray(static_cast(len)); + if (jtransaction_ids == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + return nullptr; + } + + jboolean is_copy; + jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy); + if (body == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + env->DeleteLocalRef(jtransaction_ids); + return nullptr; + } + for (size_t i = 0; i < len; ++i) { + body[i] = static_cast(transaction_ids[i]); + } + env->ReleaseLongArrayElements(jtransaction_ids, body, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); + + jobject jwaiting_transactions = env->CallObjectMethod( + jtransaction, mid, static_cast(column_family_id), jkey, + jtransaction_ids); + if (env->ExceptionCheck()) { + // exception thrown: InstantiationException or OutOfMemoryError + env->DeleteLocalRef(jkey); + env->DeleteLocalRef(jtransaction_ids); + return nullptr; + } + + return jwaiting_transactions; + } +}; + +// The portal class for org.rocksdb.TransactionDB +class TransactionDBJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB"); + } + + /** + * Create a new Java org.rocksdb.TransactionDB.DeadlockInfo object + * + * @param env A pointer to the Java environment + * @param jtransaction A Java org.rocksdb.Transaction object + * @param column_family_id The id of the column family + * @param key The key + * @param transaction_ids The transaction ids + * + * @return A reference to a Java + * org.rocksdb.Transaction.WaitingTransactions object, + * or nullptr if an an exception occurs + */ + static jobject newDeadlockInfo( + JNIEnv* env, jobject jtransaction_db, + const ROCKSDB_NAMESPACE::TransactionID transaction_id, + const uint32_t column_family_id, const std::string& waiting_key, + const bool exclusive) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "newDeadlockInfo", + "(JJLjava/lang/String;Z)Lorg/rocksdb/TransactionDB$DeadlockInfo;"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jwaiting_key = env->NewStringUTF(waiting_key.c_str()); + if (jwaiting_key == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + // resolve the column family id to a ColumnFamilyHandle + jobject jdeadlock_info = env->CallObjectMethod( + jtransaction_db, mid, transaction_id, + static_cast(column_family_id), jwaiting_key, exclusive); + if (env->ExceptionCheck()) { + // exception thrown: InstantiationException or OutOfMemoryError + env->DeleteLocalRef(jwaiting_key); + return nullptr; + } + + return jdeadlock_info; + } +}; + +// The portal class for org.rocksdb.TxnDBWritePolicy +class TxnDBWritePolicyJni { + public: + // Returns the equivalent org.rocksdb.TxnDBWritePolicy for the provided + // C++ ROCKSDB_NAMESPACE::TxnDBWritePolicy enum + static jbyte toJavaTxnDBWritePolicy( + const ROCKSDB_NAMESPACE::TxnDBWritePolicy& txndb_write_policy) { + switch (txndb_write_policy) { + case ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_COMMITTED: + return 0x0; + case ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_PREPARED: + return 0x1; + case ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_UNPREPARED: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::TxnDBWritePolicy enum for the + // provided Java org.rocksdb.TxnDBWritePolicy + static ROCKSDB_NAMESPACE::TxnDBWritePolicy toCppTxnDBWritePolicy( + jbyte jtxndb_write_policy) { + switch (jtxndb_write_policy) { + case 0x0: + return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_COMMITTED; + case 0x1: + return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_PREPARED; + case 0x2: + return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_UNPREPARED; + default: + // undefined/default + return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_COMMITTED; + } + } +}; + +// The portal class for org.rocksdb.TransactionDB.KeyLockInfo +class KeyLockInfoJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB.KeyLockInfo + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$KeyLockInfo"); + } + + /** + * Create a new Java org.rocksdb.TransactionDB.KeyLockInfo object + * with the same properties as the provided C++ ROCKSDB_NAMESPACE::KeyLockInfo + * object + * + * @param env A pointer to the Java environment + * @param key_lock_info The ROCKSDB_NAMESPACE::KeyLockInfo object + * + * @return A reference to a Java + * org.rocksdb.TransactionDB.KeyLockInfo object, + * or nullptr if an an exception occurs + */ + static jobject construct( + JNIEnv* env, const ROCKSDB_NAMESPACE::KeyLockInfo& key_lock_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = + env->GetMethodID(jclazz, "", "(Ljava/lang/String;[JZ)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jkey = env->NewStringUTF(key_lock_info.key.c_str()); + if (jkey == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + const jsize jtransaction_ids_len = + static_cast(key_lock_info.ids.size()); + jlongArray jtransactions_ids = env->NewLongArray(jtransaction_ids_len); + if (jtransactions_ids == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + return nullptr; + } + + const jobject jkey_lock_info = env->NewObject( + jclazz, mid, jkey, jtransactions_ids, key_lock_info.exclusive); + if (jkey_lock_info == nullptr) { + // exception thrown: InstantiationException or OutOfMemoryError + env->DeleteLocalRef(jtransactions_ids); + env->DeleteLocalRef(jkey); + return nullptr; + } + + return jkey_lock_info; + } +}; + +// The portal class for org.rocksdb.TransactionDB.DeadlockInfo +class DeadlockInfoJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB.DeadlockInfo + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$DeadlockInfo"); + } +}; + +// The portal class for org.rocksdb.TransactionDB.DeadlockPath +class DeadlockPathJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB.DeadlockPath + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$DeadlockPath"); + } + + /** + * Create a new Java org.rocksdb.TransactionDB.DeadlockPath object + * + * @param env A pointer to the Java environment + * + * @return A reference to a Java + * org.rocksdb.TransactionDB.DeadlockPath object, + * or nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const jobjectArray jdeadlock_infos, + const bool limit_exceeded) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "([LDeadlockInfo;Z)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jobject jdeadlock_path = + env->NewObject(jclazz, mid, jdeadlock_infos, limit_exceeded); + if (jdeadlock_path == nullptr) { + // exception thrown: InstantiationException or OutOfMemoryError + return nullptr; + } + + return jdeadlock_path; + } +}; + +class AbstractTableFilterJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::TableFilterJniCallback*, + AbstractTableFilterJni> { + public: + /** + * Get the Java Method: TableFilter#filter(TableProperties) + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getFilterMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "filter", "(Lorg/rocksdb/TableProperties;)Z"); + assert(mid != nullptr); + return mid; + } + + private: + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFilter"); + } +}; + +class TablePropertiesJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.TableProperties object. + * + * @param env A pointer to the Java environment + * @param table_properties A Cpp table properties object + * + * @return A reference to a Java org.rocksdb.TableProperties object, or + * nullptr if an an exception occurs + */ + static jobject fromCppTableProperties( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableProperties& table_properties) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "", + "(JJJJJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/" + "lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/" + "String;Ljava/util/Map;Ljava/util/Map;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jbyteArray jcolumn_family_name = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, table_properties.column_family_name); + if (jcolumn_family_name == nullptr) { + // exception occurred creating java string + return nullptr; + } + + jstring jfilter_policy_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &table_properties.filter_policy_name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + return nullptr; + } + + jstring jcomparator_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &table_properties.comparator_name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + return nullptr; + } + + jstring jmerge_operator_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &table_properties.merge_operator_name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + env->DeleteLocalRef(jcomparator_name); + return nullptr; + } + + jstring jprefix_extractor_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &table_properties.prefix_extractor_name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + env->DeleteLocalRef(jcomparator_name); + env->DeleteLocalRef(jmerge_operator_name); + return nullptr; + } + + jstring jproperty_collectors_names = + ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &table_properties.property_collectors_names, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + env->DeleteLocalRef(jcomparator_name); + env->DeleteLocalRef(jmerge_operator_name); + env->DeleteLocalRef(jprefix_extractor_name); + return nullptr; + } + + jstring jcompression_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &table_properties.compression_name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + env->DeleteLocalRef(jcomparator_name); + env->DeleteLocalRef(jmerge_operator_name); + env->DeleteLocalRef(jprefix_extractor_name); + env->DeleteLocalRef(jproperty_collectors_names); + return nullptr; + } + + // Map + jobject juser_collected_properties = + ROCKSDB_NAMESPACE::HashMapJni::fromCppMap( + env, &table_properties.user_collected_properties); + if (env->ExceptionCheck()) { + // exception occurred creating java map + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + env->DeleteLocalRef(jcomparator_name); + env->DeleteLocalRef(jmerge_operator_name); + env->DeleteLocalRef(jprefix_extractor_name); + env->DeleteLocalRef(jproperty_collectors_names); + env->DeleteLocalRef(jcompression_name); + return nullptr; + } + + // Map + jobject jreadable_properties = ROCKSDB_NAMESPACE::HashMapJni::fromCppMap( + env, &table_properties.readable_properties); + if (env->ExceptionCheck()) { + // exception occurred creating java map + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfilter_policy_name); + env->DeleteLocalRef(jcomparator_name); + env->DeleteLocalRef(jmerge_operator_name); + env->DeleteLocalRef(jprefix_extractor_name); + env->DeleteLocalRef(jproperty_collectors_names); + env->DeleteLocalRef(jcompression_name); + env->DeleteLocalRef(juser_collected_properties); + return nullptr; + } + + jobject jtable_properties = env->NewObject( + jclazz, mid, static_cast(table_properties.data_size), + static_cast(table_properties.index_size), + static_cast(table_properties.index_partitions), + static_cast(table_properties.top_level_index_size), + static_cast(table_properties.index_key_is_user_key), + static_cast(table_properties.index_value_is_delta_encoded), + static_cast(table_properties.filter_size), + static_cast(table_properties.raw_key_size), + static_cast(table_properties.raw_value_size), + static_cast(table_properties.num_data_blocks), + static_cast(table_properties.num_entries), + static_cast(table_properties.num_deletions), + static_cast(table_properties.num_merge_operands), + static_cast(table_properties.num_range_deletions), + static_cast(table_properties.format_version), + static_cast(table_properties.fixed_key_len), + static_cast(table_properties.column_family_id), + static_cast(table_properties.creation_time), + static_cast(table_properties.oldest_key_time), + static_cast( + table_properties.slow_compression_estimated_data_size), + static_cast( + table_properties.fast_compression_estimated_data_size), + static_cast( + table_properties.external_sst_file_global_seqno_offset), + jcolumn_family_name, jfilter_policy_name, jcomparator_name, + jmerge_operator_name, jprefix_extractor_name, + jproperty_collectors_names, jcompression_name, + juser_collected_properties, jreadable_properties); + + if (env->ExceptionCheck()) { + return nullptr; + } + + return jtable_properties; + } + + private: + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableProperties"); + } +}; + +class ColumnFamilyDescriptorJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.ColumnFamilyDescriptor + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyDescriptor"); + } + + /** + * Create a new Java org.rocksdb.ColumnFamilyDescriptor object with the same + * properties as the provided C++ ROCKSDB_NAMESPACE::ColumnFamilyDescriptor + * object + * + * @param env A pointer to the Java environment + * @param cfd A pointer to ROCKSDB_NAMESPACE::ColumnFamilyDescriptor object + * + * @return A reference to a Java org.rocksdb.ColumnFamilyDescriptor object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, ColumnFamilyDescriptor* cfd) { + jbyteArray jcf_name = JniUtil::copyBytes(env, cfd->name); + jobject cfopts = ColumnFamilyOptionsJni::construct(env, &(cfd->options)); + + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", + "([BLorg/rocksdb/ColumnFamilyOptions;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + env->DeleteLocalRef(jcf_name); + return nullptr; + } + + jobject jcfd = env->NewObject(jclazz, mid, jcf_name, cfopts); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + return nullptr; + } + + return jcfd; + } + + /** + * Get the Java Method: ColumnFamilyDescriptor#columnFamilyName + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getColumnFamilyNameMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "columnFamilyName", "()[B"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: ColumnFamilyDescriptor#columnFamilyOptions + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID( + jclazz, "columnFamilyOptions", "()Lorg/rocksdb/ColumnFamilyOptions;"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.IndexType +class IndexTypeJni { + public: + // Returns the equivalent org.rocksdb.IndexType for the provided + // C++ ROCKSDB_NAMESPACE::IndexType enum + static jbyte toJavaIndexType( + const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType& index_type) { + switch (index_type) { + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::kBinarySearch: + return 0x0; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::kHashSearch: + return 0x1; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kTwoLevelIndexSearch: + return 0x2; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kBinarySearchWithFirstKey: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexType enum for the + // provided Java org.rocksdb.IndexType + static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType toCppIndexType( + jbyte jindex_type) { + switch (jindex_type) { + case 0x0: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kBinarySearch; + case 0x1: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kHashSearch; + case 0x2: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kTwoLevelIndexSearch; + case 0x3: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kBinarySearchWithFirstKey; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType:: + kBinarySearch; + } + } +}; + +// The portal class for org.rocksdb.DataBlockIndexType +class DataBlockIndexTypeJni { + public: + // Returns the equivalent org.rocksdb.DataBlockIndexType for the provided + // C++ ROCKSDB_NAMESPACE::DataBlockIndexType enum + static jbyte toJavaDataBlockIndexType( + const ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType& + index_type) { + switch (index_type) { + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinarySearch: + return 0x0; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash: + return 0x1; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::DataBlockIndexType enum for + // the provided Java org.rocksdb.DataBlockIndexType + static ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType + toCppDataBlockIndexType(jbyte jindex_type) { + switch (jindex_type) { + case 0x0: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinarySearch; + case 0x1: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinarySearch; + } + } +}; + +// The portal class for org.rocksdb.ChecksumType +class ChecksumTypeJni { + public: + // Returns the equivalent org.rocksdb.ChecksumType for the provided + // C++ ROCKSDB_NAMESPACE::ChecksumType enum + static jbyte toJavaChecksumType( + const ROCKSDB_NAMESPACE::ChecksumType& checksum_type) { + switch (checksum_type) { + case ROCKSDB_NAMESPACE::ChecksumType::kNoChecksum: + return 0x0; + case ROCKSDB_NAMESPACE::ChecksumType::kCRC32c: + return 0x1; + case ROCKSDB_NAMESPACE::ChecksumType::kxxHash: + return 0x2; + case ROCKSDB_NAMESPACE::ChecksumType::kxxHash64: + return 0x3; + case ROCKSDB_NAMESPACE::ChecksumType::kXXH3: + return 0x4; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ChecksumType enum for the + // provided Java org.rocksdb.ChecksumType + static ROCKSDB_NAMESPACE::ChecksumType toCppChecksumType( + jbyte jchecksum_type) { + switch (jchecksum_type) { + case 0x0: + return ROCKSDB_NAMESPACE::ChecksumType::kNoChecksum; + case 0x1: + return ROCKSDB_NAMESPACE::ChecksumType::kCRC32c; + case 0x2: + return ROCKSDB_NAMESPACE::ChecksumType::kxxHash; + case 0x3: + return ROCKSDB_NAMESPACE::ChecksumType::kxxHash64; + case 0x4: + return ROCKSDB_NAMESPACE::ChecksumType::kXXH3; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ChecksumType::kCRC32c; + } + } +}; + +// The portal class for org.rocksdb.IndexShorteningMode +class IndexShorteningModeJni { + public: + // Returns the equivalent org.rocksdb.IndexShorteningMode for the provided + // C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum + static jbyte toJavaIndexShorteningMode( + const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode& + index_shortening_mode) { + switch (index_shortening_mode) { + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kNoShortening: + return 0x0; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators: + return 0x1; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum for + // the provided Java org.rocksdb.IndexShorteningMode + static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode + toCppIndexShorteningMode(jbyte jindex_shortening_mode) { + switch (jindex_shortening_mode) { + case 0x0: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kNoShortening; + case 0x1: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators; + case 0x2: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators; + } + } +}; + +// The portal class for org.rocksdb.Priority +class PriorityJni { + public: + // Returns the equivalent org.rocksdb.Priority for the provided + // C++ ROCKSDB_NAMESPACE::Env::Priority enum + static jbyte toJavaPriority( + const ROCKSDB_NAMESPACE::Env::Priority& priority) { + switch (priority) { + case ROCKSDB_NAMESPACE::Env::Priority::BOTTOM: + return 0x0; + case ROCKSDB_NAMESPACE::Env::Priority::LOW: + return 0x1; + case ROCKSDB_NAMESPACE::Env::Priority::HIGH: + return 0x2; + case ROCKSDB_NAMESPACE::Env::Priority::TOTAL: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::env::Priority enum for the + // provided Java org.rocksdb.Priority + static ROCKSDB_NAMESPACE::Env::Priority toCppPriority(jbyte jpriority) { + switch (jpriority) { + case 0x0: + return ROCKSDB_NAMESPACE::Env::Priority::BOTTOM; + case 0x1: + return ROCKSDB_NAMESPACE::Env::Priority::LOW; + case 0x2: + return ROCKSDB_NAMESPACE::Env::Priority::HIGH; + case 0x3: + return ROCKSDB_NAMESPACE::Env::Priority::TOTAL; + default: + // undefined/default + return ROCKSDB_NAMESPACE::Env::Priority::LOW; + } + } +}; + +// The portal class for org.rocksdb.ThreadType +class ThreadTypeJni { + public: + // Returns the equivalent org.rocksdb.ThreadType for the provided + // C++ ROCKSDB_NAMESPACE::ThreadStatus::ThreadType enum + static jbyte toJavaThreadType( + const ROCKSDB_NAMESPACE::ThreadStatus::ThreadType& thread_type) { + switch (thread_type) { + case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::HIGH_PRIORITY: + return 0x0; + case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::LOW_PRIORITY: + return 0x1; + case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::USER: + return 0x2; + case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::BOTTOM_PRIORITY: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::ThreadType enum + // for the provided Java org.rocksdb.ThreadType + static ROCKSDB_NAMESPACE::ThreadStatus::ThreadType toCppThreadType( + jbyte jthread_type) { + switch (jthread_type) { + case 0x0: + return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::HIGH_PRIORITY; + case 0x1: + return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::LOW_PRIORITY; + case 0x2: + return ThreadStatus::ThreadType::USER; + case 0x3: + return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::BOTTOM_PRIORITY; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::LOW_PRIORITY; + } + } +}; + +// The portal class for org.rocksdb.OperationType +class OperationTypeJni { + public: + // Returns the equivalent org.rocksdb.OperationType for the provided + // C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationType enum + static jbyte toJavaOperationType( + const ROCKSDB_NAMESPACE::ThreadStatus::OperationType& operation_type) { + switch (operation_type) { + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN: + return 0x0; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION: + return 0x1; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH: + return 0x2; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationType + // enum for the provided Java org.rocksdb.OperationType + static ROCKSDB_NAMESPACE::ThreadStatus::OperationType toCppOperationType( + jbyte joperation_type) { + switch (joperation_type) { + case 0x0: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN; + case 0x1: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION; + case 0x2: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH; + case 0x3: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN; + } + } +}; + +// The portal class for org.rocksdb.OperationStage +class OperationStageJni { + public: + // Returns the equivalent org.rocksdb.OperationStage for the provided + // C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationStage enum + static jbyte toJavaOperationStage( + const ROCKSDB_NAMESPACE::ThreadStatus::OperationStage& operation_stage) { + switch (operation_stage) { + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_UNKNOWN: + return 0x0; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_FLUSH_RUN: + return 0x1; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_FLUSH_WRITE_L0: + return 0x2; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_PREPARE: + return 0x3; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_RUN: + return 0x4; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_PROCESS_KV: + return 0x5; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_INSTALL: + return 0x6; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_SYNC_FILE: + return 0x7; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_PICK_MEMTABLES_TO_FLUSH: + return 0x8; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_MEMTABLE_ROLLBACK: + return 0x9; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS: + return 0xA; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationStage + // enum for the provided Java org.rocksdb.OperationStage + static ROCKSDB_NAMESPACE::ThreadStatus::OperationStage toCppOperationStage( + jbyte joperation_stage) { + switch (joperation_stage) { + case 0x0: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_UNKNOWN; + case 0x1: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_FLUSH_RUN; + case 0x2: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_FLUSH_WRITE_L0; + case 0x3: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_PREPARE; + case 0x4: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_RUN; + case 0x5: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_PROCESS_KV; + case 0x6: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_INSTALL; + case 0x7: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_COMPACTION_SYNC_FILE; + case 0x8: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_PICK_MEMTABLES_TO_FLUSH; + case 0x9: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_MEMTABLE_ROLLBACK; + case 0xA: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage:: + STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_UNKNOWN; + } + } +}; + +// The portal class for org.rocksdb.StateType +class StateTypeJni { + public: + // Returns the equivalent org.rocksdb.StateType for the provided + // C++ ROCKSDB_NAMESPACE::ThreadStatus::StateType enum + static jbyte toJavaStateType( + const ROCKSDB_NAMESPACE::ThreadStatus::StateType& state_type) { + switch (state_type) { + case ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_UNKNOWN: + return 0x0; + case ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_MUTEX_WAIT: + return 0x1; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::StateType enum + // for the provided Java org.rocksdb.StateType + static ROCKSDB_NAMESPACE::ThreadStatus::StateType toCppStateType( + jbyte jstate_type) { + switch (jstate_type) { + case 0x0: + return ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_UNKNOWN; + case 0x1: + return ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_MUTEX_WAIT; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_UNKNOWN; + } + } +}; + +// The portal class for org.rocksdb.ThreadStatus +class ThreadStatusJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.ThreadStatus + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/ThreadStatus"); + } + + /** + * Create a new Java org.rocksdb.ThreadStatus object with the same + * properties as the provided C++ ROCKSDB_NAMESPACE::ThreadStatus object + * + * @param env A pointer to the Java environment + * @param thread_status A pointer to ROCKSDB_NAMESPACE::ThreadStatus object + * + * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or + * nullptr if an an exception occurs + */ + static jobject construct( + JNIEnv* env, const ROCKSDB_NAMESPACE::ThreadStatus* thread_status) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "", "(JBLjava/lang/String;Ljava/lang/String;BJB[JB)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jdb_name = + JniUtil::toJavaString(env, &(thread_status->db_name), true); + if (env->ExceptionCheck()) { + // an error occurred + return nullptr; + } + + jstring jcf_name = + JniUtil::toJavaString(env, &(thread_status->cf_name), true); + if (env->ExceptionCheck()) { + // an error occurred + env->DeleteLocalRef(jdb_name); + return nullptr; + } + + // long[] + const jsize len = static_cast( + ROCKSDB_NAMESPACE::ThreadStatus::kNumOperationProperties); + jlongArray joperation_properties = env->NewLongArray(len); + if (joperation_properties == nullptr) { + // an exception occurred + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jboolean is_copy; + jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy); + if (body == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(joperation_properties); + return nullptr; + } + for (size_t i = 0; i < len; ++i) { + body[i] = static_cast(thread_status->op_properties[i]); + } + env->ReleaseLongArrayElements(joperation_properties, body, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); + + jobject jcfd = env->NewObject( + jclazz, mid, static_cast(thread_status->thread_id), + ThreadTypeJni::toJavaThreadType(thread_status->thread_type), jdb_name, + jcf_name, + OperationTypeJni::toJavaOperationType(thread_status->operation_type), + static_cast(thread_status->op_elapsed_micros), + OperationStageJni::toJavaOperationStage(thread_status->operation_stage), + joperation_properties, + StateTypeJni::toJavaStateType(thread_status->state_type)); + if (env->ExceptionCheck()) { + // exception occurred + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(joperation_properties); + return nullptr; + } + + // cleanup + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(joperation_properties); + + return jcfd; + } +}; + +// The portal class for org.rocksdb.CompactionStyle +class CompactionStyleJni { + public: + // Returns the equivalent org.rocksdb.CompactionStyle for the provided + // C++ ROCKSDB_NAMESPACE::CompactionStyle enum + static jbyte toJavaCompactionStyle( + const ROCKSDB_NAMESPACE::CompactionStyle& compaction_style) { + switch (compaction_style) { + case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleLevel: + return 0x0; + case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleUniversal: + return 0x1; + case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO: + return 0x2; + case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionStyle enum for the + // provided Java org.rocksdb.CompactionStyle + static ROCKSDB_NAMESPACE::CompactionStyle toCppCompactionStyle( + jbyte jcompaction_style) { + switch (jcompaction_style) { + case 0x0: + return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleLevel; + case 0x1: + return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleUniversal; + case 0x2: + return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO; + case 0x3: + return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone; + default: + // undefined/default + return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleLevel; + } + } +}; + +// The portal class for org.rocksdb.CompactionReason +class CompactionReasonJni { + public: + // Returns the equivalent org.rocksdb.CompactionReason for the provided + // C++ ROCKSDB_NAMESPACE::CompactionReason enum + static jbyte toJavaCompactionReason( + const ROCKSDB_NAMESPACE::CompactionReason& compaction_reason) { + switch (compaction_reason) { + case ROCKSDB_NAMESPACE::CompactionReason::kUnknown: + return 0x0; + case ROCKSDB_NAMESPACE::CompactionReason::kLevelL0FilesNum: + return 0x1; + case ROCKSDB_NAMESPACE::CompactionReason::kLevelMaxLevelSize: + return 0x2; + case ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeAmplification: + return 0x3; + case ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeRatio: + return 0x4; + case ROCKSDB_NAMESPACE::CompactionReason::kUniversalSortedRunNum: + return 0x5; + case ROCKSDB_NAMESPACE::CompactionReason::kFIFOMaxSize: + return 0x6; + case ROCKSDB_NAMESPACE::CompactionReason::kFIFOReduceNumFiles: + return 0x7; + case ROCKSDB_NAMESPACE::CompactionReason::kFIFOTtl: + return 0x8; + case ROCKSDB_NAMESPACE::CompactionReason::kManualCompaction: + return 0x9; + case ROCKSDB_NAMESPACE::CompactionReason::kFilesMarkedForCompaction: + return 0x10; + case ROCKSDB_NAMESPACE::CompactionReason::kBottommostFiles: + return 0x0A; + case ROCKSDB_NAMESPACE::CompactionReason::kTtl: + return 0x0B; + case ROCKSDB_NAMESPACE::CompactionReason::kFlush: + return 0x0C; + case ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion: + return 0x0D; + case ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction: + return 0x0E; + case ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature: + return 0x0F; + case ROCKSDB_NAMESPACE::CompactionReason::kForcedBlobGC: + return 0x11; + case ROCKSDB_NAMESPACE::CompactionReason::kRoundRobinTtl: + return 0x12; + case ROCKSDB_NAMESPACE::CompactionReason::kRefitLevel: + return 0x13; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionReason enum for the + // provided Java org.rocksdb.CompactionReason + static ROCKSDB_NAMESPACE::CompactionReason toCppCompactionReason( + jbyte jcompaction_reason) { + switch (jcompaction_reason) { + case 0x0: + return ROCKSDB_NAMESPACE::CompactionReason::kUnknown; + case 0x1: + return ROCKSDB_NAMESPACE::CompactionReason::kLevelL0FilesNum; + case 0x2: + return ROCKSDB_NAMESPACE::CompactionReason::kLevelMaxLevelSize; + case 0x3: + return ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeAmplification; + case 0x4: + return ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeRatio; + case 0x5: + return ROCKSDB_NAMESPACE::CompactionReason::kUniversalSortedRunNum; + case 0x6: + return ROCKSDB_NAMESPACE::CompactionReason::kFIFOMaxSize; + case 0x7: + return ROCKSDB_NAMESPACE::CompactionReason::kFIFOReduceNumFiles; + case 0x8: + return ROCKSDB_NAMESPACE::CompactionReason::kFIFOTtl; + case 0x9: + return ROCKSDB_NAMESPACE::CompactionReason::kManualCompaction; + case 0x10: + return ROCKSDB_NAMESPACE::CompactionReason::kFilesMarkedForCompaction; + case 0x0A: + return ROCKSDB_NAMESPACE::CompactionReason::kBottommostFiles; + case 0x0B: + return ROCKSDB_NAMESPACE::CompactionReason::kTtl; + case 0x0C: + return ROCKSDB_NAMESPACE::CompactionReason::kFlush; + case 0x0D: + return ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion; + case 0x0E: + return ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction; + case 0x0F: + return ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature; + case 0x11: + return ROCKSDB_NAMESPACE::CompactionReason::kForcedBlobGC; + case 0x12: + return ROCKSDB_NAMESPACE::CompactionReason::kRoundRobinTtl; + case 0x13: + return ROCKSDB_NAMESPACE::CompactionReason::kRefitLevel; + default: + // undefined/default + return ROCKSDB_NAMESPACE::CompactionReason::kUnknown; + } + } +}; + +// The portal class for org.rocksdb.WalFileType +class WalFileTypeJni { + public: + // Returns the equivalent org.rocksdb.WalFileType for the provided + // C++ ROCKSDB_NAMESPACE::WalFileType enum + static jbyte toJavaWalFileType( + const ROCKSDB_NAMESPACE::WalFileType& wal_file_type) { + switch (wal_file_type) { + case ROCKSDB_NAMESPACE::WalFileType::kArchivedLogFile: + return 0x0; + case ROCKSDB_NAMESPACE::WalFileType::kAliveLogFile: + return 0x1; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::WalFileType enum for the + // provided Java org.rocksdb.WalFileType + static ROCKSDB_NAMESPACE::WalFileType toCppWalFileType(jbyte jwal_file_type) { + switch (jwal_file_type) { + case 0x0: + return ROCKSDB_NAMESPACE::WalFileType::kArchivedLogFile; + case 0x1: + return ROCKSDB_NAMESPACE::WalFileType::kAliveLogFile; + default: + // undefined/default + return ROCKSDB_NAMESPACE::WalFileType::kAliveLogFile; + } + } +}; + +class LogFileJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.LogFile object. + * + * @param env A pointer to the Java environment + * @param log_file A Cpp log file object + * + * @return A reference to a Java org.rocksdb.LogFile object, or + * nullptr if an an exception occurs + */ + static jobject fromCppLogFile(JNIEnv* env, + ROCKSDB_NAMESPACE::LogFile* log_file) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = + env->GetMethodID(jclazz, "", "(Ljava/lang/String;JBJJ)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + std::string path_name = log_file->PathName(); + jstring jpath_name = + ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &path_name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + return nullptr; + } + + jobject jlog_file = env->NewObject( + jclazz, mid, jpath_name, static_cast(log_file->LogNumber()), + ROCKSDB_NAMESPACE::WalFileTypeJni::toJavaWalFileType(log_file->Type()), + static_cast(log_file->StartSequence()), + static_cast(log_file->SizeFileBytes())); + + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jpath_name); + return nullptr; + } + + // cleanup + env->DeleteLocalRef(jpath_name); + + return jlog_file; + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/LogFile"); + } +}; + +class LiveFileMetaDataJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.LiveFileMetaData object. + * + * @param env A pointer to the Java environment + * @param live_file_meta_data A Cpp live file meta data object + * + * @return A reference to a Java org.rocksdb.LiveFileMetaData object, or + * nullptr if an an exception occurs + */ + static jobject fromCppLiveFileMetaData( + JNIEnv* env, ROCKSDB_NAMESPACE::LiveFileMetaData* live_file_meta_data) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "", + "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jbyteArray jcolumn_family_name = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, live_file_meta_data->column_family_name); + if (jcolumn_family_name == nullptr) { + // exception occurred creating java byte array + return nullptr; + } + + jstring jfile_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &live_file_meta_data->name, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + return nullptr; + } + + jstring jpath = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &live_file_meta_data->db_path, true); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfile_name); + return nullptr; + } + + jbyteArray jsmallest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, live_file_meta_data->smallestkey); + if (jsmallest_key == nullptr) { + // exception occurred creating java byte array + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + return nullptr; + } + + jbyteArray jlargest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, live_file_meta_data->largestkey); + if (jlargest_key == nullptr) { + // exception occurred creating java byte array + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + return nullptr; + } + + jobject jlive_file_meta_data = env->NewObject( + jclazz, mid, jcolumn_family_name, + static_cast(live_file_meta_data->level), jfile_name, jpath, + static_cast(live_file_meta_data->size), + static_cast(live_file_meta_data->smallest_seqno), + static_cast(live_file_meta_data->largest_seqno), jsmallest_key, + jlargest_key, + static_cast(live_file_meta_data->num_reads_sampled), + static_cast(live_file_meta_data->being_compacted), + static_cast(live_file_meta_data->num_entries), + static_cast(live_file_meta_data->num_deletions)); + + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); + return nullptr; + } + + // cleanup + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); + + return jlive_file_meta_data; + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/LiveFileMetaData"); + } +}; + +class SstFileMetaDataJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.SstFileMetaData object. + * + * @param env A pointer to the Java environment + * @param sst_file_meta_data A Cpp sst file meta data object + * + * @return A reference to a Java org.rocksdb.SstFileMetaData object, or + * nullptr if an an exception occurs + */ + static jobject fromCppSstFileMetaData( + JNIEnv* env, + const ROCKSDB_NAMESPACE::SstFileMetaData* sst_file_meta_data) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "", "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jfile_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &sst_file_meta_data->name, true); + if (jfile_name == nullptr) { + // exception occurred creating java byte array + return nullptr; + } + + jstring jpath = ROCKSDB_NAMESPACE::JniUtil::toJavaString( + env, &sst_file_meta_data->db_path, true); + if (jpath == nullptr) { + // exception occurred creating java byte array + env->DeleteLocalRef(jfile_name); + return nullptr; + } + + jbyteArray jsmallest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, sst_file_meta_data->smallestkey); + if (jsmallest_key == nullptr) { + // exception occurred creating java byte array + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + return nullptr; + } + + jbyteArray jlargest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, sst_file_meta_data->largestkey); + if (jlargest_key == nullptr) { + // exception occurred creating java byte array + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + return nullptr; + } + + jobject jsst_file_meta_data = env->NewObject( + jclazz, mid, jfile_name, jpath, + static_cast(sst_file_meta_data->size), + static_cast(sst_file_meta_data->smallest_seqno), + static_cast(sst_file_meta_data->largest_seqno), jsmallest_key, + jlargest_key, static_cast(sst_file_meta_data->num_reads_sampled), + static_cast(sst_file_meta_data->being_compacted), + static_cast(sst_file_meta_data->num_entries), + static_cast(sst_file_meta_data->num_deletions)); + + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); + return nullptr; + } + + // cleanup + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); + + return jsst_file_meta_data; + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/SstFileMetaData"); + } +}; + +class LevelMetaDataJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.LevelMetaData object. + * + * @param env A pointer to the Java environment + * @param level_meta_data A Cpp level meta data object + * + * @return A reference to a Java org.rocksdb.LevelMetaData object, or + * nullptr if an an exception occurs + */ + static jobject fromCppLevelMetaData( + JNIEnv* env, const ROCKSDB_NAMESPACE::LevelMetaData* level_meta_data) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", + "(IJ[Lorg/rocksdb/SstFileMetaData;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jsize jlen = static_cast(level_meta_data->files.size()); + jobjectArray jfiles = + env->NewObjectArray(jlen, SstFileMetaDataJni::getJClass(env), nullptr); + if (jfiles == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + jsize i = 0; + for (auto it = level_meta_data->files.begin(); + it != level_meta_data->files.end(); ++it) { + jobject jfile = SstFileMetaDataJni::fromCppSstFileMetaData(env, &(*it)); + if (jfile == nullptr) { + // exception occurred + env->DeleteLocalRef(jfiles); + return nullptr; + } + env->SetObjectArrayElement(jfiles, i++, jfile); + } + + jobject jlevel_meta_data = + env->NewObject(jclazz, mid, static_cast(level_meta_data->level), + static_cast(level_meta_data->size), jfiles); + + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jfiles); + return nullptr; + } + + // cleanup + env->DeleteLocalRef(jfiles); + + return jlevel_meta_data; + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/LevelMetaData"); + } +}; + +class ColumnFamilyMetaDataJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.ColumnFamilyMetaData object. + * + * @param env A pointer to the Java environment + * @param column_famly_meta_data A Cpp live file meta data object + * + * @return A reference to a Java org.rocksdb.ColumnFamilyMetaData object, or + * nullptr if an an exception occurs + */ + static jobject fromCppColumnFamilyMetaData( + JNIEnv* env, + const ROCKSDB_NAMESPACE::ColumnFamilyMetaData* column_famly_meta_data) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", + "(JJ[B[Lorg/rocksdb/LevelMetaData;)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jbyteArray jname = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, column_famly_meta_data->name); + if (jname == nullptr) { + // exception occurred creating java byte array + return nullptr; + } + + const jsize jlen = + static_cast(column_famly_meta_data->levels.size()); + jobjectArray jlevels = + env->NewObjectArray(jlen, LevelMetaDataJni::getJClass(env), nullptr); + if (jlevels == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jname); + return nullptr; + } + + jsize i = 0; + for (auto it = column_famly_meta_data->levels.begin(); + it != column_famly_meta_data->levels.end(); ++it) { + jobject jlevel = LevelMetaDataJni::fromCppLevelMetaData(env, &(*it)); + if (jlevel == nullptr) { + // exception occurred + env->DeleteLocalRef(jname); + env->DeleteLocalRef(jlevels); + return nullptr; + } + env->SetObjectArrayElement(jlevels, i++, jlevel); + } + + jobject jcolumn_family_meta_data = env->NewObject( + jclazz, mid, static_cast(column_famly_meta_data->size), + static_cast(column_famly_meta_data->file_count), jname, jlevels); + + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jname); + env->DeleteLocalRef(jlevels); + return nullptr; + } + + // cleanup + env->DeleteLocalRef(jname); + env->DeleteLocalRef(jlevels); + + return jcolumn_family_meta_data; + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyMetaData"); + } +}; + +// The portal class for org.rocksdb.AbstractTraceWriter +class AbstractTraceWriterJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::TraceWriterJniCallback*, + AbstractTraceWriterJni> { + public: + /** + * Get the Java Class org.rocksdb.AbstractTraceWriter + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractTraceWriter"); + } + + /** + * Get the Java Method: AbstractTraceWriter#write + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getWriteProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "writeProxy", "(J)S"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractTraceWriter#closeWriter + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "closeWriterProxy", "()S"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractTraceWriter#getFileSize + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getGetFileSizeMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "getFileSize", "()J"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.AbstractWalFilter +class AbstractWalFilterJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.AbstractWalFilter + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractWalFilter"); + } + + /** + * Get the Java Method: AbstractWalFilter#columnFamilyLogNumberMap + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "columnFamilyLogNumberMap", + "(Ljava/util/Map;Ljava/util/Map;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractTraceWriter#logRecordFoundProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "logRecordFoundProxy", + "(JLjava/lang/String;JJ)S"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractTraceWriter#name + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getNameMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "name", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.WalProcessingOption +class WalProcessingOptionJni { + public: + // Returns the equivalent org.rocksdb.WalProcessingOption for the provided + // C++ ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption enum + static jbyte toJavaWalProcessingOption( + const ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption& + wal_processing_option) { + switch (wal_processing_option) { + case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption:: + kContinueProcessing: + return 0x0; + case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption:: + kIgnoreCurrentRecord: + return 0x1; + case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::kStopReplay: + return 0x2; + case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::kCorruptedRecord: + return 0x3; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ + // ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption enum for the provided + // Java org.rocksdb.WalProcessingOption + static ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption + toCppWalProcessingOption(jbyte jwal_processing_option) { + switch (jwal_processing_option) { + case 0x0: + return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption:: + kContinueProcessing; + case 0x1: + return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption:: + kIgnoreCurrentRecord; + case 0x2: + return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::kStopReplay; + case 0x3: + return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption:: + kCorruptedRecord; + default: + // undefined/default + return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption:: + kCorruptedRecord; + } + } +}; + +// The portal class for org.rocksdb.ReusedSynchronisationType +class ReusedSynchronisationTypeJni { + public: + // Returns the equivalent org.rocksdb.ReusedSynchronisationType for the + // provided C++ ROCKSDB_NAMESPACE::ReusedSynchronisationType enum + static jbyte toJavaReusedSynchronisationType( + const ROCKSDB_NAMESPACE::ReusedSynchronisationType& + reused_synchronisation_type) { + switch (reused_synchronisation_type) { + case ROCKSDB_NAMESPACE::ReusedSynchronisationType::MUTEX: + return 0x0; + case ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX: + return 0x1; + case ROCKSDB_NAMESPACE::ReusedSynchronisationType::THREAD_LOCAL: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ReusedSynchronisationType + // enum for the provided Java org.rocksdb.ReusedSynchronisationType + static ROCKSDB_NAMESPACE::ReusedSynchronisationType + toCppReusedSynchronisationType(jbyte reused_synchronisation_type) { + switch (reused_synchronisation_type) { + case 0x0: + return ROCKSDB_NAMESPACE::ReusedSynchronisationType::MUTEX; + case 0x1: + return ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX; + case 0x2: + return ROCKSDB_NAMESPACE::ReusedSynchronisationType::THREAD_LOCAL; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX; + } + } +}; +// The portal class for org.rocksdb.SanityLevel +class SanityLevelJni { + public: + // Returns the equivalent org.rocksdb.SanityLevel for the provided + // C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum + static jbyte toJavaSanityLevel( + const ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel& sanity_level) { + switch (sanity_level) { + case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::kSanityLevelNone: + return 0x0; + case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel:: + kSanityLevelLooselyCompatible: + return 0x1; + case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel:: + kSanityLevelExactMatch: + return -0x01; + default: + return -0x01; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel + // enum for the provided Java org.rocksdb.SanityLevel + static ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel toCppSanityLevel( + jbyte sanity_level) { + switch (sanity_level) { + case 0x0: + return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelNone; + case 0x1: + return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelLooselyCompatible; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelExactMatch; + } + } +}; + +// The portal class for org.rocksdb.PrepopulateBlobCache +class PrepopulateBlobCacheJni { + public: + // Returns the equivalent org.rocksdb.PrepopulateBlobCache for the provided + // C++ ROCKSDB_NAMESPACE::PrepopulateBlobCache enum + static jbyte toJavaPrepopulateBlobCache( + ROCKSDB_NAMESPACE::PrepopulateBlobCache prepopulate_blob_cache) { + switch (prepopulate_blob_cache) { + case ROCKSDB_NAMESPACE::PrepopulateBlobCache::kDisable: + return 0x0; + case ROCKSDB_NAMESPACE::PrepopulateBlobCache::kFlushOnly: + return 0x1; + default: + return 0x7f; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::PrepopulateBlobCache enum for + // the provided Java org.rocksdb.PrepopulateBlobCache + static ROCKSDB_NAMESPACE::PrepopulateBlobCache toCppPrepopulateBlobCache( + jbyte jprepopulate_blob_cache) { + switch (jprepopulate_blob_cache) { + case 0x0: + return ROCKSDB_NAMESPACE::PrepopulateBlobCache::kDisable; + case 0x1: + return ROCKSDB_NAMESPACE::PrepopulateBlobCache::kFlushOnly; + case 0x7F: + default: + // undefined/default + return ROCKSDB_NAMESPACE::PrepopulateBlobCache::kDisable; + } + } +}; + +// The portal class for org.rocksdb.AbstractListener.EnabledEventCallback +class EnabledEventCallbackJni { + public: + // Returns the set of equivalent C++ + // ROCKSDB_NAMESPACE::EnabledEventCallbackJni::EnabledEventCallback enums for + // the provided Java jenabled_event_callback_values + static std::set toCppEnabledEventCallbacks( + jlong jenabled_event_callback_values) { + std::set enabled_event_callbacks; + for (size_t i = 0; i < EnabledEventCallback::NUM_ENABLED_EVENT_CALLBACK; + ++i) { + if (((1ULL << i) & jenabled_event_callback_values) > 0) { + enabled_event_callbacks.emplace(static_cast(i)); + } + } + return enabled_event_callbacks; + } +}; + +// The portal class for org.rocksdb.AbstractEventListener +class AbstractEventListenerJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::EventListenerJniCallback*, + AbstractEventListenerJni> { + public: + /** + * Get the Java Class org.rocksdb.AbstractEventListener + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractEventListener"); + } + + /** + * Get the Java Method: AbstractEventListener#onFlushCompletedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFlushCompletedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onFlushCompletedProxy", + "(JLorg/rocksdb/FlushJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFlushBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFlushBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onFlushBeginProxy", + "(JLorg/rocksdb/FlushJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileDeleted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileDeletedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onTableFileDeleted", "(Lorg/rocksdb/TableFileDeletionInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onCompactionBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnCompactionBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onCompactionBeginProxy", + "(JLorg/rocksdb/CompactionJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onCompactionCompletedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnCompactionCompletedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onCompactionCompletedProxy", + "(JLorg/rocksdb/CompactionJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileCreated + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileCreatedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onTableFileCreated", "(Lorg/rocksdb/TableFileCreationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileCreationStarted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileCreationStartedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onTableFileCreationStarted", + "(Lorg/rocksdb/TableFileCreationBriefInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onMemTableSealed + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnMemTableSealedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onMemTableSealed", + "(Lorg/rocksdb/MemTableInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: + * AbstractEventListener#onColumnFamilyHandleDeletionStarted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnColumnFamilyHandleDeletionStartedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onColumnFamilyHandleDeletionStarted", + "(Lorg/rocksdb/ColumnFamilyHandle;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onExternalFileIngestedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnExternalFileIngestedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onExternalFileIngestedProxy", + "(JLorg/rocksdb/ExternalFileIngestionInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onBackgroundError + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnBackgroundErrorProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onBackgroundErrorProxy", + "(BLorg/rocksdb/Status;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onStallConditionsChanged + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnStallConditionsChangedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onStallConditionsChanged", + "(Lorg/rocksdb/WriteStallInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileReadFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileReadFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileReadFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileWriteFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileWriteFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileWriteFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileFlushFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileFlushFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileFlushFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileSyncFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileSyncFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileRangeSyncFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileRangeSyncFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileRangeSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileTruncateFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileTruncateFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileTruncateFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileCloseFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileCloseFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileCloseFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#shouldBeNotifiedOnFileIO + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getShouldBeNotifiedOnFileIOMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "shouldBeNotifiedOnFileIO", "()Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onErrorRecoveryBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnErrorRecoveryBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryBeginProxy", + "(BLorg/rocksdb/Status;)Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onErrorRecoveryCompleted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnErrorRecoveryCompletedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryCompleted", + "(Lorg/rocksdb/Status;)V"); + assert(mid != nullptr); + return mid; + } +}; + +class FlushJobInfoJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.FlushJobInfo object. + * + * @param env A pointer to the Java environment + * @param flush_job_info A Cpp flush job info object + * + * @return A reference to a Java org.rocksdb.FlushJobInfo object, or + * nullptr if an an exception occurs + */ + static jobject fromCppFlushJobInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::FlushJobInfo* flush_job_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &flush_job_info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &flush_job_info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jfile_path); + return nullptr; + } + jobject jtable_properties = TablePropertiesJni::fromCppTableProperties( + env, flush_job_info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jfile_path); + return nullptr; + } + return env->NewObject( + jclazz, ctor, static_cast(flush_job_info->cf_id), jcf_name, + jfile_path, static_cast(flush_job_info->thread_id), + static_cast(flush_job_info->job_id), + static_cast(flush_job_info->triggered_writes_slowdown), + static_cast(flush_job_info->triggered_writes_stop), + static_cast(flush_job_info->smallest_seqno), + static_cast(flush_job_info->largest_seqno), jtable_properties, + static_cast(flush_job_info->flush_reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/FlushJobInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(JLjava/lang/String;Ljava/lang/String;JIZZJJLorg/" + "rocksdb/TableProperties;B)V"); + } +}; + +class TableFileDeletionInfoJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.TableFileDeletionInfo object. + * + * @param env A pointer to the Java environment + * @param file_del_info A Cpp table file deletion info object + * + * @return A reference to a Java org.rocksdb.TableFileDeletionInfo object, or + * nullptr if an an exception occurs + */ + static jobject fromCppTableFileDeletionInfo( + JNIEnv* env, + const ROCKSDB_NAMESPACE::TableFileDeletionInfo* file_del_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &file_del_info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jobject jstatus = StatusJni::construct(env, file_del_info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + return env->NewObject(jclazz, ctor, jdb_name, + JniUtil::toJavaString(env, &file_del_info->file_path), + static_cast(file_del_info->job_id), jstatus); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileDeletionInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(Ljava/lang/String;Ljava/lang/String;ILorg/rocksdb/Status;)V"); + } +}; + +class CompactionJobInfoJni : public JavaClass { + public: + static jobject fromCppCompactionJobInfo( + JNIEnv* env, + const ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, + GET_CPLUSPLUS_POINTER(compaction_job_info)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/CompactionJobInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(J)V"); + } +}; + +class TableFileCreationInfoJni : public JavaClass { + public: + static jobject fromCppTableFileCreationInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jobject jtable_properties = + TablePropertiesJni::fromCppTableProperties(env, info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jobject jstatus = StatusJni::construct(env, info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jtable_properties); + return nullptr; + } + return env->NewObject(jclazz, ctor, static_cast(info->file_size), + jtable_properties, jstatus, jdb_name, jcf_name, + jfile_path, static_cast(info->job_id), + static_cast(info->reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(JLorg/rocksdb/TableProperties;Lorg/rocksdb/Status;Ljava/lang/" + "String;Ljava/lang/String;Ljava/lang/String;IB)V"); + } +}; + +class TableFileCreationBriefInfoJni : public JavaClass { + public: + static jobject fromCppTableFileCreationBriefInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationBriefInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + return env->NewObject(jclazz, ctor, jdb_name, jcf_name, jfile_path, + static_cast(info->job_id), + static_cast(info->reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationBriefInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IB)V"); + } +}; + +class MemTableInfoJni : public JavaClass { + public: + static jobject fromCppMemTableInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::MemTableInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + return env->NewObject(jclazz, ctor, jcf_name, + static_cast(info->first_seqno), + static_cast(info->earliest_seqno), + static_cast(info->num_entries), + static_cast(info->num_deletes)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/MemTableInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(Ljava/lang/String;JJJJ)V"); + } +}; + +class ExternalFileIngestionInfoJni : public JavaClass { + public: + static jobject fromCppExternalFileIngestionInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::ExternalFileIngestionInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jexternal_file_path = + JniUtil::toJavaString(env, &info->external_file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jstring jinternal_file_path = + JniUtil::toJavaString(env, &info->internal_file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jexternal_file_path); + return nullptr; + } + jobject jtable_properties = + TablePropertiesJni::fromCppTableProperties(env, info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jexternal_file_path); + env->DeleteLocalRef(jinternal_file_path); + return nullptr; + } + return env->NewObject( + jclazz, ctor, jcf_name, jexternal_file_path, jinternal_file_path, + static_cast(info->global_seqno), jtable_properties); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/ExternalFileIngestionInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/" + "String;JLorg/rocksdb/TableProperties;)V"); + } +}; + +class WriteStallInfoJni : public JavaClass { + public: + static jobject fromCppWriteStallInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::WriteStallInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + return env->NewObject(jclazz, ctor, jcf_name, + static_cast(info->condition.cur), + static_cast(info->condition.prev)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WriteStallInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(Ljava/lang/String;BB)V"); + } +}; + +class FileOperationInfoJni : public JavaClass { + public: + static jobject fromCppFileOperationInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::FileOperationInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jpath = JniUtil::toJavaString(env, &info->path); + if (env->ExceptionCheck()) { + return nullptr; + } + jobject jstatus = StatusJni::construct(env, info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jpath); + return nullptr; + } + return env->NewObject( + jclazz, ctor, jpath, static_cast(info->offset), + static_cast(info->length), + static_cast(info->start_ts.time_since_epoch().count()), + static_cast(info->duration.count()), jstatus); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/FileOperationInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V"); + } +}; +} // namespace ROCKSDB_NAMESPACE +#endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/statisticsjni.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/statisticsjni.h new file mode 100644 index 0000000000..3262b296cf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/statisticsjni.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::Statistics + +#ifndef JAVA_ROCKSJNI_STATISTICSJNI_H_ +#define JAVA_ROCKSJNI_STATISTICSJNI_H_ + +#include +#include +#include + +#include "monitoring/statistics_impl.h" +#include "rocksdb/statistics.h" + +namespace ROCKSDB_NAMESPACE { + +class StatisticsJni : public StatisticsImpl { + public: + StatisticsJni(std::shared_ptr stats); + StatisticsJni(std::shared_ptr stats, + const std::set ignore_histograms); + virtual bool HistEnabledForType(uint32_t type) const override; + + private: + const std::set m_ignore_histograms; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_STATISTICSJNI_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/table_filter_jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/table_filter_jnicallback.h new file mode 100644 index 0000000000..0ef404ca22 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/table_filter_jnicallback.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::TableFilter. + +#ifndef JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_ + +#include + +#include +#include + +#include "rocksdb/table_properties.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +class TableFilterJniCallback : public JniCallback { + public: + TableFilterJniCallback(JNIEnv* env, jobject jtable_filter); + std::function + GetTableFilterFunction(); + + private: + jmethodID m_jfilter_methodid; + std::function + m_table_filter_function; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/trace_writer_jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/trace_writer_jnicallback.h new file mode 100644 index 0000000000..c82a3a72cf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/trace_writer_jnicallback.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::TraceWriter. + +#ifndef JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_ + +#include + +#include + +#include "rocksdb/trace_reader_writer.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +class TraceWriterJniCallback : public JniCallback, public TraceWriter { + public: + TraceWriterJniCallback(JNIEnv* env, jobject jtrace_writer); + virtual Status Write(const Slice& data); + virtual Status Close(); + virtual uint64_t GetFileSize(); + + private: + jmethodID m_jwrite_proxy_methodid; + jmethodID m_jclose_writer_proxy_methodid; + jmethodID m_jget_file_size_methodid; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/transaction_notifier_jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/transaction_notifier_jnicallback.h new file mode 100644 index 0000000000..089a5ee4a5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/transaction_notifier_jnicallback.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::TransactionNotifier. + +#ifndef JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_ + +#include + +#include "rocksdb/utilities/transaction.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB TransactionDB or OptimisticTransactionDB (C++), + * we then callback to the appropriate Java method + * this enables TransactionNotifier to be implemented in Java. + * + * Unlike RocksJava's Comparator JNI Callback, we do not attempt + * to reduce Java object allocations by caching the Snapshot object + * presented to the callback. This could be revisited in future + * if performance is lacking. + */ +class TransactionNotifierJniCallback : public JniCallback, + public TransactionNotifier { + public: + TransactionNotifierJniCallback(JNIEnv* env, jobject jtransaction_notifier); + virtual void SnapshotCreated(const Snapshot* newSnapshot); + + private: + jmethodID m_jsnapshot_created_methodID; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/wal_filter_jnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/wal_filter_jnicallback.h new file mode 100644 index 0000000000..5cdc659786 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/wal_filter_jnicallback.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::WalFilter. + +#ifndef JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_ + +#include + +#include +#include +#include + +#include "rocksdb/wal_filter.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +class WalFilterJniCallback : public JniCallback, public WalFilter { + public: + WalFilterJniCallback(JNIEnv* env, jobject jwal_filter); + virtual void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map); + virtual WalFilter::WalProcessingOption LogRecordFound( + unsigned long long log_number, const std::string& log_file_name, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed); + virtual const char* Name() const; + + private: + std::unique_ptr m_name; + jmethodID m_column_family_log_number_map_mid; + jmethodID m_log_record_found_proxy_mid; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/writebatchhandlerjnicallback.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/writebatchhandlerjnicallback.h new file mode 100644 index 0000000000..9629797ca7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/java/rocksjni/writebatchhandlerjnicallback.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::WriteBatch::Handler. + +#ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ +#define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ + +#include + +#include +#include + +#include "rocksdb/write_batch.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB storage engine (C++) + * which calls the appropriate Java method. + * This enables Write Batch Handlers to be implemented in Java. + */ +class WriteBatchHandlerJniCallback : public JniCallback, + public WriteBatch::Handler { + public: + WriteBatchHandlerJniCallback(JNIEnv* env, jobject jWriteBackHandler); + Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value); + void Put(const Slice& key, const Slice& value); + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value); + void Merge(const Slice& key, const Slice& value); + Status DeleteCF(uint32_t column_family_id, const Slice& key); + void Delete(const Slice& key); + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key); + void SingleDelete(const Slice& key); + Status DeleteRangeCF(uint32_t column_family_id, const Slice& beginKey, + const Slice& endKey); + void DeleteRange(const Slice& beginKey, const Slice& endKey); + void LogData(const Slice& blob); + Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, + const Slice& value); + Status MarkBeginPrepare(bool); + Status MarkEndPrepare(const Slice& xid); + Status MarkNoop(bool empty_batch); + Status MarkRollback(const Slice& xid); + Status MarkCommit(const Slice& xid); + Status MarkCommitWithTimestamp(const Slice& xid, const Slice& commit_ts); + bool Continue(); + + private: + JNIEnv* m_env; + jmethodID m_jPutCfMethodId; + jmethodID m_jPutMethodId; + jmethodID m_jMergeCfMethodId; + jmethodID m_jMergeMethodId; + jmethodID m_jDeleteCfMethodId; + jmethodID m_jDeleteMethodId; + jmethodID m_jSingleDeleteCfMethodId; + jmethodID m_jSingleDeleteMethodId; + jmethodID m_jDeleteRangeCfMethodId; + jmethodID m_jDeleteRangeMethodId; + jmethodID m_jLogDataMethodId; + jmethodID m_jPutBlobIndexCfMethodId; + jmethodID m_jMarkBeginPrepareMethodId; + jmethodID m_jMarkEndPrepareMethodId; + jmethodID m_jMarkNoopMethodId; + jmethodID m_jMarkRollbackMethodId; + jmethodID m_jMarkCommitMethodId; + jmethodID m_jMarkCommitWithTimestampMethodId; + jmethodID m_jContinueMethodId; + /** + * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an + * unexpected exception occurred + */ + std::unique_ptr kv_op( + const Slice& key, const Slice& value, + std::function kvFn); + /** + * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an + * unexpected exception occurred + */ + std::unique_ptr k_op( + const Slice& key, std::function kFn); +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.cc new file mode 100644 index 0000000000..9e9ad45aee --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.cc @@ -0,0 +1,368 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "logging/auto_roll_logger.h" + +#include + +#include "file/filename.h" +#include "logging/logging.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// -- AutoRollLogger + +AutoRollLogger::AutoRollLogger(const std::shared_ptr& fs, + const std::shared_ptr& clock, + const std::string& dbname, + const std::string& db_log_dir, + size_t log_max_size, + size_t log_file_time_to_roll, + size_t keep_log_file_num, + const InfoLogLevel log_level) + : Logger(log_level), + dbname_(dbname), + db_log_dir_(db_log_dir), + fs_(fs), + clock_(clock), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + kKeepLogFileNum(keep_log_file_num), + cached_now(static_cast(clock_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + Status s = fs->GetAbsolutePath(dbname, io_options_, &db_absolute_path_, + &io_context_); + if (s.IsNotSupported()) { + db_absolute_path_ = dbname; + } else { + status_ = s; + } + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + if (fs_->FileExists(log_fname_, io_options_, &io_context_).ok()) { + RollLogFile(); + } + GetExistingFiles(); + s = ResetLogger(); + if (s.ok() && status_.ok()) { + status_ = TrimOldLogFiles(); + } +} + +Status AutoRollLogger::ResetLogger() { + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); + status_ = fs_->NewLogger(log_fname_, io_options_, &logger_, &io_context_); + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger"); + + if (!status_.ok()) { + return status_; + } + assert(logger_); + logger_->SetInfoLogLevel(Logger::GetInfoLogLevel()); + + if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) { + status_ = Status::NotSupported( + "The underlying logger doesn't support GetLogFileSize()"); + } + if (status_.ok()) { + cached_now = static_cast(clock_->NowMicros() * 1e-6); + ctime_ = cached_now; + cached_now_access_count = 0; + } + + return status_; +} + +void AutoRollLogger::RollLogFile() { + // This function is called when log is rotating. Two rotations + // can happen quickly (NowMicro returns same value). To not overwrite + // previous log file we increment by one micro second and try again. + uint64_t now = clock_->NowMicros(); + std::string old_fname; + do { + old_fname = + OldInfoLogFileName(dbname_, now, db_absolute_path_, db_log_dir_); + now++; + } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok()); + // Wait for logger_ reference count to turn to 1 as it might be pinned by + // Flush. Pinned Logger can't be closed till Flush is completed on that + // Logger. + while (logger_.use_count() > 1) { + } + // Close the existing logger first to release the existing handle + // before renaming the file using the file system. If this call + // fails there is nothing much we can do and we will continue with the + // rename and hence ignoring the result status. + if (logger_) { + logger_->Close().PermitUncheckedError(); + } + Status s = fs_->RenameFile(log_fname_, old_fname, io_options_, &io_context_); + if (!s.ok()) { + // What should we do on error? + } + old_log_files_.push(old_fname); +} + +void AutoRollLogger::GetExistingFiles() { + { + // Empty the queue to avoid duplicated entries in the queue. + std::queue empty; + std::swap(old_log_files_, empty); + } + + std::string parent_dir; + std::vector info_log_files; + Status s = + GetInfoLogFiles(fs_, db_log_dir_, dbname_, &parent_dir, &info_log_files); + if (status_.ok()) { + status_ = s; + } + // We need to sort the file before enqueing it so that when we + // delete file from the front, it is the oldest file. + std::sort(info_log_files.begin(), info_log_files.end()); + + for (const std::string& f : info_log_files) { + old_log_files_.push(parent_dir + "/" + f); + } +} + +Status AutoRollLogger::TrimOldLogFiles() { + // Here we directly list info files and delete them through FileSystem. + // The deletion isn't going through DB, so there are shortcomes: + // 1. the deletion is not rate limited by SstFileManager + // 2. there is a chance that an I/O will be issued here + // Since it's going to be complicated to pass DB object down to + // here, we take a simple approach to keep the code easier to + // maintain. + + // old_log_files_.empty() is helpful for the corner case that + // kKeepLogFileNum == 0. We can instead check kKeepLogFileNum != 0 but + // it's essentially the same thing, and checking empty before accessing + // the queue feels safer. + while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) { + Status s = + fs_->DeleteFile(old_log_files_.front(), io_options_, &io_context_); + // Remove the file from the tracking anyway. It's possible that + // DB cleaned up the old log file, or people cleaned it up manually. + old_log_files_.pop(); + // To make the file really go away, we should sync parent directory. + // Since there isn't any consistency issue involved here, skipping + // this part to avoid one I/O here. + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +std::string AutoRollLogger::ValistToString(const char* format, + va_list args) const { + // Any log messages longer than 1024 will get truncated. + // The user is responsible for chopping longer messages into multi line log + static const int MAXBUFFERSIZE = 1024; + char buffer[MAXBUFFERSIZE]; + + int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args); + (void)count; + assert(count >= 0); + + return buffer; +} + +void AutoRollLogger::LogInternal(const char* format, ...) { + mutex_.AssertHeld(); + + if (!logger_) { + return; + } + + va_list args; + va_start(args, format); + logger_->Logv(format, args); + va_end(args); +} + +void AutoRollLogger::Logv(const char* format, va_list ap) { + assert(GetStatus().ok()); + if (!logger_) { + return; + } + + std::shared_ptr logger; + { + MutexLock l(&mutex_); + if ((kLogFileTimeToRoll > 0 && LogExpired()) || + (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { + RollLogFile(); + Status s = ResetLogger(); + Status s2 = TrimOldLogFiles(); + + if (!s.ok()) { + // can't really log the error if creating a new LOG file failed + return; + } + + WriteHeaderInfo(); + + if (!s2.ok()) { + ROCKS_LOG_WARN(logger.get(), "Fail to trim old info log file: %s", + s2.ToString().c_str()); + } + } + + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + + // Another thread could have put a new Logger instance into logger_ by now. + // However, since logger is still hanging on to the previous instance + // (reference count is not zero), we don't have to worry about it being + // deleted while we are accessing it. + // Note that logv itself is not mutex protected to allow maximum concurrency, + // as thread safety should have been handled by the underlying logger. + logger->Logv(format, ap); +} + +void AutoRollLogger::WriteHeaderInfo() { + mutex_.AssertHeld(); + for (auto& header : headers_) { + LogInternal("%s", header.c_str()); + } +} + +void AutoRollLogger::LogHeader(const char* format, va_list args) { + if (!logger_) { + return; + } + + // header message are to be retained in memory. Since we cannot make any + // assumptions about the data contained in va_list, we will retain them as + // strings + va_list tmp; + va_copy(tmp, args); + std::string data = ValistToString(format, tmp); + va_end(tmp); + + MutexLock l(&mutex_); + headers_.push_back(data); + + // Log the original message to the current log + logger_->Logv(format, args); +} + +bool AutoRollLogger::LogExpired() { + if (cached_now_access_count >= call_NowMicros_every_N_records_) { + cached_now = static_cast(clock_->NowMicros() * 1e-6); + cached_now_access_count = 0; + } + + ++cached_now_access_count; + return cached_now >= ctime_ + kLogFileTimeToRoll; +} + +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger) { + if (options.info_log) { + *logger = options.info_log; + return Status::OK(); + } + + Env* env = options.env; + std::string db_absolute_path; + Status s = env->GetAbsolutePath(dbname, &db_absolute_path); + TEST_SYNC_POINT_CALLBACK("rocksdb::CreateLoggerFromOptions:AfterGetPath", &s); + if (!s.ok()) { + return s; + } + std::string fname = + InfoLogFileName(dbname, db_absolute_path, options.db_log_dir); + + const auto& clock = env->GetSystemClock(); + // In case it does not exist. + s = env->CreateDirIfMissing(dbname); + if (!s.ok()) { + if (options.db_log_dir.empty()) { + return s; + } else { + // Ignore the error returned during creation of dbname because dbname and + // db_log_dir can be on different filesystems in which case dbname will + // not exist and error should be ignored. db_log_dir creation will handle + // the error in case there is any error in the creation of dbname on same + // filesystem. + s = Status::OK(); + } + } + assert(s.ok()); + + if (!options.db_log_dir.empty()) { + s = env->CreateDirIfMissing(options.db_log_dir); + if (!s.ok()) { + return s; + } + } + // Currently we only support roll by time-to-roll and log size + if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { + AutoRollLogger* result = new AutoRollLogger( + env->GetFileSystem(), clock, dbname, options.db_log_dir, + options.max_log_file_size, options.log_file_time_to_roll, + options.keep_log_file_num, options.info_log_level); + s = result->GetStatus(); + if (!s.ok()) { + delete result; + } else { + logger->reset(result); + } + return s; + } + // Open a log file in the same directory as the db + s = env->FileExists(fname); + if (s.ok()) { + s = env->RenameFile( + fname, OldInfoLogFileName(dbname, clock->NowMicros(), db_absolute_path, + options.db_log_dir)); + + // The operation sequence of "FileExists -> Rename" is not atomic. It's + // possible that FileExists returns OK but file gets deleted before Rename. + // This can cause Rename to return IOError with subcode PathNotFound. + // Although it may be a rare case and applications should be discouraged + // to not concurrently modifying the contents of the directories accessed + // by the database instance, it is still helpful if we can perform some + // simple handling of this case. Therefore, we do the following: + // 1. if Rename() returns IOError with PathNotFound subcode, then we check + // whether the source file, i.e. LOG, exists. + // 2. if LOG exists, it means Rename() failed due to something else. Then + // we report error. + // 3. if LOG does not exist, it means it may have been removed/renamed by + // someone else. Since it does not exist, we can reset Status to OK so + // that this caller can try creating a new LOG file. If this succeeds, + // we should still allow it. + if (s.IsPathNotFound()) { + s = env->FileExists(fname); + if (s.IsNotFound()) { + s = Status::OK(); + } + } + } else if (s.IsNotFound()) { + // "LOG" is not required to exist since this could be a new DB. + s = Status::OK(); + } + if (s.ok()) { + s = env->NewLogger(fname, logger); + } + if (s.ok() && logger->get() != nullptr) { + (*logger)->SetInfoLogLevel(options.info_log_level); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.h new file mode 100644 index 0000000000..dca9996fea --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/auto_roll_logger.h @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include +#include +#include + +#include "file/filename.h" +#include "port/port.h" +#include "port/util_logger.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { +class FileSystem; +class SystemClock; + +// Rolls the log file by size and/or time +class AutoRollLogger : public Logger { + public: + AutoRollLogger(const std::shared_ptr& fs, + const std::shared_ptr& clock, + const std::string& dbname, const std::string& db_log_dir, + size_t log_max_size, size_t log_file_time_to_roll, + size_t keep_log_file_num, + const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL); + + using Logger::Logv; + void Logv(const char* format, va_list ap) override; + + // Write a header entry to the log. All header information will be written + // again every time the log rolls over. + virtual void LogHeader(const char* format, va_list ap) override; + + // check if the logger has encountered any problem. + Status GetStatus() { return status_; } + + size_t GetLogFileSize() const override { + if (!logger_) { + return 0; + } + + std::shared_ptr logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + return logger->GetLogFileSize(); + } + + void Flush() override { + std::shared_ptr logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + TEST_SYNC_POINT("AutoRollLogger::Flush:PinnedLogger"); + if (logger) { + logger->Flush(); + } + } + + virtual ~AutoRollLogger() { + if (logger_ && !closed_) { + logger_->Close().PermitUncheckedError(); + } + status_.PermitUncheckedError(); + } + + using Logger::GetInfoLogLevel; + InfoLogLevel GetInfoLogLevel() const override { + MutexLock l(&mutex_); + if (!logger_) { + return Logger::GetInfoLogLevel(); + } + return logger_->GetInfoLogLevel(); + } + + using Logger::SetInfoLogLevel; + void SetInfoLogLevel(const InfoLogLevel log_level) override { + MutexLock lock(&mutex_); + Logger::SetInfoLogLevel(log_level); + if (logger_) { + logger_->SetInfoLogLevel(log_level); + } + } + + void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) { + call_NowMicros_every_N_records_ = call_NowMicros_every_N_records; + } + + // Expose the log file path for testing purpose + std::string TEST_log_fname() const { return log_fname_; } + + uint64_t TEST_ctime() const { return ctime_; } + + Logger* TEST_inner_logger() const { return logger_.get(); } + + protected: + // Implementation of Close() + virtual Status CloseImpl() override { + if (logger_) { + return logger_->Close(); + } else { + return Status::OK(); + } + } + + private: + bool LogExpired(); + Status ResetLogger(); + void RollLogFile(); + // Read all names of old log files into old_log_files_ + // If there is any error, put the error code in status_ + void GetExistingFiles(); + // Delete old log files if it excceeds the limit. + Status TrimOldLogFiles(); + // Log message to logger without rolling + void LogInternal(const char* format, ...); + // Serialize the va_list to a string + std::string ValistToString(const char* format, va_list args) const; + // Write the logs marked as headers to the new log file + void WriteHeaderInfo(); + std::string log_fname_; // Current active info log's file name. + std::string dbname_; + std::string db_log_dir_; + std::string db_absolute_path_; + std::shared_ptr fs_; + std::shared_ptr clock_; + std::shared_ptr logger_; + // current status of the logger + Status status_; + const size_t kMaxLogFileSize; + const size_t kLogFileTimeToRoll; + const size_t kKeepLogFileNum; + // header information + std::list headers_; + // List of all existing info log files. Used for enforcing number of + // info log files. + // Full path is stored here. It consumes signifianctly more memory + // than only storing file name. Can optimize if it causes a problem. + std::queue old_log_files_; + // to avoid frequent clock->NowMicros() calls, we cached the current time + uint64_t cached_now; + uint64_t ctime_; + uint64_t cached_now_access_count; + uint64_t call_NowMicros_every_N_records_; + IOOptions io_options_; + IODebugContext io_context_; + mutable port::Mutex mutex_; +}; + +// Facade to craete logger automatically +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/env_logger.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/env_logger.h new file mode 100644 index 0000000000..fc9b245504 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/env_logger.h @@ -0,0 +1,195 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that uses custom Env object for logging. + +#pragma once + +#include + +#include +#include + +#include "file/writable_file_writer.h" +#include "monitoring/iostats_context_imp.h" +#include "port/sys_time.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/perf_level.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class EnvLogger : public Logger { + public: + EnvLogger(std::unique_ptr&& writable_file, + const std::string& fname, const EnvOptions& options, Env* env, + InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + env_(env), + clock_(env_->GetSystemClock().get()), + file_(std::move(writable_file), fname, options, clock_), + last_flush_micros_(0), + flush_pending_(false) {} + + ~EnvLogger() { + if (!closed_) { + closed_ = true; + CloseHelper().PermitUncheckedError(); + } + } + + private: + // A guard to prepare file operations, such as mutex and skip + // I/O context. + class FileOpGuard { + public: + explicit FileOpGuard(EnvLogger& logger) + : logger_(logger), prev_perf_level_(GetPerfLevel()) { + // Preserve iostats not to pollute writes from user writes. We might + // need a better solution than this. + SetPerfLevel(PerfLevel::kDisable); + IOSTATS_SET_DISABLE(true); + logger.mutex_.Lock(); + } + ~FileOpGuard() { + logger_.mutex_.Unlock(); + IOSTATS_SET_DISABLE(false); + SetPerfLevel(prev_perf_level_); + } + + private: + EnvLogger& logger_; + PerfLevel prev_perf_level_; + }; + + void FlushLocked() { + mutex_.AssertHeld(); + if (flush_pending_) { + flush_pending_ = false; + file_.Flush().PermitUncheckedError(); + file_.reset_seen_error(); + } + last_flush_micros_ = clock_->NowMicros(); + } + + void Flush() override { + TEST_SYNC_POINT("EnvLogger::Flush:Begin1"); + TEST_SYNC_POINT("EnvLogger::Flush:Begin2"); + + FileOpGuard guard(*this); + FlushLocked(); + } + + Status CloseImpl() override { return CloseHelper(); } + + Status CloseHelper() { + FileOpGuard guard(*this); + const auto close_status = file_.Close(); + + if (close_status.ok()) { + return close_status; + } + return Status::IOError("Close of log file failed with error:" + + (close_status.getState() + ? std::string(close_status.getState()) + : std::string())); + } + + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + IOSTATS_TIMER_GUARD(logger_nanos); + + const uint64_t thread_id = env_->GetThreadID(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 65536; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + port::LocalTimeR(&seconds, &t); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llu ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + { + FileOpGuard guard(*this); + // We will ignore any error returned by Append(). + file_.Append(Slice(base, p - base)).PermitUncheckedError(); + file_.reset_seen_error(); + flush_pending_ = true; + const uint64_t now_micros = clock_->NowMicros(); + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + FlushLocked(); + } + } + if (base != buffer) { + delete[] base; + } + break; + } + } + + size_t GetLogFileSize() const override { + MutexLock l(&mutex_); + return file_.GetFileSize(); + } + + private: + Env* env_; + SystemClock* clock_; + WritableFileWriter file_; + mutable port::Mutex mutex_; // Mutex to protect the shared variables below. + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + std::atomic flush_pending_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.cc new file mode 100644 index 0000000000..cb9eca6871 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "logging/event_logger.h" + +#include +#include +#include +#include + +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +EventLoggerStream::EventLoggerStream(Logger* logger) + : logger_(logger), + log_buffer_(nullptr), + max_log_size_(0), + json_writer_(nullptr) {} + +EventLoggerStream::EventLoggerStream(LogBuffer* log_buffer, + const size_t max_log_size) + : logger_(nullptr), + log_buffer_(log_buffer), + max_log_size_(max_log_size), + json_writer_(nullptr) {} + +EventLoggerStream::~EventLoggerStream() { + if (json_writer_) { + json_writer_->EndObject(); +#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT + printf("%s\n", json_writer_->Get().c_str()); +#else + if (logger_) { + EventLogger::Log(logger_, *json_writer_); + } else if (log_buffer_) { + assert(max_log_size_); + EventLogger::LogToBuffer(log_buffer_, *json_writer_, max_log_size_); + } +#endif + delete json_writer_; + } +} + +void EventLogger::Log(const JSONWriter& jwriter) { Log(logger_, jwriter); } + +void EventLogger::Log(Logger* logger, const JSONWriter& jwriter) { +#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT + printf("%s\n", jwriter.Get().c_str()); +#else + ROCKSDB_NAMESPACE::Log(logger, "%s %s", Prefix(), jwriter.Get().c_str()); +#endif +} + +void EventLogger::LogToBuffer(LogBuffer* log_buffer, const JSONWriter& jwriter, + const size_t max_log_size) { +#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT + printf("%s\n", jwriter.Get().c_str()); +#else + assert(log_buffer); + ROCKSDB_NAMESPACE::LogToBuffer(log_buffer, max_log_size, "%s %s", Prefix(), + jwriter.Get().c_str()); +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.h new file mode 100644 index 0000000000..9ce982f50e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/event_logger.h @@ -0,0 +1,202 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "logging/log_buffer.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter { + public: + JSONWriter() : state_(kExpectKey), first_element_(true), in_array_(false) { + stream_ << "{"; + } + + void AddKey(const std::string& key) { + assert(state_ == kExpectKey); + if (!first_element_) { + stream_ << ", "; + } + stream_ << "\"" << key << "\": "; + state_ = kExpectValue; + first_element_ = false; + } + + void AddValue(const char* value) { + assert(state_ == kExpectValue || state_ == kInArray); + if (state_ == kInArray && !first_element_) { + stream_ << ", "; + } + stream_ << "\"" << value << "\""; + if (state_ != kInArray) { + state_ = kExpectKey; + } + first_element_ = false; + } + + template + void AddValue(const T& value) { + assert(state_ == kExpectValue || state_ == kInArray); + if (state_ == kInArray && !first_element_) { + stream_ << ", "; + } + stream_ << value; + if (state_ != kInArray) { + state_ = kExpectKey; + } + first_element_ = false; + } + + void StartArray() { + assert(state_ == kExpectValue); + state_ = kInArray; + in_array_ = true; + stream_ << "["; + first_element_ = true; + } + + void EndArray() { + assert(state_ == kInArray); + state_ = kExpectKey; + in_array_ = false; + stream_ << "]"; + first_element_ = false; + } + + void StartObject() { + assert(state_ == kExpectValue); + state_ = kExpectKey; + stream_ << "{"; + first_element_ = true; + } + + void EndObject() { + assert(state_ == kExpectKey); + stream_ << "}"; + first_element_ = false; + } + + void StartArrayedObject() { + assert(state_ == kInArray && in_array_); + state_ = kExpectValue; + if (!first_element_) { + stream_ << ", "; + } + StartObject(); + } + + void EndArrayedObject() { + assert(in_array_); + EndObject(); + state_ = kInArray; + } + + std::string Get() const { return stream_.str(); } + + JSONWriter& operator<<(const char* val) { + if (state_ == kExpectKey) { + AddKey(val); + } else { + AddValue(val); + } + return *this; + } + + JSONWriter& operator<<(const std::string& val) { + return *this << val.c_str(); + } + + template + JSONWriter& operator<<(const T& val) { + assert(state_ != kExpectKey); + AddValue(val); + return *this; + } + + private: + enum JSONWriterState { + kExpectKey, + kExpectValue, + kInArray, + kInArrayedObject, + }; + JSONWriterState state_; + bool first_element_; + bool in_array_; + std::ostringstream stream_; +}; + +class EventLoggerStream { + public: + template + EventLoggerStream& operator<<(const T& val) { + MakeStream(); + *json_writer_ << val; + return *this; + } + + void StartArray() { json_writer_->StartArray(); } + void EndArray() { json_writer_->EndArray(); } + void StartObject() { json_writer_->StartObject(); } + void EndObject() { json_writer_->EndObject(); } + + ~EventLoggerStream(); + + private: + void MakeStream() { + if (!json_writer_) { + json_writer_ = new JSONWriter(); + *this << "time_micros" + << std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + } + } + friend class EventLogger; + explicit EventLoggerStream(Logger* logger); + explicit EventLoggerStream(LogBuffer* log_buffer, const size_t max_log_size); + // exactly one is non-nullptr + Logger* const logger_; + LogBuffer* const log_buffer_; + const size_t max_log_size_; // used only for log_buffer_ + // ownership + JSONWriter* json_writer_; +}; + +// here is an example of the output that will show up in the LOG: +// 2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": +// 1421360005788015, "event": "table_file_creation", "file_number": 12, +// "file_size": 1909699} +class EventLogger { + public: + static const char* Prefix() { return "EVENT_LOG_v1"; } + + explicit EventLogger(Logger* logger) : logger_(logger) {} + EventLoggerStream Log() { return EventLoggerStream(logger_); } + EventLoggerStream LogToBuffer(LogBuffer* log_buffer) { + return EventLoggerStream(log_buffer, LogBuffer::kDefaultMaxLogSize); + } + EventLoggerStream LogToBuffer(LogBuffer* log_buffer, + const size_t max_log_size) { + return EventLoggerStream(log_buffer, max_log_size); + } + void Log(const JSONWriter& jwriter); + static void Log(Logger* logger, const JSONWriter& jwriter); + static void LogToBuffer( + LogBuffer* log_buffer, const JSONWriter& jwriter, + const size_t max_log_size = LogBuffer::kDefaultMaxLogSize); + + private: + Logger* logger_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.cc new file mode 100644 index 0000000000..2763e617f4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "logging/log_buffer.h" + +#include "port/port.h" +#include "port/sys_time.h" + +namespace ROCKSDB_NAMESPACE { + +LogBuffer::LogBuffer(const InfoLogLevel log_level, Logger* info_log) + : log_level_(log_level), info_log_(info_log) {} + +void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format, + va_list ap) { + if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) { + // Skip the level because of its level. + return; + } + + char* alloc_mem = arena_.AllocateAligned(max_log_size); + BufferedLog* buffered_log = new (alloc_mem) BufferedLog(); + char* p = buffered_log->message; + char* limit = alloc_mem + max_log_size - 1; + + // store the time + port::GetTimeOfDay(&(buffered_log->now_tv), nullptr); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + auto n = vsnprintf(p, limit - p, format, backup_ap); +#ifndef OS_WIN + // MS reports -1 when the buffer is too short + assert(n >= 0); +#endif + if (n > 0) { + p += n; + } else { + p = limit; + } + va_end(backup_ap); + } + + if (p > limit) { + p = limit; + } + + // Add '\0' to the end + *p = '\0'; + + logs_.push_back(buffered_log); +} + +void LogBuffer::FlushBufferToLog() { + for (BufferedLog* log : logs_) { + const time_t seconds = log->now_tv.tv_sec; + struct tm t; + if (port::LocalTimeR(&seconds, &t) != nullptr) { + Log(log_level_, info_log_, + "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, + t.tm_sec, static_cast(log->now_tv.tv_usec), log->message); + } + } + logs_.clear(); +} + +void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format, + ...) { + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(max_log_size, format, ap); + va_end(ap); + } +} + +void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) { + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(LogBuffer::kDefaultMaxLogSize, format, ap); + va_end(ap); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.h new file mode 100644 index 0000000000..92d38d10d1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/log_buffer.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "memory/arena.h" +#include "port/sys_time.h" +#include "rocksdb/env.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Logger; + +// A class to buffer info log entries and flush them in the end. +class LogBuffer { + public: + // log_level: the log level for all the logs + // info_log: logger to write the logs to + LogBuffer(const InfoLogLevel log_level, Logger* info_log); + + // Add a log entry to the buffer. Use default max_log_size. + // max_log_size indicates maximize log size, including some metadata. + void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap); + + size_t IsEmpty() const { return logs_.empty(); } + + // Flush all buffered log to the info log. + void FlushBufferToLog(); + static const size_t kDefaultMaxLogSize = 512; + + private: + // One log entry with its timestamp + struct BufferedLog { + port::TimeVal now_tv; // Timestamp of the log + char message[1]; // Beginning of log message + }; + + const InfoLogLevel log_level_; + Logger* info_log_; + Arena arena_; + autovector logs_; +}; + +// Add log to the LogBuffer for a delayed info logging. It can be used when +// we want to add some logs inside a mutex. +// max_log_size indicates maximize log size, including some metadata. +extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, + const char* format, ...); +// Same as previous function, but with default max log size. +extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/logging.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/logging.h new file mode 100644 index 0000000000..0fa882a786 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/logging/logging.h @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#pragma once + +// Helper macros that include information about file name and line number +#define ROCKS_LOG_STRINGIFY(x) #x +#define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x) +#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) \ + ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT) + +inline const char* RocksLogShorterFileName(const char* file) { + // 18 is the length of "logging/logging.h". + // If the name of this file changed, please change this number, too. + return file + (sizeof(__FILE__) > 18 ? sizeof(__FILE__) - 18 : 0); +} + +// Don't inclide file/line info in HEADER level +#define ROCKS_LOG_HEADER(LGR, FMT, ...) \ + ROCKSDB_NAMESPACE::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_AT_LEVEL(LGR, LVL, FMT, ...) \ + ROCKSDB_NAMESPACE::Log((LVL), (LGR), ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::DEBUG_LEVEL, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_INFO(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::INFO_LEVEL, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_WARN(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::WARN_LEVEL, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_ERROR(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::ERROR_LEVEL, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_FATAL(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::FATAL_LEVEL, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...) \ + ROCKSDB_NAMESPACE::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), \ + ##__VA_ARGS__) + +#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...) \ + ROCKSDB_NAMESPACE::LogToBuffer( \ + LOG_BUF, MAX_LOG_SIZE, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_DETAILS(LGR, FMT, ...) \ + ; // due to overhead by default skip such lines +// ROCKS_LOG_DEBUG(LGR, FMT, ##__VA_ARGS__) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/allocator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/allocator.h new file mode 100644 index 0000000000..0d7cd60a99 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/allocator.h @@ -0,0 +1,58 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Abstract interface for allocating memory in blocks. This memory is freed +// when the allocator object is destroyed. See the Arena class for more info. + +#pragma once +#include +#include + +#include "rocksdb/write_buffer_manager.h" + +namespace ROCKSDB_NAMESPACE { + +class Logger; + +class Allocator { + public: + virtual ~Allocator() {} + + virtual char* Allocate(size_t bytes) = 0; + virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) = 0; + + virtual size_t BlockSize() const = 0; +}; + +class AllocTracker { + public: + explicit AllocTracker(WriteBufferManager* write_buffer_manager); + // No copying allowed + AllocTracker(const AllocTracker&) = delete; + void operator=(const AllocTracker&) = delete; + + ~AllocTracker(); + void Allocate(size_t bytes); + // Call when we're finished allocating memory so we can free it from + // the write buffer's limit. + void DoneAllocating(); + + void FreeMem(); + + bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; } + + private: + WriteBufferManager* write_buffer_manager_; + std::atomic bytes_allocated_; + bool done_allocating_; + bool freed_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.cc new file mode 100644 index 0000000000..0a920203dc --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "memory/arena.h" + +#include + +#include "logging/logging.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +size_t Arena::OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(Arena::kMinBlockSize, block_size); + block_size = std::min(Arena::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; + } + + return block_size; +} + +Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) + : kBlockSize(OptimizeBlockSize(block_size)), tracker_(tracker) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); + TEST_SYNC_POINT_CALLBACK("Arena::Arena:0", const_cast(&kBlockSize)); + alloc_bytes_remaining_ = sizeof(inline_block_); + blocks_memory_ += alloc_bytes_remaining_; + aligned_alloc_ptr_ = inline_block_; + unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; + if (MemMapping::kHugePageSupported) { + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + } + } + if (tracker_ != nullptr) { + tracker_->Allocate(kInlineSize); + } +} + +Arena::~Arena() { + if (tracker_ != nullptr) { + assert(tracker_->is_freed()); + tracker_->FreeMem(); + } +} + +char* Arena::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { + ++irregular_block_num; + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + return AllocateNewBlock(bytes); + } + + // We waste the remaining space in the current block. + size_t size = 0; + char* block_head = nullptr; + if (MemMapping::kHugePageSupported && hugetlb_size_ > 0) { + size = hugetlb_size_; + block_head = AllocateFromHugePage(size); + } + if (!block_head) { + size = kBlockSize; + block_head = AllocateNewBlock(size); + } + alloc_bytes_remaining_ = size - bytes; + + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + size; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + size - bytes; + return unaligned_alloc_ptr_; + } +} + +char* Arena::AllocateFromHugePage(size_t bytes) { + MemMapping mm = MemMapping::AllocateHuge(bytes); + auto addr = static_cast(mm.Get()); + if (addr) { + huge_blocks_.push_back(std::move(mm)); + blocks_memory_ += bytes; + if (tracker_ != nullptr) { + tracker_->Allocate(bytes); + } + } + return addr; +} + +char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, + Logger* logger) { + if (MemMapping::kHugePageSupported && hugetlb_size_ > 0 && + huge_page_size > 0 && bytes > 0) { + // Allocate from a huge page TLB table. + size_t reserved_size = + ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; + assert(reserved_size >= bytes); + + char* addr = AllocateFromHugePage(reserved_size); + if (addr == nullptr) { + ROCKS_LOG_WARN(logger, + "AllocateAligned fail to allocate huge TLB pages: %s", + errnoStr(errno).c_str()); + // fail back to malloc + } else { + return addr; + } + } + + size_t current_mod = + reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returns aligned memory + result = AllocateFallback(bytes, true /* aligned */); + } + assert((reinterpret_cast(result) & (kAlignUnit - 1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + // NOTE: std::make_unique zero-initializes the block so is not appropriate + // here + char* block = new char[block_bytes]; + blocks_.push_back(std::unique_ptr(block)); + + size_t allocated_size; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + allocated_size = malloc_usable_size(block); +#ifndef NDEBUG + // It's hard to predict what malloc_usable_size() returns. + // A callback can allow users to change the costed size. + std::pair pair(&allocated_size, &block_bytes); + TEST_SYNC_POINT_CALLBACK("Arena::AllocateNewBlock:0", &pair); +#endif // NDEBUG +#else + allocated_size = block_bytes; +#endif // ROCKSDB_MALLOC_USABLE_SIZE + blocks_memory_ += allocated_size; + if (tracker_ != nullptr) { + tracker_->Allocate(allocated_size); + } + return block; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.h new file mode 100644 index 0000000000..39399aa71b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/arena.h @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Arena is an implementation of Allocator class. For a request of small size, +// it allocates a block with pre-defined block size. For a request of big +// size, it uses malloc to directly get the requested size. + +#pragma once + +#include +#include + +#include "memory/allocator.h" +#include "port/mmap.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena : public Allocator { + public: + // No copying allowed + Arena(const Arena&) = delete; + void operator=(const Arena&) = delete; + + static constexpr size_t kInlineSize = 2048; + static constexpr size_t kMinBlockSize = 4096; + static constexpr size_t kMaxBlockSize = 2u << 30; + + static constexpr unsigned kAlignUnit = alignof(std::max_align_t); + static_assert((kAlignUnit & (kAlignUnit - 1)) == 0, + "Pointer size should be power of 2"); + + // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the + // supported hugepage size of the system), block allocation will try huge + // page TLB first. If allocation fails, will fall back to normal case. + explicit Arena(size_t block_size = kMinBlockSize, + AllocTracker* tracker = nullptr, size_t huge_page_size = 0); + ~Arena(); + + char* Allocate(size_t bytes) override; + + // huge_page_size: if >0, will try to allocate from huage page TLB. + // The argument will be the size of the page size for huge page TLB. Bytes + // will be rounded up to multiple of the page size to allocate through mmap + // anonymous option with huge page on. The extra space allocated will be + // wasted. If allocation fails, will fall back to normal case. To enable it, + // need to reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt for details. + // huge page allocation can fail. In this case it will fail back to + // normal cases. The messages will be logged to logger. So when calling with + // huge_page_tlb_size > 0, we highly recommend a logger is passed in. + // Otherwise, the error message will be printed out to stderr directly. + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override; + + // Returns an estimate of the total memory usage of data allocated + // by the arena (exclude the space allocated but not yet used for future + // allocations). + size_t ApproximateMemoryUsage() const { + return blocks_memory_ + blocks_.size() * sizeof(char*) - + alloc_bytes_remaining_; + } + + size_t MemoryAllocatedBytes() const { return blocks_memory_; } + + size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; } + + // If an allocation is too big, we'll allocate an irregular block with the + // same size of that allocation. + size_t IrregularBlockNum() const { return irregular_block_num; } + + size_t BlockSize() const override { return kBlockSize; } + + bool IsInInlineBlock() const { + return blocks_.empty() && huge_blocks_.empty(); + } + + // check and adjust the block_size so that the return value is + // 1. in the range of [kMinBlockSize, kMaxBlockSize]. + // 2. the multiple of align unit. + static size_t OptimizeBlockSize(size_t block_size); + + private: + alignas(std::max_align_t) char inline_block_[kInlineSize]; + // Number of bytes allocated in one block + const size_t kBlockSize; + // Allocated memory blocks + std::deque> blocks_; + // Huge page allocations + std::deque huge_blocks_; + size_t irregular_block_num = 0; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + + size_t hugetlb_size_ = 0; + + char* AllocateFromHugePage(size_t bytes); + char* AllocateFallback(size_t bytes, bool aligned); + char* AllocateNewBlock(size_t block_bytes); + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_ = 0; + // Non-owned + AllocTracker* tracker_; +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + unaligned_alloc_ptr_ -= bytes; + alloc_bytes_remaining_ -= bytes; + return unaligned_alloc_ptr_; + } + return AllocateFallback(bytes, false /* unaligned */); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.cc new file mode 100644 index 0000000000..1619bd93b0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "memory/concurrent_arena.h" + +#include + +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +thread_local size_t ConcurrentArena::tls_cpuid = 0; + +namespace { +// If the shard block size is too large, in the worst case, every core +// allocates a block without populate it. If the shared block size is +// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a +// flush. Cap the size instead. +const size_t kMaxShardBlockSize = size_t{128 * 1024}; +} // namespace + +ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker, + size_t huge_page_size) + : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)), + shards_(), + arena_(block_size, tracker, huge_page_size) { + Fixup(); +} + +ConcurrentArena::Shard* ConcurrentArena::Repick() { + auto shard_and_index = shards_.AccessElementAndIndex(); + // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we + // have repicked + tls_cpuid = shard_and_index.second | shards_.Size(); + return shard_and_index.first; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.h new file mode 100644 index 0000000000..f14507d302 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/concurrent_arena.h @@ -0,0 +1,215 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include "memory/allocator.h" +#include "memory/arena.h" +#include "port/lang.h" +#include "port/likely.h" +#include "util/core_local.h" +#include "util/mutexlock.h" +#include "util/thread_local.h" + +// Only generate field unused warning for padding array, or build under +// GCC 4.8.1 will fail. +#ifdef __clang__ +#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__)) +#else +#define ROCKSDB_FIELD_UNUSED +#endif // __clang__ + +namespace ROCKSDB_NAMESPACE { + +class Logger; + +// ConcurrentArena wraps an Arena. It makes it thread safe using a fast +// inlined spinlock, and adds small per-core allocation caches to avoid +// contention for small allocations. To avoid any memory waste from the +// per-core shards, they are kept small, they are lazily instantiated +// only if ConcurrentArena actually notices concurrent use, and they +// adjust their size so that there is no fragmentation waste when the +// shard blocks are allocated from the underlying main arena. +class ConcurrentArena : public Allocator { + public: + // block_size and huge_page_size are the same as for Arena (and are + // in fact just passed to the constructor of arena_. The core-local + // shards compute their shard_block_size as a fraction of block_size + // that varies according to the hardware concurrency level. + explicit ConcurrentArena(size_t block_size = Arena::kMinBlockSize, + AllocTracker* tracker = nullptr, + size_t huge_page_size = 0); + + char* Allocate(size_t bytes) override { + return AllocateImpl(bytes, false /*force_arena*/, + [this, bytes]() { return arena_.Allocate(bytes); }); + } + + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override { + size_t rounded_up = ((bytes - 1) | (sizeof(void*) - 1)) + 1; + assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) && + (rounded_up % sizeof(void*)) == 0); + + return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, + [this, rounded_up, huge_page_size, logger]() { + return arena_.AllocateAligned(rounded_up, + huge_page_size, logger); + }); + } + + size_t ApproximateMemoryUsage() const { + std::unique_lock lock(arena_mutex_, std::defer_lock); + lock.lock(); + return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused(); + } + + size_t MemoryAllocatedBytes() const { + return memory_allocated_bytes_.load(std::memory_order_relaxed); + } + + size_t AllocatedAndUnused() const { + return arena_allocated_and_unused_.load(std::memory_order_relaxed) + + ShardAllocatedAndUnused(); + } + + size_t IrregularBlockNum() const { + return irregular_block_num_.load(std::memory_order_relaxed); + } + + size_t BlockSize() const override { return arena_.BlockSize(); } + + private: + struct Shard { + char padding[40] ROCKSDB_FIELD_UNUSED; + mutable SpinMutex mutex; + char* free_begin_; + std::atomic allocated_and_unused_; + + Shard() : free_begin_(nullptr), allocated_and_unused_(0) {} + }; + + static thread_local size_t tls_cpuid; + + char padding0[56] ROCKSDB_FIELD_UNUSED; + + size_t shard_block_size_; + + CoreLocalArray shards_; + + Arena arena_; + mutable SpinMutex arena_mutex_; + std::atomic arena_allocated_and_unused_; + std::atomic memory_allocated_bytes_; + std::atomic irregular_block_num_; + + char padding1[56] ROCKSDB_FIELD_UNUSED; + + Shard* Repick(); + + size_t ShardAllocatedAndUnused() const { + size_t total = 0; + for (size_t i = 0; i < shards_.Size(); ++i) { + total += shards_.AccessAtCore(i)->allocated_and_unused_.load( + std::memory_order_relaxed); + } + return total; + } + + template + char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) { + size_t cpu; + + // Go directly to the arena if the allocation is too large, or if + // we've never needed to Repick() and the arena mutex is available + // with no waiting. This keeps the fragmentation penalty of + // concurrency zero unless it might actually confer an advantage. + std::unique_lock arena_lock(arena_mutex_, std::defer_lock); + if (bytes > shard_block_size_ / 4 || force_arena || + ((cpu = tls_cpuid) == 0 && + !shards_.AccessAtCore(0)->allocated_and_unused_.load( + std::memory_order_relaxed) && + arena_lock.try_lock())) { + if (!arena_lock.owns_lock()) { + arena_lock.lock(); + } + auto rv = func(); + Fixup(); + return rv; + } + + // pick a shard from which to allocate + Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1)); + if (!s->mutex.try_lock()) { + s = Repick(); + s->mutex.lock(); + } + std::unique_lock lock(s->mutex, std::adopt_lock); + + size_t avail = s->allocated_and_unused_.load(std::memory_order_relaxed); + if (avail < bytes) { + // reload + std::lock_guard reload_lock(arena_mutex_); + + // If the arena's current block is within a factor of 2 of the right + // size, we adjust our request to avoid arena waste. + auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed); + assert(exact == arena_.AllocatedAndUnused()); + + if (exact >= bytes && arena_.IsInInlineBlock()) { + // If we haven't exhausted arena's inline block yet, allocate from arena + // directly. This ensures that we'll do the first few small allocations + // without allocating any blocks. + // In particular this prevents empty memtables from using + // disproportionately large amount of memory: a memtable allocates on + // the order of 1 KB of memory when created; we wouldn't want to + // allocate a full arena block (typically a few megabytes) for that, + // especially if there are thousands of empty memtables. + auto rv = func(); + Fixup(); + return rv; + } + + avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2 + ? exact + : shard_block_size_; + s->free_begin_ = arena_.AllocateAligned(avail); + Fixup(); + } + s->allocated_and_unused_.store(avail - bytes, std::memory_order_relaxed); + + char* rv; + if ((bytes % sizeof(void*)) == 0) { + // aligned allocation from the beginning + rv = s->free_begin_; + s->free_begin_ += bytes; + } else { + // unaligned from the end + rv = s->free_begin_ + avail - bytes; + } + return rv; + } + + void Fixup() { + arena_allocated_and_unused_.store(arena_.AllocatedAndUnused(), + std::memory_order_relaxed); + memory_allocated_bytes_.store(arena_.MemoryAllocatedBytes(), + std::memory_order_relaxed); + irregular_block_num_.store(arena_.IrregularBlockNum(), + std::memory_order_relaxed); + } + + ConcurrentArena(const ConcurrentArena&) = delete; + ConcurrentArena& operator=(const ConcurrentArena&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.cc new file mode 100644 index 0000000000..d05248224d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.cc @@ -0,0 +1,303 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "memory/jemalloc_nodump_allocator.h" + +#include +#include + +#include "port/likely.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/fastrange.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +std::atomic JemallocNodumpAllocator::original_alloc_{nullptr}; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +static std::unordered_map jemalloc_type_info = { + {"limit_tcache_size", + {offsetof(struct JemallocAllocatorOptions, limit_tcache_size), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tcache_size_lower_bound", + {offsetof(struct JemallocAllocatorOptions, tcache_size_lower_bound), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tcache_size_upper_bound", + {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_arenas", + {offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; +bool JemallocNodumpAllocator::IsSupported(std::string* why) { +#ifndef ROCKSDB_JEMALLOC + *why = "Not compiled with ROCKSDB_JEMALLOC"; + return false; +#else + static const std::string unsupported = + "JemallocNodumpAllocator only available with jemalloc version >= 5 " + "and MADV_DONTDUMP is available."; + if (!HasJemalloc()) { + *why = unsupported; + return false; + } +#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + *why = unsupported; + return false; +#else + return true; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +#endif // ROCKSDB_MALLOC +} + +JemallocNodumpAllocator::JemallocNodumpAllocator( + JemallocAllocatorOptions& options) + : options_(options) +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + , + tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) { +#else // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +{ +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + RegisterOptions(&options_, &jemalloc_type_info); +} + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +JemallocNodumpAllocator::~JemallocNodumpAllocator() { + // Destroy tcache before destroying arena. + autovector tcache_list; + tcache_.Scrape(&tcache_list, nullptr); + for (void* tcache_index : tcache_list) { + DestroyThreadSpecificCache(tcache_index); + } + for (auto arena_index : arena_indexes_) { + // Destroy arena. Silently ignore error. + Status s = DestroyArena(arena_index); + assert(s.ok()); + s.PermitUncheckedError(); + } +} + +size_t JemallocNodumpAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return malloc_usable_size(static_cast(p)); +} + +void* JemallocNodumpAllocator::Allocate(size_t size) { + int tcache_flag = GetThreadSpecificCache(size); + uint32_t arena_index = GetArenaIndex(); + return mallocx(size, MALLOCX_ARENA(arena_index) | tcache_flag); +} + +void JemallocNodumpAllocator::Deallocate(void* p) { + // Obtain tcache. + size_t size = 0; + if (options_.limit_tcache_size) { + size = malloc_usable_size(p); + } + int tcache_flag = GetThreadSpecificCache(size); + // No need to pass arena index to dallocx(). Jemalloc will find arena index + // from its own metadata. + dallocx(p, tcache_flag); +} + +uint32_t JemallocNodumpAllocator::GetArenaIndex() const { + if (arena_indexes_.size() == 1) { + return arena_indexes_[0]; + } + + static std::atomic next_seed = 0; + // Core-local may work in place of `thread_local` as we should be able to + // tolerate occasional stale reads in thread migration cases. However we need + // to make Random thread-safe and prevent cacheline bouncing. Whether this is + // worthwhile is still an open question. + thread_local Random tl_random(next_seed.fetch_add(1)); + return arena_indexes_[FastRange32(tl_random.Next(), arena_indexes_.size())]; +} + +Status JemallocNodumpAllocator::InitializeArenas() { + assert(!init_); + init_ = true; + + for (size_t i = 0; i < options_.num_arenas; i++) { + // Create arena. + unsigned arena_index; + size_t arena_index_size = sizeof(arena_index); + int ret = + mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete( + "Failed to create jemalloc arena, error code: " + + std::to_string(ret)); + } + arena_indexes_.push_back(arena_index); + + // Read existing hooks. + std::string key = + "arena." + std::to_string(arena_indexes_[i]) + ".extent_hooks"; + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to read existing hooks, error code: " + + std::to_string(ret)); + } + + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + if (!success && original_alloc != expected) { + // This could happen if jemalloc creates new arenas with different initial + // values in their `alloc` function pointers. See `original_alloc_` API + // doc for more details. + return Status::Incomplete("Original alloc conflict."); + } + + // Set the custom hook. + per_arena_hooks_.emplace_back(); + per_arena_hooks_.back().reset(new extent_hooks_t(*hooks)); + per_arena_hooks_.back()->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = per_arena_hooks_.back().get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + return Status::Incomplete("Failed to set custom hook, error code: " + + std::to_string(ret)); + } + } + return Status::OK(); +} + +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status JemallocNodumpAllocator::PrepareOptions( + const ConfigOptions& config_options) { + std::string message; + + if (!IsSupported(&message)) { + return Status::NotSupported(message); + } else if (options_.limit_tcache_size && + options_.tcache_size_lower_bound >= + options_.tcache_size_upper_bound) { + return Status::InvalidArgument( + "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } else if (options_.num_arenas < 1) { + return Status::InvalidArgument("num_arenas must be a positive integer"); + } else if (IsMutable()) { + Status s = MemoryAllocator::PrepareOptions(config_options); +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + if (s.ok()) { + s = InitializeArenas(); + } +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + return s; + } else { + // Already prepared + return Status::OK(); + } +} + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) { + // We always enable tcache. The only corner case is when there are a ton of + // threads accessing with low frequency, then it could consume a lot of + // memory (may reach # threads * ~1MB) without bringing too much benefit. + if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound || + size > options_.tcache_size_upper_bound)) { + return MALLOCX_TCACHE_NONE; + } + unsigned* tcache_index = reinterpret_cast(tcache_.Get()); + if (UNLIKELY(tcache_index == nullptr)) { + // Instantiate tcache. + tcache_index = new unsigned(0); + size_t tcache_index_size = sizeof(unsigned); + int ret = + mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0); + if (ret != 0) { + // No good way to expose the error. Silently disable tcache. + delete tcache_index; + return MALLOCX_TCACHE_NONE; + } + tcache_.Reset(static_cast(tcache_index)); + } + return MALLOCX_TCACHE(*tcache_index); +} +void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, + size_t size, size_t alignment, bool* zero, + bool* commit, unsigned arena_ind) { + extent_alloc_t* original_alloc = + original_alloc_.load(std::memory_order_relaxed); + assert(original_alloc != nullptr); + void* result = original_alloc(extent, new_addr, size, alignment, zero, commit, + arena_ind); + if (result != nullptr) { + int ret = madvise(result, size, MADV_DONTDUMP); + if (ret != 0) { + fprintf( + stderr, + "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d", + ret); + assert(false); + } + } + return result; +} + +Status JemallocNodumpAllocator::DestroyArena(uint32_t arena_index) { + assert(arena_index != 0); + std::string key = "arena." + std::to_string(arena_index) + ".destroy"; + int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to destroy jemalloc arena, error code: " + + std::to_string(ret)); + } + return Status::OK(); +} + +void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) { + assert(ptr != nullptr); + unsigned* tcache_index = static_cast(ptr); + size_t tcache_index_size = sizeof(unsigned); + int ret __attribute__((__unused__)) = + mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size); + // Silently ignore error. + assert(ret == 0); + delete tcache_index; +} + +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr* memory_allocator) { + if (memory_allocator == nullptr) { + return Status::InvalidArgument("memory_allocator must be non-null."); + } +#ifndef ROCKSDB_JEMALLOC + (void)options; + return Status::NotSupported("Not compiled with JEMALLOC"); +#else + std::unique_ptr allocator( + new JemallocNodumpAllocator(options)); + Status s = allocator->PrepareOptions(ConfigOptions()); + if (s.ok()) { + memory_allocator->reset(allocator.release()); + } + return s; +#endif +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.h new file mode 100644 index 0000000000..2bdbaeb328 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/jemalloc_nodump_allocator.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "port/jemalloc_helper.h" +#include "port/port.h" +#include "rocksdb/memory_allocator.h" +#include "util/thread_local.h" +#include "utilities/memory_allocators.h" + +#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) + +#include + +#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP) +#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP +#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX + +namespace ROCKSDB_NAMESPACE { + +// Allocation requests are randomly sharded across +// `JemallocAllocatorOptions::num_arenas` arenas to reduce contention on per- +// arena mutexes. +class JemallocNodumpAllocator : public BaseMemoryAllocator { + public: + explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options); +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + ~JemallocNodumpAllocator(); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + static const char* kClassName() { return "JemallocNodumpAllocator"; } + const char* Name() const override { return kClassName(); } + static bool IsSupported() { + std::string unused; + return IsSupported(&unused); + } + static bool IsSupported(std::string* why); + bool IsMutable() const { return !init_; } + + Status PrepareOptions(const ConfigOptions& config_options) override; + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + void* Allocate(size_t size) override; + void Deallocate(void* p) override; + size_t UsableSize(void* p, size_t allocation_size) const override; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + private: +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + Status InitializeArenas(); + + uint32_t GetArenaIndex() const; + + // Custom alloc hook to replace jemalloc default alloc. + static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size, + size_t alignment, bool* zero, bool* commit, + unsigned arena_ind); + + // Destroy arena on destruction of the allocator, or on failure. + static Status DestroyArena(uint32_t arena_index); + + // Destroy tcache on destruction of the allocator, or thread exit. + static void DestroyThreadSpecificCache(void* ptr); + + // Get or create tcache. Return flag suitable to use with `mallocx`: + // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc). + int GetThreadSpecificCache(size_t size); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + JemallocAllocatorOptions options_; + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + // A function pointer to jemalloc default alloc. Use atomic to make sure + // NewJemallocNodumpAllocator is thread-safe. + // + // Hack: original_alloc_ needs to be static for Alloc() to access it. + // alloc needs to be static to pass to jemalloc as function pointer. We can + // use a single process-wide value as long as we assume that any newly created + // arena has the same original value in its `alloc` function pointer. + static std::atomic original_alloc_; + + // Custom hooks has to outlive corresponding arena. + std::vector> per_arena_hooks_; + + // Hold thread-local tcache index. + ThreadLocalPtr tcache_; + + std::vector arena_indexes_; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + bool init_ = false; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.cc new file mode 100644 index 0000000000..635c2210e7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef MEMKIND +#include +#endif // MEMKIND + +#include "memory/memkind_kmem_allocator.h" + +namespace ROCKSDB_NAMESPACE { +Status MemkindKmemAllocator::PrepareOptions(const ConfigOptions& options) { + std::string message; + if (!IsSupported(&message)) { + return Status::NotSupported(message); + } else { + return MemoryAllocator::PrepareOptions(options); + } +} + +#ifdef MEMKIND +void* MemkindKmemAllocator::Allocate(size_t size) { + void* p = memkind_malloc(MEMKIND_DAX_KMEM, size); + if (p == NULL) { + throw std::bad_alloc(); + } + return p; +} + +void MemkindKmemAllocator::Deallocate(void* p) { + memkind_free(MEMKIND_DAX_KMEM, p); +} + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +size_t MemkindKmemAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return memkind_malloc_usable_size(MEMKIND_DAX_KMEM, p); +} +#endif // ROCKSDB_MALLOC_USABLE_SIZE +#endif // MEMKIND + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.h new file mode 100644 index 0000000000..7176f17e33 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memkind_kmem_allocator.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/memory_allocator.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { + +class MemkindKmemAllocator : public BaseMemoryAllocator { + public: + static const char* kClassName() { return "MemkindKmemAllocator"; } + const char* Name() const override { return kClassName(); } + static bool IsSupported() { + std::string unused; + return IsSupported(&unused); + } + + static bool IsSupported(std::string* msg) { +#ifdef MEMKIND + (void)msg; + return true; +#else + *msg = "Not compiled with MemKind"; + return false; +#endif + } + Status PrepareOptions(const ConfigOptions& options) override; + +#ifdef MEMKIND + void* Allocate(size_t size) override; + void Deallocate(void* p) override; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + size_t UsableSize(void* p, size_t /*allocation_size*/) const override; +#endif +#endif // MEMKIND +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator.cc new file mode 100644 index 0000000000..d0de26b94d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/memory_allocator.h" + +#include "memory/jemalloc_nodump_allocator.h" +#include "memory/memkind_kmem_allocator.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map ma_wrapper_type_info = { + {"target", OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +}; + +static int RegisterBuiltinAllocators(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + DefaultMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new DefaultMemoryAllocator()); + return guard->get(); + }); + library.AddFactory( + CountedMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new CountedMemoryAllocator( + std::make_shared())); + return guard->get(); + }); + library.AddFactory( + JemallocNodumpAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* errmsg) { + if (JemallocNodumpAllocator::IsSupported(errmsg)) { + JemallocAllocatorOptions options; + guard->reset(new JemallocNodumpAllocator(options)); + } + return guard->get(); + }); + library.AddFactory( + MemkindKmemAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* errmsg) { + if (MemkindKmemAllocator::IsSupported(errmsg)) { + guard->reset(new MemkindKmemAllocator()); + } + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +} // namespace + +MemoryAllocatorWrapper::MemoryAllocatorWrapper( + const std::shared_ptr& t) + : target_(t) { + RegisterOptions("", &target_, &ma_wrapper_type_info); +} + +Status MemoryAllocator::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinAllocators(*(ObjectLibrary::Default().get()), ""); + }); + ConfigOptions copy = options; + copy.invoke_prepare_options = true; + return LoadManagedObject(copy, value, result); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator_impl.h new file mode 100644 index 0000000000..68aa35beb8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_allocator_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include + +#include "rocksdb/memory_allocator.h" + +namespace ROCKSDB_NAMESPACE { + +struct CustomDeleter { + CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {} + + void operator()(char* ptr) const { + if (allocator) { + allocator->Deallocate(reinterpret_cast(ptr)); + } else { + delete[] ptr; + } + } + + MemoryAllocator* allocator; +}; + +using CacheAllocationPtr = std::unique_ptr; + +inline CacheAllocationPtr AllocateBlock(size_t size, + MemoryAllocator* allocator) { + if (allocator) { + auto block = reinterpret_cast(allocator->Allocate(size)); + return CacheAllocationPtr(block, allocator); + } + return CacheAllocationPtr(new char[size]); +} + +inline CacheAllocationPtr AllocateAndCopyBlock(const Slice& data, + MemoryAllocator* allocator) { + CacheAllocationPtr cap = AllocateBlock(data.size(), allocator); + std::copy_n(data.data(), data.size(), cap.get()); + return cap; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_usage.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_usage.h new file mode 100644 index 0000000000..76b9bd1305 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memory/memory_usage.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#ifdef USE_FOLLY +#include +#endif + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper methods to estimate memroy usage by std containers. + +template +size_t ApproximateMemoryUsage( + const std::unordered_map& umap) { + using Map = std::unordered_map; + return sizeof(umap) + + // Size of all items plus a next pointer for each item. + (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() + + // Size of hash buckets. + umap.bucket_count() * sizeof(void*); +} + +#ifdef USE_FOLLY +template +size_t ApproximateMemoryUsage(const folly::F14FastMap& umap) { + return sizeof(umap) + umap.getAllocatedMemorySize(); +} +#endif + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/alloc_tracker.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/alloc_tracker.cc new file mode 100644 index 0000000000..4c6d354319 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/alloc_tracker.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "memory/allocator.h" +#include "memory/arena.h" +#include "rocksdb/write_buffer_manager.h" + +namespace ROCKSDB_NAMESPACE { + +AllocTracker::AllocTracker(WriteBufferManager* write_buffer_manager) + : write_buffer_manager_(write_buffer_manager), + bytes_allocated_(0), + done_allocating_(false), + freed_(false) {} + +AllocTracker::~AllocTracker() { FreeMem(); } + +void AllocTracker::Allocate(size_t bytes) { + assert(write_buffer_manager_ != nullptr); + if (write_buffer_manager_->enabled() || + write_buffer_manager_->cost_to_cache()) { + bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed); + write_buffer_manager_->ReserveMem(bytes); + } +} + +void AllocTracker::DoneAllocating() { + if (write_buffer_manager_ != nullptr && !done_allocating_) { + if (write_buffer_manager_->enabled() || + write_buffer_manager_->cost_to_cache()) { + write_buffer_manager_->ScheduleFreeMem( + bytes_allocated_.load(std::memory_order_relaxed)); + } else { + assert(bytes_allocated_.load(std::memory_order_relaxed) == 0); + } + done_allocating_ = true; + } +} + +void AllocTracker::FreeMem() { + if (!done_allocating_) { + DoneAllocating(); + } + if (write_buffer_manager_ != nullptr && !freed_) { + if (write_buffer_manager_->enabled() || + write_buffer_manager_->cost_to_cache()) { + write_buffer_manager_->FreeMem( + bytes_allocated_.load(std::memory_order_relaxed)); + } else { + assert(bytes_allocated_.load(std::memory_order_relaxed) == 0); + } + freed_ = true; + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_linklist_rep.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_linklist_rep.cc new file mode 100644 index 0000000000..9e60f9be37 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_linklist_rep.cc @@ -0,0 +1,924 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + + +#include +#include + +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/options_type.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { +namespace { + +using Key = const char*; +using MemtableSkipList = SkipList; +using Pointer = std::atomic; + +// A data structure used as the header of a link list of a hash bucket. +struct BucketHeader { + Pointer next; + std::atomic num_entries; + + explicit BucketHeader(void* n, uint32_t count) + : next(n), num_entries(count) {} + + bool IsSkipListBucket() { + return next.load(std::memory_order_relaxed) == this; + } + + uint32_t GetNumEntries() const { + return num_entries.load(std::memory_order_relaxed); + } + + // REQUIRES: called from single-threaded Insert() + void IncNumEntries() { + // Only one thread can do write at one time. No need to do atomic + // incremental. Update it with relaxed load and store. + num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed); + } +}; + +// A data structure used as the header of a skip list of a hash bucket. +struct SkipListBucketHeader { + BucketHeader Counting_header; + MemtableSkipList skip_list; + + explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp, + Allocator* allocator, uint32_t count) + : Counting_header(this, // Pointing to itself to indicate header type. + count), + skip_list(cmp, allocator) {} +}; + +struct Node { + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next() { + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return next_.load(std::memory_order_acquire); + } + void SetNext(Node* x) { + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_.store(x, std::memory_order_release); + } + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next() { return next_.load(std::memory_order_relaxed); } + + void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } + + // Needed for placement new below which is fine + Node() {} + + private: + std::atomic next_; + + // Prohibit copying due to the below + Node(const Node&) = delete; + Node& operator=(const Node&) = delete; + + public: + char key[1]; +}; + +// Memory structure of the mem table: +// It is a hash table, each bucket points to one entry, a linked list or a +// skip list. In order to track total number of records in a bucket to determine +// whether should switch to skip list, a header is added just to indicate +// number of entries in the bucket. +// +// +// +-----> NULL Case 1. Empty bucket +// | +// | +// | +---> +-------+ +// | | | Next +--> NULL +// | | +-------+ +// +-----+ | | | | Case 2. One Entry in bucket. +// | +-+ | | Data | next pointer points to +// +-----+ | | | NULL. All other cases +// | | | | | next pointer is not NULL. +// +-----+ | +-------+ +// | +---+ +// +-----+ +-> +-------+ +> +-------+ +-> +-------+ +// | | | | Next +--+ | Next +--+ | Next +-->NULL +// +-----+ | +-------+ +-------+ +-------+ +// | +-----+ | Count | | | | | +// +-----+ +-------+ | Data | | Data | +// | | | | | | +// +-----+ Case 3. | | | | +// | | A header +-------+ +-------+ +// +-----+ points to +// | | a linked list. Count indicates total number +// +-----+ of rows in this bucket. +// | | +// +-----+ +-> +-------+ <--+ +// | | | | Next +----+ +// +-----+ | +-------+ Case 4. A header points to a skip +// | +----+ | Count | list and next pointer points to +// +-----+ +-------+ itself, to distinguish case 3 or 4. +// | | | | Count still is kept to indicates total +// +-----+ | Skip +--> of entries in the bucket for debugging +// | | | List | Data purpose. +// | | | +--> +// +-----+ | | +// | | +-------+ +// +-----+ +// +// We don't have data race when changing cases because: +// (1) When changing from case 2->3, we create a new bucket header, put the +// single node there first without changing the original node, and do a +// release store when changing the bucket pointer. In that case, a reader +// who sees a stale value of the bucket pointer will read this node, while +// a reader sees the correct value because of the release store. +// (2) When changing case 3->4, a new header is created with skip list points +// to the data, before doing an acquire store to change the bucket pointer. +// The old header and nodes are never changed, so any reader sees any +// of those existing pointers will guarantee to be able to iterate to the +// end of the linked list. +// (3) Header's next pointer in case 3 might change, but they are never equal +// to itself, so no matter a reader sees any stale or newer value, it will +// be able to correctly distinguish case 3 and 4. +// +// The reason that we use case 2 is we want to make the format to be efficient +// when the utilization of buckets is relatively low. If we use case 3 for +// single entry bucket, we will need to waste 12 bytes for every entry, +// which can be significant decrease of memory utilization. +class HashLinkListRep : public MemTableRep { + public: + HashLinkListRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, const SliceTransform* transform, + size_t bucket_size, uint32_t threshold_use_skiplist, + size_t huge_page_tlb_size, Logger* logger, + int bucket_entries_logging_threshold, + bool if_log_bucket_dist_when_flash); + + KeyHandle Allocate(const size_t len, char** buf) override; + + void Insert(KeyHandle handle) override; + + bool Contains(const char* key) const override; + + size_t ApproximateMemoryUsage() override; + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override; + + ~HashLinkListRep() override; + + MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; + + MemTableRep::Iterator* GetDynamicPrefixIterator( + Arena* arena = nullptr) override; + + private: + friend class DynamicIterator; + + size_t bucket_size_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + Pointer* buckets_; + + const uint32_t threshold_use_skiplist_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + const MemTableRep::KeyComparator& compare_; + + Logger* logger_; + int bucket_entries_logging_threshold_; + bool if_log_bucket_dist_when_flash_; + + bool LinkListContains(Node* head, const Slice& key) const; + + bool IsEmptyBucket(Pointer& bucket_pointer) const { + return bucket_pointer.load(std::memory_order_acquire) == nullptr; + } + + // Precondition: GetLinkListFirstNode() must have been called first and return + // null so that it must be a skip list bucket + SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const; + + // Returning nullptr indicates it is a skip list bucket. + Node* GetLinkListFirstNode(Pointer& bucket_pointer) const; + + Slice GetPrefix(const Slice& internal_key) const { + return transform_->Transform(ExtractUserKey(internal_key)); + } + + size_t GetHash(const Slice& slice) const { + return GetSliceRangedNPHash(slice, bucket_size_); + } + + Pointer& GetBucket(size_t i) const { return buckets_[i]; } + + Pointer& GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + + bool Equal(const Slice& a, const Key& b) const { + return (compare_(b, a) == 0); + } + + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, internal_key) < 0); + } + + bool KeyIsAfterNode(const Key& key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); + } + + bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, internal_key) <= 0); + } + + bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) <= 0); + } + + Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; + Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const; + + class FullListIterator : public MemTableRep::Iterator { + public: + explicit FullListIterator(MemtableSkipList* list, Allocator* allocator) + : iter_(list), full_list_(list), allocator_(allocator) {} + + ~FullListIterator() override {} + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const override { return iter_.Valid(); } + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const override { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + void Next() override { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev() override { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + void Seek(const Slice& internal_key, const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.SeekForPrev(encoded_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToFirst() override { iter_.SeekToFirst(); } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToLast() override { iter_.SeekToLast(); } + + private: + MemtableSkipList::Iterator iter_; + // To destruct with the iterator. + std::unique_ptr full_list_; + std::unique_ptr allocator_; + std::string tmp_; // For passing to EncodeKey + }; + + class LinkListIterator : public MemTableRep::Iterator { + public: + explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep, + Node* head) + : hash_link_list_rep_(hash_link_list_rep), + head_(head), + node_(nullptr) {} + + ~LinkListIterator() override {} + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const override { return node_ != nullptr; } + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const override { + assert(Valid()); + return node_->key; + } + + // Advances to the next position. + // REQUIRES: Valid() + void Next() override { + assert(Valid()); + node_ = node_->Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev() override { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Advance to the first entry with a key >= target + void Seek(const Slice& internal_key, + const char* /*memtable_key*/) override { + node_ = + hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, internal_key); + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override { + // Since we do not support Prev() + // We simply do not support SeekForPrev + Reset(nullptr); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToFirst() override { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToLast() override { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + protected: + void Reset(Node* head) { + head_ = head; + node_ = nullptr; + } + + private: + friend class HashLinkListRep; + const HashLinkListRep* const hash_link_list_rep_; + Node* head_; + Node* node_; + + virtual void SeekToHead() { node_ = head_; } + }; + + class DynamicIterator : public HashLinkListRep::LinkListIterator { + public: + explicit DynamicIterator(HashLinkListRep& memtable_rep) + : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + void Seek(const Slice& k, const char* memtable_key) override { + auto transformed = memtable_rep_.GetPrefix(k); + Pointer& bucket = memtable_rep_.GetBucket(transformed); + + if (memtable_rep_.IsEmptyBucket(bucket)) { + skip_list_iter_.reset(); + Reset(nullptr); + } else { + Node* first_linked_list_node = + memtable_rep_.GetLinkListFirstNode(bucket); + if (first_linked_list_node != nullptr) { + // The bucket is organized as a linked list + skip_list_iter_.reset(); + Reset(first_linked_list_node); + HashLinkListRep::LinkListIterator::Seek(k, memtable_key); + + } else { + SkipListBucketHeader* skip_list_header = + memtable_rep_.GetSkipListBucketHeader(bucket); + assert(skip_list_header != nullptr); + // The bucket is organized as a skip list + if (!skip_list_iter_) { + skip_list_iter_.reset( + new MemtableSkipList::Iterator(&skip_list_header->skip_list)); + } else { + skip_list_iter_->SetList(&skip_list_header->skip_list); + } + if (memtable_key != nullptr) { + skip_list_iter_->Seek(memtable_key); + } else { + IterKey encoded_key; + encoded_key.EncodeLengthPrefixedKey(k); + skip_list_iter_->Seek(encoded_key.GetUserKey().data()); + } + } + } + } + + bool Valid() const override { + if (skip_list_iter_) { + return skip_list_iter_->Valid(); + } + return HashLinkListRep::LinkListIterator::Valid(); + } + + const char* key() const override { + if (skip_list_iter_) { + return skip_list_iter_->key(); + } + return HashLinkListRep::LinkListIterator::key(); + } + + void Next() override { + if (skip_list_iter_) { + skip_list_iter_->Next(); + } else { + HashLinkListRep::LinkListIterator::Next(); + } + } + + private: + // the underlying memtable + const HashLinkListRep& memtable_rep_; + std::unique_ptr skip_list_iter_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() {} + bool Valid() const override { return false; } + const char* key() const override { + assert(false); + return nullptr; + } + void Next() override {} + void Prev() override {} + void Seek(const Slice& /*user_key*/, + const char* /*memtable_key*/) override {} + void SeekForPrev(const Slice& /*user_key*/, + const char* /*memtable_key*/) override {} + void SeekToFirst() override {} + void SeekToLast() override {} + + private: + }; +}; + +HashLinkListRep::HashLinkListRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, size_t bucket_size, + uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger, + int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash) + : MemTableRep(allocator), + bucket_size_(bucket_size), + // Threshold to use skip list doesn't make sense if less than 3, so we + // force it to be minimum of 3 to simplify implementation. + threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)), + transform_(transform), + compare_(compare), + logger_(logger), + bucket_entries_logging_threshold_(bucket_entries_logging_threshold), + if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) { + char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size, + huge_page_tlb_size, logger); + + buckets_ = new (mem) Pointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].store(nullptr, std::memory_order_relaxed); + } +} + +HashLinkListRep::~HashLinkListRep() {} + +KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { + char* mem = allocator_->AllocateAligned(sizeof(Node) + len); + Node* x = new (mem) Node(); + *buf = x->key; + return static_cast(x); +} + +SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( + Pointer& bucket_pointer) const { + Pointer* first_next_pointer = + static_cast(bucket_pointer.load(std::memory_order_acquire)); + assert(first_next_pointer != nullptr); + assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr); + + // Counting header + BucketHeader* header = reinterpret_cast(first_next_pointer); + assert(header->IsSkipListBucket()); + assert(header->GetNumEntries() > threshold_use_skiplist_); + auto* skip_list_bucket_header = + reinterpret_cast(header); + assert(skip_list_bucket_header->Counting_header.next.load( + std::memory_order_relaxed) == header); + return skip_list_bucket_header; +} + +Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const { + Pointer* first_next_pointer = + static_cast(bucket_pointer.load(std::memory_order_acquire)); + assert(first_next_pointer != nullptr); + if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { + // Single entry bucket + return reinterpret_cast(first_next_pointer); + } + + // It is possible that after we fetch first_next_pointer it is modified + // and the next is not null anymore. In this case, the bucket should have been + // modified to a counting header, so we should reload the first_next_pointer + // to make sure we see the update. + first_next_pointer = + static_cast(bucket_pointer.load(std::memory_order_acquire)); + // Counting header + BucketHeader* header = reinterpret_cast(first_next_pointer); + if (!header->IsSkipListBucket()) { + assert(header->GetNumEntries() <= threshold_use_skiplist_); + return reinterpret_cast( + header->next.load(std::memory_order_acquire)); + } + assert(header->GetNumEntries() > threshold_use_skiplist_); + return nullptr; +} + +void HashLinkListRep::Insert(KeyHandle handle) { + Node* x = static_cast(handle); + assert(!Contains(x->key)); + Slice internal_key = GetLengthPrefixedSlice(x->key); + auto transformed = GetPrefix(internal_key); + auto& bucket = buckets_[GetHash(transformed)]; + Pointer* first_next_pointer = + static_cast(bucket.load(std::memory_order_relaxed)); + + if (first_next_pointer == nullptr) { + // Case 1. empty bucket + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(nullptr); + bucket.store(x, std::memory_order_release); + return; + } + + BucketHeader* header = nullptr; + if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { + // Case 2. only one entry in the bucket + // Need to convert to a Counting bucket and turn to case 4. + Node* first = reinterpret_cast(first_next_pointer); + // Need to add a bucket header. + // We have to first convert it to a bucket with header before inserting + // the new node. Otherwise, we might need to change next pointer of first. + // In that case, a reader might sees the next pointer is NULL and wrongly + // think the node is a bucket header. + auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader)); + header = new (mem) BucketHeader(first, 1); + bucket.store(header, std::memory_order_release); + } else { + header = reinterpret_cast(first_next_pointer); + if (header->IsSkipListBucket()) { + // Case 4. Bucket is already a skip list + assert(header->GetNumEntries() > threshold_use_skiplist_); + auto* skip_list_bucket_header = + reinterpret_cast(header); + // Only one thread can execute Insert() at one time. No need to do atomic + // incremental. + skip_list_bucket_header->Counting_header.IncNumEntries(); + skip_list_bucket_header->skip_list.Insert(x->key); + return; + } + } + + if (bucket_entries_logging_threshold_ > 0 && + header->GetNumEntries() == + static_cast(bucket_entries_logging_threshold_)) { + Info(logger_, + "HashLinkedList bucket %" ROCKSDB_PRIszt + " has more than %d " + "entries. Key to insert: %s", + GetHash(transformed), header->GetNumEntries(), + GetLengthPrefixedSlice(x->key).ToString(true).c_str()); + } + + if (header->GetNumEntries() == threshold_use_skiplist_) { + // Case 3. number of entries reaches the threshold so need to convert to + // skip list. + LinkListIterator bucket_iter( + this, reinterpret_cast( + first_next_pointer->load(std::memory_order_relaxed))); + auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader)); + SkipListBucketHeader* new_skip_list_header = new (mem) + SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1); + auto& skip_list = new_skip_list_header->skip_list; + + // Add all current entries to the skip list + for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) { + skip_list.Insert(bucket_iter.key()); + } + + // insert the new entry + skip_list.Insert(x->key); + // Set the bucket + bucket.store(new_skip_list_header, std::memory_order_release); + } else { + // Case 5. Need to insert to the sorted linked list without changing the + // header. + Node* first = + reinterpret_cast(header->next.load(std::memory_order_relaxed)); + assert(first != nullptr); + // Advance counter unless the bucket needs to be advanced to skip list. + // In that case, we need to make sure the previous count never exceeds + // threshold_use_skiplist_ to avoid readers to cast to wrong format. + header->IncNumEntries(); + + Node* cur = first; + Node* prev = nullptr; + while (true) { + if (cur == nullptr) { + break; + } + Node* next = cur->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((cur == first) || (next == nullptr) || + KeyIsAfterNode(next->key, cur)); + if (KeyIsAfterNode(internal_key, cur)) { + // Keep searching in this list + prev = cur; + cur = next; + } else { + break; + } + } + + // Our data structure does not allow duplicate insertion + assert(cur == nullptr || !Equal(x->key, cur->key)); + + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(cur); + + if (prev) { + prev->SetNext(x); + } else { + header->next.store(static_cast(x), std::memory_order_release); + } + } +} + +bool HashLinkListRep::Contains(const char* key) const { + Slice internal_key = GetLengthPrefixedSlice(key); + + auto transformed = GetPrefix(internal_key); + Pointer& bucket = GetBucket(transformed); + if (IsEmptyBucket(bucket)) { + return false; + } + + Node* linked_list_node = GetLinkListFirstNode(bucket); + if (linked_list_node != nullptr) { + return LinkListContains(linked_list_node, internal_key); + } + + SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket); + if (skip_list_header != nullptr) { + return skip_list_header->skip_list.Contains(key); + } + return false; +} + +size_t HashLinkListRep::ApproximateMemoryUsage() { + // Memory is always allocated from the allocator. + return 0; +} + +void HashLinkListRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto transformed = transform_->Transform(k.user_key()); + Pointer& bucket = GetBucket(transformed); + + if (IsEmptyBucket(bucket)) { + return; + } + + auto* link_list_head = GetLinkListFirstNode(bucket); + if (link_list_head != nullptr) { + LinkListIterator iter(this, link_list_head); + for (iter.Seek(k.internal_key(), nullptr); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } else { + auto* skip_list_header = GetSkipListBucketHeader(bucket); + if (skip_list_header != nullptr) { + // Is a skip list + MemtableSkipList::Iterator iter(&skip_list_header->skip_list); + for (iter.Seek(k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } + } +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { + // allocate a new arena of similar size to the one currently in use + Arena* new_arena = new Arena(allocator_->BlockSize()); + auto list = new MemtableSkipList(compare_, new_arena); + HistogramImpl keys_per_bucket_hist; + + for (size_t i = 0; i < bucket_size_; ++i) { + int count = 0; + Pointer& bucket = GetBucket(i); + if (!IsEmptyBucket(bucket)) { + auto* link_list_head = GetLinkListFirstNode(bucket); + if (link_list_head != nullptr) { + LinkListIterator itr(this, link_list_head); + for (itr.SeekToHead(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + count++; + } + } else { + auto* skip_list_header = GetSkipListBucketHeader(bucket); + assert(skip_list_header != nullptr); + // Is a skip list + MemtableSkipList::Iterator itr(&skip_list_header->skip_list); + for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + count++; + } + } + } + if (if_log_bucket_dist_when_flash_) { + keys_per_bucket_hist.Add(count); + } + } + if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) { + Info(logger_, "hashLinkedList Entry distribution among buckets: %s", + keys_per_bucket_hist.ToString().c_str()); + } + + if (alloc_arena == nullptr) { + return new FullListIterator(list, new_arena); + } else { + auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator)); + return new (mem) FullListIterator(list, new_arena); + } +} + +MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator( + Arena* alloc_arena) { + if (alloc_arena == nullptr) { + return new DynamicIterator(*this); + } else { + auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator)); + return new (mem) DynamicIterator(*this); + } +} + +bool HashLinkListRep::LinkListContains(Node* head, + const Slice& user_key) const { + Node* x = FindGreaterOrEqualInBucket(head, user_key); + return (x != nullptr && Equal(user_key, x->key)); +} + +Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, + const Slice& key) const { + Node* x = head; + while (true) { + if (x == nullptr) { + return x; + } + Node* next = x->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, x)) { + // Keep searching in this list + x = next; + } else { + break; + } + } + return x; +} + +struct HashLinkListRepOptions { + static const char* kName() { return "HashLinkListRepFactoryOptions"; } + size_t bucket_count; + uint32_t threshold_use_skiplist; + size_t huge_page_tlb_size; + int bucket_entries_logging_threshold; + bool if_log_bucket_dist_when_flash; +}; + +static std::unordered_map hash_linklist_info = { + {"bucket_count", + {offsetof(struct HashLinkListRepOptions, bucket_count), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"threshold", + {offsetof(struct HashLinkListRepOptions, threshold_use_skiplist), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"huge_page_size", + {offsetof(struct HashLinkListRepOptions, huge_page_tlb_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"logging_threshold", + {offsetof(struct HashLinkListRepOptions, bucket_entries_logging_threshold), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"log_when_flash", + {offsetof(struct HashLinkListRepOptions, if_log_bucket_dist_when_flash), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +class HashLinkListRepFactory : public MemTableRepFactory { + public: + explicit HashLinkListRepFactory(size_t bucket_count, + uint32_t threshold_use_skiplist, + size_t huge_page_tlb_size, + int bucket_entries_logging_threshold, + bool if_log_bucket_dist_when_flash) { + options_.bucket_count = bucket_count; + options_.threshold_use_skiplist = threshold_use_skiplist; + options_.huge_page_tlb_size = huge_page_tlb_size; + options_.bucket_entries_logging_threshold = + bucket_entries_logging_threshold; + options_.if_log_bucket_dist_when_flash = if_log_bucket_dist_when_flash; + RegisterOptions(&options_, &hash_linklist_info); + } + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* logger) override; + + static const char* kClassName() { return "HashLinkListRepFactory"; } + static const char* kNickName() { return "hash_linkedlist"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + + private: + HashLinkListRepOptions options_; +}; + +} // namespace + +MemTableRep* HashLinkListRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* logger) { + return new HashLinkListRep( + compare, allocator, transform, options_.bucket_count, + options_.threshold_use_skiplist, options_.huge_page_tlb_size, logger, + options_.bucket_entries_logging_threshold, + options_.if_log_bucket_dist_when_flash); +} + +MemTableRepFactory* NewHashLinkListRepFactory( + size_t bucket_count, size_t huge_page_tlb_size, + int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash, + uint32_t threshold_use_skiplist) { + return new HashLinkListRepFactory( + bucket_count, threshold_use_skiplist, huge_page_tlb_size, + bucket_entries_logging_threshold, if_log_bucket_dist_when_flash); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_skiplist_rep.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_skiplist_rep.cc new file mode 100644 index 0000000000..15ff4f0719 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/hash_skiplist_rep.cc @@ -0,0 +1,391 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include + +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "port/port.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/options_type.h" +#include "util/murmurhash.h" + +namespace ROCKSDB_NAMESPACE { +namespace { + +class HashSkipListRep : public MemTableRep { + public: + HashSkipListRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, const SliceTransform* transform, + size_t bucket_size, int32_t skiplist_height, + int32_t skiplist_branching_factor); + + void Insert(KeyHandle handle) override; + + bool Contains(const char* key) const override; + + size_t ApproximateMemoryUsage() override; + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override; + + ~HashSkipListRep() override; + + MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; + + MemTableRep::Iterator* GetDynamicPrefixIterator( + Arena* arena = nullptr) override; + + private: + friend class DynamicIterator; + using Bucket = SkipList; + + size_t bucket_size_; + + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + std::atomic* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + const MemTableRep::KeyComparator& compare_; + // immutable after construction + Allocator* const allocator_; + + inline size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), static_cast(slice.size()), 0) % + bucket_size_; + } + inline Bucket* GetBucket(size_t i) const { + return buckets_[i].load(std::memory_order_acquire); + } + inline Bucket* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + // Get a bucket from buckets_. If the bucket hasn't been initialized yet, + // initialize it before returning. + Bucket* GetInitializedBucket(const Slice& transformed); + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(Bucket* list, bool own_list = true, + Arena* arena = nullptr) + : list_(list), iter_(list), own_list_(own_list), arena_(arena) {} + + ~Iterator() override { + // if we own the list, we should also delete it + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + } + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const override { return list_ != nullptr && iter_.Valid(); } + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const override { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + void Next() override { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev() override { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + void Seek(const Slice& internal_key, const char* memtable_key) override { + if (list_ != nullptr) { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override { + // not supported + assert(false); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToFirst() override { + if (list_ != nullptr) { + iter_.SeekToFirst(); + } + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToLast() override { + if (list_ != nullptr) { + iter_.SeekToLast(); + } + } + + protected: + void Reset(Bucket* list) { + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + list_ = list; + iter_.SetList(list); + own_list_ = false; + } + + private: + // if list_ is nullptr, we should NEVER call any methods on iter_ + // if list_ is nullptr, this Iterator is not Valid() + Bucket* list_; + Bucket::Iterator iter_; + // here we track if we own list_. If we own it, we are also + // responsible for it's cleaning. This is a poor man's std::shared_ptr + bool own_list_; + std::unique_ptr arena_; + std::string tmp_; // For passing to EncodeKey + }; + + class DynamicIterator : public HashSkipListRep::Iterator { + public: + explicit DynamicIterator(const HashSkipListRep& memtable_rep) + : HashSkipListRep::Iterator(nullptr, false), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + void Seek(const Slice& k, const char* memtable_key) override { + auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k)); + Reset(memtable_rep_.GetBucket(transformed)); + HashSkipListRep::Iterator::Seek(k, memtable_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToFirst() override { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToLast() override { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + private: + // the underlying memtable + const HashSkipListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() {} + bool Valid() const override { return false; } + const char* key() const override { + assert(false); + return nullptr; + } + void Next() override {} + void Prev() override {} + void Seek(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override {} + void SeekForPrev(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override {} + void SeekToFirst() override {} + void SeekToLast() override {} + + private: + }; +}; + +HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, + const SliceTransform* transform, + size_t bucket_size, int32_t skiplist_height, + int32_t skiplist_branching_factor) + : MemTableRep(allocator), + bucket_size_(bucket_size), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor), + transform_(transform), + compare_(compare), + allocator_(allocator) { + auto mem = + allocator->AllocateAligned(sizeof(std::atomic) * bucket_size); + buckets_ = new (mem) std::atomic[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].store(nullptr, std::memory_order_relaxed); + } +} + +HashSkipListRep::~HashSkipListRep() {} + +HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( + const Slice& transformed) { + size_t hash = GetHash(transformed); + auto bucket = GetBucket(hash); + if (bucket == nullptr) { + auto addr = allocator_->AllocateAligned(sizeof(Bucket)); + bucket = new (addr) Bucket(compare_, allocator_, skiplist_height_, + skiplist_branching_factor_); + buckets_[hash].store(bucket, std::memory_order_release); + } + return bucket; +} + +void HashSkipListRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); + assert(!Contains(key)); + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetInitializedBucket(transformed); + bucket->Insert(key); +} + +bool HashSkipListRep::Contains(const char* key) const { + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return bucket->Contains(key); +} + +size_t HashSkipListRep::ApproximateMemoryUsage() { return 0; } + +void HashSkipListRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto transformed = transform_->Transform(k.user_key()); + auto bucket = GetBucket(transformed); + if (bucket != nullptr) { + Bucket::Iterator iter(bucket); + for (iter.Seek(k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } +} + +MemTableRep::Iterator* HashSkipListRep::GetIterator(Arena* arena) { + // allocate a new arena of similar size to the one currently in use + Arena* new_arena = new Arena(allocator_->BlockSize()); + auto list = new Bucket(compare_, new_arena); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Bucket::Iterator itr(bucket); + for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + if (arena == nullptr) { + return new Iterator(list, true, new_arena); + } else { + auto mem = arena->AllocateAligned(sizeof(Iterator)); + return new (mem) Iterator(list, true, new_arena); + } +} + +MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) { + if (arena == nullptr) { + return new DynamicIterator(*this); + } else { + auto mem = arena->AllocateAligned(sizeof(DynamicIterator)); + return new (mem) DynamicIterator(*this); + } +} + +struct HashSkipListRepOptions { + static const char* kName() { return "HashSkipListRepFactoryOptions"; } + size_t bucket_count; + int32_t skiplist_height; + int32_t skiplist_branching_factor; +}; + +static std::unordered_map hash_skiplist_info = { + {"bucket_count", + {offsetof(struct HashSkipListRepOptions, bucket_count), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"skiplist_height", + {offsetof(struct HashSkipListRepOptions, skiplist_height), + OptionType::kInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"branching_factor", + {offsetof(struct HashSkipListRepOptions, skiplist_branching_factor), + OptionType::kInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +class HashSkipListRepFactory : public MemTableRepFactory { + public: + explicit HashSkipListRepFactory(size_t bucket_count, int32_t skiplist_height, + int32_t skiplist_branching_factor) { + options_.bucket_count = bucket_count; + options_.skiplist_height = skiplist_height; + options_.skiplist_branching_factor = skiplist_branching_factor; + RegisterOptions(&options_, &hash_skiplist_info); + } + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* logger) override; + + static const char* kClassName() { return "HashSkipListRepFactory"; } + static const char* kNickName() { return "prefix_hash"; } + + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + + private: + HashSkipListRepOptions options_; +}; + +} // namespace + +MemTableRep* HashSkipListRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* /*logger*/) { + return new HashSkipListRep(compare, allocator, transform, + options_.bucket_count, options_.skiplist_height, + options_.skiplist_branching_factor); +} + +MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count, int32_t skiplist_height, + int32_t skiplist_branching_factor) { + return new HashSkipListRepFactory(bucket_count, skiplist_height, + skiplist_branching_factor); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/inlineskiplist.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/inlineskiplist.h new file mode 100644 index 0000000000..abb3c3ddb7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/inlineskiplist.h @@ -0,0 +1,1051 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. Use of +// this source code is governed by a BSD-style license that can be found +// in the LICENSE file. See the AUTHORS file for names of contributors. +// +// InlineSkipList is derived from SkipList (skiplist.h), but it optimizes +// the memory layout by requiring that the key storage be allocated through +// the skip list instance. For the common case of SkipList this saves 1 pointer per skip list node and gives better cache +// locality, at the expense of wasted padding from using AllocateAligned +// instead of Allocate for the keys. The unused padding will be from +// 0 to sizeof(void*)-1 bytes, and the space savings are sizeof(void*) +// bytes, so despite the padding the space used is always less than +// SkipList. +// +// Thread safety ------------- +// +// Writes via Insert require external synchronization, most likely a mutex. +// InsertConcurrently can be safely called concurrently with reads and +// with other concurrent inserts. Reads require a guarantee that the +// InlineSkipList will not be destroyed while the read is in progress. +// Apart from that, reads progress without any internal locking or +// synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the InlineSkipList is +// destroyed. This is trivially guaranteed by the code since we never +// delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the InlineSkipList. +// Only Insert() modifies the list, and it is careful to initialize a +// node and use release-stores to publish the nodes in one or more lists. +// +// ... prev vs. next pointer ordering ... +// + +#pragma once +#include +#include + +#include +#include +#include + +#include "memory/allocator.h" +#include "port/likely.h" +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +template +class InlineSkipList { + private: + struct Node; + struct Splice; + + public: + using DecodedKey = + typename std::remove_reference::type::DecodedType; + + static const uint16_t kMaxPossibleHeight = 32; + + // Create a new InlineSkipList object that will use "cmp" for comparing + // keys, and will allocate memory using "*allocator". Objects allocated + // in the allocator must remain allocated for the lifetime of the + // skiplist object. + explicit InlineSkipList(Comparator cmp, Allocator* allocator, + int32_t max_height = 12, + int32_t branching_factor = 4); + // No copying allowed + InlineSkipList(const InlineSkipList&) = delete; + InlineSkipList& operator=(const InlineSkipList&) = delete; + + // Allocates a key and a skip-list node, returning a pointer to the key + // portion of the node. This method is thread-safe if the allocator + // is thread-safe. + char* AllocateKey(size_t key_size); + + // Allocate a splice using allocator. + Splice* AllocateSplice(); + + // Allocate a splice on heap. + Splice* AllocateSpliceOnHeap(); + + // Inserts a key allocated by AllocateKey, after the actual key value + // has been filled in. + // + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls to any of inserts. + bool Insert(const char* key); + + // Inserts a key allocated by AllocateKey with a hint of last insert + // position in the skip-list. If hint points to nullptr, a new hint will be + // populated, which can be used in subsequent calls. + // + // It can be used to optimize the workload where there are multiple groups + // of keys, and each key is likely to insert to a location close to the last + // inserted key in the same group. One example is sequential inserts. + // + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls to any of inserts. + bool InsertWithHint(const char* key, void** hint); + + // Like InsertConcurrently, but with a hint + // + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls that use same hint + bool InsertWithHintConcurrently(const char* key, void** hint); + + // Like Insert, but external synchronization is not required. + bool InsertConcurrently(const char* key); + + // Inserts a node into the skip list. key must have been allocated by + // AllocateKey and then filled in by the caller. If UseCAS is true, + // then external synchronization is not required, otherwise this method + // may not be called concurrently with any other insertions. + // + // Regardless of whether UseCAS is true, the splice must be owned + // exclusively by the current thread. If allow_partial_splice_fix is + // true, then the cost of insertion is amortized O(log D), where D is + // the distance from the splice to the inserted key (measured as the + // number of intervening nodes). Note that this bound is very good for + // sequential insertions! If allow_partial_splice_fix is false then + // the existing splice will be ignored unless the current key is being + // inserted immediately after the splice. allow_partial_splice_fix == + // false has worse running time for the non-sequential case O(log N), + // but a better constant factor. + template + bool Insert(const char* key, Splice* splice, bool allow_partial_splice_fix); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const char* key) const; + + // Return estimated number of entries smaller than `key`. + uint64_t EstimateCount(const char* key) const; + + // Validate correctness of the skip-list. + void TEST_Validate() const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const InlineSkipList* list); + + // Change the underlying skiplist used for this iterator + // This enables us not changing the iterator without deallocating + // an old one and then allocating a new one + void SetList(const InlineSkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const char* target); + + // Retreat to the last entry with a key <= target + void SeekForPrev(const char* target); + + // Advance to a random entry in the list. + void RandomSeek(); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const InlineSkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + const uint16_t kMaxHeight_; + const uint16_t kBranching_; + const uint32_t kScaledInverseBranching_; + + Allocator* const allocator_; // Allocator used for allocations of nodes + // Immutable after construction + Comparator const compare_; + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + std::atomic max_height_; // Height of the entire list + + // seq_splice_ is a Splice used for insertions in the non-concurrent + // case. It caches the prev and next found during the most recent + // non-concurrent insertion. + Splice* seq_splice_; + + inline int GetMaxHeight() const { + return max_height_.load(std::memory_order_relaxed); + } + + int RandomHeight(); + + Node* AllocateNode(size_t key_size, int height); + + bool Equal(const char* a, const char* b) const { + return (compare_(a, b) == 0); + } + + bool LessThan(const char* a, const char* b) const { + return (compare_(a, b) < 0); + } + + // Return true if key is greater than the data stored in "n". Null n + // is considered infinite. n should not be head_. + bool KeyIsAfterNode(const char* key, Node* n) const; + bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; + + // Returns the earliest node with a key >= key. + // Return nullptr if there is no such node. + Node* FindGreaterOrEqual(const char* key) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + // Fills prev[level] with pointer to previous node at "level" for every + // level in [0..max_height_-1], if prev is non-null. + Node* FindLessThan(const char* key, Node** prev = nullptr) const; + + // Return the latest node with a key < key on bottom_level. Start searching + // from root node on the level below top_level. + // Fills prev[level] with pointer to previous node at "level" for every + // level in [bottom_level..top_level-1], if prev is non-null. + Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level, + int bottom_level) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // Returns a random entry. + Node* FindRandomEntry() const; + + // Traverses a single level of the list, setting *out_prev to the last + // node before the key and *out_next to the first node after. Assumes + // that the key is not present in the skip list. On entry, before should + // point to a node that is before the key, and after should point to + // a node that is after the key. after should be nullptr if a good after + // node isn't conveniently available. + template + void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after, + int level, Node** out_prev, Node** out_next); + + // Recomputes Splice levels from highest_level (inclusive) down to + // lowest_level (inclusive). + void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, + int recompute_level); +}; + +// Implementation details follow + +template +struct InlineSkipList::Splice { + // The invariant of a Splice is that prev_[i+1].key <= prev_[i].key < + // next_[i].key <= next_[i+1].key for all i. That means that if a + // key is bracketed by prev_[i] and next_[i] then it is bracketed by + // all higher levels. It is _not_ required that prev_[i]->Next(i) == + // next_[i] (it probably did at some point in the past, but intervening + // or concurrent operations might have inserted nodes in between). + int height_ = 0; + Node** prev_; + Node** next_; +}; + +// The Node data type is more of a pointer into custom-managed memory than +// a traditional C++ struct. The key is stored in the bytes immediately +// after the struct, and the next_ pointers for nodes with height > 1 are +// stored immediately _before_ the struct. This avoids the need to include +// any pointer or sizing data, which reduces per-node memory overheads. +template +struct InlineSkipList::Node { + // Stores the height of the node in the memory location normally used for + // next_[0]. This is used for passing data from AllocateKey to Insert. + void StashHeight(const int height) { + assert(sizeof(int) <= sizeof(next_[0])); + memcpy(static_cast(&next_[0]), &height, sizeof(int)); + } + + // Retrieves the value passed to StashHeight. Undefined after a call + // to SetNext or NoBarrier_SetNext. + int UnstashHeight() const { + int rv; + memcpy(&rv, &next_[0], sizeof(int)); + return rv; + } + + const char* Key() const { return reinterpret_cast(&next_[1]); } + + // Accessors/mutators for links. Wrapped in methods so we can add + // the appropriate barriers as necessary, and perform the necessary + // addressing trickery for storing links below the Node in memory. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return ((&next_[0] - n)->load(std::memory_order_acquire)); + } + + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + (&next_[0] - n)->store(x, std::memory_order_release); + } + + bool CASNext(int n, Node* expected, Node* x) { + assert(n >= 0); + return (&next_[0] - n)->compare_exchange_strong(expected, x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return (&next_[0] - n)->load(std::memory_order_relaxed); + } + + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + (&next_[0] - n)->store(x, std::memory_order_relaxed); + } + + // Insert node after prev on specific level. + void InsertAfter(Node* prev, int level) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "this" in prev. + NoBarrier_SetNext(level, prev->NoBarrier_Next(level)); + prev->SetNext(level, this); + } + + private: + // next_[0] is the lowest level link (level 0). Higher levels are + // stored _earlier_, so level 1 is at next_[-1]. + std::atomic next_[1]; +}; + +template +inline InlineSkipList::Iterator::Iterator( + const InlineSkipList* list) { + SetList(list); +} + +template +inline void InlineSkipList::Iterator::SetList( + const InlineSkipList* list) { + list_ = list; + node_ = nullptr; +} + +template +inline bool InlineSkipList::Iterator::Valid() const { + return node_ != nullptr; +} + +template +inline const char* InlineSkipList::Iterator::key() const { + assert(Valid()); + return node_->Key(); +} + +template +inline void InlineSkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void InlineSkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->Key()); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +inline void InlineSkipList::Iterator::Seek(const char* target) { + node_ = list_->FindGreaterOrEqual(target); +} + +template +inline void InlineSkipList::Iterator::SeekForPrev( + const char* target) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && list_->LessThan(target, key())) { + Prev(); + } +} + +template +inline void InlineSkipList::Iterator::RandomSeek() { + node_ = list_->FindRandomEntry(); +} + +template +inline void InlineSkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void InlineSkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +int InlineSkipList::RandomHeight() { + auto rnd = Random::GetTLSInstance(); + + // Increase height with probability 1 in kBranching + int height = 1; + while (height < kMaxHeight_ && height < kMaxPossibleHeight && + rnd->Next() < kScaledInverseBranching_) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight_); + assert(height <= kMaxPossibleHeight); + return height; +} + +template +bool InlineSkipList::KeyIsAfterNode(const char* key, + Node* n) const { + // nullptr n is considered infinite + assert(n != head_); + return (n != nullptr) && (compare_(n->Key(), key) < 0); +} + +template +bool InlineSkipList::KeyIsAfterNode(const DecodedKey& key, + Node* n) const { + // nullptr n is considered infinite + assert(n != head_); + return (n != nullptr) && (compare_(n->Key(), key) < 0); +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindGreaterOrEqual(const char* key) const { + // Note: It looks like we could reduce duplication by implementing + // this function as FindLessThan(key)->Next(0), but we wouldn't be able + // to exit early on equality and the result wouldn't even be correct. + // A concurrent insert might occur after FindLessThan(key) but before + // we get a chance to call Next(0). + Node* x = head_; + int level = GetMaxHeight() - 1; + Node* last_bigger = nullptr; + const DecodedKey key_decoded = compare_.decode_key(key); + while (true) { + Node* next = x->Next(level); + if (next != nullptr) { + PREFETCH(next->Next(level), 0, 1); + } + // Make sure the lists are sorted + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); + // Make sure we haven't overshot during our search + assert(x == head_ || KeyIsAfterNode(key_decoded, x)); + int cmp = (next == nullptr || next == last_bigger) + ? 1 + : compare_(next->Key(), key_decoded); + if (cmp == 0 || (cmp > 0 && level == 0)) { + return next; + } else if (cmp < 0) { + // Keep searching in this list + x = next; + } else { + // Switch to next list, reuse compare_() result + last_bigger = next; + level--; + } + } +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindLessThan(const char* key, Node** prev) const { + return FindLessThan(key, prev, head_, GetMaxHeight(), 0); +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindLessThan(const char* key, Node** prev, + Node* root, int top_level, + int bottom_level) const { + assert(top_level > bottom_level); + int level = top_level - 1; + Node* x = root; + // KeyIsAfter(key, last_not_after) is definitely false + Node* last_not_after = nullptr; + const DecodedKey key_decoded = compare_.decode_key(key); + while (true) { + assert(x != nullptr); + Node* next = x->Next(level); + if (next != nullptr) { + PREFETCH(next->Next(level), 0, 1); + } + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); + assert(x == head_ || KeyIsAfterNode(key_decoded, x)); + if (next != last_not_after && KeyIsAfterNode(key_decoded, next)) { + // Keep searching in this list + assert(next != nullptr); + x = next; + } else { + if (prev != nullptr) { + prev[level] = x; + } + if (level == bottom_level) { + return x; + } else { + // Switch to next list, reuse KeyIsAfterNode() result + last_not_after = next; + level--; + } + } + } +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindLast() const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == nullptr) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindRandomEntry() const { + // TODO(bjlemaire): consider adding PREFETCH calls. + Node *x = head_, *scan_node = nullptr, *limit_node = nullptr; + + // We start at the max level. + // FOr each level, we look at all the nodes at the level, and + // we randomly pick one of them. Then decrement the level + // and reiterate the process. + // eg: assume GetMaxHeight()=5, and there are #100 elements (nodes). + // level 4 nodes: lvl_nodes={#1, #15, #67, #84}. Randomly pick #15. + // We will consider all the nodes between #15 (inclusive) and #67 + // (exclusive). #67 is called 'limit_node' here. + // level 3 nodes: lvl_nodes={#15, #21, #45, #51}. Randomly choose + // #51. #67 remains 'limit_node'. + // [...] + // level 0 nodes: lvl_nodes={#56,#57,#58,#59}. Randomly pick $57. + // Return Node #57. + std::vector lvl_nodes; + Random* rnd = Random::GetTLSInstance(); + int level = GetMaxHeight() - 1; + + while (level >= 0) { + lvl_nodes.clear(); + scan_node = x; + while (scan_node != limit_node) { + lvl_nodes.push_back(scan_node); + scan_node = scan_node->Next(level); + } + uint32_t rnd_idx = rnd->Next() % lvl_nodes.size(); + x = lvl_nodes[rnd_idx]; + if (rnd_idx + 1 < lvl_nodes.size()) { + limit_node = lvl_nodes[rnd_idx + 1]; + } + level--; + } + // There is a special case where x could still be the head_ + // (note that the head_ contains no key). + return x == head_ && head_ != nullptr ? head_->Next(0) : x; +} + +template +uint64_t InlineSkipList::EstimateCount(const char* key) const { + uint64_t count = 0; + + Node* x = head_; + int level = GetMaxHeight() - 1; + const DecodedKey key_decoded = compare_.decode_key(key); + while (true) { + assert(x == head_ || compare_(x->Key(), key_decoded) < 0); + Node* next = x->Next(level); + if (next != nullptr) { + PREFETCH(next->Next(level), 0, 1); + } + if (next == nullptr || compare_(next->Key(), key_decoded) >= 0) { + if (level == 0) { + return count; + } else { + // Switch to next list + count *= kBranching_; + level--; + } + } else { + x = next; + count++; + } + } +} + +template +InlineSkipList::InlineSkipList(const Comparator cmp, + Allocator* allocator, + int32_t max_height, + int32_t branching_factor) + : kMaxHeight_(static_cast(max_height)), + kBranching_(static_cast(branching_factor)), + kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), + allocator_(allocator), + compare_(cmp), + head_(AllocateNode(0, max_height)), + max_height_(1), + seq_splice_(AllocateSplice()) { + assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); + assert(branching_factor > 1 && + kBranching_ == static_cast(branching_factor)); + assert(kScaledInverseBranching_ > 0); + + for (int i = 0; i < kMaxHeight_; ++i) { + head_->SetNext(i, nullptr); + } +} + +template +char* InlineSkipList::AllocateKey(size_t key_size) { + return const_cast(AllocateNode(key_size, RandomHeight())->Key()); +} + +template +typename InlineSkipList::Node* +InlineSkipList::AllocateNode(size_t key_size, int height) { + auto prefix = sizeof(std::atomic) * (height - 1); + + // prefix is space for the height - 1 pointers that we store before + // the Node instance (next_[-(height - 1) .. -1]). Node starts at + // raw + prefix, and holds the bottom-mode (level 0) skip list pointer + // next_[0]. key_size is the bytes for the key, which comes just after + // the Node. + char* raw = allocator_->AllocateAligned(prefix + sizeof(Node) + key_size); + Node* x = reinterpret_cast(raw + prefix); + + // Once we've linked the node into the skip list we don't actually need + // to know its height, because we can implicitly use the fact that we + // traversed into a node at level h to known that h is a valid level + // for that node. We need to convey the height to the Insert step, + // however, so that it can perform the proper links. Since we're not + // using the pointers at the moment, StashHeight temporarily borrow + // storage from next_[0] for that purpose. + x->StashHeight(height); + return x; +} + +template +typename InlineSkipList::Splice* +InlineSkipList::AllocateSplice() { + // size of prev_ and next_ + size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1); + char* raw = allocator_->AllocateAligned(sizeof(Splice) + array_size * 2); + Splice* splice = reinterpret_cast(raw); + splice->height_ = 0; + splice->prev_ = reinterpret_cast(raw + sizeof(Splice)); + splice->next_ = reinterpret_cast(raw + sizeof(Splice) + array_size); + return splice; +} + +template +typename InlineSkipList::Splice* +InlineSkipList::AllocateSpliceOnHeap() { + size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1); + char* raw = new char[sizeof(Splice) + array_size * 2]; + Splice* splice = reinterpret_cast(raw); + splice->height_ = 0; + splice->prev_ = reinterpret_cast(raw + sizeof(Splice)); + splice->next_ = reinterpret_cast(raw + sizeof(Splice) + array_size); + return splice; +} + +template +bool InlineSkipList::Insert(const char* key) { + return Insert(key, seq_splice_, false); +} + +template +bool InlineSkipList::InsertConcurrently(const char* key) { + Node* prev[kMaxPossibleHeight]; + Node* next[kMaxPossibleHeight]; + Splice splice; + splice.prev_ = prev; + splice.next_ = next; + return Insert(key, &splice, false); +} + +template +bool InlineSkipList::InsertWithHint(const char* key, void** hint) { + assert(hint != nullptr); + Splice* splice = reinterpret_cast(*hint); + if (splice == nullptr) { + splice = AllocateSplice(); + *hint = reinterpret_cast(splice); + } + return Insert(key, splice, true); +} + +template +bool InlineSkipList::InsertWithHintConcurrently(const char* key, + void** hint) { + assert(hint != nullptr); + Splice* splice = reinterpret_cast(*hint); + if (splice == nullptr) { + splice = AllocateSpliceOnHeap(); + *hint = reinterpret_cast(splice); + } + return Insert(key, splice, true); +} + +template +template +void InlineSkipList::FindSpliceForLevel(const DecodedKey& key, + Node* before, Node* after, + int level, Node** out_prev, + Node** out_next) { + while (true) { + Node* next = before->Next(level); + if (next != nullptr) { + PREFETCH(next->Next(level), 0, 1); + } + if (prefetch_before == true) { + if (next != nullptr && level > 0) { + PREFETCH(next->Next(level - 1), 0, 1); + } + } + assert(before == head_ || next == nullptr || + KeyIsAfterNode(next->Key(), before)); + assert(before == head_ || KeyIsAfterNode(key, before)); + if (next == after || !KeyIsAfterNode(key, next)) { + // found it + *out_prev = before; + *out_next = next; + return; + } + before = next; + } +} + +template +void InlineSkipList::RecomputeSpliceLevels(const DecodedKey& key, + Splice* splice, + int recompute_level) { + assert(recompute_level > 0); + assert(recompute_level <= splice->height_); + for (int i = recompute_level - 1; i >= 0; --i) { + FindSpliceForLevel(key, splice->prev_[i + 1], splice->next_[i + 1], i, + &splice->prev_[i], &splice->next_[i]); + } +} + +template +template +bool InlineSkipList::Insert(const char* key, Splice* splice, + bool allow_partial_splice_fix) { + Node* x = reinterpret_cast(const_cast(key)) - 1; + const DecodedKey key_decoded = compare_.decode_key(key); + int height = x->UnstashHeight(); + assert(height >= 1 && height <= kMaxHeight_); + + int max_height = max_height_.load(std::memory_order_relaxed); + while (height > max_height) { + if (max_height_.compare_exchange_weak(max_height, height)) { + // successfully updated it + max_height = height; + break; + } + // else retry, possibly exiting the loop because somebody else + // increased it + } + assert(max_height <= kMaxPossibleHeight); + + int recompute_height = 0; + if (splice->height_ < max_height) { + // Either splice has never been used or max_height has grown since + // last use. We could potentially fix it in the latter case, but + // that is tricky. + splice->prev_[max_height] = head_; + splice->next_[max_height] = nullptr; + splice->height_ = max_height; + recompute_height = max_height; + } else { + // Splice is a valid proper-height splice that brackets some + // key, but does it bracket this one? We need to validate it and + // recompute a portion of the splice (levels 0..recompute_height-1) + // that is a superset of all levels that don't bracket the new key. + // Several choices are reasonable, because we have to balance the work + // saved against the extra comparisons required to validate the Splice. + // + // One strategy is just to recompute all of orig_splice_height if the + // bottom level isn't bracketing. This pessimistically assumes that + // we will either get a perfect Splice hit (increasing sequential + // inserts) or have no locality. + // + // Another strategy is to walk up the Splice's levels until we find + // a level that brackets the key. This strategy lets the Splice + // hint help for other cases: it turns insertion from O(log N) into + // O(log D), where D is the number of nodes in between the key that + // produced the Splice and the current insert (insertion is aided + // whether the new key is before or after the splice). If you have + // a way of using a prefix of the key to map directly to the closest + // Splice out of O(sqrt(N)) Splices and we make it so that splices + // can also be used as hints during read, then we end up with Oshman's + // and Shavit's SkipTrie, which has O(log log N) lookup and insertion + // (compare to O(log N) for skip list). + // + // We control the pessimistic strategy with allow_partial_splice_fix. + // A good strategy is probably to be pessimistic for seq_splice_, + // optimistic if the caller actually went to the work of providing + // a Splice. + while (recompute_height < max_height) { + if (splice->prev_[recompute_height]->Next(recompute_height) != + splice->next_[recompute_height]) { + // splice isn't tight at this level, there must have been some inserts + // to this + // location that didn't update the splice. We might only be a little + // stale, but if + // the splice is very stale it would be O(N) to fix it. We haven't used + // up any of + // our budget of comparisons, so always move up even if we are + // pessimistic about + // our chances of success. + ++recompute_height; + } else if (splice->prev_[recompute_height] != head_ && + !KeyIsAfterNode(key_decoded, + splice->prev_[recompute_height])) { + // key is from before splice + if (allow_partial_splice_fix) { + // skip all levels with the same node without more comparisons + Node* bad = splice->prev_[recompute_height]; + while (splice->prev_[recompute_height] == bad) { + ++recompute_height; + } + } else { + // we're pessimistic, recompute everything + recompute_height = max_height; + } + } else if (KeyIsAfterNode(key_decoded, splice->next_[recompute_height])) { + // key is from after splice + if (allow_partial_splice_fix) { + Node* bad = splice->next_[recompute_height]; + while (splice->next_[recompute_height] == bad) { + ++recompute_height; + } + } else { + recompute_height = max_height; + } + } else { + // this level brackets the key, we won! + break; + } + } + } + assert(recompute_height <= max_height); + if (recompute_height > 0) { + RecomputeSpliceLevels(key_decoded, splice, recompute_height); + } + + bool splice_is_valid = true; + if (UseCAS) { + for (int i = 0; i < height; ++i) { + while (true) { + // Checking for duplicate keys on the level 0 is sufficient + if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && + compare_(x->Key(), splice->next_[i]->Key()) >= 0)) { + // duplicate key + return false; + } + if (UNLIKELY(i == 0 && splice->prev_[i] != head_ && + compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) { + // duplicate key + return false; + } + assert(splice->next_[i] == nullptr || + compare_(x->Key(), splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), x->Key()) < 0); + x->NoBarrier_SetNext(i, splice->next_[i]); + if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) { + // success + break; + } + // CAS failed, we need to recompute prev and next. It is unlikely + // to be helpful to try to use a different level as we redo the + // search, because it should be unlikely that lots of nodes have + // been inserted between prev[i] and next[i]. No point in using + // next[i] as the after hint, because we know it is stale. + FindSpliceForLevel(key_decoded, splice->prev_[i], nullptr, i, + &splice->prev_[i], &splice->next_[i]); + + // Since we've narrowed the bracket for level i, we might have + // violated the Splice constraint between i and i-1. Make sure + // we recompute the whole thing next time. + if (i > 0) { + splice_is_valid = false; + } + } + } + } else { + for (int i = 0; i < height; ++i) { + if (i >= recompute_height && + splice->prev_[i]->Next(i) != splice->next_[i]) { + FindSpliceForLevel(key_decoded, splice->prev_[i], nullptr, i, + &splice->prev_[i], &splice->next_[i]); + } + // Checking for duplicate keys on the level 0 is sufficient + if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && + compare_(x->Key(), splice->next_[i]->Key()) >= 0)) { + // duplicate key + return false; + } + if (UNLIKELY(i == 0 && splice->prev_[i] != head_ && + compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) { + // duplicate key + return false; + } + assert(splice->next_[i] == nullptr || + compare_(x->Key(), splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), x->Key()) < 0); + assert(splice->prev_[i]->Next(i) == splice->next_[i]); + x->NoBarrier_SetNext(i, splice->next_[i]); + splice->prev_[i]->SetNext(i, x); + } + } + if (splice_is_valid) { + for (int i = 0; i < height; ++i) { + splice->prev_[i] = x; + } + assert(splice->prev_[splice->height_] == head_); + assert(splice->next_[splice->height_] == nullptr); + for (int i = 0; i < splice->height_; ++i) { + assert(splice->next_[i] == nullptr || + compare_(key, splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), key) <= 0); + assert(splice->prev_[i + 1] == splice->prev_[i] || + splice->prev_[i + 1] == head_ || + compare_(splice->prev_[i + 1]->Key(), splice->prev_[i]->Key()) < + 0); + assert(splice->next_[i + 1] == splice->next_[i] || + splice->next_[i + 1] == nullptr || + compare_(splice->next_[i]->Key(), splice->next_[i + 1]->Key()) < + 0); + } + } else { + splice->height_ = 0; + } + return true; +} + +template +bool InlineSkipList::Contains(const char* key) const { + Node* x = FindGreaterOrEqual(key); + if (x != nullptr && Equal(key, x->Key())) { + return true; + } else { + return false; + } +} + +template +void InlineSkipList::TEST_Validate() const { + // Interate over all levels at the same time, and verify nodes appear in + // the right order, and nodes appear in upper level also appear in lower + // levels. + Node* nodes[kMaxPossibleHeight]; + int max_height = GetMaxHeight(); + assert(max_height > 0); + for (int i = 0; i < max_height; i++) { + nodes[i] = head_; + } + while (nodes[0] != nullptr) { + Node* l0_next = nodes[0]->Next(0); + if (l0_next == nullptr) { + break; + } + assert(nodes[0] == head_ || compare_(nodes[0]->Key(), l0_next->Key()) < 0); + nodes[0] = l0_next; + + int i = 1; + while (i < max_height) { + Node* next = nodes[i]->Next(i); + if (next == nullptr) { + break; + } + auto cmp = compare_(nodes[0]->Key(), next->Key()); + assert(cmp <= 0); + if (cmp == 0) { + assert(next == nodes[0]); + nodes[i] = next; + } else { + break; + } + i++; + } + } + for (int i = 1; i < max_height; i++) { + assert(nodes[i] != nullptr && nodes[i]->Next(i) == nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplist.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplist.h new file mode 100644 index 0000000000..e3cecd30c1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplist.h @@ -0,0 +1,498 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... +// + +#pragma once +#include +#include + +#include + +#include "memory/allocator.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*allocator". Objects allocated in the + // allocator must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Allocator* allocator, + int32_t max_height = 12, int32_t branching_factor = 4); + // No copying allowed + SkipList(const SkipList&) = delete; + void operator=(const SkipList&) = delete; + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Return estimated number of entries smaller than `key`. + uint64_t EstimateCount(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Change the underlying skiplist used for this iterator + // This enables us not changing the iterator without deallocating + // an old one and then allocating a new one + void SetList(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + const uint16_t kMaxHeight_; + const uint16_t kBranching_; + const uint32_t kScaledInverseBranching_; + + // Immutable after construction + Comparator const compare_; + Allocator* const allocator_; // Allocator used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + std::atomic max_height_; // Height of the entire list + + // Used for optimizing sequential insert patterns. Tricky. prev_[i] for + // i up to max_height_ is the predecessor of prev_[0] and prev_height_ + // is the height of prev_[0]. prev_[0] can only be equal to head before + // insertion, in which case max_height_ and prev_height_ are 1. + Node** prev_; + int32_t prev_height_; + + inline int GetMaxHeight() const { + return max_height_.load(std::memory_order_relaxed); + } + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + bool LessThan(const Key& a, const Key& b) const { + return (compare_(a, b) < 0); + } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Returns the earliest node with a key >= key. + // Return nullptr if there is no such node. + Node* FindGreaterOrEqual(const Key& key) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + // Fills prev[level] with pointer to previous node at "level" for every + // level in [0..max_height_-1], if prev is non-null. + Node* FindLessThan(const Key& key, Node** prev = nullptr) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) {} + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return (next_[n].load(std::memory_order_acquire)); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].store(x, std::memory_order_release); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return next_[n].load(std::memory_order_relaxed); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].store(x, std::memory_order_relaxed); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + std::atomic next_[1]; +}; + +template +typename SkipList::Node* SkipList::NewNode( + const Key& key, int height) { + char* mem = allocator_->AllocateAligned( + sizeof(Node) + sizeof(std::atomic) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + SetList(list); +} + +template +inline void SkipList::Iterator::SetList(const SkipList* list) { + list_ = list; + node_ = nullptr; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != nullptr; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target); +} + +template +inline void SkipList::Iterator::SeekForPrev( + const Key& target) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && list_->LessThan(target, key())) { + Prev(); + } +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +int SkipList::RandomHeight() { + auto rnd = Random::GetTLSInstance(); + + // Increase height with probability 1 in kBranching + int height = 1; + while (height < kMaxHeight_ && rnd->Next() < kScaledInverseBranching_) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight_); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* +SkipList::FindGreaterOrEqual(const Key& key) const { + // Note: It looks like we could reduce duplication by implementing + // this function as FindLessThan(key)->Next(0), but we wouldn't be able + // to exit early on equality and the result wouldn't even be correct. + // A concurrent insert might occur after FindLessThan(key) but before + // we get a chance to call Next(0). + Node* x = head_; + int level = GetMaxHeight() - 1; + Node* last_bigger = nullptr; + while (true) { + assert(x != nullptr); + Node* next = x->Next(level); + // Make sure the lists are sorted + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x)); + // Make sure we haven't overshot during our search + assert(x == head_ || KeyIsAfterNode(key, x)); + int cmp = + (next == nullptr || next == last_bigger) ? 1 : compare_(next->key, key); + if (cmp == 0 || (cmp > 0 && level == 0)) { + return next; + } else if (cmp < 0) { + // Keep searching in this list + x = next; + } else { + // Switch to next list, reuse compare_() result + last_bigger = next; + level--; + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key, Node** prev) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + // KeyIsAfter(key, last_not_after) is definitely false + Node* last_not_after = nullptr; + while (true) { + assert(x != nullptr); + Node* next = x->Next(level); + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x)); + assert(x == head_ || KeyIsAfterNode(key, x)); + if (next != last_not_after && KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != nullptr) { + prev[level] = x; + } + if (level == 0) { + return x; + } else { + // Switch to next list, reuse KeyIUsAfterNode() result + last_not_after = next; + level--; + } + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == nullptr) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +uint64_t SkipList::EstimateCount(const Key& key) const { + uint64_t count = 0; + + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == nullptr || compare_(next->key, key) >= 0) { + if (level == 0) { + return count; + } else { + // Switch to next list + count *= kBranching_; + level--; + } + } else { + x = next; + count++; + } + } +} + +template +SkipList::SkipList(const Comparator cmp, Allocator* allocator, + int32_t max_height, + int32_t branching_factor) + : kMaxHeight_(static_cast(max_height)), + kBranching_(static_cast(branching_factor)), + kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), + compare_(cmp), + allocator_(allocator), + head_(NewNode(0 /* any key will do */, max_height)), + max_height_(1), + prev_height_(1) { + assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); + assert(branching_factor > 0 && + kBranching_ == static_cast(branching_factor)); + assert(kScaledInverseBranching_ > 0); + // Allocate the prev_ Node* array, directly from the passed-in allocator. + // prev_ does not need to be freed, as its life cycle is tied up with + // the allocator as a whole. + prev_ = reinterpret_cast( + allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); + for (int i = 0; i < kMaxHeight_; i++) { + head_->SetNext(i, nullptr); + prev_[i] = head_; + } +} + +template +void SkipList::Insert(const Key& key) { + // fast path for sequential insertion + if (!KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) && + (prev_[0] == head_ || KeyIsAfterNode(key, prev_[0]))) { + assert(prev_[0] != head_ || (prev_height_ == 1 && GetMaxHeight() == 1)); + + // Outside of this method prev_[1..max_height_] is the predecessor + // of prev_[0], and prev_height_ refers to prev_[0]. Inside Insert + // prev_[0..max_height - 1] is the predecessor of key. Switch from + // the external state to the internal + for (int i = 1; i < prev_height_; i++) { + prev_[i] = prev_[0]; + } + } else { + // TODO(opt): we could use a NoBarrier predecessor search as an + // optimization for architectures where memory_order_acquire needs + // a synchronization instruction. Doesn't matter on x86 + FindLessThan(key, prev_); + } + + // Our data structure does not allow duplicate insertion + assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev_[i] = head_; + } + // fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (nullptr), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since nullptr sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.store(height, std::memory_order_relaxed); + } + + Node* x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i)); + prev_[i]->SetNext(i, x); + } + prev_[0] = x; + prev_height_ = height; +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key); + if (x != nullptr && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplistrep.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplistrep.cc new file mode 100644 index 0000000000..c3b4c785d3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/skiplistrep.cc @@ -0,0 +1,368 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include + +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/inlineskiplist.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +class SkipListRep : public MemTableRep { + InlineSkipList skip_list_; + const MemTableRep::KeyComparator& cmp_; + const SliceTransform* transform_; + const size_t lookahead_; + + friend class LookaheadIterator; + + public: + explicit SkipListRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, const SliceTransform* transform, + const size_t lookahead) + : MemTableRep(allocator), + skip_list_(compare, allocator), + cmp_(compare), + transform_(transform), + lookahead_(lookahead) {} + + KeyHandle Allocate(const size_t len, char** buf) override { + *buf = skip_list_.AllocateKey(len); + return static_cast(*buf); + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(KeyHandle handle) override { + skip_list_.Insert(static_cast(handle)); + } + + bool InsertKey(KeyHandle handle) override { + return skip_list_.Insert(static_cast(handle)); + } + + void InsertWithHint(KeyHandle handle, void** hint) override { + skip_list_.InsertWithHint(static_cast(handle), hint); + } + + bool InsertKeyWithHint(KeyHandle handle, void** hint) override { + return skip_list_.InsertWithHint(static_cast(handle), hint); + } + + void InsertWithHintConcurrently(KeyHandle handle, void** hint) override { + skip_list_.InsertWithHintConcurrently(static_cast(handle), hint); + } + + bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override { + return skip_list_.InsertWithHintConcurrently(static_cast(handle), + hint); + } + + void InsertConcurrently(KeyHandle handle) override { + skip_list_.InsertConcurrently(static_cast(handle)); + } + + bool InsertKeyConcurrently(KeyHandle handle) override { + return skip_list_.InsertConcurrently(static_cast(handle)); + } + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const char* key) const override { + return skip_list_.Contains(key); + } + + size_t ApproximateMemoryUsage() override { + // All memory is allocated through allocator; nothing to report here + return 0; + } + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + for (iter.Seek(dummy_slice, k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } + + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) override { + std::string tmp; + uint64_t start_count = + skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey)); + uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey)); + return (end_count >= start_count) ? (end_count - start_count) : 0; + } + + void UniqueRandomSample(const uint64_t num_entries, + const uint64_t target_sample_size, + std::unordered_set* entries) override { + entries->clear(); + // Avoid divide-by-0. + assert(target_sample_size > 0); + assert(num_entries > 0); + // NOTE: the size of entries is not enforced to be exactly + // target_sample_size at the end of this function, it might be slightly + // greater or smaller. + SkipListRep::Iterator iter(&skip_list_); + // There are two methods to create the subset of samples (size m) + // from the table containing N elements: + // 1-Iterate linearly through the N memtable entries. For each entry i, + // add it to the sample set with a probability + // (target_sample_size - entries.size() ) / (N-i). + // + // 2-Pick m random elements without repetition. + // We pick Option 2 when m sqrt(N). + if (target_sample_size > + static_cast(std::sqrt(1.0 * num_entries))) { + Random* rnd = Random::GetTLSInstance(); + iter.SeekToFirst(); + uint64_t counter = 0, num_samples_left = target_sample_size; + for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) { + // Add entry to sample set with probability + // num_samples_left/(num_entries - counter). + if (rnd->Next() % (num_entries - counter) < num_samples_left) { + entries->insert(iter.key()); + num_samples_left--; + } + } + } else { + // Option 2: pick m random elements with no duplicates. + // If Option 2 is picked, then target_sample_size99.9% for N>4. + // At worst, for the final pick , when m=sqrt(N) there is + // a probability of p= 1/sqrt(N) chances to find a duplicate. + for (uint64_t j = 0; j < 5; j++) { + iter.RandomSeek(); + // unordered_set::insert returns pair. + // The second element is true if an insert successfully happened. + // If element is already in the set, this bool will be false, and + // true otherwise. + if ((entries->insert(iter.key())).second) { + break; + } + } + } + } + } + + ~SkipListRep() override {} + + // Iteration over the contents of a skip list + class Iterator : public MemTableRep::Iterator { + InlineSkipList::Iterator iter_; + + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator( + const InlineSkipList* list) + : iter_(list) {} + + ~Iterator() override {} + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const override { return iter_.Valid(); } + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const override { return iter_.key(); } + + // Advances to the next position. + // REQUIRES: Valid() + void Next() override { iter_.Next(); } + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev() override { iter_.Prev(); } + + // Advance to the first entry with a key >= target + void Seek(const Slice& user_key, const char* memtable_key) override { + if (memtable_key != nullptr) { + iter_.Seek(memtable_key); + } else { + iter_.Seek(EncodeKey(&tmp_, user_key)); + } + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& user_key, const char* memtable_key) override { + if (memtable_key != nullptr) { + iter_.SeekForPrev(memtable_key); + } else { + iter_.SeekForPrev(EncodeKey(&tmp_, user_key)); + } + } + + void RandomSeek() override { iter_.RandomSeek(); } + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst() override { iter_.SeekToFirst(); } + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast() override { iter_.SeekToLast(); } + + protected: + std::string tmp_; // For passing to EncodeKey + }; + + // Iterator over the contents of a skip list which also keeps track of the + // previously visited node. In Seek(), it examines a few nodes after it + // first, falling back to O(log n) search from the head of the list only if + // the target key hasn't been found. + class LookaheadIterator : public MemTableRep::Iterator { + public: + explicit LookaheadIterator(const SkipListRep& rep) + : rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} + + ~LookaheadIterator() override {} + + bool Valid() const override { return iter_.Valid(); } + + const char* key() const override { + assert(Valid()); + return iter_.key(); + } + + void Next() override { + assert(Valid()); + + bool advance_prev = true; + if (prev_.Valid()) { + auto k1 = rep_.UserKey(prev_.key()); + auto k2 = rep_.UserKey(iter_.key()); + + if (k1.compare(k2) == 0) { + // same user key, don't move prev_ + advance_prev = false; + } else if (rep_.transform_) { + // only advance prev_ if it has the same prefix as iter_ + auto t1 = rep_.transform_->Transform(k1); + auto t2 = rep_.transform_->Transform(k2); + advance_prev = t1.compare(t2) == 0; + } + } + + if (advance_prev) { + prev_ = iter_; + } + iter_.Next(); + } + + void Prev() override { + assert(Valid()); + iter_.Prev(); + prev_ = iter_; + } + + void Seek(const Slice& internal_key, const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + + if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { + // prev_.key() is smaller or equal to our target key; do a quick + // linear search (at most lookahead_ steps) starting from prev_ + iter_ = prev_; + + size_t cur = 0; + while (cur++ <= rep_.lookahead_ && iter_.Valid()) { + if (rep_.cmp_(encoded_key, iter_.key()) <= 0) { + return; + } + Next(); + } + } + + iter_.Seek(encoded_key); + prev_ = iter_; + } + + void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.SeekForPrev(encoded_key); + prev_ = iter_; + } + + void SeekToFirst() override { + iter_.SeekToFirst(); + prev_ = iter_; + } + + void SeekToLast() override { + iter_.SeekToLast(); + prev_ = iter_; + } + + protected: + std::string tmp_; // For passing to EncodeKey + + private: + const SkipListRep& rep_; + InlineSkipList::Iterator iter_; + InlineSkipList::Iterator prev_; + }; + + MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + if (lookahead_ > 0) { + void* mem = + arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator)) + : + operator new(sizeof(SkipListRep::LookaheadIterator)); + return new (mem) SkipListRep::LookaheadIterator(*this); + } else { + void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator)) + : + operator new(sizeof(SkipListRep::Iterator)); + return new (mem) SkipListRep::Iterator(&skip_list_); + } + } +}; +} // namespace + +static std::unordered_map skiplist_factory_info = { + {"lookahead", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}}, +}; + +SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) { + RegisterOptions("SkipListFactoryOptions", &lookahead_, + &skiplist_factory_info); +} + +std::string SkipListFactory::GetId() const { + std::string id = Name(); + if (lookahead_ > 0) { + id.append(":").append(std::to_string(lookahead_)); + } + return id; +} + +MemTableRep* SkipListFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* /*logger*/) { + return new SkipListRep(compare, allocator, transform, lookahead_); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/stl_wrappers.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/stl_wrappers.h new file mode 100644 index 0000000000..783a8088d0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/stl_wrappers.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace stl_wrappers { + +class Base { + protected: + const MemTableRep::KeyComparator& compare_; + explicit Base(const MemTableRep::KeyComparator& compare) + : compare_(compare) {} +}; + +struct Compare : private Base { + explicit Compare(const MemTableRep::KeyComparator& compare) : Base(compare) {} + inline bool operator()(const char* a, const char* b) const { + return compare_(a, b) < 0; + } +}; + +} // namespace stl_wrappers +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/vectorrep.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/vectorrep.cc new file mode 100644 index 0000000000..e42ae4439c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/vectorrep.cc @@ -0,0 +1,307 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include +#include +#include +#include +#include + +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/stl_wrappers.h" +#include "port/port.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/options_type.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { +namespace { + +class VectorRep : public MemTableRep { + public: + VectorRep(const KeyComparator& compare, Allocator* allocator, size_t count); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert) + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + void Insert(KeyHandle handle) override; + + // Returns true iff an entry that compares equal to key is in the collection. + bool Contains(const char* key) const override; + + void MarkReadOnly() override; + + size_t ApproximateMemoryUsage() override; + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override; + + ~VectorRep() override {} + + class Iterator : public MemTableRep::Iterator { + class VectorRep* vrep_; + std::shared_ptr> bucket_; + std::vector::const_iterator mutable cit_; + const KeyComparator& compare_; + std::string tmp_; // For passing to EncodeKey + bool mutable sorted_; + void DoSort() const; + + public: + explicit Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare); + + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + ~Iterator() override{}; + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const override; + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const override; + + // Advances to the next position. + // REQUIRES: Valid() + void Next() override; + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev() override; + + // Advance to the first entry with a key >= target + void Seek(const Slice& user_key, const char* memtable_key) override; + + // Advance to the first entry with a key <= target + void SeekForPrev(const Slice& user_key, const char* memtable_key) override; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToFirst() override; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + void SeekToLast() override; + }; + + // Return an iterator over the keys in this representation. + MemTableRep::Iterator* GetIterator(Arena* arena) override; + + private: + friend class Iterator; + using Bucket = std::vector; + std::shared_ptr bucket_; + mutable port::RWMutex rwlock_; + bool immutable_; + bool sorted_; + const KeyComparator& compare_; +}; + +void VectorRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); + WriteLock l(&rwlock_); + assert(!immutable_); + bucket_->push_back(key); +} + +// Returns true iff an entry that compares equal to key is in the collection. +bool VectorRep::Contains(const char* key) const { + ReadLock l(&rwlock_); + return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end(); +} + +void VectorRep::MarkReadOnly() { + WriteLock l(&rwlock_); + immutable_ = true; +} + +size_t VectorRep::ApproximateMemoryUsage() { + return sizeof(bucket_) + sizeof(*bucket_) + + bucket_->size() * + sizeof( + std::remove_reference::type::value_type); +} + +VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator, + size_t count) + : MemTableRep(allocator), + bucket_(new Bucket()), + immutable_(false), + sorted_(false), + compare_(compare) { + bucket_.get()->reserve(count); +} + +VectorRep::Iterator::Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare) + : vrep_(vrep), + bucket_(bucket), + cit_(bucket_->end()), + compare_(compare), + sorted_(false) {} + +void VectorRep::Iterator::DoSort() const { + // vrep is non-null means that we are working on an immutable memtable + if (!sorted_ && vrep_ != nullptr) { + WriteLock l(&vrep_->rwlock_); + if (!vrep_->sorted_) { + std::sort(bucket_->begin(), bucket_->end(), + stl_wrappers::Compare(compare_)); + cit_ = bucket_->begin(); + vrep_->sorted_ = true; + } + sorted_ = true; + } + if (!sorted_) { + std::sort(bucket_->begin(), bucket_->end(), + stl_wrappers::Compare(compare_)); + cit_ = bucket_->begin(); + sorted_ = true; + } + assert(sorted_); + assert(vrep_ == nullptr || vrep_->sorted_); +} + +// Returns true iff the iterator is positioned at a valid node. +bool VectorRep::Iterator::Valid() const { + DoSort(); + return cit_ != bucket_->end(); +} + +// Returns the key at the current position. +// REQUIRES: Valid() +const char* VectorRep::Iterator::key() const { + assert(sorted_); + return *cit_; +} + +// Advances to the next position. +// REQUIRES: Valid() +void VectorRep::Iterator::Next() { + assert(sorted_); + if (cit_ == bucket_->end()) { + return; + } + ++cit_; +} + +// Advances to the previous position. +// REQUIRES: Valid() +void VectorRep::Iterator::Prev() { + assert(sorted_); + if (cit_ == bucket_->begin()) { + // If you try to go back from the first element, the iterator should be + // invalidated. So we set it to past-the-end. This means that you can + // treat the container circularly. + cit_ = bucket_->end(); + } else { + --cit_; + } +} + +// Advance to the first entry with a key >= target +void VectorRep::Iterator::Seek(const Slice& user_key, + const char* memtable_key) { + DoSort(); + // Do binary search to find first value not less than the target + const char* encoded_key = + (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key); + cit_ = std::equal_range(bucket_->begin(), bucket_->end(), encoded_key, + [this](const char* a, const char* b) { + return compare_(a, b) < 0; + }) + .first; +} + +// Advance to the first entry with a key <= target +void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/, + const char* /*memtable_key*/) { + assert(false); +} + +// Position at the first entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToFirst() { + DoSort(); + cit_ = bucket_->begin(); +} + +// Position at the last entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToLast() { + DoSort(); + cit_ = bucket_->end(); + if (bucket_->size() != 0) { + --cit_; + } +} + +void VectorRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + rwlock_.ReadLock(); + VectorRep* vector_rep; + std::shared_ptr bucket; + if (immutable_) { + vector_rep = this; + } else { + vector_rep = nullptr; + bucket.reset(new Bucket(*bucket_)); // make a copy + } + VectorRep::Iterator iter(vector_rep, immutable_ ? bucket_ : bucket, compare_); + rwlock_.ReadUnlock(); + + for (iter.Seek(k.user_key(), k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { + } +} + +MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) { + char* mem = nullptr; + if (arena != nullptr) { + mem = arena->AllocateAligned(sizeof(Iterator)); + } + ReadLock l(&rwlock_); + // Do not sort here. The sorting would be done the first time + // a Seek is performed on the iterator. + if (immutable_) { + if (arena == nullptr) { + return new Iterator(this, bucket_, compare_); + } else { + return new (mem) Iterator(this, bucket_, compare_); + } + } else { + std::shared_ptr tmp; + tmp.reset(new Bucket(*bucket_)); // make a copy + if (arena == nullptr) { + return new Iterator(nullptr, tmp, compare_); + } else { + return new (mem) Iterator(nullptr, tmp, compare_); + } + } +} +} // namespace + +static std::unordered_map vector_rep_table_info = { + {"count", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +VectorRepFactory::VectorRepFactory(size_t count) : count_(count) { + RegisterOptions("VectorRepFactoryOptions", &count_, &vector_rep_table_info); +} + +MemTableRep* VectorRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform*, Logger* /*logger*/) { + return new VectorRep(compare, allocator, count_); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/write_buffer_manager.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/write_buffer_manager.cc new file mode 100644 index 0000000000..ce1789c20d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/memtable/write_buffer_manager.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/write_buffer_manager.h" + +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +WriteBufferManager::WriteBufferManager(size_t _buffer_size, + std::shared_ptr cache, + bool allow_stall) + : buffer_size_(_buffer_size), + mutable_limit_(buffer_size_ * 7 / 8), + memory_used_(0), + memory_active_(0), + cache_res_mgr_(nullptr), + allow_stall_(allow_stall), + stall_active_(false) { + if (cache) { + // Memtable's memory usage tends to fluctuate frequently + // therefore we set delayed_decrease = true to save some dummy entry + // insertion on memory increase right after memory decrease + cache_res_mgr_ = std::make_shared< + CacheReservationManagerImpl>( + cache, true /* delayed_decrease */); + } +} + +WriteBufferManager::~WriteBufferManager() { +#ifndef NDEBUG + std::unique_lock lock(mu_); + assert(queue_.empty()); +#endif +} + +std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const { + if (cache_res_mgr_ != nullptr) { + return cache_res_mgr_->GetTotalReservedCacheSize(); + } else { + return 0; + } +} + +void WriteBufferManager::ReserveMem(size_t mem) { + if (cache_res_mgr_ != nullptr) { + ReserveMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_add(mem, std::memory_order_relaxed); + } + if (enabled()) { + memory_active_.fetch_add(mem, std::memory_order_relaxed); + } +} + +// Should only be called from write thread +void WriteBufferManager::ReserveMemWithCache(size_t mem) { + assert(cache_res_mgr_ != nullptr); + // Use a mutex to protect various data structures. Can be optimized to a + // lock-free solution if it ends up with a performance bottleneck. + std::lock_guard lock(cache_res_mgr_mu_); + + size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem; + memory_used_.store(new_mem_used, std::memory_order_relaxed); + Status s = cache_res_mgr_->UpdateCacheReservation(new_mem_used); + + // We absorb the error since WriteBufferManager is not able to handle + // this failure properly. Ideallly we should prevent this allocation + // from happening if this cache charging fails. + // [TODO] We'll need to improve it in the future and figure out what to do on + // error + s.PermitUncheckedError(); +} + +void WriteBufferManager::ScheduleFreeMem(size_t mem) { + if (enabled()) { + memory_active_.fetch_sub(mem, std::memory_order_relaxed); + } +} + +void WriteBufferManager::FreeMem(size_t mem) { + if (cache_res_mgr_ != nullptr) { + FreeMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_sub(mem, std::memory_order_relaxed); + } + // Check if stall is active and can be ended. + MaybeEndWriteStall(); +} + +void WriteBufferManager::FreeMemWithCache(size_t mem) { + assert(cache_res_mgr_ != nullptr); + // Use a mutex to protect various data structures. Can be optimized to a + // lock-free solution if it ends up with a performance bottleneck. + std::lock_guard lock(cache_res_mgr_mu_); + size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem; + memory_used_.store(new_mem_used, std::memory_order_relaxed); + Status s = cache_res_mgr_->UpdateCacheReservation(new_mem_used); + + // We absorb the error since WriteBufferManager is not able to handle + // this failure properly. + // [TODO] We'll need to improve it in the future and figure out what to do on + // error + s.PermitUncheckedError(); +} + +void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { + assert(wbm_stall != nullptr); + + // Allocate outside of the lock. + std::list new_node = {wbm_stall}; + + { + std::unique_lock lock(mu_); + // Verify if the stall conditions are stil active. + if (ShouldStall()) { + stall_active_.store(true, std::memory_order_relaxed); + queue_.splice(queue_.end(), std::move(new_node)); + } + } + + // If the node was not consumed, the stall has ended already and we can signal + // the caller. + if (!new_node.empty()) { + new_node.front()->Signal(); + } +} + +// Called when memory is freed in FreeMem or the buffer size has changed. +void WriteBufferManager::MaybeEndWriteStall() { + // Stall conditions have not been resolved. + if (allow_stall_.load(std::memory_order_relaxed) && + IsStallThresholdExceeded()) { + return; + } + + // Perform all deallocations outside of the lock. + std::list cleanup; + + std::unique_lock lock(mu_); + if (!stall_active_.load(std::memory_order_relaxed)) { + return; // Nothing to do. + } + + // Unblock new writers. + stall_active_.store(false, std::memory_order_relaxed); + + // Unblock the writers in the queue. + for (StallInterface* wbm_stall : queue_) { + wbm_stall->Signal(); + } + cleanup = std::move(queue_); +} + +void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { + assert(wbm_stall != nullptr); + + // Deallocate the removed nodes outside of the lock. + std::list cleanup; + + if (enabled() && allow_stall_.load(std::memory_order_relaxed)) { + std::unique_lock lock(mu_); + for (auto it = queue_.begin(); it != queue_.end();) { + auto next = std::next(it); + if (*it == wbm_stall) { + cleanup.splice(cleanup.end(), queue_, std::move(it)); + } + it = next; + } + } + wbm_stall->Signal(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/file_read_sample.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/file_read_sample.h new file mode 100644 index 0000000000..82a933e0a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/file_read_sample.h @@ -0,0 +1,23 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "db/version_edit.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { +static const uint32_t kFileReadSampleRate = 1024; +extern bool should_sample_file_read(); +extern void sample_file_read_inc(FileMetaData*); + +inline bool should_sample_file_read() { + return (Random::GetTLSInstance()->Next() % kFileReadSampleRate == 307); +} + +inline void sample_file_read_inc(FileMetaData* meta) { + meta->stats.num_reads_sampled.fetch_add(kFileReadSampleRate, + std::memory_order_relaxed); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.cc new file mode 100644 index 0000000000..61bc6c1409 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.cc @@ -0,0 +1,270 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "monitoring/histogram.h" + +#include + +#include +#include +#include +#include + +#include "port/port.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +HistogramBucketMapper::HistogramBucketMapper() { + // If you change this, you also need to change + // size of array buckets_ in HistogramImpl + bucketValues_ = {1, 2}; + double bucket_val = static_cast(bucketValues_.back()); + while ((bucket_val = 1.5 * bucket_val) <= + static_cast(std::numeric_limits::max())) { + bucketValues_.push_back(static_cast(bucket_val)); + // Extracts two most significant digits to make histogram buckets more + // human-readable. E.g., 172 becomes 170. + uint64_t pow_of_ten = 1; + while (bucketValues_.back() / 10 > 10) { + bucketValues_.back() /= 10; + pow_of_ten *= 10; + } + bucketValues_.back() *= pow_of_ten; + } + maxBucketValue_ = bucketValues_.back(); + minBucketValue_ = bucketValues_.front(); +} + +size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { + auto beg = bucketValues_.begin(); + auto end = bucketValues_.end(); + if (value >= maxBucketValue_) + return end - beg - 1; // bucketValues_.size() - 1 + else + return std::lower_bound(beg, end, value) - beg; +} + +namespace { +const HistogramBucketMapper bucketMapper; +} + +HistogramStat::HistogramStat() : num_buckets_(bucketMapper.BucketCount()) { + assert(num_buckets_ == sizeof(buckets_) / sizeof(*buckets_)); + Clear(); +} + +void HistogramStat::Clear() { + min_.store(bucketMapper.LastValue(), std::memory_order_relaxed); + max_.store(0, std::memory_order_relaxed); + num_.store(0, std::memory_order_relaxed); + sum_.store(0, std::memory_order_relaxed); + sum_squares_.store(0, std::memory_order_relaxed); + for (unsigned int b = 0; b < num_buckets_; b++) { + buckets_[b].store(0, std::memory_order_relaxed); + } +}; + +bool HistogramStat::Empty() const { return num() == 0; } + +void HistogramStat::Add(uint64_t value) { + // This function is designed to be lock free, as it's in the critical path + // of any operation. Each individual value is atomic and the order of updates + // by concurrent threads is tolerable. + const size_t index = bucketMapper.IndexForValue(value); + assert(index < num_buckets_); + buckets_[index].store(buckets_[index].load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + + uint64_t old_min = min(); + if (value < old_min) { + min_.store(value, std::memory_order_relaxed); + } + + uint64_t old_max = max(); + if (value > old_max) { + max_.store(value, std::memory_order_relaxed); + } + + num_.store(num_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + sum_.store(sum_.load(std::memory_order_relaxed) + value, + std::memory_order_relaxed); + sum_squares_.store( + sum_squares_.load(std::memory_order_relaxed) + value * value, + std::memory_order_relaxed); +} + +void HistogramStat::Merge(const HistogramStat& other) { + // This function needs to be performned with the outer lock acquired + // However, atomic operation on every member is still need, since Add() + // requires no lock and value update can still happen concurrently + uint64_t old_min = min(); + uint64_t other_min = other.min(); + while (other_min < old_min && + !min_.compare_exchange_weak(old_min, other_min)) { + } + + uint64_t old_max = max(); + uint64_t other_max = other.max(); + while (other_max > old_max && + !max_.compare_exchange_weak(old_max, other_max)) { + } + + num_.fetch_add(other.num(), std::memory_order_relaxed); + sum_.fetch_add(other.sum(), std::memory_order_relaxed); + sum_squares_.fetch_add(other.sum_squares(), std::memory_order_relaxed); + for (unsigned int b = 0; b < num_buckets_; b++) { + buckets_[b].fetch_add(other.bucket_at(b), std::memory_order_relaxed); + } +} + +double HistogramStat::Median() const { return Percentile(50.0); } + +double HistogramStat::Percentile(double p) const { + double threshold = num() * (p / 100.0); + uint64_t cumulative_sum = 0; + for (unsigned int b = 0; b < num_buckets_; b++) { + uint64_t bucket_value = bucket_at(b); + cumulative_sum += bucket_value; + if (cumulative_sum >= threshold) { + // Scale linearly within this bucket + uint64_t left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b - 1); + uint64_t right_point = bucketMapper.BucketLimit(b); + uint64_t left_sum = cumulative_sum - bucket_value; + uint64_t right_sum = cumulative_sum; + double pos = 0; + uint64_t right_left_diff = right_sum - left_sum; + if (right_left_diff != 0) { + pos = (threshold - left_sum) / right_left_diff; + } + double r = left_point + (right_point - left_point) * pos; + uint64_t cur_min = min(); + uint64_t cur_max = max(); + if (r < cur_min) r = static_cast(cur_min); + if (r > cur_max) r = static_cast(cur_max); + return r; + } + } + return static_cast(max()); +} + +double HistogramStat::Average() const { + uint64_t cur_num = num(); + uint64_t cur_sum = sum(); + if (cur_num == 0) return 0; + return static_cast(cur_sum) / static_cast(cur_num); +} + +double HistogramStat::StandardDeviation() const { + double cur_num = + static_cast(num()); // Use double to avoid integer overflow + double cur_sum = static_cast(sum()); + double cur_sum_squares = static_cast(sum_squares()); + if (cur_num == 0.0) { + return 0.0; + } + double variance = + (cur_sum_squares * cur_num - cur_sum * cur_sum) / (cur_num * cur_num); + return std::sqrt(std::max(variance, 0.0)); +} + +std::string HistogramStat::ToString() const { + uint64_t cur_num = num(); + std::string r; + char buf[1650]; + snprintf(buf, sizeof(buf), "Count: %" PRIu64 " Average: %.4f StdDev: %.2f\n", + cur_num, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %" PRIu64 " Median: %.4f Max: %" PRIu64 "\n", + (cur_num == 0 ? 0 : min()), Median(), (cur_num == 0 ? 0 : max())); + r.append(buf); + snprintf(buf, sizeof(buf), + "Percentiles: " + "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n", + Percentile(50), Percentile(75), Percentile(99), Percentile(99.9), + Percentile(99.99)); + r.append(buf); + r.append("------------------------------------------------------\n"); + if (cur_num == 0) return r; // all buckets are empty + const double mult = 100.0 / cur_num; + uint64_t cumulative_sum = 0; + for (unsigned int b = 0; b < num_buckets_; b++) { + uint64_t bucket_value = bucket_at(b); + if (bucket_value <= 0.0) continue; + cumulative_sum += bucket_value; + snprintf(buf, sizeof(buf), + "%c %7" PRIu64 ", %7" PRIu64 " ] %8" PRIu64 " %7.3f%% %7.3f%% ", + (b == 0) ? '[' : '(', + (b == 0) ? 0 : bucketMapper.BucketLimit(b - 1), // left + bucketMapper.BucketLimit(b), // right + bucket_value, // count + (mult * bucket_value), // percentage + (mult * cumulative_sum)); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + size_t marks = static_cast(mult * bucket_value / 5 + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +void HistogramStat::Data(HistogramData* const data) const { + assert(data); + data->median = Median(); + data->percentile95 = Percentile(95); + data->percentile99 = Percentile(99); + data->max = static_cast(max()); + data->average = Average(); + data->standard_deviation = StandardDeviation(); + data->count = num(); + data->sum = sum(); + data->min = static_cast(min()); +} + +void HistogramImpl::Clear() { + std::lock_guard lock(mutex_); + stats_.Clear(); +} + +bool HistogramImpl::Empty() const { return stats_.Empty(); } + +void HistogramImpl::Add(uint64_t value) { stats_.Add(value); } + +void HistogramImpl::Merge(const Histogram& other) { + if (strcmp(Name(), other.Name()) == 0) { + Merge(*static_cast_with_check(&other)); + } +} + +void HistogramImpl::Merge(const HistogramImpl& other) { + std::lock_guard lock(mutex_); + stats_.Merge(other.stats_); +} + +double HistogramImpl::Median() const { return stats_.Median(); } + +double HistogramImpl::Percentile(double p) const { + return stats_.Percentile(p); +} + +double HistogramImpl::Average() const { return stats_.Average(); } + +double HistogramImpl::StandardDeviation() const { + return stats_.StandardDeviation(); +} + +std::string HistogramImpl::ToString() const { return stats_.ToString(); } + +void HistogramImpl::Data(HistogramData* const data) const { stats_.Data(data); } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.h new file mode 100644 index 0000000000..15fee2b4f8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram.h @@ -0,0 +1,143 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include + +#include "rocksdb/statistics.h" + +namespace ROCKSDB_NAMESPACE { + +class HistogramBucketMapper { + public: + HistogramBucketMapper(); + + // converts a value to the bucket index. + size_t IndexForValue(uint64_t value) const; + // number of buckets required. + + size_t BucketCount() const { return bucketValues_.size(); } + + uint64_t LastValue() const { return maxBucketValue_; } + + uint64_t FirstValue() const { return minBucketValue_; } + + uint64_t BucketLimit(const size_t bucketNumber) const { + assert(bucketNumber < BucketCount()); + return bucketValues_[bucketNumber]; + } + + private: + std::vector bucketValues_; + uint64_t maxBucketValue_; + uint64_t minBucketValue_; +}; + +struct HistogramStat { + HistogramStat(); + ~HistogramStat() {} + + HistogramStat(const HistogramStat&) = delete; + HistogramStat& operator=(const HistogramStat&) = delete; + + void Clear(); + bool Empty() const; + void Add(uint64_t value); + void Merge(const HistogramStat& other); + + inline uint64_t min() const { return min_.load(std::memory_order_relaxed); } + inline uint64_t max() const { return max_.load(std::memory_order_relaxed); } + inline uint64_t num() const { return num_.load(std::memory_order_relaxed); } + inline uint64_t sum() const { return sum_.load(std::memory_order_relaxed); } + inline uint64_t sum_squares() const { + return sum_squares_.load(std::memory_order_relaxed); + } + inline uint64_t bucket_at(size_t b) const { + return buckets_[b].load(std::memory_order_relaxed); + } + + double Median() const; + double Percentile(double p) const; + double Average() const; + double StandardDeviation() const; + void Data(HistogramData* const data) const; + std::string ToString() const; + + // To be able to use HistogramStat as thread local variable, it + // cannot have dynamic allocated member. That's why we're + // using manually values from BucketMapper + std::atomic_uint_fast64_t min_; + std::atomic_uint_fast64_t max_; + std::atomic_uint_fast64_t num_; + std::atomic_uint_fast64_t sum_; + std::atomic_uint_fast64_t sum_squares_; + std::atomic_uint_fast64_t buckets_[109]; // 109==BucketMapper::BucketCount() + const uint64_t num_buckets_; +}; + +class Histogram { + public: + Histogram() {} + virtual ~Histogram(){}; + + virtual void Clear() = 0; + virtual bool Empty() const = 0; + virtual void Add(uint64_t value) = 0; + virtual void Merge(const Histogram&) = 0; + + virtual std::string ToString() const = 0; + virtual const char* Name() const = 0; + virtual uint64_t min() const = 0; + virtual uint64_t max() const = 0; + virtual uint64_t num() const = 0; + virtual double Median() const = 0; + virtual double Percentile(double p) const = 0; + virtual double Average() const = 0; + virtual double StandardDeviation() const = 0; + virtual void Data(HistogramData* const data) const = 0; +}; + +class HistogramImpl : public Histogram { + public: + HistogramImpl() { Clear(); } + + HistogramImpl(const HistogramImpl&) = delete; + HistogramImpl& operator=(const HistogramImpl&) = delete; + + virtual void Clear() override; + virtual bool Empty() const override; + virtual void Add(uint64_t value) override; + virtual void Merge(const Histogram& other) override; + void Merge(const HistogramImpl& other); + + virtual std::string ToString() const override; + virtual const char* Name() const override { return "HistogramImpl"; } + virtual uint64_t min() const override { return stats_.min(); } + virtual uint64_t max() const override { return stats_.max(); } + virtual uint64_t num() const override { return stats_.num(); } + virtual double Median() const override; + virtual double Percentile(double p) const override; + virtual double Average() const override; + virtual double StandardDeviation() const override; + virtual void Data(HistogramData* const data) const override; + + virtual ~HistogramImpl() {} + + inline HistogramStat& TEST_GetStats() { return stats_; } + + private: + HistogramStat stats_; + std::mutex mutex_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.cc new file mode 100644 index 0000000000..c41ae8a03d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "monitoring/histogram_windowing.h" + +#include + +#include "monitoring/histogram.h" +#include "rocksdb/system_clock.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +HistogramWindowingImpl::HistogramWindowingImpl() { + clock_ = SystemClock::Default(); + window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); + Clear(); +} + +HistogramWindowingImpl::HistogramWindowingImpl(uint64_t num_windows, + uint64_t micros_per_window, + uint64_t min_num_per_window) + : num_windows_(num_windows), + micros_per_window_(micros_per_window), + min_num_per_window_(min_num_per_window) { + clock_ = SystemClock::Default(); + window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); + Clear(); +} + +HistogramWindowingImpl::~HistogramWindowingImpl() {} + +void HistogramWindowingImpl::Clear() { + std::lock_guard lock(mutex_); + + stats_.Clear(); + for (size_t i = 0; i < num_windows_; i++) { + window_stats_[i].Clear(); + } + current_window_.store(0, std::memory_order_relaxed); + last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); +} + +bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); } + +// This function is designed to be lock free, as it's in the critical path +// of any operation. +// Each individual value is atomic, it is just that some samples can go +// in the older bucket which is tolerable. +void HistogramWindowingImpl::Add(uint64_t value) { + TimerTick(); + + // Parent (global) member update + stats_.Add(value); + + // Current window update + window_stats_[static_cast(current_window())].Add(value); +} + +void HistogramWindowingImpl::Merge(const Histogram& other) { + if (strcmp(Name(), other.Name()) == 0) { + Merge(*static_cast_with_check(&other)); + } +} + +void HistogramWindowingImpl::Merge(const HistogramWindowingImpl& other) { + std::lock_guard lock(mutex_); + stats_.Merge(other.stats_); + + if (stats_.num_buckets_ != other.stats_.num_buckets_ || + micros_per_window_ != other.micros_per_window_) { + return; + } + + uint64_t cur_window = current_window(); + uint64_t other_cur_window = other.current_window(); + // going backwards for alignment + for (unsigned int i = 0; i < std::min(num_windows_, other.num_windows_); + i++) { + uint64_t window_index = (cur_window + num_windows_ - i) % num_windows_; + uint64_t other_window_index = + (other_cur_window + other.num_windows_ - i) % other.num_windows_; + size_t windex = static_cast(window_index); + size_t other_windex = static_cast(other_window_index); + + window_stats_[windex].Merge(other.window_stats_[other_windex]); + } +} + +std::string HistogramWindowingImpl::ToString() const { + return stats_.ToString(); +} + +double HistogramWindowingImpl::Median() const { return Percentile(50.0); } + +double HistogramWindowingImpl::Percentile(double p) const { + // Retry 3 times in total + for (int retry = 0; retry < 3; retry++) { + uint64_t start_num = stats_.num(); + double result = stats_.Percentile(p); + // Detect if swap buckets or Clear() was called during calculation + if (stats_.num() >= start_num) { + return result; + } + } + return 0.0; +} + +double HistogramWindowingImpl::Average() const { return stats_.Average(); } + +double HistogramWindowingImpl::StandardDeviation() const { + return stats_.StandardDeviation(); +} + +void HistogramWindowingImpl::Data(HistogramData* const data) const { + stats_.Data(data); +} + +void HistogramWindowingImpl::TimerTick() { + uint64_t curr_time = clock_->NowMicros(); + size_t curr_window_ = static_cast(current_window()); + if (curr_time - last_swap_time() > micros_per_window_ && + window_stats_[curr_window_].num() >= min_num_per_window_) { + SwapHistoryBucket(); + } +} + +void HistogramWindowingImpl::SwapHistoryBucket() { + // Threads executing Add() would be competing for this mutex, the first one + // who got the metex would take care of the bucket swap, other threads + // can skip this. + // If mutex is held by Merge() or Clear(), next Add() will take care of the + // swap, if needed. + if (mutex_.try_lock()) { + last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); + + uint64_t curr_window = current_window(); + uint64_t next_window = + (curr_window == num_windows_ - 1) ? 0 : curr_window + 1; + + // subtract next buckets from totals and swap to next buckets + HistogramStat& stats_to_drop = + window_stats_[static_cast(next_window)]; + + if (!stats_to_drop.Empty()) { + for (size_t b = 0; b < stats_.num_buckets_; b++) { + stats_.buckets_[b].fetch_sub(stats_to_drop.bucket_at(b), + std::memory_order_relaxed); + } + + if (stats_.min() == stats_to_drop.min()) { + uint64_t new_min = std::numeric_limits::max(); + for (unsigned int i = 0; i < num_windows_; i++) { + if (i != next_window) { + uint64_t m = window_stats_[i].min(); + if (m < new_min) new_min = m; + } + } + stats_.min_.store(new_min, std::memory_order_relaxed); + } + + if (stats_.max() == stats_to_drop.max()) { + uint64_t new_max = 0; + for (unsigned int i = 0; i < num_windows_; i++) { + if (i != next_window) { + uint64_t m = window_stats_[i].max(); + if (m > new_max) new_max = m; + } + } + stats_.max_.store(new_max, std::memory_order_relaxed); + } + + stats_.num_.fetch_sub(stats_to_drop.num(), std::memory_order_relaxed); + stats_.sum_.fetch_sub(stats_to_drop.sum(), std::memory_order_relaxed); + stats_.sum_squares_.fetch_sub(stats_to_drop.sum_squares(), + std::memory_order_relaxed); + + stats_to_drop.Clear(); + } + + // advance to next window bucket + current_window_.store(next_window, std::memory_order_relaxed); + + mutex_.unlock(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.h new file mode 100644 index 0000000000..9a862671f4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/histogram_windowing.h @@ -0,0 +1,84 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "monitoring/histogram.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +class HistogramWindowingImpl : public Histogram { + public: + HistogramWindowingImpl(); + HistogramWindowingImpl(uint64_t num_windows, uint64_t micros_per_window, + uint64_t min_num_per_window); + + HistogramWindowingImpl(const HistogramWindowingImpl&) = delete; + HistogramWindowingImpl& operator=(const HistogramWindowingImpl&) = delete; + + ~HistogramWindowingImpl(); + + virtual void Clear() override; + virtual bool Empty() const override; + virtual void Add(uint64_t value) override; + virtual void Merge(const Histogram& other) override; + void Merge(const HistogramWindowingImpl& other); + + virtual std::string ToString() const override; + virtual const char* Name() const override { return "HistogramWindowingImpl"; } + virtual uint64_t min() const override { return stats_.min(); } + virtual uint64_t max() const override { return stats_.max(); } + virtual uint64_t num() const override { return stats_.num(); } + virtual double Median() const override; + virtual double Percentile(double p) const override; + virtual double Average() const override; + virtual double StandardDeviation() const override; + virtual void Data(HistogramData* const data) const override; + +#ifndef NDEBUG + void TEST_UpdateClock(const std::shared_ptr& clock) { + clock_ = clock; + } +#endif // NDEBUG + + private: + void TimerTick(); + void SwapHistoryBucket(); + inline uint64_t current_window() const { + return current_window_.load(std::memory_order_relaxed); + } + inline uint64_t last_swap_time() const { + return last_swap_time_.load(std::memory_order_relaxed); + } + + std::shared_ptr clock_; + std::mutex mutex_; + + // Aggregated stats over windows_stats_, all the computation is done + // upon aggregated values + HistogramStat stats_; + + // This is a circular array representing the latest N time-windows. + // Each entry stores a time-window of data. Expiration is done + // on window-based. + std::unique_ptr window_stats_; + + std::atomic_uint_fast64_t current_window_; + std::atomic_uint_fast64_t last_swap_time_; + + // Following parameters are configuable + uint64_t num_windows_ = 5; + uint64_t micros_per_window_ = 60000000; + // By default, don't care about the number of values in current window + // when decide whether to swap windows or not. + uint64_t min_num_per_window_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.cc new file mode 100644 index 0000000000..568d8ec134 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "monitoring/in_memory_stats_history.h" + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +InMemoryStatsHistoryIterator::~InMemoryStatsHistoryIterator() {} + +bool InMemoryStatsHistoryIterator::Valid() const { return valid_; } + +Status InMemoryStatsHistoryIterator::status() const { return status_; } + +// Because of garbage collection, the next stats snapshot may or may not be +// right after the current one. When reading from DBImpl::stats_history_, this +// call will be protected by DB Mutex so it will not return partial or +// corrupted results. +void InMemoryStatsHistoryIterator::Next() { + // increment start_time by 1 to avoid infinite loop + AdvanceIteratorByTime(GetStatsTime() + 1, end_time_); +} + +uint64_t InMemoryStatsHistoryIterator::GetStatsTime() const { return time_; } + +const std::map& +InMemoryStatsHistoryIterator::GetStatsMap() const { + return stats_map_; +} + +// advance the iterator to the next time between [start_time, end_time) +// if success, update time_ and stats_map_ with new_time and stats_map +void InMemoryStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time, + uint64_t end_time) { + // try to find next entry in stats_history_ map + if (db_impl_ != nullptr) { + valid_ = + db_impl_->FindStatsByTime(start_time, end_time, &time_, &stats_map_); + } else { + valid_ = false; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.h new file mode 100644 index 0000000000..3be864fe26 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/in_memory_stats_history.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/stats_history.h" + +namespace ROCKSDB_NAMESPACE { + +// InMemoryStatsHistoryIterator can be used to access stats history that was +// stored by an in-memory two level std::map(DBImpl::stats_history_). It keeps +// a copy of the stats snapshot (in stats_map_) that is currently being pointed +// to, which allows the iterator to access the stats snapshot even when +// the background garbage collecting thread purges it from the source of truth +// (`DBImpl::stats_history_`). In that case, the iterator will continue to be +// valid until a call to `Next()` returns no result and invalidates it. In +// some extreme cases, the iterator may also return fragmented segments of +// stats snapshots due to long gaps between `Next()` calls and interleaved +// garbage collection. +class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { + public: + // Setup InMemoryStatsHistoryIterator to return stats snapshots between + // seconds timestamps [start_time, end_time) + InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time, + DBImpl* db_impl) + : start_time_(start_time), + end_time_(end_time), + valid_(true), + db_impl_(db_impl) { + AdvanceIteratorByTime(start_time_, end_time_); + } + // no copying allowed + InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete; + void operator=(const InMemoryStatsHistoryIterator&) = delete; + InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete; + InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) = + delete; + + ~InMemoryStatsHistoryIterator() override; + bool Valid() const override; + Status status() const override; + + // Move to the next stats snapshot currently available + // This function may invalidate the iterator + // REQUIRES: Valid() + void Next() override; + + // REQUIRES: Valid() + uint64_t GetStatsTime() const override; + + // This function is idempotent + // REQUIRES: Valid() + const std::map& GetStatsMap() const override; + + private: + // advance the iterator to the next stats history record with timestamp + // between [start_time, end_time) + void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time); + + uint64_t time_; + uint64_t start_time_; + uint64_t end_time_; + std::map stats_map_; + Status status_; + bool valid_; + DBImpl* db_impl_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.cc new file mode 100644 index 0000000000..699495a348 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "monitoring/instrumented_mutex.h" + +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +#ifndef NPERF_CONTEXT +Statistics* stats_for_report(SystemClock* clock, Statistics* stats) { + if (clock != nullptr && stats != nullptr && + stats->get_stats_level() > kExceptTimeForMutex) { + return stats; + } else { + return nullptr; + } +} +#endif // NPERF_CONTEXT +} // namespace + +void InstrumentedMutex::Lock() { + PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( + db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, + stats_for_report(clock_, stats_), stats_code_); + LockInternal(); +} + +void InstrumentedMutex::LockInternal() { +#ifndef NDEBUG + ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif +#ifdef COERCE_CONTEXT_SWITCH + if (stats_code_ == DB_MUTEX_WAIT_MICROS) { + thread_local Random rnd(301); + if (rnd.OneIn(2)) { + if (bg_cv_) { + bg_cv_->SignalAll(); + } + sched_yield(); + } else { + uint32_t sleep_us = rnd.Uniform(11) * 1000; + if (bg_cv_) { + bg_cv_->SignalAll(); + } + SystemClock::Default()->SleepForMicroseconds(sleep_us); + } + } +#endif + mutex_.Lock(); +} + +void InstrumentedCondVar::Wait() { + PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( + db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, + stats_for_report(clock_, stats_), stats_code_); + WaitInternal(); +} + +void InstrumentedCondVar::WaitInternal() { +#ifndef NDEBUG + ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif + cond_.Wait(); +} + +bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) { + PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( + db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, + stats_for_report(clock_, stats_), stats_code_); + return TimedWaitInternal(abs_time_us); +} + +bool InstrumentedCondVar::TimedWaitInternal(uint64_t abs_time_us) { +#ifndef NDEBUG + ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif + + TEST_SYNC_POINT_CALLBACK("InstrumentedCondVar::TimedWaitInternal", + &abs_time_us); + + return cond_.TimedWait(abs_time_us); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.h new file mode 100644 index 0000000000..b97d2502e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/instrumented_mutex.h @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "monitoring/statistics_impl.h" +#include "port/port.h" +#include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/thread_status.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { +class InstrumentedCondVar; + +// A wrapper class for port::Mutex that provides additional layer +// for collecting stats and instrumentation. +class InstrumentedMutex { + public: + explicit InstrumentedMutex(bool adaptive = false) + : mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {} + + explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false) + : mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {} + + InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code, + bool adaptive = false) + : mutex_(adaptive), + stats_(stats), + clock_(clock), + stats_code_(stats_code) {} + +#ifdef COERCE_CONTEXT_SWITCH + InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code, + InstrumentedCondVar* bg_cv, bool adaptive = false) + : mutex_(adaptive), + stats_(stats), + clock_(clock), + stats_code_(stats_code), + bg_cv_(bg_cv) {} +#endif + + void Lock(); + + void Unlock() { mutex_.Unlock(); } + + void AssertHeld() { mutex_.AssertHeld(); } + + private: + void LockInternal(); + friend class InstrumentedCondVar; + port::Mutex mutex_; + Statistics* stats_; + SystemClock* clock_; + int stats_code_; +#ifdef COERCE_CONTEXT_SWITCH + InstrumentedCondVar* bg_cv_ = nullptr; +#endif +}; + +class ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedInstrumentedMutex + : public InstrumentedMutex { + using InstrumentedMutex::InstrumentedMutex; +}; +static_assert(alignof(CacheAlignedInstrumentedMutex) != CACHE_LINE_SIZE || + sizeof(CacheAlignedInstrumentedMutex) % CACHE_LINE_SIZE == 0); + +// RAII wrapper for InstrumentedMutex +class InstrumentedMutexLock { + public: + explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) { + mutex_->Lock(); + } + + ~InstrumentedMutexLock() { mutex_->Unlock(); } + + private: + InstrumentedMutex* const mutex_; + InstrumentedMutexLock(const InstrumentedMutexLock&) = delete; + void operator=(const InstrumentedMutexLock&) = delete; +}; + +// RAII wrapper for temporary releasing InstrumentedMutex inside +// InstrumentedMutexLock +class InstrumentedMutexUnlock { + public: + explicit InstrumentedMutexUnlock(InstrumentedMutex* mutex) : mutex_(mutex) { + mutex_->Unlock(); + } + + ~InstrumentedMutexUnlock() { mutex_->Lock(); } + + private: + InstrumentedMutex* const mutex_; + InstrumentedMutexUnlock(const InstrumentedMutexUnlock&) = delete; + void operator=(const InstrumentedMutexUnlock&) = delete; +}; + +class InstrumentedCondVar { + public: + explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex) + : cond_(&(instrumented_mutex->mutex_)), + stats_(instrumented_mutex->stats_), + clock_(instrumented_mutex->clock_), + stats_code_(instrumented_mutex->stats_code_) {} + + void Wait(); + + bool TimedWait(uint64_t abs_time_us); + + void Signal() { cond_.Signal(); } + + void SignalAll() { cond_.SignalAll(); } + + private: + void WaitInternal(); + bool TimedWaitInternal(uint64_t abs_time_us); + port::CondVar cond_; + Statistics* stats_; + SystemClock* clock_; + int stats_code_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context.cc new file mode 100644 index 0000000000..04e98914da --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "monitoring/iostats_context_imp.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef NIOSTATS_CONTEXT +// Should not be used because the counters are not thread-safe. +// Put here just to make get_iostats_context() simple without ifdef. +static IOStatsContext iostats_context; +#else +thread_local IOStatsContext iostats_context; +#endif + +IOStatsContext* get_iostats_context() { return &iostats_context; } + +void IOStatsContext::Reset() { +#ifndef NIOSTATS_CONTEXT + thread_pool_id = Env::Priority::TOTAL; + bytes_read = 0; + bytes_written = 0; + open_nanos = 0; + allocate_nanos = 0; + write_nanos = 0; + read_nanos = 0; + range_sync_nanos = 0; + prepare_write_nanos = 0; + fsync_nanos = 0; + logger_nanos = 0; + cpu_write_nanos = 0; + cpu_read_nanos = 0; + file_io_stats_by_temperature.Reset(); +#endif //! NIOSTATS_CONTEXT +} + +#define IOSTATS_CONTEXT_OUTPUT(counter) \ + if (!exclude_zero_counters || counter > 0) { \ + ss << #counter << " = " << counter << ", "; \ + } + +std::string IOStatsContext::ToString(bool exclude_zero_counters) const { +#ifdef NIOSTATS_CONTEXT + (void)exclude_zero_counters; + return ""; +#else + std::ostringstream ss; + IOSTATS_CONTEXT_OUTPUT(thread_pool_id); + IOSTATS_CONTEXT_OUTPUT(bytes_read); + IOSTATS_CONTEXT_OUTPUT(bytes_written); + IOSTATS_CONTEXT_OUTPUT(open_nanos); + IOSTATS_CONTEXT_OUTPUT(allocate_nanos); + IOSTATS_CONTEXT_OUTPUT(write_nanos); + IOSTATS_CONTEXT_OUTPUT(read_nanos); + IOSTATS_CONTEXT_OUTPUT(range_sync_nanos); + IOSTATS_CONTEXT_OUTPUT(fsync_nanos); + IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos); + IOSTATS_CONTEXT_OUTPUT(logger_nanos); + IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos); + IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count); + std::string str = ss.str(); + str.erase(str.find_last_not_of(", ") + 1); + return str; +#endif //! NIOSTATS_CONTEXT +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context_imp.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context_imp.h new file mode 100644 index 0000000000..a0b4292dfd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/iostats_context_imp.h @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "monitoring/perf_step_timer.h" +#include "rocksdb/iostats_context.h" + +#if !defined(NIOSTATS_CONTEXT) +namespace ROCKSDB_NAMESPACE { +extern thread_local IOStatsContext iostats_context; +} // namespace ROCKSDB_NAMESPACE + +// increment a specific counter by the specified value +#define IOSTATS_ADD(metric, value) \ + if (!iostats_context.disable_iostats) { \ + iostats_context.metric += value; \ + } + +// reset a specific counter to zero +#define IOSTATS_RESET(metric) (iostats_context.metric = 0) + +// reset all counters to zero +#define IOSTATS_RESET_ALL() (iostats_context.Reset()) + +#define IOSTATS_SET_THREAD_POOL_ID(value) \ + (iostats_context.thread_pool_id = value) + +#define IOSTATS_THREAD_POOL_ID() (iostats_context.thread_pool_id) + +#define IOSTATS(metric) (iostats_context.metric) + +// Declare and set start time of the timer +#define IOSTATS_TIMER_GUARD(metric) \ + PerfStepTimer iostats_step_timer_##metric(&(iostats_context.metric)); \ + iostats_step_timer_##metric.Start(); + +// Declare and set start time of the timer +#define IOSTATS_CPU_TIMER_GUARD(metric, clock) \ + PerfStepTimer iostats_step_timer_##metric( \ + &(iostats_context.metric), clock, true, \ + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ + iostats_step_timer_##metric.Start(); + +#define IOSTATS_SET_DISABLE(disable) (iostats_context.disable_iostats = disable) + +#else // !NIOSTATS_CONTEXT + +#define IOSTATS_ADD(metric, value) +#define IOSTATS_ADD_IF_POSITIVE(metric, value) +#define IOSTATS_RESET(metric) +#define IOSTATS_RESET_ALL() +#define IOSTATS_SET_THREAD_POOL_ID(value) +#define IOSTATS_THREAD_POOL_ID() +#define IOSTATS(metric) 0 +#define IOSTATS_SET_DISABLE(disable) + +#define IOSTATS_TIMER_GUARD(metric) +#define IOSTATS_CPU_TIMER_GUARD(metric, clock) static_cast(clock) + +#endif // !NIOSTATS_CONTEXT diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context.cc new file mode 100644 index 0000000000..6839e90598 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include + +#include "monitoring/perf_context_imp.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * Please add new metrics to this macro and appropriate fields will be copied, + * and/or emitted when converted to string. + * When people need to add new metrics please add the metric to the macro below + * and enclose the name of the specific metric within defCmd(). + * The position of the field will be dictated by the + * order in which the macros are enumerated and the offsets of the fields will + * be matched against ''PerfContextByLevelBase'' declared in perf_context.h. + */ +// clang-format off +#define DEF_PERF_CONTEXT_LEVEL_METRICS(defCmd) \ + defCmd(bloom_filter_useful) \ + defCmd(bloom_filter_full_positive) \ + defCmd(bloom_filter_full_true_positive) \ + defCmd(user_key_return_count) \ + defCmd(get_from_table_nanos) \ + defCmd(block_cache_hit_count) \ + defCmd(block_cache_miss_count) +// clang-format on + +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevelInt { +#define EMIT_FIELDS(x) uint64_t x = 0; + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS +}; + +/* + * Please add new metrics to this macro and appropriate fields will be copied, + * and/or emitted when converted to string. + * When people need to add new metrics please enclose the name of the specific + * metric within defCmd(). The position of the field will be dictated by the + * order in which the macros are enumerated and the offsets of the fields will + * be matched against ''PerfContextBase'' declared in perf_context.h. + */ + +// clang-format off +#define DEF_PERF_CONTEXT_METRICS(defCmd) \ + defCmd(user_key_comparison_count) \ + defCmd(block_cache_hit_count) \ + defCmd(block_read_count) \ + defCmd(block_read_byte) \ + defCmd(block_read_time) \ + defCmd(block_read_cpu_time) \ + defCmd(block_cache_index_hit_count) \ + defCmd(block_cache_standalone_handle_count) \ + defCmd(block_cache_real_handle_count) \ + defCmd(index_block_read_count) \ + defCmd(block_cache_filter_hit_count) \ + defCmd(filter_block_read_count) \ + defCmd(compression_dict_block_read_count) \ + defCmd(secondary_cache_hit_count) \ + defCmd(compressed_sec_cache_insert_real_count) \ + defCmd(compressed_sec_cache_insert_dummy_count) \ + defCmd(compressed_sec_cache_uncompressed_bytes) \ + defCmd(compressed_sec_cache_compressed_bytes) \ + defCmd(block_checksum_time) \ + defCmd(block_decompress_time) \ + defCmd(get_read_bytes) \ + defCmd(multiget_read_bytes) \ + defCmd(iter_read_bytes) \ + defCmd(blob_cache_hit_count) \ + defCmd(blob_read_count) \ + defCmd(blob_read_byte) \ + defCmd(blob_read_time) \ + defCmd(blob_checksum_time) \ + defCmd(blob_decompress_time) \ + defCmd(internal_key_skipped_count) \ + defCmd(internal_delete_skipped_count) \ + defCmd(internal_recent_skipped_count) \ + defCmd(internal_merge_count) \ + defCmd(internal_merge_point_lookup_count) \ + defCmd(internal_range_del_reseek_count) \ + defCmd(get_snapshot_time) \ + defCmd(get_from_memtable_time) \ + defCmd(get_from_memtable_count) \ + defCmd(get_post_process_time) \ + defCmd(get_from_output_files_time) \ + defCmd(seek_on_memtable_time) \ + defCmd(seek_on_memtable_count) \ + defCmd(next_on_memtable_count) \ + defCmd(prev_on_memtable_count) \ + defCmd(seek_child_seek_time) \ + defCmd(seek_child_seek_count) \ + defCmd(seek_min_heap_time) \ + defCmd(seek_max_heap_time) \ + defCmd(seek_internal_seek_time) \ + defCmd(find_next_user_entry_time) \ + defCmd(write_wal_time) \ + defCmd(write_memtable_time) \ + defCmd(write_delay_time) \ + defCmd(write_scheduling_flushes_compactions_time)\ + defCmd(write_pre_and_post_process_time) \ + defCmd(write_thread_wait_nanos) \ + defCmd(db_mutex_lock_nanos) \ + defCmd(db_condition_wait_nanos) \ + defCmd(merge_operator_time_nanos) \ + defCmd(read_index_block_nanos) \ + defCmd(read_filter_block_nanos) \ + defCmd(new_table_block_iter_nanos) \ + defCmd(new_table_iterator_nanos) \ + defCmd(block_seek_nanos) \ + defCmd(find_table_nanos) \ + defCmd(bloom_memtable_hit_count) \ + defCmd(bloom_memtable_miss_count) \ + defCmd(bloom_sst_hit_count) \ + defCmd(bloom_sst_miss_count) \ + defCmd(key_lock_wait_time) \ + defCmd(key_lock_wait_count) \ + defCmd(env_new_sequential_file_nanos) \ + defCmd(env_new_random_access_file_nanos) \ + defCmd(env_new_writable_file_nanos) \ + defCmd(env_reuse_writable_file_nanos) \ + defCmd(env_new_random_rw_file_nanos) \ + defCmd(env_new_directory_nanos) \ + defCmd(env_file_exists_nanos) \ + defCmd(env_get_children_nanos) \ + defCmd(env_get_children_file_attributes_nanos) \ + defCmd(env_delete_file_nanos) \ + defCmd(env_create_dir_nanos) \ + defCmd(env_create_dir_if_missing_nanos) \ + defCmd(env_delete_dir_nanos) \ + defCmd(env_get_file_size_nanos) \ + defCmd(env_get_file_modification_time_nanos) \ + defCmd(env_rename_file_nanos) \ + defCmd(env_link_file_nanos) \ + defCmd(env_lock_file_nanos) \ + defCmd(env_unlock_file_nanos) \ + defCmd(env_new_logger_nanos) \ + defCmd(get_cpu_nanos) \ + defCmd(iter_next_cpu_nanos) \ + defCmd(iter_prev_cpu_nanos) \ + defCmd(iter_seek_cpu_nanos) \ + defCmd(iter_next_count) \ + defCmd(iter_prev_count) \ + defCmd(iter_seek_count) \ + defCmd(encrypt_data_nanos) \ + defCmd(decrypt_data_nanos) \ + defCmd(number_async_seek) +// clang-format on + +struct PerfContextInt { +#define EMIT_FIELDS(x) uint64_t x; + DEF_PERF_CONTEXT_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS +}; + +#if defined(NPERF_CONTEXT) +// Should not be used because the counters are not thread-safe. +// Put here just to make get_perf_context() simple without ifdef. +PerfContext perf_context; +#else +thread_local PerfContext perf_context; +#endif + +PerfContext* get_perf_context() { + static_assert(sizeof(PerfContextBase) == sizeof(PerfContextInt)); + static_assert(sizeof(PerfContextByLevelBase) == + sizeof(PerfContextByLevelInt)); + /* + * Validate that we have the same fields and offsets between the external user + * facing + * ''PerfContextBase'' and ''PerfContextByLevelBase' structures with the + * internal structures that we generate from the DEF_* macros above. This way + * if people add metrics to the user-facing header file, they will have a + * build failure and need to add similar fields to the macros in this file. + * These are compile-time validations and don't impose any run-time penalties. + */ +#define EMIT_OFFSET_ASSERTION(x) \ + static_assert(offsetof(PerfContextBase, x) == offsetof(PerfContextInt, x)); + DEF_PERF_CONTEXT_METRICS(EMIT_OFFSET_ASSERTION) +#undef EMIT_OFFSET_ASSERTION +#define EMIT_OFFSET_ASSERTION(x) \ + static_assert(offsetof(PerfContextByLevelBase, x) == \ + offsetof(PerfContextByLevelInt, x)); + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_OFFSET_ASSERTION) +#undef EMIT_OFFSET_ASSERTION + return &perf_context; +} + +PerfContext::~PerfContext() { +#if !defined(NPERF_CONTEXT) && !defined(OS_SOLARIS) + ClearPerLevelPerfContext(); +#endif +} + +PerfContext::PerfContext(const PerfContext& other) { +#ifdef NPERF_CONTEXT + (void)other; +#else + copyMetrics(&other); +#endif +} + +PerfContext::PerfContext(PerfContext&& other) noexcept { +#ifdef NPERF_CONTEXT + (void)other; +#else + copyMetrics(&other); +#endif +} + +PerfContext& PerfContext::operator=(const PerfContext& other) { +#ifdef NPERF_CONTEXT + (void)other; +#else + copyMetrics(&other); +#endif + return *this; +} + +void PerfContext::copyMetrics(const PerfContext* other) noexcept { +#ifdef NPERF_CONTEXT + (void)other; +#else +#define EMIT_COPY_FIELDS(x) x = other->x; + DEF_PERF_CONTEXT_METRICS(EMIT_COPY_FIELDS) +#undef EMIT_COPY_FIELDS + if (per_level_perf_context_enabled && level_to_perf_context != nullptr) { + ClearPerLevelPerfContext(); + } + if (other->level_to_perf_context != nullptr) { + level_to_perf_context = new std::map(); + *level_to_perf_context = *other->level_to_perf_context; + } + per_level_perf_context_enabled = other->per_level_perf_context_enabled; +#endif +} + +void PerfContext::Reset() { +#ifndef NPERF_CONTEXT +#define EMIT_FIELDS(x) x = 0; + DEF_PERF_CONTEXT_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS + if (per_level_perf_context_enabled && level_to_perf_context) { + for (auto& kv : *level_to_perf_context) { + kv.second.Reset(); + } + } +#endif +} + +void PerfContextByLevel::Reset() { +#ifndef NPERF_CONTEXT +#define EMIT_FIELDS(x) x = 0; + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS +#endif +} + +std::string PerfContext::ToString(bool exclude_zero_counters) const { +#ifdef NPERF_CONTEXT + (void)exclude_zero_counters; + return ""; +#else + std::ostringstream ss; +#define PERF_CONTEXT_OUTPUT(counter) \ + if (!exclude_zero_counters || (counter > 0)) { \ + ss << #counter << " = " << counter << ", "; \ + } + DEF_PERF_CONTEXT_METRICS(PERF_CONTEXT_OUTPUT) +#undef PERF_CONTEXT_OUTPUT + if (per_level_perf_context_enabled && level_to_perf_context) { +#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter) \ + ss << #counter << " = "; \ + for (auto& kv : *level_to_perf_context) { \ + if (!exclude_zero_counters || (kv.second.counter > 0)) { \ + ss << kv.second.counter << "@level" << kv.first << ", "; \ + } \ + } + DEF_PERF_CONTEXT_LEVEL_METRICS(PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER) +#undef PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER + } + std::string str = ss.str(); + str.erase(str.find_last_not_of(", ") + 1); + return str; +#endif +} + +void PerfContext::EnablePerLevelPerfContext() { + if (level_to_perf_context == nullptr) { + level_to_perf_context = new std::map(); + } + per_level_perf_context_enabled = true; +} + +void PerfContext::DisablePerLevelPerfContext() { + per_level_perf_context_enabled = false; +} + +void PerfContext::ClearPerLevelPerfContext() { + if (level_to_perf_context != nullptr) { + level_to_perf_context->clear(); + delete level_to_perf_context; + level_to_perf_context = nullptr; + } + per_level_perf_context_enabled = false; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context_imp.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context_imp.h new file mode 100644 index 0000000000..5b66ff2ff9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_context_imp.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "monitoring/perf_step_timer.h" +#include "rocksdb/perf_context.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { +#if defined(NPERF_CONTEXT) +extern PerfContext perf_context; +#else +#if defined(OS_SOLARIS) +extern thread_local PerfContext perf_context_; +#define perf_context (*get_perf_context()) +#else +extern thread_local PerfContext perf_context; +#endif +#endif + +#if defined(NPERF_CONTEXT) + +#define PERF_TIMER_STOP(metric) +#define PERF_TIMER_START(metric) +#define PERF_TIMER_GUARD(metric) +#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock) +#define PERF_CPU_TIMER_GUARD(metric, clock) +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) +#define PERF_TIMER_MEASURE(metric) +#define PERF_COUNTER_ADD(metric, value) +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) + +#else + +// Stop the timer and update the metric +#define PERF_TIMER_STOP(metric) perf_step_timer_##metric.Stop(); + +#define PERF_TIMER_START(metric) perf_step_timer_##metric.Start(); + +// Declare and set start time of the timer +#define PERF_TIMER_GUARD(metric) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \ + perf_step_timer_##metric.Start(); + +// Declare and set start time of the timer +#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), clock); \ + perf_step_timer_##metric.Start(); + +// Declare and set start time of the timer +#define PERF_CPU_TIMER_GUARD(metric, clock) \ + PerfStepTimer perf_step_timer_##metric( \ + &(perf_context.metric), clock, true, \ + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ + perf_step_timer_##metric.Start(); + +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \ + false, PerfLevel::kEnableTime, stats, \ + ticker_type); \ + if (condition) { \ + perf_step_timer_##metric.Start(); \ + } + +// Update metric with time elapsed since last START. start time is reset +// to current timestamp. +#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure(); + +// Increase metric value +#define PERF_COUNTER_ADD(metric, value) \ + if (perf_level >= PerfLevel::kEnableCount) { \ + perf_context.metric += value; \ + } + +// Increase metric value +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ + if (perf_level >= PerfLevel::kEnableCount && \ + perf_context.per_level_perf_context_enabled && \ + perf_context.level_to_perf_context) { \ + if ((*(perf_context.level_to_perf_context)).find(level) != \ + (*(perf_context.level_to_perf_context)).end()) { \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } else { \ + PerfContextByLevel empty_context; \ + (*(perf_context.level_to_perf_context))[level] = empty_context; \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } \ + } + +#endif + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level.cc new file mode 100644 index 0000000000..e3507624b6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include + +#include "monitoring/perf_level_imp.h" + +namespace ROCKSDB_NAMESPACE { + +thread_local PerfLevel perf_level = kEnableCount; + +void SetPerfLevel(PerfLevel level) { + assert(level > kUninitialized); + assert(level < kOutOfBounds); + perf_level = level; +} + +PerfLevel GetPerfLevel() { return perf_level; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level_imp.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level_imp.h new file mode 100644 index 0000000000..28bd185cd1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_level_imp.h @@ -0,0 +1,14 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "port/port.h" +#include "rocksdb/perf_level.h" + +namespace ROCKSDB_NAMESPACE { + +extern thread_local PerfLevel perf_level; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_step_timer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_step_timer.h new file mode 100644 index 0000000000..f6c45d773c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/perf_step_timer.h @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "monitoring/perf_level_imp.h" +#include "monitoring/statistics_impl.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +class PerfStepTimer { + public: + explicit PerfStepTimer( + uint64_t* metric, SystemClock* clock = nullptr, bool use_cpu_time = false, + PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, + Statistics* statistics = nullptr, uint32_t ticker_type = 0) + : perf_counter_enabled_(perf_level >= enable_level), + use_cpu_time_(use_cpu_time), + ticker_type_(ticker_type), + clock_((perf_counter_enabled_ || statistics != nullptr) + ? (clock ? clock : SystemClock::Default().get()) + : nullptr), + start_(0), + metric_(metric), + statistics_(statistics) {} + + ~PerfStepTimer() { Stop(); } + + void Start() { + if (perf_counter_enabled_ || statistics_ != nullptr) { + start_ = time_now(); + } + } + + void Measure() { + if (start_) { + uint64_t now = time_now(); + *metric_ += now - start_; + start_ = now; + } + } + + void Stop() { + if (start_) { + uint64_t duration = time_now() - start_; + if (perf_counter_enabled_) { + *metric_ += duration; + } + + if (statistics_ != nullptr) { + RecordTick(statistics_, ticker_type_, duration); + } + start_ = 0; + } + } + + private: + uint64_t time_now() { + if (!use_cpu_time_) { + return clock_->NowNanos(); + } else { + return clock_->CPUNanos(); + } + } + + const bool perf_counter_enabled_; + const bool use_cpu_time_; + uint32_t ticker_type_; + SystemClock* const clock_; + uint64_t start_; + uint64_t* metric_; + Statistics* statistics_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.cc new file mode 100644 index 0000000000..f4c022148c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "monitoring/persistent_stats_history.h" + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +// 10 digit seconds timestamp => [Sep 9, 2001 ~ Nov 20, 2286] +const int kNowSecondsStringLength = 10; +const std::string kFormatVersionKeyString = + "__persistent_stats_format_version__"; +const std::string kCompatibleVersionKeyString = + "__persistent_stats_compatible_version__"; +// Every release maintains two versions numbers for persistents stats: Current +// format version and compatible format version. Current format version +// designates what type of encoding will be used when writing to stats CF; +// compatible format version designates the minimum format version that +// can decode the stats CF encoded using the current format version. +const uint64_t kStatsCFCurrentFormatVersion = 1; +const uint64_t kStatsCFCompatibleFormatVersion = 1; + +Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, + uint64_t* version_number) { + if (type >= StatsVersionKeyType::kKeyTypeMax) { + return Status::InvalidArgument("Invalid stats version key type provided"); + } + std::string key; + if (type == StatsVersionKeyType::kFormatVersion) { + key = kFormatVersionKeyString; + } else if (type == StatsVersionKeyType::kCompatibleVersion) { + key = kCompatibleVersionKeyString; + } + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db->Get(options, db->PersistentStatsColumnFamily(), key, &result); + if (!s.ok() || result.empty()) { + return Status::NotFound("Persistent stats version key " + key + + " not found."); + } + + // read version_number but do nothing in current version + *version_number = ParseUint64(result); + return Status::OK(); +} + +int EncodePersistentStatsKey(uint64_t now_seconds, const std::string& key, + int size, char* buf) { + char timestamp[kNowSecondsStringLength + 1]; + // make time stamp string equal in length to allow sorting by time + snprintf(timestamp, sizeof(timestamp), "%010d", + static_cast(now_seconds)); + timestamp[kNowSecondsStringLength] = '\0'; + return snprintf(buf, size, "%s#%s", timestamp, key.c_str()); +} + +void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) { + cfo->write_buffer_size = 2 << 20; + cfo->target_file_size_base = 2 * 1048576; + cfo->max_bytes_for_level_base = 10 * 1048576; + cfo->soft_pending_compaction_bytes_limit = 256 * 1048576; + cfo->hard_pending_compaction_bytes_limit = 1073741824ul; + cfo->compression = kNoCompression; +} + +PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {} + +bool PersistentStatsHistoryIterator::Valid() const { return valid_; } + +Status PersistentStatsHistoryIterator::status() const { return status_; } + +void PersistentStatsHistoryIterator::Next() { + // increment start_time by 1 to avoid infinite loop + AdvanceIteratorByTime(GetStatsTime() + 1, end_time_); +} + +uint64_t PersistentStatsHistoryIterator::GetStatsTime() const { return time_; } + +const std::map& +PersistentStatsHistoryIterator::GetStatsMap() const { + return stats_map_; +} + +std::pair parseKey(const Slice& key, + uint64_t start_time) { + std::pair result; + std::string key_str = key.ToString(); + std::string::size_type pos = key_str.find("#"); + // TODO(Zhongyi): add counters to track parse failures? + if (pos == std::string::npos) { + result.first = std::numeric_limits::max(); + result.second.clear(); + } else { + uint64_t parsed_time = ParseUint64(key_str.substr(0, pos)); + // skip entries with timestamp smaller than start_time + if (parsed_time < start_time) { + result.first = std::numeric_limits::max(); + result.second = ""; + } else { + result.first = parsed_time; + std::string key_resize = key_str.substr(pos + 1); + result.second = key_resize; + } + } + return result; +} + +// advance the iterator to the next time between [start_time, end_time) +// if success, update time_ and stats_map_ with new_time and stats_map +void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time, + uint64_t end_time) { + // try to find next entry in stats_history_ map + if (db_impl_ != nullptr) { + ReadOptions ro; + Iterator* iter = + db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily()); + + char timestamp[kNowSecondsStringLength + 1]; + snprintf(timestamp, sizeof(timestamp), "%010d", + static_cast(std::max(time_, start_time))); + timestamp[kNowSecondsStringLength] = '\0'; + + iter->Seek(timestamp); + // no more entries with timestamp >= start_time is found or version key + // is found to be incompatible + if (!iter->Valid()) { + valid_ = false; + delete iter; + return; + } + time_ = parseKey(iter->key(), start_time).first; + valid_ = true; + // check parsed time and invalid if it exceeds end_time + if (time_ > end_time) { + valid_ = false; + delete iter; + return; + } + // find all entries with timestamp equal to time_ + std::map new_stats_map; + std::pair kv; + for (; iter->Valid(); iter->Next()) { + kv = parseKey(iter->key(), start_time); + if (kv.first != time_) { + break; + } + if (kv.second.compare(kFormatVersionKeyString) == 0) { + continue; + } + new_stats_map[kv.second] = ParseUint64(iter->value().ToString()); + } + stats_map_.swap(new_stats_map); + delete iter; + } else { + valid_ = false; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.h new file mode 100644 index 0000000000..7c711fe4ef --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/persistent_stats_history.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/db_impl/db_impl.h" +#include "rocksdb/stats_history.h" + +namespace ROCKSDB_NAMESPACE { + +extern const std::string kFormatVersionKeyString; +extern const std::string kCompatibleVersionKeyString; +extern const uint64_t kStatsCFCurrentFormatVersion; +extern const uint64_t kStatsCFCompatibleFormatVersion; + +enum StatsVersionKeyType : uint32_t { + kFormatVersion = 1, + kCompatibleVersion = 2, + kKeyTypeMax = 3 +}; + +// Read the version number from persitent stats cf depending on type provided +// stores the version number in `*version_number` +// returns Status::OK() on success, or other status code on failure +Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, + uint64_t* version_number); + +// Encode timestamp and stats key into buf +// Format: timestamp(10 digit) + '#' + key +// Total length of encoded key will be capped at 100 bytes +int EncodePersistentStatsKey(uint64_t timestamp, const std::string& key, + int size, char* buf); + +void OptimizeForPersistentStats(ColumnFamilyOptions* cfo); + +class PersistentStatsHistoryIterator final : public StatsHistoryIterator { + public: + PersistentStatsHistoryIterator(uint64_t start_time, uint64_t end_time, + DBImpl* db_impl) + : time_(0), + start_time_(start_time), + end_time_(end_time), + valid_(true), + db_impl_(db_impl) { + AdvanceIteratorByTime(start_time_, end_time_); + } + ~PersistentStatsHistoryIterator() override; + bool Valid() const override; + Status status() const override; + + void Next() override; + uint64_t GetStatsTime() const override; + + const std::map& GetStatsMap() const override; + + private: + // advance the iterator to the next stats history record with timestamp + // between [start_time, end_time) + void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time); + + // No copying allowed + PersistentStatsHistoryIterator(const PersistentStatsHistoryIterator&) = + delete; + void operator=(const PersistentStatsHistoryIterator&) = delete; + PersistentStatsHistoryIterator(PersistentStatsHistoryIterator&&) = delete; + PersistentStatsHistoryIterator& operator=(PersistentStatsHistoryIterator&&) = + delete; + + uint64_t time_; + uint64_t start_time_; + uint64_t end_time_; + std::map stats_map_; + Status status_; + bool valid_; + DBImpl* db_impl_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics.cc new file mode 100644 index 0000000000..0e9c2250f2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics.cc @@ -0,0 +1,534 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "rocksdb/statistics.h" + +#include +#include +#include + +#include "monitoring/statistics_impl.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +const std::vector> TickersNameMap = { + {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"}, + {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"}, + {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"}, + {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"}, + {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"}, + {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, + {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"}, + {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"}, + {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, + {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, + {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"}, + {BLOCK_CACHE_FILTER_BYTES_INSERT, + "rocksdb.block.cache.filter.bytes.insert"}, + {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, + {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, + {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"}, + {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"}, + {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"}, + {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"}, + {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, + {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"}, + {BLOOM_FILTER_FULL_TRUE_POSITIVE, + "rocksdb.bloom.filter.full.true.positive"}, + {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"}, + {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"}, + {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"}, + {SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"}, + {MEMTABLE_HIT, "rocksdb.memtable.hit"}, + {MEMTABLE_MISS, "rocksdb.memtable.miss"}, + {GET_HIT_L0, "rocksdb.l0.hit"}, + {GET_HIT_L1, "rocksdb.l1.hit"}, + {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"}, + {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"}, + {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"}, + {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"}, + {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"}, + {COMPACTION_RANGE_DEL_DROP_OBSOLETE, + "rocksdb.compaction.range_del.drop.obsolete"}, + {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + "rocksdb.compaction.optimized.del.drop.obsolete"}, + {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"}, + {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"}, + {NUMBER_KEYS_READ, "rocksdb.number.keys.read"}, + {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"}, + {BYTES_WRITTEN, "rocksdb.bytes.written"}, + {BYTES_READ, "rocksdb.bytes.read"}, + {NUMBER_DB_SEEK, "rocksdb.number.db.seek"}, + {NUMBER_DB_NEXT, "rocksdb.number.db.next"}, + {NUMBER_DB_PREV, "rocksdb.number.db.prev"}, + {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"}, + {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"}, + {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"}, + {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"}, + {NO_FILE_OPENS, "rocksdb.no.file.opens"}, + {NO_FILE_ERRORS, "rocksdb.no.file.errors"}, + {STALL_MICROS, "rocksdb.stall.micros"}, + {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"}, + {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, + {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"}, + {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"}, + {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"}, + {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, + {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, + {BLOOM_FILTER_PREFIX_TRUE_POSITIVE, + "rocksdb.bloom.filter.prefix.true.positive"}, + {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, + {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, + {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, + {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, + {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, + {WRITE_DONE_BY_OTHER, "rocksdb.write.other"}, + {WRITE_WITH_WAL, "rocksdb.write.wal"}, + {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, + {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, + {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"}, + {COMPACT_READ_BYTES_MARKED, "rocksdb.compact.read.marked.bytes"}, + {COMPACT_READ_BYTES_PERIODIC, "rocksdb.compact.read.periodic.bytes"}, + {COMPACT_READ_BYTES_TTL, "rocksdb.compact.read.ttl.bytes"}, + {COMPACT_WRITE_BYTES_MARKED, "rocksdb.compact.write.marked.bytes"}, + {COMPACT_WRITE_BYTES_PERIODIC, "rocksdb.compact.write.periodic.bytes"}, + {COMPACT_WRITE_BYTES_TTL, "rocksdb.compact.write.ttl.bytes"}, + {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + "rocksdb.number.direct.load.table.properties"}, + {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"}, + {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"}, + {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"}, + {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"}, + {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"}, + {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"}, + {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"}, + {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"}, + {ROW_CACHE_HIT, "rocksdb.row.cache.hit"}, + {ROW_CACHE_MISS, "rocksdb.row.cache.miss"}, + {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"}, + {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"}, + {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"}, + {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"}, + {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"}, + {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"}, + {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"}, + {BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"}, + {BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"}, + {BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"}, + {BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"}, + {BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"}, + {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"}, + {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"}, + {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"}, + {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"}, + {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"}, + {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"}, + {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"}, + {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"}, + {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"}, + {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"}, + {BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, + "rocksdb.blobdb.blob.index.expired.count"}, + {BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"}, + {BLOB_DB_BLOB_INDEX_EVICTED_COUNT, + "rocksdb.blobdb.blob.index.evicted.count"}, + {BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"}, + {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"}, + {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"}, + {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"}, + {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"}, + {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"}, + {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"}, + {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"}, + {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"}, + {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"}, + {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, + "rocksdb.txn.overhead.mutex.old.commit.map"}, + {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"}, + {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"}, + {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"}, + {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"}, + {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"}, + {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"}, + {BLOCK_CACHE_COMPRESSION_DICT_MISS, + "rocksdb.block.cache.compression.dict.miss"}, + {BLOCK_CACHE_COMPRESSION_DICT_HIT, + "rocksdb.block.cache.compression.dict.hit"}, + {BLOCK_CACHE_COMPRESSION_DICT_ADD, + "rocksdb.block.cache.compression.dict.add"}, + {BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + "rocksdb.block.cache.compression.dict.bytes.insert"}, + {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"}, + {BLOCK_CACHE_INDEX_ADD_REDUNDANT, + "rocksdb.block.cache.index.add.redundant"}, + {BLOCK_CACHE_FILTER_ADD_REDUNDANT, + "rocksdb.block.cache.filter.add.redundant"}, + {BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"}, + {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + "rocksdb.block.cache.compression.dict.add.redundant"}, + {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, + {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, + {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"}, + {ERROR_HANDLER_BG_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.io.errro.count"}, + {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.retryable.io.errro.count"}, + {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"}, + {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + "rocksdb.error.handler.autoresume.retry.total.count"}, + {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + "rocksdb.error.handler.autoresume.success.count"}, + {MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + "rocksdb.memtable.payload.bytes.at.flush"}, + {MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + "rocksdb.memtable.garbage.bytes.at.flush"}, + {SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"}, + {VERIFY_CHECKSUM_READ_BYTES, "rocksdb.verify_checksum.read.bytes"}, + {BACKUP_READ_BYTES, "rocksdb.backup.read.bytes"}, + {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"}, + {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"}, + {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"}, + {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"}, + {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"}, + {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"}, + {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"}, + {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"}, + {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"}, + {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"}, + {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"}, + {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"}, + {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"}, + {LAST_LEVEL_SEEK_FILTERED, "rocksdb.last.level.seek.filtered"}, + {LAST_LEVEL_SEEK_FILTER_MATCH, "rocksdb.last.level.seek.filter.match"}, + {LAST_LEVEL_SEEK_DATA, "rocksdb.last.level.seek.data"}, + {LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + "rocksdb.last.level.seek.data.useful.no.filter"}, + {LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + "rocksdb.last.level.seek.data.useful.filter.match"}, + {NON_LAST_LEVEL_SEEK_FILTERED, "rocksdb.non.last.level.seek.filtered"}, + {NON_LAST_LEVEL_SEEK_FILTER_MATCH, + "rocksdb.non.last.level.seek.filter.match"}, + {NON_LAST_LEVEL_SEEK_DATA, "rocksdb.non.last.level.seek.data"}, + {NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + "rocksdb.non.last.level.seek.data.useful.no.filter"}, + {NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + "rocksdb.non.last.level.seek.data.useful.filter.match"}, + {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}, + {BLOCK_CHECKSUM_MISMATCH_COUNT, "rocksdb.block.checksum.mismatch.count"}, + {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"}, + {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"}, + {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"}, + {BLOB_DB_CACHE_ADD, "rocksdb.blobdb.cache.add"}, + {BLOB_DB_CACHE_ADD_FAILURES, "rocksdb.blobdb.cache.add.failures"}, + {BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"}, + {BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"}, + {READ_ASYNC_MICROS, "rocksdb.read.async.micros"}, + {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}, + {SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"}, + {SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"}, + {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, + {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, + {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, + {TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"}, + {TIMESTAMP_FILTER_TABLE_FILTERED, + "rocksdb.timestamp.filter.table.filtered"}, + {BYTES_COMPRESSED_FROM, "rocksdb.bytes.compressed.from"}, + {BYTES_COMPRESSED_TO, "rocksdb.bytes.compressed.to"}, + {BYTES_COMPRESSION_BYPASSED, "rocksdb.bytes.compression_bypassed"}, + {BYTES_COMPRESSION_REJECTED, "rocksdb.bytes.compression.rejected"}, + {NUMBER_BLOCK_COMPRESSION_BYPASSED, + "rocksdb.number.block_compression_bypassed"}, + {NUMBER_BLOCK_COMPRESSION_REJECTED, + "rocksdb.number.block_compression_rejected"}, + {BYTES_DECOMPRESSED_FROM, "rocksdb.bytes.decompressed.from"}, + {BYTES_DECOMPRESSED_TO, "rocksdb.bytes.decompressed.to"}, +}; + +const std::vector> HistogramsNameMap = { + {DB_GET, "rocksdb.db.get.micros"}, + {DB_WRITE, "rocksdb.db.write.micros"}, + {COMPACTION_TIME, "rocksdb.compaction.times.micros"}, + {COMPACTION_CPU_TIME, "rocksdb.compaction.times.cpu_micros"}, + {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"}, + {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"}, + {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"}, + {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"}, + {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"}, + {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"}, + {DB_MULTIGET, "rocksdb.db.multiget.micros"}, + {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"}, + {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"}, + {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"}, + {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"}, + {DB_SEEK, "rocksdb.db.seek.micros"}, + {WRITE_STALL, "rocksdb.db.write.stall"}, + {SST_READ_MICROS, "rocksdb.sst.read.micros"}, + {FILE_READ_FLUSH_MICROS, "rocksdb.file.read.flush.micros"}, + {FILE_READ_COMPACTION_MICROS, "rocksdb.file.read.compaction.micros"}, + {FILE_READ_DB_OPEN_MICROS, "rocksdb.file.read.db.open.micros"}, + {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, + {BYTES_PER_READ, "rocksdb.bytes.per.read"}, + {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, + {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"}, + {BYTES_COMPRESSED, "rocksdb.bytes.compressed"}, + {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"}, + {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"}, + {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"}, + {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"}, + {BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"}, + {BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"}, + {BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"}, + {BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"}, + {BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"}, + {BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"}, + {BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"}, + {BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"}, + {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"}, + {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"}, + {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"}, + {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"}, + {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"}, + {FLUSH_TIME, "rocksdb.db.flush.micros"}, + {SST_BATCH_SIZE, "rocksdb.sst.batch.size"}, + {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + "rocksdb.num.index.and.filter.blocks.read.per.level"}, + {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, + {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + "rocksdb.error.handler.autoresume.retry.count"}, + {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"}, + {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"}, + {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"}, + {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, + {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, + {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, + {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + "rocksdb.table.open.prefetch.tail.read.bytes"}, +}; + +std::shared_ptr CreateDBStatistics() { + return std::make_shared(nullptr); +} + +static int RegisterBuiltinStatistics(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + StatisticsImpl::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new StatisticsImpl(nullptr)); + return guard->get(); + }); + return 1; +} + +Status Statistics::CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinStatistics(*(ObjectLibrary::Default().get()), ""); + }); + Status s; + if (id == "" || id == StatisticsImpl::kClassName()) { + result->reset(new StatisticsImpl(nullptr)); + } else if (id == kNullptrString) { + result->reset(); + } else { + s = LoadSharedObject(config_options, id, result); + } + return s; +} + +static std::unordered_map stats_type_info = { + {"inner", OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kCompareNever)}, +}; + +StatisticsImpl::StatisticsImpl(std::shared_ptr stats) + : stats_(std::move(stats)) { + RegisterOptions("StatisticsOptions", &stats_, &stats_type_info); +} + +StatisticsImpl::~StatisticsImpl() {} + +uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const { + MutexLock lock(&aggregate_lock_); + return getTickerCountLocked(tickerType); +} + +uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const { + assert(tickerType < TICKER_ENUM_MAX); + uint64_t res = 0; + for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { + res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType]; + } + return res; +} + +void StatisticsImpl::histogramData(uint32_t histogramType, + HistogramData* const data) const { + MutexLock lock(&aggregate_lock_); + getHistogramImplLocked(histogramType)->Data(data); +} + +std::unique_ptr StatisticsImpl::getHistogramImplLocked( + uint32_t histogramType) const { + assert(histogramType < HISTOGRAM_ENUM_MAX); + std::unique_ptr res_hist(new HistogramImpl()); + for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { + res_hist->Merge( + per_core_stats_.AccessAtCore(core_idx)->histograms_[histogramType]); + } + return res_hist; +} + +std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const { + MutexLock lock(&aggregate_lock_); + return getHistogramImplLocked(histogramType)->ToString(); +} + +void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) { + { + MutexLock lock(&aggregate_lock_); + setTickerCountLocked(tickerType, count); + } + if (stats_ && tickerType < TICKER_ENUM_MAX) { + stats_->setTickerCount(tickerType, count); + } +} + +void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { + if (core_idx == 0) { + per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count; + } else { + per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = 0; + } + } +} + +uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) { + uint64_t sum = 0; + { + MutexLock lock(&aggregate_lock_); + assert(tickerType < TICKER_ENUM_MAX); + for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { + sum += + per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange( + 0, std::memory_order_relaxed); + } + } + if (stats_ && tickerType < TICKER_ENUM_MAX) { + stats_->setTickerCount(tickerType, 0); + } + return sum; +} + +void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) { + if (get_stats_level() <= StatsLevel::kExceptTickers) { + return; + } + if (tickerType < TICKER_ENUM_MAX) { + per_core_stats_.Access()->tickers_[tickerType].fetch_add( + count, std::memory_order_relaxed); + if (stats_) { + stats_->recordTick(tickerType, count); + } + } else { + assert(false); + } +} + +void StatisticsImpl::recordInHistogram(uint32_t histogramType, uint64_t value) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + if (get_stats_level() <= StatsLevel::kExceptHistogramOrTimers) { + return; + } + per_core_stats_.Access()->histograms_[histogramType].Add(value); + if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) { + stats_->recordInHistogram(histogramType, value); + } +} + +Status StatisticsImpl::Reset() { + MutexLock lock(&aggregate_lock_); + for (uint32_t i = 0; i < TICKER_ENUM_MAX; ++i) { + setTickerCountLocked(i, 0); + } + for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) { + for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { + per_core_stats_.AccessAtCore(core_idx)->histograms_[i].Clear(); + } + } + return Status::OK(); +} + +namespace { + +// a buffer size used for temp string buffers +const int kTmpStrBufferSize = 200; + +} // namespace + +std::string StatisticsImpl::ToString() const { + MutexLock lock(&aggregate_lock_); + std::string res; + res.reserve(20000); + for (const auto& t : TickersNameMap) { + assert(t.first < TICKER_ENUM_MAX); + char buffer[kTmpStrBufferSize]; + snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n", + t.second.c_str(), getTickerCountLocked(t.first)); + res.append(buffer); + } + for (const auto& h : HistogramsNameMap) { + assert(h.first < HISTOGRAM_ENUM_MAX); + char buffer[kTmpStrBufferSize]; + HistogramData hData; + getHistogramImplLocked(h.first)->Data(&hData); + // don't handle failures - buffer should always be big enough and arguments + // should be provided correctly + int ret = + snprintf(buffer, kTmpStrBufferSize, + "%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64 + " SUM : %" PRIu64 "\n", + h.second.c_str(), hData.median, hData.percentile95, + hData.percentile99, hData.max, hData.count, hData.sum); + if (ret < 0 || ret >= kTmpStrBufferSize) { + assert(false); + continue; + } + res.append(buffer); + } + res.shrink_to_fit(); + return res; +} + +bool StatisticsImpl::getTickerMap( + std::map* stats_map) const { + assert(stats_map); + if (!stats_map) return false; + stats_map->clear(); + MutexLock lock(&aggregate_lock_); + for (const auto& t : TickersNameMap) { + assert(t.first < TICKER_ENUM_MAX); + (*stats_map)[t.second.c_str()] = getTickerCountLocked(t.first); + } + return true; +} + +bool StatisticsImpl::HistEnabledForType(uint32_t type) const { + return type < HISTOGRAM_ENUM_MAX; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics_impl.h new file mode 100644 index 0000000000..e0dc29d284 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/statistics_impl.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "port/likely.h" +#include "port/port.h" +#include "rocksdb/statistics.h" +#include "util/core_local.h" +#include "util/mutexlock.h" + +#ifdef __clang__ +#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__)) +#else +#define ROCKSDB_FIELD_UNUSED +#endif // __clang__ + +#ifndef STRINGIFY +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#endif + +namespace ROCKSDB_NAMESPACE { + +enum TickersInternal : uint32_t { + INTERNAL_TICKER_ENUM_START = TICKER_ENUM_MAX, + INTERNAL_TICKER_ENUM_MAX +}; + +enum HistogramsInternal : uint32_t { + INTERNAL_HISTOGRAM_START = HISTOGRAM_ENUM_MAX, + INTERNAL_HISTOGRAM_ENUM_MAX +}; + +class StatisticsImpl : public Statistics { + public: + StatisticsImpl(std::shared_ptr stats); + virtual ~StatisticsImpl(); + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "BasicStatistics"; } + + virtual uint64_t getTickerCount(uint32_t ticker_type) const override; + virtual void histogramData(uint32_t histogram_type, + HistogramData* const data) const override; + std::string getHistogramString(uint32_t histogram_type) const override; + + virtual void setTickerCount(uint32_t ticker_type, uint64_t count) override; + virtual uint64_t getAndResetTickerCount(uint32_t ticker_type) override; + virtual void recordTick(uint32_t ticker_type, uint64_t count) override; + // The function is implemented for now for backward compatibility reason. + // In case a user explictly calls it, for example, they may have a wrapped + // Statistics object, passing the call to recordTick() into here, nothing + // will break. + void measureTime(uint32_t histogramType, uint64_t time) override { + recordInHistogram(histogramType, time); + } + virtual void recordInHistogram(uint32_t histogram_type, + uint64_t value) override; + + virtual Status Reset() override; + virtual std::string ToString() const override; + virtual bool getTickerMap(std::map*) const override; + virtual bool HistEnabledForType(uint32_t type) const override; + + const Customizable* Inner() const override { return stats_.get(); } + + private: + // If non-nullptr, forwards updates to the object pointed to by `stats_`. + std::shared_ptr stats_; + // Synchronizes anything that operates across other cores' local data, + // such that operations like Reset() can be performed atomically. + mutable port::Mutex aggregate_lock_; + + // The ticker/histogram data are stored in this structure, which we will store + // per-core. It is cache-aligned, so tickers/histograms belonging to different + // cores can never share the same cache line. + // + // Alignment attributes expand to nothing depending on the platform + struct ALIGN_AS(CACHE_LINE_SIZE) StatisticsData { + std::atomic_uint_fast64_t tickers_[INTERNAL_TICKER_ENUM_MAX] = {{0}}; + HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX]; +#ifndef HAVE_ALIGNED_NEW + char + padding[(CACHE_LINE_SIZE - + (INTERNAL_TICKER_ENUM_MAX * sizeof(std::atomic_uint_fast64_t) + + INTERNAL_HISTOGRAM_ENUM_MAX * sizeof(HistogramImpl)) % + CACHE_LINE_SIZE)] ROCKSDB_FIELD_UNUSED; +#endif + void* operator new(size_t s) { return port::cacheline_aligned_alloc(s); } + void* operator new[](size_t s) { return port::cacheline_aligned_alloc(s); } + void operator delete(void* p) { port::cacheline_aligned_free(p); } + void operator delete[](void* p) { port::cacheline_aligned_free(p); } + }; + +#ifndef TEST_CACHE_LINE_SIZE + static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, + "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned"); +#endif + + CoreLocalArray per_core_stats_; + + uint64_t getTickerCountLocked(uint32_t ticker_type) const; + std::unique_ptr getHistogramImplLocked( + uint32_t histogram_type) const; + void setTickerCountLocked(uint32_t ticker_type, uint64_t count); +}; + +// Utility functions +inline void RecordInHistogram(Statistics* statistics, uint32_t histogram_type, + uint64_t value) { + if (statistics) { + statistics->recordInHistogram(histogram_type, value); + } +} + +inline void RecordTimeToHistogram(Statistics* statistics, + uint32_t histogram_type, uint64_t value) { + if (statistics) { + statistics->reportTimeToHistogram(histogram_type, value); + } +} + +inline void RecordTick(Statistics* statistics, uint32_t ticker_type, + uint64_t count = 1) { + if (statistics) { + statistics->recordTick(ticker_type, count); + } +} + +inline void SetTickerCount(Statistics* statistics, uint32_t ticker_type, + uint64_t count) { + if (statistics) { + statistics->setTickerCount(ticker_type, count); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_impl.cc new file mode 100644 index 0000000000..9619dfd81e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_impl.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include + +#include "rocksdb/env.h" +#include "rocksdb/thread_status.h" +#include "util/string_util.h" +#include "util/thread_operation.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_USING_THREAD_STATUS +std::string ThreadStatus::GetThreadTypeName( + ThreadStatus::ThreadType thread_type) { + switch (thread_type) { + case ThreadStatus::ThreadType::HIGH_PRIORITY: + return "High Pri"; + case ThreadStatus::ThreadType::LOW_PRIORITY: + return "Low Pri"; + case ThreadStatus::ThreadType::USER: + return "User"; + case ThreadStatus::ThreadType::BOTTOM_PRIORITY: + return "Bottom Pri"; + case ThreadStatus::ThreadType::NUM_THREAD_TYPES: + assert(false); + } + return "Unknown"; +} + +const std::string& ThreadStatus::GetOperationName( + ThreadStatus::OperationType op_type) { + if (op_type < 0 || op_type >= NUM_OP_TYPES) { + return global_operation_table[OP_UNKNOWN].name; + } + return global_operation_table[op_type].name; +} + +const std::string& ThreadStatus::GetOperationStageName( + ThreadStatus::OperationStage stage) { + if (stage < 0 || stage >= NUM_OP_STAGES) { + return global_op_stage_table[STAGE_UNKNOWN].name; + } + return global_op_stage_table[stage].name; +} + +const std::string& ThreadStatus::GetStateName( + ThreadStatus::StateType state_type) { + if (state_type < 0 || state_type >= NUM_STATE_TYPES) { + return global_state_table[STATE_UNKNOWN].name; + } + return global_state_table[state_type].name; +} + +const std::string ThreadStatus::MicrosToString(uint64_t micros) { + if (micros == 0) { + return ""; + } + const int kBufferLen = 100; + char buffer[kBufferLen]; + AppendHumanMicros(micros, buffer, kBufferLen, false); + return std::string(buffer); +} + +const std::string& ThreadStatus::GetOperationPropertyName( + ThreadStatus::OperationType op_type, int i) { + static const std::string empty_str = ""; + switch (op_type) { + case ThreadStatus::OP_COMPACTION: + if (i >= NUM_COMPACTION_PROPERTIES) { + return empty_str; + } + return compaction_operation_properties[i].name; + case ThreadStatus::OP_FLUSH: + if (i >= NUM_FLUSH_PROPERTIES) { + return empty_str; + } + return flush_operation_properties[i].name; + default: + return empty_str; + } +} + +std::map ThreadStatus::InterpretOperationProperties( + ThreadStatus::OperationType op_type, const uint64_t* op_properties) { + int num_properties; + switch (op_type) { + case OP_COMPACTION: + num_properties = NUM_COMPACTION_PROPERTIES; + break; + case OP_FLUSH: + num_properties = NUM_FLUSH_PROPERTIES; + break; + default: + num_properties = 0; + } + + std::map property_map; + for (int i = 0; i < num_properties; ++i) { + if (op_type == OP_COMPACTION && i == COMPACTION_INPUT_OUTPUT_LEVEL) { + property_map.insert({"BaseInputLevel", op_properties[i] >> 32}); + property_map.insert( + {"OutputLevel", op_properties[i] % (uint64_t(1) << 32U)}); + } else if (op_type == OP_COMPACTION && i == COMPACTION_PROP_FLAGS) { + property_map.insert({"IsManual", ((op_properties[i] & 2) >> 1)}); + property_map.insert({"IsDeletion", ((op_properties[i] & 4) >> 2)}); + property_map.insert({"IsTrivialMove", ((op_properties[i] & 8) >> 3)}); + } else { + property_map.insert( + {GetOperationPropertyName(op_type, i), op_properties[i]}); + } + } + return property_map; +} + +#else + +std::string ThreadStatus::GetThreadTypeName( + ThreadStatus::ThreadType /*thread_type*/) { + static std::string dummy_str = ""; + return dummy_str; +} + +const std::string& ThreadStatus::GetOperationName( + ThreadStatus::OperationType /*op_type*/) { + static std::string dummy_str = ""; + return dummy_str; +} + +const std::string& ThreadStatus::GetOperationStageName( + ThreadStatus::OperationStage /*stage*/) { + static std::string dummy_str = ""; + return dummy_str; +} + +const std::string& ThreadStatus::GetStateName( + ThreadStatus::StateType /*state_type*/) { + static std::string dummy_str = ""; + return dummy_str; +} + +const std::string ThreadStatus::MicrosToString(uint64_t /*op_elapsed_time*/) { + static std::string dummy_str = ""; + return dummy_str; +} + +const std::string& ThreadStatus::GetOperationPropertyName( + ThreadStatus::OperationType /*op_type*/, int /*i*/) { + static std::string dummy_str = ""; + return dummy_str; +} + +std::map ThreadStatus::InterpretOperationProperties( + ThreadStatus::OperationType /*op_type*/, + const uint64_t* /*op_properties*/) { + return std::map(); +} + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.cc new file mode 100644 index 0000000000..37fcef62b0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.cc @@ -0,0 +1,328 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "monitoring/thread_status_updater.h" + +#include + +#include "port/likely.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_USING_THREAD_STATUS + +thread_local ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = + nullptr; + +void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType ttype, + uint64_t thread_id) { + if (UNLIKELY(thread_status_data_ == nullptr)) { + thread_status_data_ = new ThreadStatusData(); + thread_status_data_->thread_type = ttype; + thread_status_data_->thread_id = thread_id; + std::lock_guard lck(thread_list_mutex_); + thread_data_set_.insert(thread_status_data_); + } + + ClearThreadOperationProperties(); +} + +void ThreadStatusUpdater::UnregisterThread() { + if (thread_status_data_ != nullptr) { + std::lock_guard lck(thread_list_mutex_); + thread_data_set_.erase(thread_status_data_); + delete thread_status_data_; + thread_status_data_ = nullptr; + } +} + +void ThreadStatusUpdater::ResetThreadStatus() { + ClearThreadState(); + ClearThreadOperation(); + SetColumnFamilyInfoKey(nullptr); +} + +void ThreadStatusUpdater::SetEnableTracking(bool enable_tracking) { + auto* data = Get(); + if (data == nullptr) { + return; + } + data->enable_tracking.store(enable_tracking, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* cf_key) { + auto* data = Get(); + if (data == nullptr) { + return; + } + data->cf_key.store(const_cast(cf_key), std::memory_order_relaxed); +} + +const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return nullptr; + } + return data->cf_key.load(std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetThreadOperation( + const ThreadStatus::OperationType type) { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + // NOTE: Our practice here is to set all the thread operation properties + // and stage before we set thread operation, and thread operation + // will be set in std::memory_order_release. This is to ensure + // whenever a thread operation is not OP_UNKNOWN, we will always + // have a consistent information on its properties. + data->operation_type.store(type, std::memory_order_release); + if (type == ThreadStatus::OP_UNKNOWN) { + data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN, + std::memory_order_relaxed); + ClearThreadOperationProperties(); + } +} + +ThreadStatus::OperationType ThreadStatusUpdater::GetThreadOperation() { + ThreadStatusData* data = GetLocalThreadStatus(); + if (data == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return data->operation_type.load(std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetThreadOperationProperty(int i, uint64_t value) { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + data->op_properties[i].store(value, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::IncreaseThreadOperationProperty(int i, + uint64_t delta) { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + data->op_properties[i].fetch_add(delta, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetOperationStartTime(const uint64_t start_time) { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + data->op_start_time.store(start_time, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::ClearThreadOperation() { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN, + std::memory_order_relaxed); + data->operation_type.store(ThreadStatus::OP_UNKNOWN, + std::memory_order_relaxed); + ClearThreadOperationProperties(); +} + +void ThreadStatusUpdater::ClearThreadOperationProperties() { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) { + data->op_properties[i].store(0, std::memory_order_relaxed); + } +} + +ThreadStatus::OperationStage ThreadStatusUpdater::SetThreadOperationStage( + ThreadStatus::OperationStage stage) { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return ThreadStatus::STAGE_UNKNOWN; + } + return data->operation_stage.exchange(stage, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetThreadState(const ThreadStatus::StateType type) { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + data->state_type.store(type, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::ClearThreadState() { + auto* data = GetLocalThreadStatus(); + if (data == nullptr) { + return; + } + data->state_type.store(ThreadStatus::STATE_UNKNOWN, + std::memory_order_relaxed); +} + +Status ThreadStatusUpdater::GetThreadList( + std::vector* thread_list) { + thread_list->clear(); + std::vector> valid_list; + uint64_t now_micros = SystemClock::Default()->NowMicros(); + + std::lock_guard lck(thread_list_mutex_); + for (auto* thread_data : thread_data_set_) { + assert(thread_data); + auto thread_id = thread_data->thread_id.load(std::memory_order_relaxed); + auto thread_type = thread_data->thread_type.load(std::memory_order_relaxed); + // Since any change to cf_info_map requires thread_list_mutex, + // which is currently held by GetThreadList(), here we can safely + // use "memory_order_relaxed" to load the cf_key. + auto cf_key = thread_data->cf_key.load(std::memory_order_relaxed); + + ThreadStatus::OperationType op_type = ThreadStatus::OP_UNKNOWN; + ThreadStatus::OperationStage op_stage = ThreadStatus::STAGE_UNKNOWN; + ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN; + uint64_t op_elapsed_micros = 0; + uint64_t op_props[ThreadStatus::kNumOperationProperties] = {0}; + + auto iter = cf_info_map_.find(cf_key); + if (iter != cf_info_map_.end()) { + op_type = thread_data->operation_type.load(std::memory_order_acquire); + // display lower-level info only when higher-level info is available. + if (op_type != ThreadStatus::OP_UNKNOWN) { + op_elapsed_micros = now_micros - thread_data->op_start_time.load( + std::memory_order_relaxed); + op_stage = thread_data->operation_stage.load(std::memory_order_relaxed); + state_type = thread_data->state_type.load(std::memory_order_relaxed); + for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) { + op_props[i] = + thread_data->op_properties[i].load(std::memory_order_relaxed); + } + } + } + + thread_list->emplace_back( + thread_id, thread_type, + iter != cf_info_map_.end() ? iter->second.db_name : "", + iter != cf_info_map_.end() ? iter->second.cf_name : "", op_type, + op_elapsed_micros, op_stage, op_props, state_type); + } + + return Status::OK(); +} + +ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() { + if (thread_status_data_ == nullptr) { + return nullptr; + } + if (!thread_status_data_->enable_tracking.load(std::memory_order_relaxed)) { + return nullptr; + } + return thread_status_data_; +} + +void ThreadStatusUpdater::NewColumnFamilyInfo(const void* db_key, + const std::string& db_name, + const void* cf_key, + const std::string& cf_name) { + // Acquiring same lock as GetThreadList() to guarantee + // a consistent view of global column family table (cf_info_map). + std::lock_guard lck(thread_list_mutex_); + + cf_info_map_.emplace(std::piecewise_construct, std::make_tuple(cf_key), + std::make_tuple(db_key, db_name, cf_name)); + db_key_map_[db_key].insert(cf_key); +} + +void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) { + // Acquiring same lock as GetThreadList() to guarantee + // a consistent view of global column family table (cf_info_map). + std::lock_guard lck(thread_list_mutex_); + + auto cf_pair = cf_info_map_.find(cf_key); + if (cf_pair != cf_info_map_.end()) { + // Remove its entry from db_key_map_ by the following steps: + // 1. Obtain the entry in db_key_map_ whose set contains cf_key + // 2. Remove it from the set. + ConstantColumnFamilyInfo& cf_info = cf_pair->second; + auto db_pair = db_key_map_.find(cf_info.db_key); + assert(db_pair != db_key_map_.end()); + size_t result __attribute__((__unused__)); + result = db_pair->second.erase(cf_key); + assert(result); + cf_info_map_.erase(cf_pair); + } +} + +void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) { + // Acquiring same lock as GetThreadList() to guarantee + // a consistent view of global column family table (cf_info_map). + std::lock_guard lck(thread_list_mutex_); + auto db_pair = db_key_map_.find(db_key); + if (UNLIKELY(db_pair == db_key_map_.end())) { + // In some occasional cases such as DB::Open fails, we won't + // register ColumnFamilyInfo for a db. + return; + } + + for (auto cf_key : db_pair->second) { + auto cf_pair = cf_info_map_.find(cf_key); + if (cf_pair != cf_info_map_.end()) { + cf_info_map_.erase(cf_pair); + } + } + db_key_map_.erase(db_key); +} + +#else + +void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType /*ttype*/, + uint64_t /*thread_id*/) {} + +void ThreadStatusUpdater::UnregisterThread() {} + +void ThreadStatusUpdater::ResetThreadStatus() {} + +void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* /*cf_key*/) {} + +void ThreadStatusUpdater::SetThreadOperation( + const ThreadStatus::OperationType /*type*/) {} + +void ThreadStatusUpdater::ClearThreadOperation() {} + +void ThreadStatusUpdater::SetThreadState( + const ThreadStatus::StateType /*type*/) {} + +void ThreadStatusUpdater::ClearThreadState() {} + +Status ThreadStatusUpdater::GetThreadList( + std::vector* /*thread_list*/) { + return Status::NotSupported( + "GetThreadList is not supported in the current running environment."); +} + +void ThreadStatusUpdater::NewColumnFamilyInfo(const void* /*db_key*/, + const std::string& /*db_name*/, + const void* /*cf_key*/, + const std::string& /*cf_name*/) {} + +void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* /*cf_key*/) {} + +void ThreadStatusUpdater::EraseDatabaseInfo(const void* /*db_key*/) {} + +void ThreadStatusUpdater::SetThreadOperationProperty(int /*i*/, + uint64_t /*value*/) {} + +void ThreadStatusUpdater::IncreaseThreadOperationProperty(int /*i*/, + uint64_t /*delta*/) {} + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.h new file mode 100644 index 0000000000..696063cb46 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater.h @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// The implementation of ThreadStatus. +// +// Note that we make get and set access to ThreadStatusData lockless. +// As a result, ThreadStatusData as a whole is not atomic. However, +// we guarantee consistent ThreadStatusData all the time whenever +// user call GetThreadList(). This consistency guarantee is done +// by having the following constraint in the internal implementation +// of set and get order: +// +// 1. When reset any information in ThreadStatusData, always start from +// clearing up the lower-level information first. +// 2. When setting any information in ThreadStatusData, always start from +// setting the higher-level information. +// 3. When returning ThreadStatusData to the user, fields are fetched from +// higher-level to lower-level. In addition, where there's a nullptr +// in one field, then all fields that has lower-level than that field +// should be ignored. +// +// The high to low level information would be: +// thread_id > thread_type > db > cf > operation > state +// +// This means user might not always get full information, but whenever +// returned by the GetThreadList() is guaranteed to be consistent. +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/status.h" +#include "rocksdb/thread_status.h" +#include "util/thread_operation.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; + +// The structure that keeps constant information about a column family. +struct ConstantColumnFamilyInfo { +#ifdef ROCKSDB_USING_THREAD_STATUS + public: + ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name, + const std::string& _cf_name) + : db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {} + const void* db_key; + const std::string db_name; + const std::string cf_name; +#endif // ROCKSDB_USING_THREAD_STATUS +}; + +// the internal data-structure that is used to reflect the current +// status of a thread using a set of atomic pointers. +struct ThreadStatusData { +#ifdef ROCKSDB_USING_THREAD_STATUS + explicit ThreadStatusData() { + enable_tracking.store(false); + thread_id.store(0); + thread_type.store(ThreadStatus::USER); + cf_key.store(nullptr); + operation_type.store(ThreadStatus::OP_UNKNOWN); + op_start_time.store(0); + state_type.store(ThreadStatus::STATE_UNKNOWN); + } + + // A flag to indicate whether the thread tracking is enabled + // in the current thread. + // If set to false, then SetThreadOperation and SetThreadState + // will be no-op. + std::atomic enable_tracking; + + std::atomic thread_id; + std::atomic thread_type; + std::atomic cf_key; + std::atomic operation_type; + std::atomic op_start_time; + std::atomic operation_stage; + std::atomic op_properties[ThreadStatus::kNumOperationProperties]; + std::atomic state_type; +#endif // ROCKSDB_USING_THREAD_STATUS +}; + +// The class that stores and updates the status of the current thread +// using a thread-local ThreadStatusData. +// +// In most of the case, you should use ThreadStatusUtil to update +// the status of the current thread instead of using ThreadSatusUpdater +// directly. +// +// @see ThreadStatusUtil +class ThreadStatusUpdater { + public: + ThreadStatusUpdater() {} + + // Releases all ThreadStatusData of all active threads. + virtual ~ThreadStatusUpdater() {} + + // Unregister the current thread. + void UnregisterThread(); + + // Reset the status of the current thread. This includes resetting + // ColumnFamilyInfoKey, ThreadOperation, and ThreadState. + void ResetThreadStatus(); + + // Set the id of the current thread. + void SetThreadID(uint64_t thread_id); + + // Register the current thread for tracking. + void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id); + + void SetEnableTracking(bool enable_tracking); + + // Update the column-family info of the current thread by setting + // its thread-local pointer of ThreadStatusData to the correct entry. + void SetColumnFamilyInfoKey(const void* cf_key); + + // returns the column family info key. + const void* GetColumnFamilyInfoKey(); + + // Update the thread operation of the current thread. + void SetThreadOperation(const ThreadStatus::OperationType type); + + // Return the thread operation of the current thread. + ThreadStatus::OperationType GetThreadOperation(); + + // The start time of the current thread operation. It is in the format + // of micro-seconds since some fixed point in time. + void SetOperationStartTime(const uint64_t start_time); + + // Set the "i"th property of the current operation. + // + // NOTE: Our practice here is to set all the thread operation properties + // and stage before we set thread operation, and thread operation + // will be set in std::memory_order_release. This is to ensure + // whenever a thread operation is not OP_UNKNOWN, we will always + // have a consistent information on its properties. + void SetThreadOperationProperty(int i, uint64_t value); + + // Increase the "i"th property of the current operation with + // the specified delta. + void IncreaseThreadOperationProperty(int i, uint64_t delta); + + // Update the thread operation stage of the current thread. + ThreadStatus::OperationStage SetThreadOperationStage( + const ThreadStatus::OperationStage stage); + + // Clear thread operation of the current thread. + void ClearThreadOperation(); + + // Reset all thread-operation-properties to 0. + void ClearThreadOperationProperties(); + + // Update the thread state of the current thread. + void SetThreadState(const ThreadStatus::StateType type); + + // Clear the thread state of the current thread. + void ClearThreadState(); + + // Obtain the status of all active registered threads. + Status GetThreadList(std::vector* thread_list); + + // Create an entry in the global ColumnFamilyInfo table for the + // specified column family. This function should be called only + // when the current thread does not hold db_mutex. + void NewColumnFamilyInfo(const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name); + + // Erase all ConstantColumnFamilyInfo that is associated with the + // specified db instance. This function should be called only when + // the current thread does not hold db_mutex. + void EraseDatabaseInfo(const void* db_key); + + // Erase the ConstantColumnFamilyInfo that is associated with the + // specified ColumnFamilyData. This function should be called only + // when the current thread does not hold db_mutex. + void EraseColumnFamilyInfo(const void* cf_key); + + // Verifies whether the input ColumnFamilyHandles matches + // the information stored in the current cf_info_map. + void TEST_VerifyColumnFamilyInfoMap( + const std::vector& handles, bool check_exist); + + protected: +#ifdef ROCKSDB_USING_THREAD_STATUS + // The thread-local variable for storing thread status. + static thread_local ThreadStatusData* thread_status_data_; + + // Returns the pointer to the thread status data only when the + // thread status data is non-null and has enable_tracking == true. + ThreadStatusData* GetLocalThreadStatus(); + + // Directly returns the pointer to thread_status_data_ without + // checking whether enabling_tracking is true of not. + ThreadStatusData* Get() { return thread_status_data_; } + + // The mutex that protects cf_info_map and db_key_map. + std::mutex thread_list_mutex_; + + // The current status data of all active threads. + std::unordered_set thread_data_set_; + + // A global map that keeps the column family information. It is stored + // globally instead of inside DB is to avoid the situation where DB is + // closing while GetThreadList function already get the pointer to its + // CopnstantColumnFamilyInfo. + std::unordered_map cf_info_map_; + + // A db_key to cf_key map that allows erasing elements in cf_info_map + // associated to the same db_key faster. + std::unordered_map> db_key_map_; + +#else + static ThreadStatusData* thread_status_data_; +#endif // ROCKSDB_USING_THREAD_STATUS +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater_debug.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater_debug.cc new file mode 100644 index 0000000000..464c23bbaa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_updater_debug.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/column_family.h" +#include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef NDEBUG +#ifdef ROCKSDB_USING_THREAD_STATUS +void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap( + const std::vector& handles, bool check_exist) { + std::unique_lock lock(thread_list_mutex_); + if (check_exist) { + assert(cf_info_map_.size() == handles.size()); + } + for (auto* handle : handles) { + auto* cfd = static_cast_with_check(handle)->cfd(); + auto iter __attribute__((__unused__)) = cf_info_map_.find(cfd); + if (check_exist) { + assert(iter != cf_info_map_.end()); + assert(iter->second.cf_name == cfd->GetName()); + } else { + assert(iter == cf_info_map_.end()); + } + } +} + +#else + +void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap( + const std::vector& /*handles*/, bool /*check_exist*/) { +} + +#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NDEBUG + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.cc new file mode 100644 index 0000000000..9b66dc28e8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "monitoring/thread_status_util.h" + +#include "monitoring/thread_status_updater.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_USING_THREAD_STATUS +thread_local ThreadStatusUpdater* + ThreadStatusUtil::thread_updater_local_cache_ = nullptr; +thread_local bool ThreadStatusUtil::thread_updater_initialized_ = false; + +void ThreadStatusUtil::RegisterThread(const Env* env, + ThreadStatus::ThreadType thread_type) { + if (!MaybeInitThreadLocalUpdater(env)) { + return; + } + assert(thread_updater_local_cache_); + thread_updater_local_cache_->RegisterThread(thread_type, env->GetThreadID()); +} + +void ThreadStatusUtil::UnregisterThread() { + thread_updater_initialized_ = false; + if (thread_updater_local_cache_ != nullptr) { + thread_updater_local_cache_->UnregisterThread(); + thread_updater_local_cache_ = nullptr; + } +} + +void ThreadStatusUtil::SetEnableTracking(bool enable_tracking) { + if (thread_updater_local_cache_ == nullptr) { + return; + } + thread_updater_local_cache_->SetEnableTracking(enable_tracking); +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { + if (thread_updater_local_cache_ == nullptr) { + return; + } + assert(cfd); + thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); +} + +void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { + if (thread_updater_local_cache_ == nullptr) { + return; + } + + if (op != ThreadStatus::OP_UNKNOWN) { + uint64_t current_time = SystemClock::Default()->NowMicros(); + thread_updater_local_cache_->SetOperationStartTime(current_time); + } else { + // TDOO(yhchiang): we could report the time when we set operation to + // OP_UNKNOWN once the whole instrumentation has been done. + thread_updater_local_cache_->SetOperationStartTime(0); + } + thread_updater_local_cache_->SetThreadOperation(op); +} + +ThreadStatus::OperationType ThreadStatusUtil::GetThreadOperation() { + if (thread_updater_local_cache_ == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return thread_updater_local_cache_->GetThreadOperation(); +} + +ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage( + ThreadStatus::OperationStage stage) { + if (thread_updater_local_cache_ == nullptr) { + // thread_updater_local_cache_ must be set in SetColumnFamily + // or other ThreadStatusUtil functions. + return ThreadStatus::STAGE_UNKNOWN; + } + + return thread_updater_local_cache_->SetThreadOperationStage(stage); +} + +void ThreadStatusUtil::SetThreadOperationProperty(int code, uint64_t value) { + if (thread_updater_local_cache_ == nullptr) { + // thread_updater_local_cache_ must be set in SetColumnFamily + // or other ThreadStatusUtil functions. + return; + } + + thread_updater_local_cache_->SetThreadOperationProperty(code, value); +} + +void ThreadStatusUtil::IncreaseThreadOperationProperty(int code, + uint64_t delta) { + if (thread_updater_local_cache_ == nullptr) { + // thread_updater_local_cache_ must be set in SetColumnFamily + // or other ThreadStatusUtil functions. + return; + } + + thread_updater_local_cache_->IncreaseThreadOperationProperty(code, delta); +} + +void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) { + if (thread_updater_local_cache_ == nullptr) { + // thread_updater_local_cache_ must be set in SetColumnFamily + // or other ThreadStatusUtil functions. + return; + } + + thread_updater_local_cache_->SetThreadState(state); +} + +void ThreadStatusUtil::ResetThreadStatus() { + if (thread_updater_local_cache_ == nullptr) { + return; + } + thread_updater_local_cache_->ResetThreadStatus(); +} + +void ThreadStatusUtil::NewColumnFamilyInfo(const DB* db, + const ColumnFamilyData* cfd, + const std::string& cf_name, + const Env* env) { + if (!MaybeInitThreadLocalUpdater(env)) { + return; + } + assert(thread_updater_local_cache_); + if (thread_updater_local_cache_) { + thread_updater_local_cache_->NewColumnFamilyInfo(db, db->GetName(), cfd, + cf_name); + } +} + +void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* cfd) { + if (thread_updater_local_cache_ == nullptr) { + return; + } + thread_updater_local_cache_->EraseColumnFamilyInfo(cfd); +} + +void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) { + ThreadStatusUpdater* thread_updater = db->GetEnv()->GetThreadStatusUpdater(); + if (thread_updater == nullptr) { + return; + } + thread_updater->EraseDatabaseInfo(db); +} + +bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) { + if (!thread_updater_initialized_ && env != nullptr) { + thread_updater_initialized_ = true; + thread_updater_local_cache_ = env->GetThreadStatusUpdater(); + } + return (thread_updater_local_cache_ != nullptr); +} + +AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater( + ThreadStatus::OperationStage stage) { + prev_stage_ = ThreadStatusUtil::SetThreadOperationStage(stage); +} + +AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() { + ThreadStatusUtil::SetThreadOperationStage(prev_stage_); +} + +#else + +ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr; +bool ThreadStatusUtil::thread_updater_initialized_ = false; + +bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { + return false; +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/) {} + +void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {} + +void ThreadStatusUtil::SetThreadOperationProperty(int /*code*/, + uint64_t /*value*/) {} + +void ThreadStatusUtil::IncreaseThreadOperationProperty(int /*code*/, + uint64_t /*delta*/) {} + +void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {} + +void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/, + const ColumnFamilyData* /*cfd*/, + const std::string& /*cf_name*/, + const Env* env) {} + +void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {} + +void ThreadStatusUtil::EraseDatabaseInfo(const DB* /*db*/) {} + +void ThreadStatusUtil::ResetThreadStatus() {} + +AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater( + ThreadStatus::OperationStage /*stage*/) {} + +AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {} + +#endif // ROCKSDB_USING_THREAD_STATUS + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.h new file mode 100644 index 0000000000..df148a0395 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util.h @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "monitoring/thread_status_updater.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/thread_status.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// The static utility class for updating thread-local status. +// +// The thread-local status is updated via the thread-local cached +// pointer thread_updater_local_cache_. During each function call, +// when ThreadStatusUtil finds thread_updater_local_cache_ is +// left uninitialized (determined by thread_updater_initialized_), +// it will tries to initialize it using the return value of +// Env::GetThreadStatusUpdater(). When thread_updater_local_cache_ +// is initialized by a non-null pointer, each function call will +// then update the status of the current thread. Otherwise, +// all function calls to ThreadStatusUtil will be no-op. +class ThreadStatusUtil { + public: + // Register the current thread for tracking. + static void RegisterThread(const Env* env, + ThreadStatus::ThreadType thread_type); + + // Unregister the current thread. + static void UnregisterThread(); + + // Create an entry in the global ColumnFamilyInfo table for the + // specified column family. This function should be called only + // when the current thread does not hold db_mutex. + static void NewColumnFamilyInfo(const DB* db, const ColumnFamilyData* cfd, + const std::string& cf_name, const Env* env); + + // Erase the ConstantColumnFamilyInfo that is associated with the + // specified ColumnFamilyData. This function should be called only + // when the current thread does not hold db_mutex. + static void EraseColumnFamilyInfo(const ColumnFamilyData* cfd); + + // Erase all ConstantColumnFamilyInfo that is associated with the + // specified db instance. This function should be called only when + // the current thread does not hold db_mutex. + static void EraseDatabaseInfo(const DB* db); + + static void SetEnableTracking(bool enable_tracking); + + // Update the thread status to indicate the current thread is doing + // something related to the specified column family. + // + // REQUIRES: cfd != nullptr + static void SetColumnFamily(const ColumnFamilyData* cfd); + + static void SetThreadOperation(ThreadStatus::OperationType type); + + static ThreadStatus::OperationType GetThreadOperation(); + + static ThreadStatus::OperationStage SetThreadOperationStage( + ThreadStatus::OperationStage stage); + + static void SetThreadOperationProperty(int code, uint64_t value); + + static void IncreaseThreadOperationProperty(int code, uint64_t delta); + + static void SetThreadState(ThreadStatus::StateType type); + + static void ResetThreadStatus(); + +#ifndef NDEBUG + static void TEST_SetStateDelay(const ThreadStatus::StateType state, + int micro); + static void TEST_StateDelay(const ThreadStatus::StateType state); + + static Env::IOActivity TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op); +#endif + + protected: + // Initialize the thread-local ThreadStatusUpdater when it finds + // the cached value is nullptr. Returns true if it has cached + // a non-null pointer. + static bool MaybeInitThreadLocalUpdater(const Env* env); + +#ifdef ROCKSDB_USING_THREAD_STATUS + // A boolean flag indicating whether thread_updater_local_cache_ + // is initialized. It is set to true when an Env uses any + // ThreadStatusUtil functions using the current thread other + // than UnregisterThread(). It will be set to false when + // UnregisterThread() is called. + // + // When this variable is set to true, thread_updater_local_cache_ + // will not be updated until this variable is again set to false + // in UnregisterThread(). + static thread_local bool thread_updater_initialized_; + + // The thread-local cached ThreadStatusUpdater that caches the + // thread_status_updater_ of the first Env that uses any ThreadStatusUtil + // function other than UnregisterThread(). This variable will + // be cleared when UnregisterThread() is called. + // + // When this variable is set to a non-null pointer, then the status + // of the current thread will be updated when a function of + // ThreadStatusUtil is called. Otherwise, all functions of + // ThreadStatusUtil will be no-op. + // + // When thread_updater_initialized_ is set to true, this variable + // will not be updated until this thread_updater_initialized_ is + // again set to false in UnregisterThread(). + static thread_local ThreadStatusUpdater* thread_updater_local_cache_; +#else + static bool thread_updater_initialized_; + static ThreadStatusUpdater* thread_updater_local_cache_; +#endif +}; + +// A helper class for updating thread state. It will set the +// thread state according to the input parameter in its constructor +// and set the thread state to the previous state in its destructor. +class AutoThreadOperationStageUpdater { + public: + explicit AutoThreadOperationStageUpdater(ThreadStatus::OperationStage stage); + ~AutoThreadOperationStageUpdater(); + +#ifdef ROCKSDB_USING_THREAD_STATUS + private: + ThreadStatus::OperationStage prev_stage_; +#endif +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util_debug.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util_debug.cc new file mode 100644 index 0000000000..6e4fe8a9f6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/monitoring/thread_status_util_debug.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef NDEBUG +// the delay for debugging purpose. +static std::atomic states_delay[ThreadStatus::NUM_STATE_TYPES]; + +void ThreadStatusUtil::TEST_SetStateDelay(const ThreadStatus::StateType state, + int micro) { + states_delay[state].store(micro, std::memory_order_relaxed); +} + +void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) { + auto delay = states_delay[state].load(std::memory_order_relaxed); + if (delay > 0) { + SystemClock::Default()->SleepForMicroseconds(delay); + } +} + +Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op) { + switch (thread_op) { + case ThreadStatus::OperationType::OP_FLUSH: + return Env::IOActivity::kFlush; + case ThreadStatus::OperationType::OP_COMPACTION: + return Env::IOActivity::kCompaction; + case ThreadStatus::OperationType::OP_DBOPEN: + return Env::IOActivity::kDBOpen; + default: + return Env::IOActivity::kUnknown; + } +} + +#endif // !NDEBUG + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.cc new file mode 100644 index 0000000000..ad1e669df5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.cc @@ -0,0 +1,1196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "options/cf_options.h" + +#include +#include +#include +#include + +#include "logging/logging.h" +#include "options/configurable_helper.h" +#include "options/db_options.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/configurable.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/cast_util.h" + +// NOTE: in this file, many option flags that were deprecated +// and removed from the rest of the code have to be kept here +// and marked as kDeprecated in order to be able to read old +// OPTIONS files. + +namespace ROCKSDB_NAMESPACE { + +static Status ParseCompressionOptions(const std::string& value, + const std::string& name, + CompressionOptions& compression_opts) { + const char kDelimiter = ':'; + std::istringstream field_stream(value); + std::string field; + + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + compression_opts.window_bits = ParseInt(field); + + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + compression_opts.level = ParseInt(field); + + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + compression_opts.strategy = ParseInt(field); + + // max_dict_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.max_dict_bytes = ParseInt(field); + } + + // zstd_max_train_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.zstd_max_train_bytes = ParseInt(field); + } + + // parallel_threads is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + // Since parallel_threads comes before enabled but was added optionally + // later, we need to check if this is the final token (meaning it is the + // enabled bit), or if there are more tokens (meaning this one is + // parallel_threads). + if (!field_stream.eof()) { + compression_opts.parallel_threads = ParseInt(field); + } else { + // parallel_threads is not serialized with this format, but enabled is + compression_opts.enabled = ParseBoolean("", field); + } + } + + // enabled is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.enabled = ParseBoolean("", field); + } + + // max_dict_buffer_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.max_dict_buffer_bytes = ParseUint64(field); + } + + // use_zstd_dict_trainer is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.use_zstd_dict_trainer = ParseBoolean("", field); + } + + if (!field_stream.eof()) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + return Status::OK(); +} + +const std::string kOptNameBMCompOpts = "bottommost_compression_opts"; +const std::string kOptNameCompOpts = "compression_opts"; + +// OptionTypeInfo map for CompressionOptions +static std::unordered_map + compression_options_type_info = { + {"window_bits", + {offsetof(struct CompressionOptions, window_bits), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"level", + {offsetof(struct CompressionOptions, level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strategy", + {offsetof(struct CompressionOptions, strategy), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_compressed_bytes_per_kb", + {offsetof(struct CompressionOptions, max_compressed_bytes_per_kb), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_dict_bytes", + {offsetof(struct CompressionOptions, max_dict_bytes), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"zstd_max_train_bytes", + {offsetof(struct CompressionOptions, zstd_max_train_bytes), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"parallel_threads", + {offsetof(struct CompressionOptions, parallel_threads), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enabled", + {offsetof(struct CompressionOptions, enabled), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_dict_buffer_bytes", + {offsetof(struct CompressionOptions, max_dict_buffer_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"use_zstd_dict_trainer", + {offsetof(struct CompressionOptions, use_zstd_dict_trainer), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + file_temperature_age_type_info = { + {"temperature", + {offsetof(struct FileTemperatureAge, temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"age", + {offsetof(struct FileTemperatureAge, age), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + fifo_compaction_options_type_info = { + {"max_table_files_size", + {offsetof(struct CompactionOptionsFIFO, max_table_files_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"age_for_warm", + {offsetof(struct CompactionOptionsFIFO, age_for_warm), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"ttl", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"allow_compaction", + {offsetof(struct CompactionOptionsFIFO, allow_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"file_temperature_age_thresholds", + OptionTypeInfo::Vector( + offsetof(struct CompactionOptionsFIFO, + file_temperature_age_thresholds), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable, + OptionTypeInfo::Struct("file_temperature_age_thresholds", + &file_temperature_age_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kMutable))}}; + +static std::unordered_map + universal_compaction_options_type_info = { + {"size_ratio", + {offsetof(class CompactionOptionsUniversal, size_ratio), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"min_merge_width", + {offsetof(class CompactionOptionsUniversal, min_merge_width), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_merge_width", + {offsetof(class CompactionOptionsUniversal, max_merge_width), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_size_amplification_percent", + {offsetof(class CompactionOptionsUniversal, + max_size_amplification_percent), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compression_size_percent", + {offsetof(class CompactionOptionsUniversal, compression_size_percent), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stop_style", + {offsetof(class CompactionOptionsUniversal, stop_style), + OptionType::kCompactionStopStyle, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"incremental", + {offsetof(class CompactionOptionsUniversal, incremental), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"allow_trivial_move", + {offsetof(class CompactionOptionsUniversal, allow_trivial_move), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}}; + +static std::unordered_map + cf_mutable_options_type_info = { + {"report_bg_io_stats", + {offsetof(struct MutableCFOptions, report_bg_io_stats), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"disable_auto_compactions", + {offsetof(struct MutableCFOptions, disable_auto_compactions), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"filter_deletes", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"check_flush_compaction_key_order", + {offsetof(struct MutableCFOptions, check_flush_compaction_key_order), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"paranoid_file_checks", + {offsetof(struct MutableCFOptions, paranoid_file_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"verify_checksums_in_compaction", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"soft_pending_compaction_bytes_limit", + {offsetof(struct MutableCFOptions, + soft_pending_compaction_bytes_limit), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"hard_pending_compaction_bytes_limit", + {offsetof(struct MutableCFOptions, + hard_pending_compaction_bytes_limit), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"hard_rate_limit", + {0, OptionType::kDouble, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"soft_rate_limit", + {0, OptionType::kDouble, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_compaction_bytes", + {offsetof(struct MutableCFOptions, max_compaction_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"ignore_max_compaction_bytes_for_input", + {offsetof(struct MutableCFOptions, + ignore_max_compaction_bytes_for_input), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"expanded_compaction_factor", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"level0_file_num_compaction_trigger", + {offsetof(struct MutableCFOptions, level0_file_num_compaction_trigger), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"level0_slowdown_writes_trigger", + {offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"level0_stop_writes_trigger", + {offsetof(struct MutableCFOptions, level0_stop_writes_trigger), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_grandparent_overlap_factor", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_write_buffer_number", + {offsetof(struct MutableCFOptions, max_write_buffer_number), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"source_compaction_factor", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"target_file_size_multiplier", + {offsetof(struct MutableCFOptions, target_file_size_multiplier), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"arena_block_size", + {offsetof(struct MutableCFOptions, arena_block_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"inplace_update_num_locks", + {offsetof(struct MutableCFOptions, inplace_update_num_locks), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_successive_merges", + {offsetof(struct MutableCFOptions, max_successive_merges), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_huge_page_size", + {offsetof(struct MutableCFOptions, memtable_huge_page_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_huge_page_tlb_size", + {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"write_buffer_size", + {offsetof(struct MutableCFOptions, write_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_bits", + {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_size_ratio", + {offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_probes", + {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"memtable_whole_key_filtering", + {offsetof(struct MutableCFOptions, memtable_whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"min_partial_merge_operands", + {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_bytes_for_level_base", + {offsetof(struct MutableCFOptions, max_bytes_for_level_base), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"snap_refresh_nanos", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_bytes_for_level_multiplier", + {offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_bytes_for_level_multiplier_additional", + OptionTypeInfo::Vector( + offsetof(struct MutableCFOptions, + max_bytes_for_level_multiplier_additional), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable, + {0, OptionType::kInt})}, + {"max_sequential_skip_in_iterations", + {offsetof(struct MutableCFOptions, max_sequential_skip_in_iterations), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"target_file_size_base", + {offsetof(struct MutableCFOptions, target_file_size_base), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compression", + {offsetof(struct MutableCFOptions, compression), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"prefix_extractor", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct MutableCFOptions, prefix_extractor), + OptionVerificationType::kByNameAllowNull, + (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))}, + {"compaction_options_fifo", + OptionTypeInfo::Struct( + "compaction_options_fifo", &fifo_compaction_options_type_info, + offsetof(struct MutableCFOptions, compaction_options_fifo), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable) + .SetParseFunc([](const ConfigOptions& opts, + const std::string& name, const std::string& value, + void* addr) { + // This is to handle backward compatibility, where + // compaction_options_fifo could be assigned a single scalar + // value, say, like "23", which would be assigned to + // max_table_files_size. + if (name == "compaction_options_fifo" && + value.find("=") == std::string::npos) { + // Old format. Parse just a single uint64_t value. + auto options = static_cast(addr); + options->max_table_files_size = ParseUint64(value); + return Status::OK(); + } else { + return OptionTypeInfo::ParseStruct( + opts, "compaction_options_fifo", + &fifo_compaction_options_type_info, name, value, addr); + } + })}, + {"compaction_options_universal", + OptionTypeInfo::Struct( + "compaction_options_universal", + &universal_compaction_options_type_info, + offsetof(struct MutableCFOptions, compaction_options_universal), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}, + {"ttl", + {offsetof(struct MutableCFOptions, ttl), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"periodic_compaction_seconds", + {offsetof(struct MutableCFOptions, periodic_compaction_seconds), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"bottommost_temperature", + {0, OptionType::kTemperature, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"last_level_temperature", + {offsetof(struct MutableCFOptions, last_level_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enable_blob_files", + {offsetof(struct MutableCFOptions, enable_blob_files), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"min_blob_size", + {offsetof(struct MutableCFOptions, min_blob_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_file_size", + {offsetof(struct MutableCFOptions, blob_file_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_compression_type", + {offsetof(struct MutableCFOptions, blob_compression_type), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enable_blob_garbage_collection", + {offsetof(struct MutableCFOptions, enable_blob_garbage_collection), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_age_cutoff", + {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_force_threshold", + {offsetof(struct MutableCFOptions, + blob_garbage_collection_force_threshold), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_compaction_readahead_size", + {offsetof(struct MutableCFOptions, blob_compaction_readahead_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_file_starting_level", + {offsetof(struct MutableCFOptions, blob_file_starting_level), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"prepopulate_blob_cache", + OptionTypeInfo::Enum( + offsetof(struct MutableCFOptions, prepopulate_blob_cache), + &prepopulate_blob_cache_string_map, OptionTypeFlags::kMutable)}, + {"sample_for_compression", + {offsetof(struct MutableCFOptions, sample_for_compression), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"bottommost_compression", + {offsetof(struct MutableCFOptions, bottommost_compression), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compression_per_level", + OptionTypeInfo::Vector( + offsetof(struct MutableCFOptions, compression_per_level), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable, + {0, OptionType::kCompressionType})}, + {"experimental_mempurge_threshold", + {offsetof(struct MutableCFOptions, experimental_mempurge_threshold), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_protection_bytes_per_key", + {offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"block_protection_bytes_per_key", + {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {kOptNameCompOpts, + OptionTypeInfo::Struct( + kOptNameCompOpts, &compression_options_type_info, + offsetof(struct MutableCFOptions, compression_opts), + OptionVerificationType::kNormal, + (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever), + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + // This is to handle backward compatibility, where + // compression_options was a ":" separated list. + if (name == kOptNameCompOpts && + value.find("=") == std::string::npos) { + auto* compression = static_cast(addr); + return ParseCompressionOptions(value, name, *compression); + } else { + return OptionTypeInfo::ParseStruct( + opts, kOptNameCompOpts, &compression_options_type_info, + name, value, addr); + } + })}, + {kOptNameBMCompOpts, + OptionTypeInfo::Struct( + kOptNameBMCompOpts, &compression_options_type_info, + offsetof(struct MutableCFOptions, bottommost_compression_opts), + OptionVerificationType::kNormal, + (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever), + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + // This is to handle backward compatibility, where + // compression_options was a ":" separated list. + if (name == kOptNameBMCompOpts && + value.find("=") == std::string::npos) { + auto* compression = static_cast(addr); + return ParseCompressionOptions(value, name, *compression); + } else { + return OptionTypeInfo::ParseStruct( + opts, kOptNameBMCompOpts, &compression_options_type_info, + name, value, addr); + } + })}, + // End special case properties +}; + +static std::unordered_map + cf_immutable_options_type_info = { + /* not yet supported + CompressionOptions compression_opts; + TablePropertiesCollectorFactories table_properties_collector_factories; + using TablePropertiesCollectorFactories = + std::vector>; + UpdateStatus (*inplace_callback)(char* existing_value, + uint34_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + std::vector cf_paths; + */ + {"compaction_measure_io_stats", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"purge_redundant_kvs_while_flush", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"inplace_update_support", + {offsetof(struct ImmutableCFOptions, inplace_update_support), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"level_compaction_dynamic_level_bytes", + {offsetof(struct ImmutableCFOptions, + level_compaction_dynamic_level_bytes), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"level_compaction_dynamic_file_size", + {offsetof(struct ImmutableCFOptions, + level_compaction_dynamic_file_size), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"optimize_filters_for_hits", + {offsetof(struct ImmutableCFOptions, optimize_filters_for_hits), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"force_consistency_checks", + {offsetof(struct ImmutableCFOptions, force_consistency_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"preclude_last_level_data_seconds", + {offsetof(struct ImmutableCFOptions, preclude_last_level_data_seconds), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"preserve_internal_time_seconds", + {offsetof(struct ImmutableCFOptions, preserve_internal_time_seconds), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + // Need to keep this around to be able to read old OPTIONS files. + {"max_mem_compaction_level", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"max_write_buffer_number_to_maintain", + {offsetof(struct ImmutableCFOptions, + max_write_buffer_number_to_maintain), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, 0}}, + {"max_write_buffer_size_to_maintain", + {offsetof(struct ImmutableCFOptions, + max_write_buffer_size_to_maintain), + OptionType::kInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"min_write_buffer_number_to_merge", + {offsetof(struct ImmutableCFOptions, min_write_buffer_number_to_merge), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, 0}}, + {"num_levels", + {offsetof(struct ImmutableCFOptions, num_levels), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"bloom_locality", + {offsetof(struct ImmutableCFOptions, bloom_locality), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"rate_limit_delay_max_milliseconds", + {0, OptionType::kUInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"comparator", + OptionTypeInfo::AsCustomRawPtr( + offsetof(struct ImmutableCFOptions, user_comparator), + OptionVerificationType::kByName, OptionTypeFlags::kCompareLoose) + .SetSerializeFunc( + // Serializes a Comparator + [](const ConfigOptions& opts, const std::string&, + const void* addr, std::string* value) { + // it's a const pointer of const Comparator* + const auto* ptr = + static_cast(addr); + // Since the user-specified comparator will be wrapped by + // InternalKeyComparator, we should persist the + // user-specified one instead of InternalKeyComparator. + if (*ptr == nullptr) { + *value = kNullptrString; + } else if (opts.mutable_options_only) { + *value = ""; + } else { + const Comparator* root_comp = (*ptr)->GetRootComparator(); + if (root_comp == nullptr) { + root_comp = (*ptr); + } + *value = root_comp->ToString(opts); + } + return Status::OK(); + })}, + {"memtable_insert_with_hint_prefix_extractor", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableCFOptions, + memtable_insert_with_hint_prefix_extractor), + OptionVerificationType::kByNameAllowNull, OptionTypeFlags::kNone)}, + {"memtable_factory", + {offsetof(struct ImmutableCFOptions, memtable_factory), + OptionType::kCustomizable, OptionVerificationType::kByName, + OptionTypeFlags::kShared, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + std::unique_ptr factory; + auto* shared = + static_cast*>(addr); + Status s = + MemTableRepFactory::CreateFromString(opts, value, shared); + return s; + }}}, + {"memtable", + {offsetof(struct ImmutableCFOptions, memtable_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + std::unique_ptr factory; + auto* shared = + static_cast*>(addr); + Status s = + MemTableRepFactory::CreateFromString(opts, value, shared); + return s; + }}}, + {"table_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableCFOptions, table_factory), + OptionVerificationType::kByName, + (OptionTypeFlags::kCompareLoose | + OptionTypeFlags::kStringNameOnly | + OptionTypeFlags::kDontPrepare))}, + {"block_based_table_factory", + {offsetof(struct ImmutableCFOptions, table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, + // Parses the input value and creates a BlockBasedTableFactory + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + BlockBasedTableOptions* old_opts = nullptr; + auto table_factory = + static_cast*>(addr); + if (table_factory->get() != nullptr) { + old_opts = + table_factory->get()->GetOptions(); + } + if (name == "block_based_table_factory") { + std::unique_ptr new_factory; + if (old_opts != nullptr) { + new_factory.reset(NewBlockBasedTableFactory(*old_opts)); + } else { + new_factory.reset(NewBlockBasedTableFactory()); + } + Status s = new_factory->ConfigureFromString(opts, value); + if (s.ok()) { + table_factory->reset(new_factory.release()); + } + return s; + } else if (old_opts != nullptr) { + return table_factory->get()->ConfigureOption(opts, name, value); + } else { + return Status::NotFound("Mismatched table option: ", name); + } + }}}, + {"plain_table_factory", + {offsetof(struct ImmutableCFOptions, table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, + // Parses the input value and creates a PlainTableFactory + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + PlainTableOptions* old_opts = nullptr; + auto table_factory = + static_cast*>(addr); + if (table_factory->get() != nullptr) { + old_opts = table_factory->get()->GetOptions(); + } + if (name == "plain_table_factory") { + std::unique_ptr new_factory; + if (old_opts != nullptr) { + new_factory.reset(NewPlainTableFactory(*old_opts)); + } else { + new_factory.reset(NewPlainTableFactory()); + } + Status s = new_factory->ConfigureFromString(opts, value); + if (s.ok()) { + table_factory->reset(new_factory.release()); + } + return s; + } else if (old_opts != nullptr) { + return table_factory->get()->ConfigureOption(opts, name, value); + } else { + return Status::NotFound("Mismatched table option: ", name); + } + }}}, + {"table_properties_collectors", + OptionTypeInfo::Vector< + std::shared_ptr>( + offsetof(struct ImmutableCFOptions, + table_properties_collector_factories), + OptionVerificationType::kByName, OptionTypeFlags::kNone, + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone))}, + {"compaction_filter", + OptionTypeInfo::AsCustomRawPtr( + offsetof(struct ImmutableCFOptions, compaction_filter), + OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, + {"compaction_filter_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableCFOptions, compaction_filter_factory), + OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, + {"merge_operator", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableCFOptions, merge_operator), + OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kCompareLoose | OptionTypeFlags::kAllowNull)}, + {"compaction_style", + {offsetof(struct ImmutableCFOptions, compaction_style), + OptionType::kCompactionStyle, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"compaction_pri", + {offsetof(struct ImmutableCFOptions, compaction_pri), + OptionType::kCompactionPri, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"sst_partitioner_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableCFOptions, sst_partitioner_factory), + OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, + {"blob_cache", + {offsetof(struct ImmutableCFOptions, blob_cache), OptionType::kUnknown, + OptionVerificationType::kNormal, + (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), + // Parses the input value as a Cache + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); + return Cache::CreateFromString(opts, value, cache); + }}}, + {"persist_user_defined_timestamps", + {offsetof(struct ImmutableCFOptions, persist_user_defined_timestamps), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareLoose}}, +}; + +const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions"; + +class ConfigurableMutableCFOptions : public Configurable { + public: + explicit ConfigurableMutableCFOptions(const MutableCFOptions& mcf) { + mutable_ = mcf; + RegisterOptions(&mutable_, &cf_mutable_options_type_info); + } + + protected: + MutableCFOptions mutable_; +}; + +class ConfigurableCFOptions : public ConfigurableMutableCFOptions { + public: + ConfigurableCFOptions(const ColumnFamilyOptions& opts, + const std::unordered_map* map) + : ConfigurableMutableCFOptions(MutableCFOptions(opts)), + immutable_(opts), + cf_options_(opts), + opt_map_(map) { + RegisterOptions(&immutable_, &cf_immutable_options_type_info); + } + + protected: + Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) override { + Status s = Configurable::ConfigureOptions(config_options, opts_map, unused); + if (s.ok()) { + UpdateColumnFamilyOptions(mutable_, &cf_options_); + UpdateColumnFamilyOptions(immutable_, &cf_options_); + s = PrepareOptions(config_options); + } + return s; + } + + virtual const void* GetOptionsPtr(const std::string& name) const override { + if (name == OptionsHelper::kCFOptionsName) { + return &cf_options_; + } else { + return ConfigurableMutableCFOptions::GetOptionsPtr(name); + } + } + + bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const override { + bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr, + that_ptr, mismatch); + if (!equals && opt_info.IsByName()) { + if (opt_map_ == nullptr) { + equals = true; + } else { + const auto& iter = opt_map_->find(opt_name); + if (iter == opt_map_->end()) { + equals = true; + } else { + equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr, + iter->second); + } + } + if (equals) { // False alarm, clear mismatch + *mismatch = ""; + } + } + if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) { + const auto* this_config = opt_info.AsRawPointer(this_ptr); + if (this_config == nullptr) { + const auto& iter = opt_map_->find(opt_name); + // If the name exists in the map and is not empty/null, + // then the this_config should be set. + if (iter != opt_map_->end() && !iter->second.empty() && + iter->second != kNullptrString) { + *mismatch = opt_name; + equals = false; + } + } + } + return equals; + } + + private: + ImmutableCFOptions immutable_; + ColumnFamilyOptions cf_options_; + const std::unordered_map* opt_map_; +}; + +std::unique_ptr CFOptionsAsConfigurable( + const MutableCFOptions& opts) { + std::unique_ptr ptr(new ConfigurableMutableCFOptions(opts)); + return ptr; +} +std::unique_ptr CFOptionsAsConfigurable( + const ColumnFamilyOptions& opts, + const std::unordered_map* opt_map) { + std::unique_ptr ptr(new ConfigurableCFOptions(opts, opt_map)); + return ptr; +} + +ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {} + +ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) + : compaction_style(cf_options.compaction_style), + compaction_pri(cf_options.compaction_pri), + user_comparator(cf_options.comparator), + internal_comparator(InternalKeyComparator(cf_options.comparator)), + merge_operator(cf_options.merge_operator), + compaction_filter(cf_options.compaction_filter), + compaction_filter_factory(cf_options.compaction_filter_factory), + min_write_buffer_number_to_merge( + cf_options.min_write_buffer_number_to_merge), + max_write_buffer_number_to_maintain( + cf_options.max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain( + cf_options.max_write_buffer_size_to_maintain), + inplace_update_support(cf_options.inplace_update_support), + inplace_callback(cf_options.inplace_callback), + memtable_factory(cf_options.memtable_factory), + table_factory(cf_options.table_factory), + table_properties_collector_factories( + cf_options.table_properties_collector_factories), + bloom_locality(cf_options.bloom_locality), + level_compaction_dynamic_level_bytes( + cf_options.level_compaction_dynamic_level_bytes), + level_compaction_dynamic_file_size( + cf_options.level_compaction_dynamic_file_size), + num_levels(cf_options.num_levels), + optimize_filters_for_hits(cf_options.optimize_filters_for_hits), + force_consistency_checks(cf_options.force_consistency_checks), + preclude_last_level_data_seconds( + cf_options.preclude_last_level_data_seconds), + preserve_internal_time_seconds(cf_options.preserve_internal_time_seconds), + memtable_insert_with_hint_prefix_extractor( + cf_options.memtable_insert_with_hint_prefix_extractor), + cf_paths(cf_options.cf_paths), + compaction_thread_limiter(cf_options.compaction_thread_limiter), + sst_partitioner_factory(cf_options.sst_partitioner_factory), + blob_cache(cf_options.blob_cache), + persist_user_defined_timestamps( + cf_options.persist_user_defined_timestamps) {} + +ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {} + +ImmutableOptions::ImmutableOptions(const Options& options) + : ImmutableOptions(options, options) {} + +ImmutableOptions::ImmutableOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const DBOptions& db_options, + const ImmutableCFOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options, + const ImmutableCFOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +// Multiple two operands. If they overflow, return op1. +uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) { + if (op1 == 0 || op2 <= 0) { + return 0; + } + if (std::numeric_limits::max() / op1 < op2) { + return op1; + } + return static_cast(op1 * op2); +} + +// when level_compaction_dynamic_level_bytes is true and leveled compaction +// is used, the base level is not always L1, so precomupted max_file_size can +// no longer be used. Recompute file_size_for_level from base level. +uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options, + int level, CompactionStyle compaction_style, int base_level, + bool level_compaction_dynamic_level_bytes) { + if (!level_compaction_dynamic_level_bytes || level < base_level || + compaction_style != kCompactionStyleLevel) { + assert(level >= 0); + assert(level < (int)cf_options.max_file_size.size()); + return cf_options.max_file_size[level]; + } else { + assert(level >= 0 && base_level >= 0); + assert(level - base_level < (int)cf_options.max_file_size.size()); + return cf_options.max_file_size[level - base_level]; + } +} + +size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options) { + // We do not want to pin meta-blocks that almost certainly came from intra-L0 + // or a former larger `write_buffer_size` value to avoid surprising users with + // pinned memory usage. We use a factor of 1.5 to account for overhead + // introduced during flush in most cases. + if (std::numeric_limits::max() / 3 < + cf_options.write_buffer_size / 2) { + return std::numeric_limits::max(); + } + return cf_options.write_buffer_size / 2 * 3; +} + +void MutableCFOptions::RefreshDerivedOptions(int num_levels, + CompactionStyle compaction_style) { + max_file_size.resize(num_levels); + for (int i = 0; i < num_levels; ++i) { + if (i == 0 && compaction_style == kCompactionStyleUniversal) { + max_file_size[i] = ULLONG_MAX; + } else if (i > 1) { + max_file_size[i] = MultiplyCheckOverflow(max_file_size[i - 1], + target_file_size_multiplier); + } else { + max_file_size[i] = target_file_size_base; + } + } +} + +void MutableCFOptions::Dump(Logger* log) const { + // Memtable related options + ROCKS_LOG_INFO(log, + " write_buffer_size: %" ROCKSDB_PRIszt, + write_buffer_size); + ROCKS_LOG_INFO(log, " max_write_buffer_number: %d", + max_write_buffer_number); + ROCKS_LOG_INFO(log, + " arena_block_size: %" ROCKSDB_PRIszt, + arena_block_size); + ROCKS_LOG_INFO(log, " memtable_prefix_bloom_ratio: %f", + memtable_prefix_bloom_size_ratio); + ROCKS_LOG_INFO(log, " memtable_whole_key_filtering: %d", + memtable_whole_key_filtering); + ROCKS_LOG_INFO(log, + " memtable_huge_page_size: %" ROCKSDB_PRIszt, + memtable_huge_page_size); + ROCKS_LOG_INFO(log, + " max_successive_merges: %" ROCKSDB_PRIszt, + max_successive_merges); + ROCKS_LOG_INFO(log, + " inplace_update_num_locks: %" ROCKSDB_PRIszt, + inplace_update_num_locks); + ROCKS_LOG_INFO(log, " prefix_extractor: %s", + prefix_extractor == nullptr + ? "nullptr" + : prefix_extractor->GetId().c_str()); + ROCKS_LOG_INFO(log, " disable_auto_compactions: %d", + disable_auto_compactions); + ROCKS_LOG_INFO(log, " soft_pending_compaction_bytes_limit: %" PRIu64, + soft_pending_compaction_bytes_limit); + ROCKS_LOG_INFO(log, " hard_pending_compaction_bytes_limit: %" PRIu64, + hard_pending_compaction_bytes_limit); + ROCKS_LOG_INFO(log, " level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + ROCKS_LOG_INFO(log, " level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + ROCKS_LOG_INFO(log, " level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + ROCKS_LOG_INFO(log, " max_compaction_bytes: %" PRIu64, + max_compaction_bytes); + ROCKS_LOG_INFO(log, " ignore_max_compaction_bytes_for_input: %s", + ignore_max_compaction_bytes_for_input ? "true" : "false"); + ROCKS_LOG_INFO(log, " target_file_size_base: %" PRIu64, + target_file_size_base); + ROCKS_LOG_INFO(log, " target_file_size_multiplier: %d", + target_file_size_multiplier); + ROCKS_LOG_INFO(log, " max_bytes_for_level_base: %" PRIu64, + max_bytes_for_level_base); + ROCKS_LOG_INFO(log, " max_bytes_for_level_multiplier: %f", + max_bytes_for_level_multiplier); + ROCKS_LOG_INFO(log, " ttl: %" PRIu64, + ttl); + ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64, + periodic_compaction_seconds); + std::string result; + char buf[10]; + for (const auto m : max_bytes_for_level_multiplier_additional) { + snprintf(buf, sizeof(buf), "%d, ", m); + result += buf; + } + if (result.size() >= 2) { + result.resize(result.size() - 2); + } else { + result = ""; + } + + ROCKS_LOG_INFO(log, "max_bytes_for_level_multiplier_additional: %s", + result.c_str()); + ROCKS_LOG_INFO(log, " max_sequential_skip_in_iterations: %" PRIu64, + max_sequential_skip_in_iterations); + ROCKS_LOG_INFO(log, " check_flush_compaction_key_order: %d", + check_flush_compaction_key_order); + ROCKS_LOG_INFO(log, " paranoid_file_checks: %d", + paranoid_file_checks); + ROCKS_LOG_INFO(log, " report_bg_io_stats: %d", + report_bg_io_stats); + ROCKS_LOG_INFO(log, " compression: %d", + static_cast(compression)); + ROCKS_LOG_INFO(log, + " experimental_mempurge_threshold: %f", + experimental_mempurge_threshold); + + // Universal Compaction Options + ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d", + compaction_options_universal.size_ratio); + ROCKS_LOG_INFO(log, "compaction_options_universal.min_merge_width : %d", + compaction_options_universal.min_merge_width); + ROCKS_LOG_INFO(log, "compaction_options_universal.max_merge_width : %d", + compaction_options_universal.max_merge_width); + ROCKS_LOG_INFO( + log, "compaction_options_universal.max_size_amplification_percent : %d", + compaction_options_universal.max_size_amplification_percent); + ROCKS_LOG_INFO(log, + "compaction_options_universal.compression_size_percent : %d", + compaction_options_universal.compression_size_percent); + ROCKS_LOG_INFO(log, "compaction_options_universal.stop_style : %d", + compaction_options_universal.stop_style); + ROCKS_LOG_INFO( + log, "compaction_options_universal.allow_trivial_move : %d", + static_cast(compaction_options_universal.allow_trivial_move)); + ROCKS_LOG_INFO(log, "compaction_options_universal.incremental : %d", + static_cast(compaction_options_universal.incremental)); + + // FIFO Compaction Options + ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64, + compaction_options_fifo.max_table_files_size); + ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d", + compaction_options_fifo.allow_compaction); + + // Blob file related options + ROCKS_LOG_INFO(log, " enable_blob_files: %s", + enable_blob_files ? "true" : "false"); + ROCKS_LOG_INFO(log, " min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_INFO(log, " blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_INFO(log, " blob_compression_type: %s", + CompressionTypeToString(blob_compression_type).c_str()); + ROCKS_LOG_INFO(log, " enable_blob_garbage_collection: %s", + enable_blob_garbage_collection ? "true" : "false"); + ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f", + blob_garbage_collection_age_cutoff); + ROCKS_LOG_INFO(log, " blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); + ROCKS_LOG_INFO(log, " blob_compaction_readahead_size: %" PRIu64, + blob_compaction_readahead_size); + ROCKS_LOG_INFO(log, " blob_file_starting_level: %d", + blob_file_starting_level); + ROCKS_LOG_INFO(log, " prepopulate_blob_cache: %s", + prepopulate_blob_cache == PrepopulateBlobCache::kFlushOnly + ? "flush only" + : "disable"); + ROCKS_LOG_INFO(log, " last_level_temperature: %d", + static_cast(last_level_temperature)); +} + +MutableCFOptions::MutableCFOptions(const Options& options) + : MutableCFOptions(ColumnFamilyOptions(options)) {} + +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + Logger* /*info_log*/, MutableCFOptions* new_options) { + assert(new_options); + *new_options = base_options; + ConfigOptions config_options; + Status s = OptionTypeInfo::ParseType( + config_options, options_map, cf_mutable_options_type_info, new_options); + if (!s.ok()) { + *new_options = base_options; + } + return s; +} + +Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, + const MutableCFOptions& mutable_opts, + std::string* opt_string) { + assert(opt_string); + opt_string->clear(); + return OptionTypeInfo::SerializeType( + config_options, cf_mutable_options_type_info, &mutable_opts, opt_string); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.h new file mode 100644 index 0000000000..37ef54c0cb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/cf_options.h @@ -0,0 +1,347 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "db/dbformat.h" +#include "options/db_options.h" +#include "rocksdb/options.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a +// subset of Options that should not be changed during the entire lifetime +// of DB. Raw pointers defined in this struct do not have ownership to the data +// they point to. Options contains std::shared_ptr to these data. +struct ImmutableCFOptions { + public: + static const char* kName() { return "ImmutableCFOptions"; } + explicit ImmutableCFOptions(); + explicit ImmutableCFOptions(const ColumnFamilyOptions& cf_options); + + CompactionStyle compaction_style; + + CompactionPri compaction_pri; + + const Comparator* user_comparator; + InternalKeyComparator internal_comparator; // Only in Immutable + + std::shared_ptr merge_operator; + + const CompactionFilter* compaction_filter; + + std::shared_ptr compaction_filter_factory; + + int min_write_buffer_number_to_merge; + + int max_write_buffer_number_to_maintain; + + int64_t max_write_buffer_size_to_maintain; + + bool inplace_update_support; + + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + + std::shared_ptr memtable_factory; + + std::shared_ptr table_factory; + + Options::TablePropertiesCollectorFactories + table_properties_collector_factories; + + // This options is required by PlainTableReader. May need to move it + // to PlainTableOptions just like bloom_bits_per_key + uint32_t bloom_locality; + + bool level_compaction_dynamic_level_bytes; + + bool level_compaction_dynamic_file_size; + + int num_levels; + + bool optimize_filters_for_hits; + + bool force_consistency_checks; + + uint64_t preclude_last_level_data_seconds; + + uint64_t preserve_internal_time_seconds; + + std::shared_ptr + memtable_insert_with_hint_prefix_extractor; + + std::vector cf_paths; + + std::shared_ptr compaction_thread_limiter; + + std::shared_ptr sst_partitioner_factory; + + std::shared_ptr blob_cache; + + bool persist_user_defined_timestamps; +}; + +struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions { + explicit ImmutableOptions(); + explicit ImmutableOptions(const Options& options); + + ImmutableOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + + ImmutableOptions(const ImmutableDBOptions& db_options, + const ImmutableCFOptions& cf_options); + + ImmutableOptions(const DBOptions& db_options, + const ImmutableCFOptions& cf_options); + + ImmutableOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& cf_options); +}; + +struct MutableCFOptions { + static const char* kName() { return "MutableCFOptions"; } + explicit MutableCFOptions(const ColumnFamilyOptions& options) + : write_buffer_size(options.write_buffer_size), + max_write_buffer_number(options.max_write_buffer_number), + arena_block_size(options.arena_block_size), + memtable_prefix_bloom_size_ratio( + options.memtable_prefix_bloom_size_ratio), + memtable_whole_key_filtering(options.memtable_whole_key_filtering), + memtable_huge_page_size(options.memtable_huge_page_size), + max_successive_merges(options.max_successive_merges), + inplace_update_num_locks(options.inplace_update_num_locks), + prefix_extractor(options.prefix_extractor), + experimental_mempurge_threshold( + options.experimental_mempurge_threshold), + disable_auto_compactions(options.disable_auto_compactions), + soft_pending_compaction_bytes_limit( + options.soft_pending_compaction_bytes_limit), + hard_pending_compaction_bytes_limit( + options.hard_pending_compaction_bytes_limit), + level0_file_num_compaction_trigger( + options.level0_file_num_compaction_trigger), + level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), + level0_stop_writes_trigger(options.level0_stop_writes_trigger), + max_compaction_bytes(options.max_compaction_bytes), + ignore_max_compaction_bytes_for_input( + options.ignore_max_compaction_bytes_for_input), + target_file_size_base(options.target_file_size_base), + target_file_size_multiplier(options.target_file_size_multiplier), + max_bytes_for_level_base(options.max_bytes_for_level_base), + max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), + ttl(options.ttl), + periodic_compaction_seconds(options.periodic_compaction_seconds), + max_bytes_for_level_multiplier_additional( + options.max_bytes_for_level_multiplier_additional), + compaction_options_fifo(options.compaction_options_fifo), + compaction_options_universal(options.compaction_options_universal), + enable_blob_files(options.enable_blob_files), + min_blob_size(options.min_blob_size), + blob_file_size(options.blob_file_size), + blob_compression_type(options.blob_compression_type), + enable_blob_garbage_collection(options.enable_blob_garbage_collection), + blob_garbage_collection_age_cutoff( + options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold), + blob_compaction_readahead_size(options.blob_compaction_readahead_size), + blob_file_starting_level(options.blob_file_starting_level), + prepopulate_blob_cache(options.prepopulate_blob_cache), + max_sequential_skip_in_iterations( + options.max_sequential_skip_in_iterations), + check_flush_compaction_key_order( + options.check_flush_compaction_key_order), + paranoid_file_checks(options.paranoid_file_checks), + report_bg_io_stats(options.report_bg_io_stats), + compression(options.compression), + bottommost_compression(options.bottommost_compression), + compression_opts(options.compression_opts), + bottommost_compression_opts(options.bottommost_compression_opts), + last_level_temperature(options.last_level_temperature == + Temperature::kUnknown + ? options.bottommost_temperature + : options.last_level_temperature), + memtable_protection_bytes_per_key( + options.memtable_protection_bytes_per_key), + block_protection_bytes_per_key(options.block_protection_bytes_per_key), + sample_for_compression( + options.sample_for_compression), // TODO: is 0 fine here? + compression_per_level(options.compression_per_level) { + RefreshDerivedOptions(options.num_levels, options.compaction_style); + } + + MutableCFOptions() + : write_buffer_size(0), + max_write_buffer_number(0), + arena_block_size(0), + memtable_prefix_bloom_size_ratio(0), + memtable_whole_key_filtering(false), + memtable_huge_page_size(0), + max_successive_merges(0), + inplace_update_num_locks(0), + prefix_extractor(nullptr), + experimental_mempurge_threshold(0.0), + disable_auto_compactions(false), + soft_pending_compaction_bytes_limit(0), + hard_pending_compaction_bytes_limit(0), + level0_file_num_compaction_trigger(0), + level0_slowdown_writes_trigger(0), + level0_stop_writes_trigger(0), + max_compaction_bytes(0), + ignore_max_compaction_bytes_for_input(true), + target_file_size_base(0), + target_file_size_multiplier(0), + max_bytes_for_level_base(0), + max_bytes_for_level_multiplier(0), + ttl(0), + periodic_compaction_seconds(0), + compaction_options_fifo(), + enable_blob_files(false), + min_blob_size(0), + blob_file_size(0), + blob_compression_type(kNoCompression), + enable_blob_garbage_collection(false), + blob_garbage_collection_age_cutoff(0.0), + blob_garbage_collection_force_threshold(0.0), + blob_compaction_readahead_size(0), + blob_file_starting_level(0), + prepopulate_blob_cache(PrepopulateBlobCache::kDisable), + max_sequential_skip_in_iterations(0), + check_flush_compaction_key_order(true), + paranoid_file_checks(false), + report_bg_io_stats(false), + compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), + bottommost_compression(kDisableCompressionOption), + last_level_temperature(Temperature::kUnknown), + memtable_protection_bytes_per_key(0), + block_protection_bytes_per_key(0), + sample_for_compression(0) {} + + explicit MutableCFOptions(const Options& options); + + // Must be called after any change to MutableCFOptions + void RefreshDerivedOptions(int num_levels, CompactionStyle compaction_style); + + void RefreshDerivedOptions(const ImmutableCFOptions& ioptions) { + RefreshDerivedOptions(ioptions.num_levels, ioptions.compaction_style); + } + + int MaxBytesMultiplerAdditional(int level) const { + if (level >= + static_cast(max_bytes_for_level_multiplier_additional.size())) { + return 1; + } + return max_bytes_for_level_multiplier_additional[level]; + } + + void Dump(Logger* log) const; + + // Memtable related options + size_t write_buffer_size; + int max_write_buffer_number; + size_t arena_block_size; + double memtable_prefix_bloom_size_ratio; + bool memtable_whole_key_filtering; + size_t memtable_huge_page_size; + size_t max_successive_merges; + size_t inplace_update_num_locks; + std::shared_ptr prefix_extractor; + // [experimental] + // Used to activate or deactive the Mempurge feature (memtable garbage + // collection). (deactivated by default). At every flush, the total useful + // payload (total entries minus garbage entries) is estimated as a ratio + // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then + // compared to this `threshold` value: + // - if ratio1.0 : aggressive mempurge. + // 0 < threshold < 1.0: mempurge triggered only for very low useful payload + // ratios. + // [experimental] + double experimental_mempurge_threshold; + + // Compaction related options + bool disable_auto_compactions; + uint64_t soft_pending_compaction_bytes_limit; + uint64_t hard_pending_compaction_bytes_limit; + int level0_file_num_compaction_trigger; + int level0_slowdown_writes_trigger; + int level0_stop_writes_trigger; + uint64_t max_compaction_bytes; + bool ignore_max_compaction_bytes_for_input; + uint64_t target_file_size_base; + int target_file_size_multiplier; + uint64_t max_bytes_for_level_base; + double max_bytes_for_level_multiplier; + uint64_t ttl; + uint64_t periodic_compaction_seconds; + std::vector max_bytes_for_level_multiplier_additional; + CompactionOptionsFIFO compaction_options_fifo; + CompactionOptionsUniversal compaction_options_universal; + + // Blob file related options + bool enable_blob_files; + uint64_t min_blob_size; + uint64_t blob_file_size; + CompressionType blob_compression_type; + bool enable_blob_garbage_collection; + double blob_garbage_collection_age_cutoff; + double blob_garbage_collection_force_threshold; + uint64_t blob_compaction_readahead_size; + int blob_file_starting_level; + PrepopulateBlobCache prepopulate_blob_cache; + + // Misc options + uint64_t max_sequential_skip_in_iterations; + bool check_flush_compaction_key_order; + bool paranoid_file_checks; + bool report_bg_io_stats; + CompressionType compression; + CompressionType bottommost_compression; + CompressionOptions compression_opts; + CompressionOptions bottommost_compression_opts; + Temperature last_level_temperature; + uint32_t memtable_protection_bytes_per_key; + uint8_t block_protection_bytes_per_key; + + uint64_t sample_for_compression; + std::vector compression_per_level; + + // Derived options + // Per-level target file size. + std::vector max_file_size; +}; + +uint64_t MultiplyCheckOverflow(uint64_t op1, double op2); + +// Get the max file size in a given level. +uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options, + int level, CompactionStyle compaction_style, int base_level = 1, + bool level_compaction_dynamic_level_bytes = false); + +// Get the max size of an L0 file for which we will pin its meta-blocks when +// `pin_l0_filter_and_index_blocks_in_cache` is set. +size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options); + +Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, + const MutableCFOptions& mutable_opts, + std::string* opt_string); + +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + Logger* info_log, MutableCFOptions* new_options); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable.cc new file mode 100644 index 0000000000..5491336e0a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable.cc @@ -0,0 +1,712 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/configurable.h" + +#include "logging/logging.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void Configurable::RegisterOptions( + const std::string& name, void* opt_ptr, + const std::unordered_map* type_map) { + RegisteredOptions opts; + opts.name = name; + opts.type_map = type_map; + opts.opt_ptr = opt_ptr; + options_.emplace_back(opts); +} + +//************************************************************************* +// +// Methods for Initializing and Validating Configurable Objects +// +//************************************************************************* + +Status Configurable::PrepareOptions(const ConfigOptions& opts) { + // We ignore the invoke_prepare_options here intentionally, + // as if you are here, you must have called PrepareOptions explicitly. + Status status = Status::OK(); + for (auto opt_iter : options_) { + if (opt_iter.type_map != nullptr) { + for (auto map_iter : *(opt_iter.type_map)) { + auto& opt_info = map_iter.second; + if (opt_info.ShouldPrepare()) { + status = opt_info.Prepare(opts, map_iter.first, opt_iter.opt_ptr); + if (!status.ok()) { + return status; + } + } + } + } + } + return status; +} + +Status Configurable::ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const { + Status status; + for (auto opt_iter : options_) { + if (opt_iter.type_map != nullptr) { + for (auto map_iter : *(opt_iter.type_map)) { + auto& opt_info = map_iter.second; + if (opt_info.ShouldValidate()) { + status = opt_info.Validate(db_opts, cf_opts, map_iter.first, + opt_iter.opt_ptr); + if (!status.ok()) { + return status; + } + } + } + } + } + return status; +} + +/*********************************************************************************/ +/* */ +/* Methods for Retrieving Options from Configurables */ +/* */ +/*********************************************************************************/ + +const void* Configurable::GetOptionsPtr(const std::string& name) const { + for (auto o : options_) { + if (o.name == name) { + return o.opt_ptr; + } + } + return nullptr; +} + +std::string Configurable::GetOptionName(const std::string& opt_name) const { + return opt_name; +} + +const OptionTypeInfo* ConfigurableHelper::FindOption( + const std::vector& options, + const std::string& short_name, std::string* opt_name, void** opt_ptr) { + for (auto iter : options) { + if (iter.type_map != nullptr) { + const auto opt_info = + OptionTypeInfo::Find(short_name, *(iter.type_map), opt_name); + if (opt_info != nullptr) { + *opt_ptr = iter.opt_ptr; + return opt_info; + } + } + } + return nullptr; +} + +//************************************************************************* +// +// Methods for Configuring Options from Strings/Name-Value Pairs/Maps +// +//************************************************************************* + +Status Configurable::ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opts_map) { + Status s = ConfigureFromMap(config_options, opts_map, nullptr); + return s; +} + +Status Configurable::ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) { + return ConfigureOptions(config_options, opts_map, unused); +} + +Status Configurable::ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) { + std::string curr_opts; + Status s; + if (!opts_map.empty()) { + // There are options in the map. + // Save the current configuration in curr_opts and then configure the + // options, but do not prepare them now. We will do all the prepare when + // the configuration is complete. + ConfigOptions copy = config_options; + copy.invoke_prepare_options = false; + if (!config_options.ignore_unknown_options) { + // If we are not ignoring unused, get the defaults in case we need to + // reset + copy.depth = ConfigOptions::kDepthDetailed; + copy.delimiter = "; "; + GetOptionString(copy, &curr_opts).PermitUncheckedError(); + } + + s = ConfigurableHelper::ConfigureOptions(copy, *this, opts_map, unused); + } + if (config_options.invoke_prepare_options && s.ok()) { + s = PrepareOptions(config_options); + } + if (!s.ok() && !curr_opts.empty()) { + ConfigOptions reset = config_options; + reset.ignore_unknown_options = true; + reset.invoke_prepare_options = true; + reset.ignore_unsupported_options = true; + // There are some options to reset from this current error + ConfigureFromString(reset, curr_opts).PermitUncheckedError(); + } + return s; +} + +Status Configurable::ParseStringOptions(const ConfigOptions& /*config_options*/, + const std::string& /*opts_str*/) { + return Status::OK(); +} + +Status Configurable::ConfigureFromString(const ConfigOptions& config_options, + const std::string& opts_str) { + Status s; + if (!opts_str.empty()) { + if (opts_str.find(';') != std::string::npos || + opts_str.find('=') != std::string::npos) { + std::unordered_map opt_map; + s = StringToMap(opts_str, &opt_map); + if (s.ok()) { + s = ConfigureFromMap(config_options, opt_map, nullptr); + } + } else { + s = ParseStringOptions(config_options, opts_str); + if (s.ok() && config_options.invoke_prepare_options) { + s = PrepareOptions(config_options); + } + } + } else if (config_options.invoke_prepare_options) { + s = PrepareOptions(config_options); + } else { + s = Status::OK(); + } + return s; +} + +/** + * Sets the value of the named property to the input value, returning OK on + * succcess. + */ +Status Configurable::ConfigureOption(const ConfigOptions& config_options, + const std::string& name, + const std::string& value) { + return ConfigurableHelper::ConfigureSingleOption(config_options, *this, name, + value); +} + +/** + * Looks for the named option amongst the options for this type and sets + * the value for it to be the input value. + * If the name was found, found_option will be set to true and the resulting + * status should be returned. + */ + +Status Configurable::ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, void* opt_ptr) { + if (opt_info.IsMutable()) { + if (config_options.mutable_options_only) { + // This option is mutable. Treat all of its children as mutable as well + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + return opt_info.Parse(copy, opt_name, opt_value, opt_ptr); + } else { + return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); + } + } else if (config_options.mutable_options_only) { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } else { + return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); + } +} + + +Status ConfigurableHelper::ConfigureOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& opts_map, + std::unordered_map* unused) { + std::unordered_map remaining = opts_map; + Status s = Status::OK(); + if (!opts_map.empty()) { + for (const auto& iter : configurable.options_) { + if (iter.type_map != nullptr) { + s = ConfigureSomeOptions(config_options, configurable, *(iter.type_map), + &remaining, iter.opt_ptr); + if (remaining.empty()) { // Are there more options left? + break; + } else if (!s.ok()) { + break; + } + } + } + } + if (unused != nullptr && !remaining.empty()) { + unused->insert(remaining.begin(), remaining.end()); + } + if (config_options.ignore_unknown_options) { + s = Status::OK(); + } else if (s.ok() && unused == nullptr && !remaining.empty()) { + s = Status::NotFound("Could not find option: ", remaining.begin()->first); + } + return s; +} + +/** + * Updates the object with the named-value property values, returning OK on + * succcess. Any properties that were found are removed from the options list; + * upon return only options that were not found in this opt_map remain. + + * Returns: + * - OK if ignore_unknown_options is set + * - InvalidArgument, if any option was invalid + * - NotSupported, if any option is unsupported and ignore_unsupported_options + is OFF + * - OK, if no option was invalid or not supported (or ignored) + */ +Status ConfigurableHelper::ConfigureSomeOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& type_map, + std::unordered_map* options, void* opt_ptr) { + Status result = Status::OK(); // The last non-OK result (if any) + Status notsup = Status::OK(); // The last NotSupported result (if any) + std::string elem_name; + int found = 1; + std::unordered_set unsupported; + // While there are unused properties and we processed at least one, + // go through the remaining unused properties and attempt to configure them. + while (found > 0 && !options->empty()) { + found = 0; + notsup = Status::OK(); + for (auto it = options->begin(); it != options->end();) { + const std::string& opt_name = configurable.GetOptionName(it->first); + const std::string& opt_value = it->second; + const auto opt_info = + OptionTypeInfo::Find(opt_name, type_map, &elem_name); + if (opt_info == nullptr) { // Did not find the option. Skip it + ++it; + } else { + Status s = ConfigureOption(config_options, configurable, *opt_info, + opt_name, elem_name, opt_value, opt_ptr); + if (s.IsNotFound()) { + ++it; + } else if (s.IsNotSupported()) { + notsup = s; + unsupported.insert(it->first); + ++it; // Skip it for now + } else { + found++; + it = options->erase(it); + if (!s.ok()) { + result = s; + } + } + } + } // End for all remaining options + } // End while found one or options remain + + // Now that we have been through the list, remove any unsupported + for (auto u : unsupported) { + auto it = options->find(u); + if (it != options->end()) { + options->erase(it); + } + } + if (config_options.ignore_unknown_options) { + if (!result.ok()) result.PermitUncheckedError(); + if (!notsup.ok()) notsup.PermitUncheckedError(); + return Status::OK(); + } else if (!result.ok()) { + if (!notsup.ok()) notsup.PermitUncheckedError(); + return result; + } else if (config_options.ignore_unsupported_options) { + if (!notsup.ok()) notsup.PermitUncheckedError(); + return Status::OK(); + } else { + return notsup; + } +} + +Status ConfigurableHelper::ConfigureSingleOption( + const ConfigOptions& config_options, Configurable& configurable, + const std::string& name, const std::string& value) { + const std::string& opt_name = configurable.GetOptionName(name); + std::string elem_name; + void* opt_ptr = nullptr; + const auto opt_info = + FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr); + if (opt_info == nullptr) { + return Status::NotFound("Could not find option: ", name); + } else { + return ConfigureOption(config_options, configurable, *opt_info, opt_name, + elem_name, value, opt_ptr); + } +} +Status ConfigurableHelper::ConfigureCustomizableOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr) { + Customizable* custom = opt_info.AsRawPointer(opt_ptr); + ConfigOptions copy = config_options; + if (opt_info.IsMutable()) { + // This option is mutable. Pass that property on to any subsequent calls + copy.mutable_options_only = false; + } + + if (opt_info.IsMutable() || !config_options.mutable_options_only) { + // Either the option is mutable, or we are processing all of the options + if (opt_name == name || name == OptionTypeInfo::kIdPropName() || + EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix())) { + return configurable.ParseOption(copy, opt_info, name, value, opt_ptr); + } else if (value.empty()) { + return Status::OK(); + } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) { + return configurable.ParseOption(copy, opt_info, name, value, opt_ptr); + } else if (value.find("=") != std::string::npos) { + return custom->ConfigureFromString(copy, value); + } else { + return custom->ConfigureOption(copy, name, value); + } + } else { + // We are processing immutable options, which means that we cannot change + // the Customizable object itself, but could change its mutable properties. + // Check to make sure that nothing is trying to change the Customizable + if (custom == nullptr) { + // We do not have a Customizable to configure. This is OK if the + // value is empty (nothing being configured) but an error otherwise + if (value.empty()) { + return Status::OK(); + } else { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } + } else if (EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix()) || + name == OptionTypeInfo::kIdPropName()) { + // We have a property of the form "id=value" or "table.id=value" + // This is OK if we ID/value matches the current customizable object + if (custom->GetId() == value) { + return Status::OK(); + } else { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } + } else if (opt_name == name) { + // The properties are of one of forms: + // name = { id = id; prop1 = value1; ... } + // name = { prop1=value1; prop2=value2; ... } + // name = ID + // Convert the value to a map and extract the ID + // If the ID does not match that of the current customizable, return an + // error. Otherwise, update the current customizable via the properties + // map + std::unordered_map props; + std::string id; + Status s = + Configurable::GetOptionsMap(value, custom->GetId(), &id, &props); + if (!s.ok()) { + return s; + } else if (custom->GetId() != id) { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } else if (props.empty()) { + return Status::OK(); + } else { + return custom->ConfigureFromMap(copy, props); + } + } else { + // Attempting to configure one of the properties of the customizable + // Let it through + return custom->ConfigureOption(copy, name, value); + } + } +} + +Status ConfigurableHelper::ConfigureOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr) { + if (opt_info.IsCustomizable()) { + return ConfigureCustomizableOption(config_options, configurable, opt_info, + opt_name, name, value, opt_ptr); + } else if (opt_name == name) { + return configurable.ParseOption(config_options, opt_info, opt_name, value, + opt_ptr); + } else if (opt_info.IsStruct() || opt_info.IsConfigurable()) { + return configurable.ParseOption(config_options, opt_info, name, value, + opt_ptr); + } else { + return Status::NotFound("Could not find option: ", name); + } +} + +//******************************************************************************* +// +// Methods for Converting Options into strings +// +//******************************************************************************* + +Status Configurable::GetOptionString(const ConfigOptions& config_options, + std::string* result) const { + assert(result); + result->clear(); + return ConfigurableHelper::SerializeOptions(config_options, *this, "", + result); +} + +std::string Configurable::ToString(const ConfigOptions& config_options, + const std::string& prefix) const { + std::string result = SerializeOptions(config_options, prefix); + if (result.empty() || result.find('=') == std::string::npos) { + return result; + } else { + return "{" + result + "}"; + } +} + +std::string Configurable::SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const { + std::string result; + Status s = ConfigurableHelper::SerializeOptions(config_options, *this, header, + &result); + assert(s.ok()); + return result; +} + +Status Configurable::GetOption(const ConfigOptions& config_options, + const std::string& name, + std::string* value) const { + return ConfigurableHelper::GetOption(config_options, *this, + GetOptionName(name), value); +} + +Status ConfigurableHelper::GetOption(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& short_name, + std::string* value) { + // Look for option directly + assert(value); + value->clear(); + + std::string opt_name; + void* opt_ptr = nullptr; + const auto opt_info = + FindOption(configurable.options_, short_name, &opt_name, &opt_ptr); + if (opt_info != nullptr) { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + if (short_name == opt_name) { + return opt_info->Serialize(embedded, opt_name, opt_ptr, value); + } else if (opt_info->IsStruct()) { + return opt_info->Serialize(embedded, opt_name, opt_ptr, value); + } else if (opt_info->IsConfigurable()) { + auto const* config = opt_info->AsRawPointer(opt_ptr); + if (config != nullptr) { + return config->GetOption(embedded, opt_name, value); + } + } + } + return Status::NotFound("Cannot find option: ", short_name); +} + +Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& prefix, + std::string* result) { + assert(result); + for (auto const& opt_iter : configurable.options_) { + if (opt_iter.type_map != nullptr) { + for (const auto& map_iter : *(opt_iter.type_map)) { + const auto& opt_name = map_iter.first; + const auto& opt_info = map_iter.second; + if (opt_info.ShouldSerialize()) { + std::string value; + Status s; + if (!config_options.mutable_options_only) { + s = opt_info.Serialize(config_options, prefix + opt_name, + opt_iter.opt_ptr, &value); + } else if (opt_info.IsMutable()) { + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr, + &value); + } else if (opt_info.IsConfigurable()) { + // If it is a Configurable and we are either printing all of the + // details or not printing only the name, this option should be + // included in the list + if (config_options.IsDetailed() || + !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) { + s = opt_info.Serialize(config_options, prefix + opt_name, + opt_iter.opt_ptr, &value); + } + } + if (!s.ok()) { + return s; + } else if (!value.empty()) { + // = + result->append(prefix + opt_name + "=" + value + + config_options.delimiter); + } + } + } + } + } + return Status::OK(); +} + +//******************************************************************************** +// +// Methods for listing the options from Configurables +// +//******************************************************************************** +Status Configurable::GetOptionNames( + const ConfigOptions& config_options, + std::unordered_set* result) const { + assert(result); + return ConfigurableHelper::ListOptions(config_options, *this, "", result); +} + +Status ConfigurableHelper::ListOptions( + const ConfigOptions& config_options, const Configurable& configurable, + const std::string& prefix, std::unordered_set* result) { + Status status; + for (auto const& opt_iter : configurable.options_) { + if (opt_iter.type_map != nullptr) { + for (const auto& map_iter : *(opt_iter.type_map)) { + const auto& opt_name = map_iter.first; + const auto& opt_info = map_iter.second; + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) { + if (!config_options.mutable_options_only) { + result->emplace(prefix + opt_name); + } else if (opt_info.IsMutable()) { + result->emplace(prefix + opt_name); + } + } + } + } + } + return status; +} + +//******************************************************************************* +// +// Methods for Comparing Configurables +// +//******************************************************************************* + +bool Configurable::AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* name) const { + assert(name); + name->clear(); + if (this == other || config_options.IsCheckDisabled()) { + return true; + } else if (other != nullptr) { + return ConfigurableHelper::AreEquivalent(config_options, *this, *other, + name); + } else { + return false; + } +} + +bool Configurable::OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const { + if (opt_info.AreEqual(config_options, opt_name, this_ptr, that_ptr, + mismatch)) { + return true; + } else if (opt_info.AreEqualByName(config_options, opt_name, this_ptr, + that_ptr)) { + *mismatch = ""; + return true; + } else { + return false; + } +} + +bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options, + const Configurable& this_one, + const Configurable& that_one, + std::string* mismatch) { + assert(mismatch != nullptr); + for (auto const& o : this_one.options_) { + const auto this_offset = this_one.GetOptionsPtr(o.name); + const auto that_offset = that_one.GetOptionsPtr(o.name); + if (this_offset != that_offset) { + if (this_offset == nullptr || that_offset == nullptr) { + return false; + } else if (o.type_map != nullptr) { + for (const auto& map_iter : *(o.type_map)) { + const auto& opt_info = map_iter.second; + if (config_options.IsCheckEnabled(opt_info.GetSanityLevel())) { + if (!config_options.mutable_options_only) { + if (!this_one.OptionsAreEqual(config_options, opt_info, + map_iter.first, this_offset, + that_offset, mismatch)) { + return false; + } + } else if (opt_info.IsMutable()) { + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + if (!this_one.OptionsAreEqual(copy, opt_info, map_iter.first, + this_offset, that_offset, + mismatch)) { + return false; + } + } + } + } + } + } + } + return true; +} + +Status Configurable::GetOptionsMap( + const std::string& value, const std::string& default_id, std::string* id, + std::unordered_map* props) { + assert(id); + assert(props); + Status status; + if (value.empty() || value == kNullptrString) { + *id = default_id; + } else if (value.find('=') == std::string::npos) { + *id = value; + } else { + status = StringToMap(value, props); + if (!status.ok()) { // There was an error creating the map. + *id = value; // Treat the value as id + props->clear(); // Clear the properties + status = Status::OK(); // and ignore the error + } else { + auto iter = props->find(OptionTypeInfo::kIdPropName()); + if (iter != props->end()) { + *id = iter->second; + props->erase(iter); + if (*id == kNullptrString) { + id->clear(); + } + } else if (!default_id.empty()) { + *id = default_id; + } else { // No id property and no default + *id = value; // Treat the value as id + props->clear(); // Clear the properties + } + } + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_helper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_helper.h new file mode 100644 index 0000000000..5d409f82a4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_helper.h @@ -0,0 +1,185 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/configurable.h" +#include "rocksdb/convenience.h" + +namespace ROCKSDB_NAMESPACE { +// Helper class defining static methods for supporting the Configurable +// class. The purpose of this class is to keep the Configurable class +// as tight as possible and provide methods for doing the actual work +// of configuring the objects. +class ConfigurableHelper { + public: + // Configures the input Configurable object based on the parameters. + // On successful completion, the Configurable is updated with the settings + // from the opt_map. + // + // The acceptable values of the name/value pairs are documented with the + // specific class/instance. + // + // @param config_options Controls how the arguments are processed. + // @param opt_map Name/value pairs of the options to update + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all values in the map were successfully updated + // @return NotFound If any of the names in the opt_map were not valid + // for this object. If unused is specified, it will contain the + // collection of NotFound entries + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + static Status ConfigureOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& options, + std::unordered_map* unused); + + // Internal method to configure a set of options for this object. + // Classes may override this value to change its behavior. + // @param config_options Controls how the options are being configured + // @param type_name The name that was registered for this set of options + // @param type_map The map of options for this name + // @param opt_ptr Pointer to the object being configured for this option set. + // @param options The option name/values being updated. On return, any + // option that was found is removed from the list. + // @return OK If all of the options were successfully updated. + // @return InvalidArgument If an option was found but the value could not + // be updated. + // @return NotFound If an option name was not found in type_mape + // @return NotSupported If the option was found but no rule for converting + // the value could be found. + static Status ConfigureSomeOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& type_map, + std::unordered_map* options, void* opt_ptr); + + // Configures a single option in the input Configurable. + // This method will look through the set of option names for this + // Configurable searching for one with the input name. If such an option + // is found, it will be configured via the input value. + // + // @param config_options Controls how the option is being configured + // @param configurable The object to configure + // @param name For options with sub-options (like Structs or + // Configurables), + // this value may be the name of the sub-field of the option being + // updated. For example, if the option is + // "compaction_options_fifo.allow_compaction", then field name would be + // "allow_compaction". For most options, field_name and opt_name will be + // equivalent. + // @param value The new value for this option. + // @param See ConfigureOptions for the possible return values + static Status ConfigureSingleOption(const ConfigOptions& config_options, + Configurable& configurable, + const std::string& name, + const std::string& value); + + // Configures the option referenced by opt_info for this configurable + // This method configures the option based on opt_info for the input + // configurable. + // @param config_options Controls how the option is being configured + // @param configurable The object to configure + // @param opt_name The full option name + // @param name For options with sub-options (like Structs or + // Configurables), + // this value may be the name of the sub-field of the option being + // updated. For example, if the option is + // "compaction_options_fifo.allow_compaction", then field name would be + // "allow_compaction". For most options, field_name and opt_name will be + // equivalent. + // @param value The new value for this option. + // @param See ConfigureOptions for the possible return values + static Status ConfigureOption(const ConfigOptions& config_options, + Configurable& configurable, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& name, + const std::string& value, void* opt_ptr); + + // Returns the value of the option associated with the input name + // This method is the functional inverse of ConfigureOption + // @param config_options Controls how the value is returned + // @param configurable The object from which to get the option. + // @param name The name of the option to return a value for. + // @param value The returned value associated with the named option. + // Note that value will be only the serialized version + // of the option and not "name=value" + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @param InvalidArgument If the name is valid for this object but + // its value cannot be serialized. + static Status GetOption(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& name, std::string* value); + + // Serializes the input Configurable into the output result. + // This is the inverse of ConfigureOptions + // @param config_options Controls how serialization happens. + // @param configurable The object to serialize + // @param prefix A prefix to add to the each option as it is serialized. + // @param result The string representation of the configurable. + // @return OK If the options for this object wer successfully serialized. + // @return InvalidArgument If one or more of the options could not be + // serialized. + static Status SerializeOptions(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& prefix, + std::string* result); + + // Internal method to list the option names for this object. + // Classes may override this value to change its behavior. + // @see ListOptions for more details + static Status ListOptions(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& prefix, + std::unordered_set* result); + + // Checks to see if the two configurables are equivalent to one other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param this_one The object to compare to. + // @param that_one The other object being compared. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + static bool AreEquivalent(const ConfigOptions& config_options, + const Configurable& this_one, + const Configurable& that_one, + std::string* mismatch); + + private: + // Looks for the option specified by name in the RegisteredOptions. + // This method traverses the types in the input options vector. If an entry + // matching name is found, that entry, opt_name, and pointer are returned. + // @param options The vector of options to search through + // @param name The name of the option to search for in the OptionType map + // @param opt_name If the name was found, this value is set to the option name + // associated with the input name/type. + // @param opt_ptr If the name was found, this value is set to the option + // pointer + // in the RegisteredOptions vector associated with this entry + // @return A pointer to the OptionTypeInfo from the options if found, + // nullptr if the name was not found in the input options + static const OptionTypeInfo* FindOption( + const std::vector& options, + const std::string& name, std::string* opt_name, void** opt_ptr); + + static Status ConfigureCustomizableOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_test.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_test.h new file mode 100644 index 0000000000..3d6fe84108 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/configurable_test.h @@ -0,0 +1,116 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include "options/configurable_helper.h" +#include "rocksdb/configurable.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +struct ColumnFamilyOptions; +struct DBOptions; + +namespace test { +enum TestEnum { kTestA, kTestB }; + +static const std::unordered_map test_enum_map = { + {"A", TestEnum::kTestA}, + {"B", TestEnum::kTestB}, +}; + +struct TestOptions { + int i = 0; + bool b = false; + bool d = true; + TestEnum e = TestEnum::kTestA; + std::string s = ""; + std::string u = ""; +}; + +static std::unordered_map simple_option_info = { + {"int", + {offsetof(struct TestOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bool", + {offsetof(struct TestOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"string", + {offsetof(struct TestOptions, s), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map enum_option_info = { + {"enum", + OptionTypeInfo::Enum(offsetof(struct TestOptions, e), &test_enum_map)} +}; + +static std::unordered_map unique_option_info = { + {"unique", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + (OptionTypeFlags::kUnique | OptionTypeFlags::kMutable)}}, +}; + +static std::unordered_map shared_option_info = { + {"shared", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + (OptionTypeFlags::kShared)}}, +}; +static std::unordered_map pointer_option_info = { + {"pointer", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kRawPointer}}, +}; + +enum TestConfigMode { + kEmptyMode = 0x0, // Don't register anything + kMutableMode = 0x01, // Configuration is mutable + kSimpleMode = 0x02, // Use the simple options + kEnumMode = 0x04, // Use the enum options + kDefaultMode = kSimpleMode, // Use no inner nested configurations + kSharedMode = 0x10, // Use shared configuration + kUniqueMode = 0x20, // Use unique configuration + kRawPtrMode = 0x40, // Use pointer configuration + kNestedMode = (kSharedMode | kUniqueMode | kRawPtrMode), + kAllOptMode = (kNestedMode | kEnumMode | kSimpleMode), +}; + +template +class TestConfigurable : public Configurable { + protected: + std::string name_; + std::string prefix_; + TestOptions options_; + + public: + std::unique_ptr unique_; + std::shared_ptr shared_; + T* pointer_; + + TestConfigurable(const std::string& name, int mode, + const std::unordered_map* map = + &simple_option_info) + : name_(name), pointer_(nullptr) { + prefix_ = "test." + name + "."; + if ((mode & TestConfigMode::kSimpleMode) != 0) { + RegisterOptions(name_, &options_, map); + } + if ((mode & TestConfigMode::kEnumMode) != 0) { + RegisterOptions(name_ + "Enum", &options_, &enum_option_info); + } + } + + ~TestConfigurable() override { delete pointer_; } +}; + +} // namespace test +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/customizable.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/customizable.cc new file mode 100644 index 0000000000..2f154d84c5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/customizable.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/customizable.h" + +#include + +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +std::string Customizable::GetOptionName(const std::string& long_name) const { + const std::string& name = Name(); + size_t name_len = name.size(); + if (long_name.size() > name_len + 1 && + long_name.compare(0, name_len, name) == 0 && + long_name.at(name_len) == '.') { + return long_name.substr(name_len + 1); + } else { + return Configurable::GetOptionName(long_name); + } +} + +std::string Customizable::GenerateIndividualId() const { + std::ostringstream ostr; + ostr << Name() << "@" << static_cast(this) << "#" + << port::GetProcessID(); + return ostr.str(); +} + +Status Customizable::GetOption(const ConfigOptions& config_options, + const std::string& opt_name, + std::string* value) const { + if (opt_name == OptionTypeInfo::kIdPropName()) { + *value = GetId(); + return Status::OK(); + } else { + return Configurable::GetOption(config_options, opt_name, value); + } +} + +std::string Customizable::SerializeOptions(const ConfigOptions& config_options, + const std::string& prefix) const { + std::string result; + std::string parent; + std::string id = GetId(); + if (!config_options.IsShallow() && !id.empty()) { + parent = Configurable::SerializeOptions(config_options, ""); + } + if (parent.empty()) { + result = id; + } else { + result.append(prefix); + result.append(OptionTypeInfo::kIdPropName()); + result.append("="); + result.append(id); + result.append(config_options.delimiter); + result.append(parent); + } + return result; +} + + +bool Customizable::AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const { + if (config_options.sanity_level > ConfigOptions::kSanityLevelNone && + this != other) { + const Customizable* custom = reinterpret_cast(other); + if (custom == nullptr) { // Cast failed + return false; + } else if (GetId() != custom->GetId()) { + *mismatch = OptionTypeInfo::kIdPropName(); + return false; + } else if (config_options.sanity_level > + ConfigOptions::kSanityLevelLooselyCompatible) { + bool matches = + Configurable::AreEquivalent(config_options, other, mismatch); + return matches; + } + } + return true; +} + +Status Customizable::GetOptionsMap( + const ConfigOptions& config_options, const Customizable* customizable, + const std::string& value, std::string* id, + std::unordered_map* props) { + Status status; + if (value.empty() || value == kNullptrString) { + *id = ""; + props->clear(); + } else if (customizable != nullptr) { + status = + Configurable::GetOptionsMap(value, customizable->GetId(), id, props); + if (status.ok() && customizable->IsInstanceOf(*id)) { + // The new ID and the old ID match, so the objects are the same type. + // Try to get the existing options, ignoring any errors + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + std::string curr_opts; + if (customizable->GetOptionString(embedded, &curr_opts).ok()) { + std::unordered_map curr_props; + if (StringToMap(curr_opts, &curr_props).ok()) { + props->insert(curr_props.begin(), curr_props.end()); + } + } + } + } else { + status = Configurable::GetOptionsMap(value, "", id, props); + } + return status; +} + +Status Customizable::ConfigureNewObject( + const ConfigOptions& config_options, Customizable* object, + const std::unordered_map& opt_map) { + Status status; + if (object != nullptr) { + status = object->ConfigureFromMap(config_options, opt_map); + } else if (!opt_map.empty()) { + status = Status::InvalidArgument("Cannot configure null object "); + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.cc new file mode 100644 index 0000000000..d81e72833c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.cc @@ -0,0 +1,1079 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "options/db_options.h" + +#include + +#include "logging/logging.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/configurable.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/listener.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/options_type.h" +#include "rocksdb/wal_filter.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map + wal_recovery_mode_string_map = { + {"kTolerateCorruptedTailRecords", + WALRecoveryMode::kTolerateCorruptedTailRecords}, + {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency}, + {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery}, + {"kSkipAnyCorruptedRecords", + WALRecoveryMode::kSkipAnyCorruptedRecords}}; + +static std::unordered_map + access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE}, + {"NORMAL", DBOptions::AccessHint::NORMAL}, + {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL}, + {"WILLNEED", DBOptions::AccessHint::WILLNEED}}; + +static std::unordered_map cache_tier_string_map = { + {"kVolatileTier", CacheTier::kVolatileTier}, + {"kNonVolatileBlockTier", CacheTier::kNonVolatileBlockTier}}; + +static std::unordered_map info_log_level_string_map = + {{"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL}, + {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL}, + {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL}, + {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL}, + {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL}, + {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}}; + +static std::unordered_map + db_mutable_options_type_info = { + {"allow_os_buffer", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"base_background_compactions", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_background_jobs", + {offsetof(struct MutableDBOptions, max_background_jobs), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_background_compactions", + {offsetof(struct MutableDBOptions, max_background_compactions), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_subcompactions", + {offsetof(struct MutableDBOptions, max_subcompactions), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"avoid_flush_during_shutdown", + {offsetof(struct MutableDBOptions, avoid_flush_during_shutdown), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"writable_file_max_buffer_size", + {offsetof(struct MutableDBOptions, writable_file_max_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"delayed_write_rate", + {offsetof(struct MutableDBOptions, delayed_write_rate), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_total_wal_size", + {offsetof(struct MutableDBOptions, max_total_wal_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"delete_obsolete_files_period_micros", + {offsetof(struct MutableDBOptions, + delete_obsolete_files_period_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stats_dump_period_sec", + {offsetof(struct MutableDBOptions, stats_dump_period_sec), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stats_persist_period_sec", + {offsetof(struct MutableDBOptions, stats_persist_period_sec), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stats_history_buffer_size", + {offsetof(struct MutableDBOptions, stats_history_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_open_files", + {offsetof(struct MutableDBOptions, max_open_files), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bytes_per_sync", + {offsetof(struct MutableDBOptions, bytes_per_sync), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"wal_bytes_per_sync", + {offsetof(struct MutableDBOptions, wal_bytes_per_sync), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"strict_bytes_per_sync", + {offsetof(struct MutableDBOptions, strict_bytes_per_sync), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compaction_readahead_size", + {offsetof(struct MutableDBOptions, compaction_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_background_flushes", + {offsetof(struct MutableDBOptions, max_background_flushes), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + db_immutable_options_type_info = { + /* + // not yet supported + std::shared_ptr row_cache; + std::shared_ptr delete_scheduler; + std::shared_ptr info_log; + std::shared_ptr rate_limiter; + std::shared_ptr statistics; + std::vector db_paths; + FileTypeSet checksum_handoff_file_types; + */ + {"advise_random_on_open", + {offsetof(struct ImmutableDBOptions, advise_random_on_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_mmap_reads", + {offsetof(struct ImmutableDBOptions, allow_mmap_reads), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_fallocate", + {offsetof(struct ImmutableDBOptions, allow_fallocate), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_mmap_writes", + {offsetof(struct ImmutableDBOptions, allow_mmap_writes), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_direct_reads", + {offsetof(struct ImmutableDBOptions, use_direct_reads), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_direct_writes", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"use_direct_io_for_flush_and_compaction", + {offsetof(struct ImmutableDBOptions, + use_direct_io_for_flush_and_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_2pc", + {offsetof(struct ImmutableDBOptions, allow_2pc), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"wal_filter", + OptionTypeInfo::AsCustomRawPtr( + offsetof(struct ImmutableDBOptions, wal_filter), + OptionVerificationType::kByName, + (OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever))}, + {"create_if_missing", + {offsetof(struct ImmutableDBOptions, create_if_missing), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"create_missing_column_families", + {offsetof(struct ImmutableDBOptions, create_missing_column_families), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"disableDataSync", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"disable_data_sync", // for compatibility + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"enable_thread_tracking", + {offsetof(struct ImmutableDBOptions, enable_thread_tracking), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"error_if_exists", + {offsetof(struct ImmutableDBOptions, error_if_exists), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"experimental_allow_mempurge", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"experimental_mempurge_policy", + {0, OptionType::kString, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"experimental_mempurge_threshold", + {0, OptionType::kDouble, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"is_fd_close_on_exec", + {offsetof(struct ImmutableDBOptions, is_fd_close_on_exec), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_checks", + {offsetof(struct ImmutableDBOptions, paranoid_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"flush_verify_memtable_count", + {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"track_and_verify_wals_in_manifest", + {offsetof(struct ImmutableDBOptions, + track_and_verify_wals_in_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"verify_sst_unique_id_in_manifest", + {offsetof(struct ImmutableDBOptions, verify_sst_unique_id_in_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"skip_log_error_on_recovery", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"skip_stats_update_on_db_open", + {offsetof(struct ImmutableDBOptions, skip_stats_update_on_db_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"skip_checking_sst_file_sizes_on_db_open", + {offsetof(struct ImmutableDBOptions, + skip_checking_sst_file_sizes_on_db_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"new_table_reader_for_compaction_inputs", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"random_access_max_buffer_size", + {offsetof(struct ImmutableDBOptions, random_access_max_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_adaptive_mutex", + {offsetof(struct ImmutableDBOptions, use_adaptive_mutex), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_fsync", + {offsetof(struct ImmutableDBOptions, use_fsync), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"max_file_opening_threads", + {offsetof(struct ImmutableDBOptions, max_file_opening_threads), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"table_cache_numshardbits", + {offsetof(struct ImmutableDBOptions, table_cache_numshardbits), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_write_buffer_size", + {offsetof(struct ImmutableDBOptions, db_write_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"keep_log_file_num", + {offsetof(struct ImmutableDBOptions, keep_log_file_num), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"recycle_log_file_num", + {offsetof(struct ImmutableDBOptions, recycle_log_file_num), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"log_file_time_to_roll", + {offsetof(struct ImmutableDBOptions, log_file_time_to_roll), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"manifest_preallocation_size", + {offsetof(struct ImmutableDBOptions, manifest_preallocation_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_log_file_size", + {offsetof(struct ImmutableDBOptions, max_log_file_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_log_dir", + {offsetof(struct ImmutableDBOptions, db_log_dir), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"wal_dir", + {offsetof(struct ImmutableDBOptions, wal_dir), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"WAL_size_limit_MB", + {offsetof(struct ImmutableDBOptions, WAL_size_limit_MB), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"WAL_ttl_seconds", + {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_manifest_file_size", + {offsetof(struct ImmutableDBOptions, max_manifest_file_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"persist_stats_to_disk", + {offsetof(struct ImmutableDBOptions, persist_stats_to_disk), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"fail_if_options_file_error", + {offsetof(struct ImmutableDBOptions, fail_if_options_file_error), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"enable_pipelined_write", + {offsetof(struct ImmutableDBOptions, enable_pipelined_write), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"unordered_write", + {offsetof(struct ImmutableDBOptions, unordered_write), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_concurrent_memtable_write", + {offsetof(struct ImmutableDBOptions, allow_concurrent_memtable_write), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"wal_recovery_mode", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, wal_recovery_mode), + &wal_recovery_mode_string_map)}, + {"enable_write_thread_adaptive_yield", + {offsetof(struct ImmutableDBOptions, + enable_write_thread_adaptive_yield), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"write_thread_slow_yield_usec", + {offsetof(struct ImmutableDBOptions, write_thread_slow_yield_usec), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_write_batch_group_size_bytes", + {offsetof(struct ImmutableDBOptions, max_write_batch_group_size_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"write_thread_max_yield_usec", + {offsetof(struct ImmutableDBOptions, write_thread_max_yield_usec), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"access_hint_on_compaction_start", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, + access_hint_on_compaction_start), + &access_hint_string_map)}, + {"info_log_level", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, info_log_level), + &info_log_level_string_map)}, + {"dump_malloc_stats", + {offsetof(struct ImmutableDBOptions, dump_malloc_stats), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"avoid_flush_during_recovery", + {offsetof(struct ImmutableDBOptions, avoid_flush_during_recovery), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_ingest_behind", + {offsetof(struct ImmutableDBOptions, allow_ingest_behind), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"preserve_deletes", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"concurrent_prepare", // Deprecated by two_write_queues + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"two_write_queues", + {offsetof(struct ImmutableDBOptions, two_write_queues), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"manual_wal_flush", + {offsetof(struct ImmutableDBOptions, manual_wal_flush), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"wal_compression", + {offsetof(struct ImmutableDBOptions, wal_compression), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"seq_per_batch", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"atomic_flush", + {offsetof(struct ImmutableDBOptions, atomic_flush), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"avoid_unnecessary_blocking_io", + {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"write_dbid_to_manifest", + {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"log_readahead_size", + {offsetof(struct ImmutableDBOptions, log_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"best_efforts_recovery", + {offsetof(struct ImmutableDBOptions, best_efforts_recovery), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_bgerror_resume_count", + {offsetof(struct ImmutableDBOptions, max_bgerror_resume_count), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bgerror_resume_retry_interval", + {offsetof(struct ImmutableDBOptions, bgerror_resume_retry_interval), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_host_id", + {offsetof(struct ImmutableDBOptions, db_host_id), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}}, + // Temporarily deprecated due to race conditions (examples in PR 10375). + {"rate_limiter", + {offsetof(struct ImmutableDBOptions, rate_limiter), + OptionType::kUnknown, OptionVerificationType::kDeprecated, + OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever}}, + // The following properties were handled as special cases in ParseOption + // This means that the properties could be read from the options file + // but never written to the file or compared to each other. + {"rate_limiter_bytes_per_sec", + {offsetof(struct ImmutableDBOptions, rate_limiter), + OptionType::kUnknown, OptionVerificationType::kNormal, + (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever), + // Parse the input value as a RateLimiter + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto limiter = static_cast*>(addr); + limiter->reset(NewGenericRateLimiter( + static_cast(ParseUint64(value)))); + return Status::OK(); + }}}, + {"env", //**TODO: Should this be kCustomizable? + OptionTypeInfo( + offsetof(struct ImmutableDBOptions, env), OptionType::kUnknown, + OptionVerificationType::kNormal, + (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever)) + .SetParseFunc([](const ConfigOptions& opts, + const std::string& /*name*/, + const std::string& value, void* addr) { + // Parse the input value as an Env + auto old_env = static_cast(addr); // Get the old value + Env* new_env = *old_env; // Set new to old + Status s = Env::CreateFromString(opts, value, + &new_env); // Update new value + if (s.ok()) { // It worked + *old_env = new_env; // Update the old one + } + return s; + }) + .SetPrepareFunc([](const ConfigOptions& opts, + const std::string& /*name*/, void* addr) { + auto env = static_cast(addr); + return (*env)->PrepareOptions(opts); + }) + .SetValidateFunc([](const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts, + const std::string& /*name*/, + const void* addr) { + const auto env = static_cast(addr); + return (*env)->ValidateOptions(db_opts, cf_opts); + })}, + {"allow_data_in_errors", + {offsetof(struct ImmutableDBOptions, allow_data_in_errors), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_checksum_gen_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableDBOptions, file_checksum_gen_factory), + OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kAllowNull)}, + {"statistics", + OptionTypeInfo::AsCustomSharedPtr( + // Statistics should not be compared and can be null + // Statistics are maked "don't serialize" until they can be shared + // between DBs + offsetof(struct ImmutableDBOptions, statistics), + OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize | + OptionTypeFlags::kAllowNull)}, + // Allow EventListeners that have a non-empty Name() to be read/written + // as options Each listener will either be + // - A simple name (e.g. "MyEventListener") + // - A name with properties (e.g. "{id=MyListener1; timeout=60}" + // Multiple listeners will be separated by a ":": + // - "MyListener0;{id=MyListener1; timeout=60} + {"listeners", + {offsetof(struct ImmutableDBOptions, listeners), OptionType::kVector, + OptionVerificationType::kByNameAllowNull, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + ConfigOptions embedded = opts; + embedded.ignore_unsupported_options = true; + std::vector> listeners; + Status s; + for (size_t start = 0, end = 0; + s.ok() && start < value.size() && end != std::string::npos; + start = end + 1) { + std::string token; + s = OptionTypeInfo::NextToken(value, ':', start, &end, &token); + if (s.ok() && !token.empty()) { + std::shared_ptr listener; + s = EventListener::CreateFromString(embedded, token, &listener); + if (s.ok() && listener != nullptr) { + listeners.push_back(listener); + } + } + } + if (s.ok()) { // It worked + *(static_cast>*>( + addr)) = listeners; + } + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto listeners = + static_cast>*>( + addr); + ConfigOptions embedded = opts; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& listener : *listeners) { + auto id = listener->GetId(); + if (!id.empty()) { + std::string elem_str = listener->ToString(embedded, ""); + if (printed++ == 0) { + value->append("{"); + } else { + value->append(":"); + } + value->append(elem_str); + } + } + if (printed > 0) { + value->append("}"); + } + return Status::OK(); + }, + nullptr}}, + {"lowest_used_cache_tier", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, lowest_used_cache_tier), + &cache_tier_string_map, OptionTypeFlags::kNone)}, + {"enforce_single_del_contracts", + {offsetof(struct ImmutableDBOptions, enforce_single_del_contracts), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +const std::string OptionsHelper::kDBOptionsName = "DBOptions"; + +class MutableDBConfigurable : public Configurable { + public: + explicit MutableDBConfigurable( + const MutableDBOptions& mdb, + const std::unordered_map* map = nullptr) + : mutable_(mdb), opt_map_(map) { + RegisterOptions(&mutable_, &db_mutable_options_type_info); + } + + bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const override { + bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr, + that_ptr, mismatch); + if (!equals && opt_info.IsByName()) { + if (opt_map_ == nullptr) { + equals = true; + } else { + const auto& iter = opt_map_->find(opt_name); + if (iter == opt_map_->end()) { + equals = true; + } else { + equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr, + iter->second); + } + } + if (equals) { // False alarm, clear mismatch + *mismatch = ""; + } + } + if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) { + const auto* this_config = opt_info.AsRawPointer(this_ptr); + if (this_config == nullptr) { + const auto& iter = opt_map_->find(opt_name); + // If the name exists in the map and is not empty/null, + // then the this_config should be set. + if (iter != opt_map_->end() && !iter->second.empty() && + iter->second != kNullptrString) { + *mismatch = opt_name; + equals = false; + } + } + } + return equals; + } + + protected: + MutableDBOptions mutable_; + const std::unordered_map* opt_map_; +}; + +class DBOptionsConfigurable : public MutableDBConfigurable { + public: + explicit DBOptionsConfigurable( + const DBOptions& opts, + const std::unordered_map* map = nullptr) + : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) { + // The ImmutableDBOptions currently requires the env to be non-null. Make + // sure it is + if (opts.env != nullptr) { + immutable_ = ImmutableDBOptions(opts); + } else { + DBOptions copy = opts; + copy.env = Env::Default(); + immutable_ = ImmutableDBOptions(copy); + } + RegisterOptions(&immutable_, &db_immutable_options_type_info); + } + + protected: + Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) override { + Status s = Configurable::ConfigureOptions(config_options, opts_map, unused); + if (s.ok()) { + db_options_ = BuildDBOptions(immutable_, mutable_); + s = PrepareOptions(config_options); + } + return s; + } + + const void* GetOptionsPtr(const std::string& name) const override { + if (name == OptionsHelper::kDBOptionsName) { + return &db_options_; + } else { + return MutableDBConfigurable::GetOptionsPtr(name); + } + } + + private: + ImmutableDBOptions immutable_; + DBOptions db_options_; +}; + +std::unique_ptr DBOptionsAsConfigurable( + const MutableDBOptions& opts) { + std::unique_ptr ptr(new MutableDBConfigurable(opts)); + return ptr; +} +std::unique_ptr DBOptionsAsConfigurable( + const DBOptions& opts, + const std::unordered_map* opt_map) { + std::unique_ptr ptr(new DBOptionsConfigurable(opts, opt_map)); + return ptr; +} + +ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {} + +ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) + : create_if_missing(options.create_if_missing), + create_missing_column_families(options.create_missing_column_families), + error_if_exists(options.error_if_exists), + paranoid_checks(options.paranoid_checks), + flush_verify_memtable_count(options.flush_verify_memtable_count), + track_and_verify_wals_in_manifest( + options.track_and_verify_wals_in_manifest), + verify_sst_unique_id_in_manifest( + options.verify_sst_unique_id_in_manifest), + env(options.env), + rate_limiter(options.rate_limiter), + sst_file_manager(options.sst_file_manager), + info_log(options.info_log), + info_log_level(options.info_log_level), + max_file_opening_threads(options.max_file_opening_threads), + statistics(options.statistics), + use_fsync(options.use_fsync), + db_paths(options.db_paths), + db_log_dir(options.db_log_dir), + wal_dir(options.wal_dir), + max_log_file_size(options.max_log_file_size), + log_file_time_to_roll(options.log_file_time_to_roll), + keep_log_file_num(options.keep_log_file_num), + recycle_log_file_num(options.recycle_log_file_num), + max_manifest_file_size(options.max_manifest_file_size), + table_cache_numshardbits(options.table_cache_numshardbits), + WAL_ttl_seconds(options.WAL_ttl_seconds), + WAL_size_limit_MB(options.WAL_size_limit_MB), + max_write_batch_group_size_bytes( + options.max_write_batch_group_size_bytes), + manifest_preallocation_size(options.manifest_preallocation_size), + allow_mmap_reads(options.allow_mmap_reads), + allow_mmap_writes(options.allow_mmap_writes), + use_direct_reads(options.use_direct_reads), + use_direct_io_for_flush_and_compaction( + options.use_direct_io_for_flush_and_compaction), + allow_fallocate(options.allow_fallocate), + is_fd_close_on_exec(options.is_fd_close_on_exec), + advise_random_on_open(options.advise_random_on_open), + db_write_buffer_size(options.db_write_buffer_size), + write_buffer_manager(options.write_buffer_manager), + access_hint_on_compaction_start(options.access_hint_on_compaction_start), + random_access_max_buffer_size(options.random_access_max_buffer_size), + use_adaptive_mutex(options.use_adaptive_mutex), + listeners(options.listeners), + enable_thread_tracking(options.enable_thread_tracking), + enable_pipelined_write(options.enable_pipelined_write), + unordered_write(options.unordered_write), + allow_concurrent_memtable_write(options.allow_concurrent_memtable_write), + enable_write_thread_adaptive_yield( + options.enable_write_thread_adaptive_yield), + write_thread_max_yield_usec(options.write_thread_max_yield_usec), + write_thread_slow_yield_usec(options.write_thread_slow_yield_usec), + skip_stats_update_on_db_open(options.skip_stats_update_on_db_open), + skip_checking_sst_file_sizes_on_db_open( + options.skip_checking_sst_file_sizes_on_db_open), + wal_recovery_mode(options.wal_recovery_mode), + allow_2pc(options.allow_2pc), + row_cache(options.row_cache), + wal_filter(options.wal_filter), + fail_if_options_file_error(options.fail_if_options_file_error), + dump_malloc_stats(options.dump_malloc_stats), + avoid_flush_during_recovery(options.avoid_flush_during_recovery), + allow_ingest_behind(options.allow_ingest_behind), + two_write_queues(options.two_write_queues), + manual_wal_flush(options.manual_wal_flush), + wal_compression(options.wal_compression), + atomic_flush(options.atomic_flush), + avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), + persist_stats_to_disk(options.persist_stats_to_disk), + write_dbid_to_manifest(options.write_dbid_to_manifest), + log_readahead_size(options.log_readahead_size), + file_checksum_gen_factory(options.file_checksum_gen_factory), + best_efforts_recovery(options.best_efforts_recovery), + max_bgerror_resume_count(options.max_bgerror_resume_count), + bgerror_resume_retry_interval(options.bgerror_resume_retry_interval), + allow_data_in_errors(options.allow_data_in_errors), + db_host_id(options.db_host_id), + checksum_handoff_file_types(options.checksum_handoff_file_types), + lowest_used_cache_tier(options.lowest_used_cache_tier), + compaction_service(options.compaction_service), + enforce_single_del_contracts(options.enforce_single_del_contracts) { + fs = env->GetFileSystem(); + clock = env->GetSystemClock().get(); + logger = info_log.get(); + stats = statistics.get(); +} + +void ImmutableDBOptions::Dump(Logger* log) const { + ROCKS_LOG_HEADER(log, " Options.error_if_exists: %d", + error_if_exists); + ROCKS_LOG_HEADER(log, " Options.create_if_missing: %d", + create_if_missing); + ROCKS_LOG_HEADER(log, " Options.paranoid_checks: %d", + paranoid_checks); + ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d", + flush_verify_memtable_count); + ROCKS_LOG_HEADER(log, + " " + "Options.track_and_verify_wals_in_manifest: %d", + track_and_verify_wals_in_manifest); + ROCKS_LOG_HEADER(log, " Options.verify_sst_unique_id_in_manifest: %d", + verify_sst_unique_id_in_manifest); + ROCKS_LOG_HEADER(log, " Options.env: %p", + env); + ROCKS_LOG_HEADER(log, " Options.fs: %s", + fs->Name()); + ROCKS_LOG_HEADER(log, " Options.info_log: %p", + info_log.get()); + ROCKS_LOG_HEADER(log, " Options.max_file_opening_threads: %d", + max_file_opening_threads); + ROCKS_LOG_HEADER(log, " Options.statistics: %p", + stats); + ROCKS_LOG_HEADER(log, " Options.use_fsync: %d", + use_fsync); + ROCKS_LOG_HEADER( + log, " Options.max_log_file_size: %" ROCKSDB_PRIszt, + max_log_file_size); + ROCKS_LOG_HEADER(log, + " Options.max_manifest_file_size: %" PRIu64, + max_manifest_file_size); + ROCKS_LOG_HEADER( + log, " Options.log_file_time_to_roll: %" ROCKSDB_PRIszt, + log_file_time_to_roll); + ROCKS_LOG_HEADER( + log, " Options.keep_log_file_num: %" ROCKSDB_PRIszt, + keep_log_file_num); + ROCKS_LOG_HEADER( + log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt, + recycle_log_file_num); + ROCKS_LOG_HEADER(log, " Options.allow_fallocate: %d", + allow_fallocate); + ROCKS_LOG_HEADER(log, " Options.allow_mmap_reads: %d", + allow_mmap_reads); + ROCKS_LOG_HEADER(log, " Options.allow_mmap_writes: %d", + allow_mmap_writes); + ROCKS_LOG_HEADER(log, " Options.use_direct_reads: %d", + use_direct_reads); + ROCKS_LOG_HEADER(log, + " " + "Options.use_direct_io_for_flush_and_compaction: %d", + use_direct_io_for_flush_and_compaction); + ROCKS_LOG_HEADER(log, " Options.create_missing_column_families: %d", + create_missing_column_families); + ROCKS_LOG_HEADER(log, " Options.db_log_dir: %s", + db_log_dir.c_str()); + ROCKS_LOG_HEADER(log, " Options.wal_dir: %s", + wal_dir.c_str()); + ROCKS_LOG_HEADER(log, " Options.table_cache_numshardbits: %d", + table_cache_numshardbits); + ROCKS_LOG_HEADER(log, + " Options.WAL_ttl_seconds: %" PRIu64, + WAL_ttl_seconds); + ROCKS_LOG_HEADER(log, + " Options.WAL_size_limit_MB: %" PRIu64, + WAL_size_limit_MB); + ROCKS_LOG_HEADER(log, + " " + "Options.max_write_batch_group_size_bytes: %" PRIu64, + max_write_batch_group_size_bytes); + ROCKS_LOG_HEADER( + log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, + manifest_preallocation_size); + ROCKS_LOG_HEADER(log, " Options.is_fd_close_on_exec: %d", + is_fd_close_on_exec); + ROCKS_LOG_HEADER(log, " Options.advise_random_on_open: %d", + advise_random_on_open); + ROCKS_LOG_HEADER( + log, " Options.db_write_buffer_size: %" ROCKSDB_PRIszt, + db_write_buffer_size); + ROCKS_LOG_HEADER(log, " Options.write_buffer_manager: %p", + write_buffer_manager.get()); + ROCKS_LOG_HEADER(log, " Options.access_hint_on_compaction_start: %d", + static_cast(access_hint_on_compaction_start)); + ROCKS_LOG_HEADER( + log, " Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt, + random_access_max_buffer_size); + ROCKS_LOG_HEADER(log, " Options.use_adaptive_mutex: %d", + use_adaptive_mutex); + ROCKS_LOG_HEADER(log, " Options.rate_limiter: %p", + rate_limiter.get()); + Header( + log, " Options.sst_file_manager.rate_bytes_per_sec: %" PRIi64, + sst_file_manager ? sst_file_manager->GetDeleteRateBytesPerSecond() : 0); + ROCKS_LOG_HEADER(log, " Options.wal_recovery_mode: %d", + static_cast(wal_recovery_mode)); + ROCKS_LOG_HEADER(log, " Options.enable_thread_tracking: %d", + enable_thread_tracking); + ROCKS_LOG_HEADER(log, " Options.enable_pipelined_write: %d", + enable_pipelined_write); + ROCKS_LOG_HEADER(log, " Options.unordered_write: %d", + unordered_write); + ROCKS_LOG_HEADER(log, " Options.allow_concurrent_memtable_write: %d", + allow_concurrent_memtable_write); + ROCKS_LOG_HEADER(log, " Options.enable_write_thread_adaptive_yield: %d", + enable_write_thread_adaptive_yield); + ROCKS_LOG_HEADER(log, + " Options.write_thread_max_yield_usec: %" PRIu64, + write_thread_max_yield_usec); + ROCKS_LOG_HEADER(log, + " Options.write_thread_slow_yield_usec: %" PRIu64, + write_thread_slow_yield_usec); + if (row_cache) { + ROCKS_LOG_HEADER( + log, + " Options.row_cache: %" ROCKSDB_PRIszt, + row_cache->GetCapacity()); + } else { + ROCKS_LOG_HEADER(log, + " Options.row_cache: None"); + } + ROCKS_LOG_HEADER(log, " Options.wal_filter: %s", + wal_filter ? wal_filter->Name() : "None"); + + ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_recovery: %d", + avoid_flush_during_recovery); + ROCKS_LOG_HEADER(log, " Options.allow_ingest_behind: %d", + allow_ingest_behind); + ROCKS_LOG_HEADER(log, " Options.two_write_queues: %d", + two_write_queues); + ROCKS_LOG_HEADER(log, " Options.manual_wal_flush: %d", + manual_wal_flush); + ROCKS_LOG_HEADER(log, " Options.wal_compression: %d", + wal_compression); + ROCKS_LOG_HEADER(log, " Options.atomic_flush: %d", atomic_flush); + ROCKS_LOG_HEADER(log, + " Options.avoid_unnecessary_blocking_io: %d", + avoid_unnecessary_blocking_io); + ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", + persist_stats_to_disk); + ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d", + write_dbid_to_manifest); + ROCKS_LOG_HEADER( + log, " Options.log_readahead_size: %" ROCKSDB_PRIszt, + log_readahead_size); + ROCKS_LOG_HEADER(log, " Options.file_checksum_gen_factory: %s", + file_checksum_gen_factory ? file_checksum_gen_factory->Name() + : kUnknownFileChecksumFuncName); + ROCKS_LOG_HEADER(log, " Options.best_efforts_recovery: %d", + static_cast(best_efforts_recovery)); + ROCKS_LOG_HEADER(log, " Options.max_bgerror_resume_count: %d", + max_bgerror_resume_count); + ROCKS_LOG_HEADER(log, + " Options.bgerror_resume_retry_interval: %" PRIu64, + bgerror_resume_retry_interval); + ROCKS_LOG_HEADER(log, " Options.allow_data_in_errors: %d", + allow_data_in_errors); + ROCKS_LOG_HEADER(log, " Options.db_host_id: %s", + db_host_id.c_str()); + ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s", + enforce_single_del_contracts ? "true" : "false"); +} + +bool ImmutableDBOptions::IsWalDirSameAsDBPath() const { + assert(!db_paths.empty()); + return IsWalDirSameAsDBPath(db_paths[0].path); +} + +bool ImmutableDBOptions::IsWalDirSameAsDBPath( + const std::string& db_path) const { + bool same = wal_dir.empty(); + if (!same) { + Status s = env->AreFilesSame(wal_dir, db_path, &same); + if (s.IsNotSupported()) { + same = wal_dir == db_path; + } + } + return same; +} + +const std::string& ImmutableDBOptions::GetWalDir() const { + if (wal_dir.empty()) { + assert(!db_paths.empty()); + return db_paths[0].path; + } else { + return wal_dir; + } +} + +const std::string& ImmutableDBOptions::GetWalDir( + const std::string& path) const { + if (wal_dir.empty()) { + return path; + } else { + return wal_dir; + } +} + +MutableDBOptions::MutableDBOptions() + : max_background_jobs(2), + max_background_compactions(-1), + max_subcompactions(0), + avoid_flush_during_shutdown(false), + writable_file_max_buffer_size(1024 * 1024), + delayed_write_rate(2 * 1024U * 1024U), + max_total_wal_size(0), + delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000), + stats_dump_period_sec(600), + stats_persist_period_sec(600), + stats_history_buffer_size(1024 * 1024), + max_open_files(-1), + bytes_per_sync(0), + wal_bytes_per_sync(0), + strict_bytes_per_sync(false), + compaction_readahead_size(0), + max_background_flushes(-1) {} + +MutableDBOptions::MutableDBOptions(const DBOptions& options) + : max_background_jobs(options.max_background_jobs), + max_background_compactions(options.max_background_compactions), + max_subcompactions(options.max_subcompactions), + avoid_flush_during_shutdown(options.avoid_flush_during_shutdown), + writable_file_max_buffer_size(options.writable_file_max_buffer_size), + delayed_write_rate(options.delayed_write_rate), + max_total_wal_size(options.max_total_wal_size), + delete_obsolete_files_period_micros( + options.delete_obsolete_files_period_micros), + stats_dump_period_sec(options.stats_dump_period_sec), + stats_persist_period_sec(options.stats_persist_period_sec), + stats_history_buffer_size(options.stats_history_buffer_size), + max_open_files(options.max_open_files), + bytes_per_sync(options.bytes_per_sync), + wal_bytes_per_sync(options.wal_bytes_per_sync), + strict_bytes_per_sync(options.strict_bytes_per_sync), + compaction_readahead_size(options.compaction_readahead_size), + max_background_flushes(options.max_background_flushes) {} + +void MutableDBOptions::Dump(Logger* log) const { + ROCKS_LOG_HEADER(log, " Options.max_background_jobs: %d", + max_background_jobs); + ROCKS_LOG_HEADER(log, " Options.max_background_compactions: %d", + max_background_compactions); + ROCKS_LOG_HEADER(log, " Options.max_subcompactions: %" PRIu32, + max_subcompactions); + ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_shutdown: %d", + avoid_flush_during_shutdown); + ROCKS_LOG_HEADER( + log, " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt, + writable_file_max_buffer_size); + ROCKS_LOG_HEADER(log, " Options.delayed_write_rate : %" PRIu64, + delayed_write_rate); + ROCKS_LOG_HEADER(log, " Options.max_total_wal_size: %" PRIu64, + max_total_wal_size); + ROCKS_LOG_HEADER( + log, " Options.delete_obsolete_files_period_micros: %" PRIu64, + delete_obsolete_files_period_micros); + ROCKS_LOG_HEADER(log, " Options.stats_dump_period_sec: %u", + stats_dump_period_sec); + ROCKS_LOG_HEADER(log, " Options.stats_persist_period_sec: %d", + stats_persist_period_sec); + ROCKS_LOG_HEADER( + log, + " Options.stats_history_buffer_size: %" ROCKSDB_PRIszt, + stats_history_buffer_size); + ROCKS_LOG_HEADER(log, " Options.max_open_files: %d", + max_open_files); + ROCKS_LOG_HEADER(log, + " Options.bytes_per_sync: %" PRIu64, + bytes_per_sync); + ROCKS_LOG_HEADER(log, + " Options.wal_bytes_per_sync: %" PRIu64, + wal_bytes_per_sync); + ROCKS_LOG_HEADER(log, + " Options.strict_bytes_per_sync: %d", + strict_bytes_per_sync); + ROCKS_LOG_HEADER(log, + " Options.compaction_readahead_size: %" ROCKSDB_PRIszt, + compaction_readahead_size); + ROCKS_LOG_HEADER(log, " Options.max_background_flushes: %d", + max_background_flushes); +} + +Status GetMutableDBOptionsFromStrings( + const MutableDBOptions& base_options, + const std::unordered_map& options_map, + MutableDBOptions* new_options) { + assert(new_options); + *new_options = base_options; + ConfigOptions config_options; + Status s = OptionTypeInfo::ParseType( + config_options, options_map, db_mutable_options_type_info, new_options); + if (!s.ok()) { + *new_options = base_options; + } + return s; +} + +bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options, + const MutableDBOptions& that_options) { + ConfigOptions config_options; + std::string mismatch; + return OptionTypeInfo::StructsAreEqual( + config_options, "MutableDBOptions", &db_mutable_options_type_info, + "MutableDBOptions", &this_options, &that_options, &mismatch); +} + +Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, + const MutableDBOptions& mutable_opts, + std::string* opt_string) { + return OptionTypeInfo::SerializeType( + config_options, db_mutable_options_type_info, &mutable_opts, opt_string); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.h new file mode 100644 index 0000000000..2a9d98b250 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/db_options.h @@ -0,0 +1,152 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +struct ImmutableDBOptions { + static const char* kName() { return "ImmutableDBOptions"; } + ImmutableDBOptions(); + explicit ImmutableDBOptions(const DBOptions& options); + + void Dump(Logger* log) const; + + bool create_if_missing; + bool create_missing_column_families; + bool error_if_exists; + bool paranoid_checks; + bool flush_verify_memtable_count; + bool track_and_verify_wals_in_manifest; + bool verify_sst_unique_id_in_manifest; + Env* env; + std::shared_ptr rate_limiter; + std::shared_ptr sst_file_manager; + std::shared_ptr info_log; + InfoLogLevel info_log_level; + int max_file_opening_threads; + std::shared_ptr statistics; + bool use_fsync; + std::vector db_paths; + std::string db_log_dir; + // The wal_dir option from the file. To determine the + // directory in use, the GetWalDir or IsWalDirSameAsDBPath + // methods should be used instead of accessing this variable directly. + std::string wal_dir; + size_t max_log_file_size; + size_t log_file_time_to_roll; + size_t keep_log_file_num; + size_t recycle_log_file_num; + uint64_t max_manifest_file_size; + int table_cache_numshardbits; + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; + uint64_t max_write_batch_group_size_bytes; + size_t manifest_preallocation_size; + bool allow_mmap_reads; + bool allow_mmap_writes; + bool use_direct_reads; + bool use_direct_io_for_flush_and_compaction; + bool allow_fallocate; + bool is_fd_close_on_exec; + bool advise_random_on_open; + size_t db_write_buffer_size; + std::shared_ptr write_buffer_manager; + DBOptions::AccessHint access_hint_on_compaction_start; + size_t random_access_max_buffer_size; + bool use_adaptive_mutex; + std::vector> listeners; + bool enable_thread_tracking; + bool enable_pipelined_write; + bool unordered_write; + bool allow_concurrent_memtable_write; + bool enable_write_thread_adaptive_yield; + uint64_t write_thread_max_yield_usec; + uint64_t write_thread_slow_yield_usec; + bool skip_stats_update_on_db_open; + bool skip_checking_sst_file_sizes_on_db_open; + WALRecoveryMode wal_recovery_mode; + bool allow_2pc; + std::shared_ptr row_cache; + WalFilter* wal_filter; + bool fail_if_options_file_error; + bool dump_malloc_stats; + bool avoid_flush_during_recovery; + bool allow_ingest_behind; + bool two_write_queues; + bool manual_wal_flush; + CompressionType wal_compression; + bool atomic_flush; + bool avoid_unnecessary_blocking_io; + bool persist_stats_to_disk; + bool write_dbid_to_manifest; + size_t log_readahead_size; + std::shared_ptr file_checksum_gen_factory; + bool best_efforts_recovery; + int max_bgerror_resume_count; + uint64_t bgerror_resume_retry_interval; + bool allow_data_in_errors; + std::string db_host_id; + FileTypeSet checksum_handoff_file_types; + CacheTier lowest_used_cache_tier; + // Convenience/Helper objects that are not part of the base DBOptions + std::shared_ptr fs; + SystemClock* clock; + Statistics* stats; + Logger* logger; + std::shared_ptr compaction_service; + bool enforce_single_del_contracts; + + bool IsWalDirSameAsDBPath() const; + bool IsWalDirSameAsDBPath(const std::string& path) const; + const std::string& GetWalDir() const; + const std::string& GetWalDir(const std::string& path) const; +}; + +struct MutableDBOptions { + static const char* kName() { return "MutableDBOptions"; } + MutableDBOptions(); + explicit MutableDBOptions(const DBOptions& options); + + void Dump(Logger* log) const; + + int max_background_jobs; + int max_background_compactions; + uint32_t max_subcompactions; + bool avoid_flush_during_shutdown; + size_t writable_file_max_buffer_size; + uint64_t delayed_write_rate; + uint64_t max_total_wal_size; + uint64_t delete_obsolete_files_period_micros; + unsigned int stats_dump_period_sec; + unsigned int stats_persist_period_sec; + size_t stats_history_buffer_size; + int max_open_files; + uint64_t bytes_per_sync; + uint64_t wal_bytes_per_sync; + bool strict_bytes_per_sync; + size_t compaction_readahead_size; + int max_background_flushes; +}; + +Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, + const MutableDBOptions& mutable_opts, + std::string* opt_string); + +Status GetMutableDBOptionsFromStrings( + const MutableDBOptions& base_options, + const std::unordered_map& options_map, + MutableDBOptions* new_options); + +bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options, + const MutableDBOptions& that_options); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options.cc new file mode 100644 index 0000000000..1caebdfb2f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options.cc @@ -0,0 +1,692 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/options.h" + +#include +#include + +#include "logging/logging.h" +#include "monitoring/statistics_impl.h" +#include "options/db_options.h" +#include "options/options_helper.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/sst_partitioner.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/wal_filter.h" +#include "table/block_based/block_based_table_factory.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions() { + assert(memtable_factory.get() != nullptr); +} + +AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) + : max_write_buffer_number(options.max_write_buffer_number), + min_write_buffer_number_to_merge( + options.min_write_buffer_number_to_merge), + max_write_buffer_number_to_maintain( + options.max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain( + options.max_write_buffer_size_to_maintain), + inplace_update_support(options.inplace_update_support), + inplace_update_num_locks(options.inplace_update_num_locks), + experimental_mempurge_threshold(options.experimental_mempurge_threshold), + inplace_callback(options.inplace_callback), + memtable_prefix_bloom_size_ratio( + options.memtable_prefix_bloom_size_ratio), + memtable_whole_key_filtering(options.memtable_whole_key_filtering), + memtable_huge_page_size(options.memtable_huge_page_size), + memtable_insert_with_hint_prefix_extractor( + options.memtable_insert_with_hint_prefix_extractor), + bloom_locality(options.bloom_locality), + arena_block_size(options.arena_block_size), + compression_per_level(options.compression_per_level), + num_levels(options.num_levels), + level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), + level0_stop_writes_trigger(options.level0_stop_writes_trigger), + target_file_size_base(options.target_file_size_base), + target_file_size_multiplier(options.target_file_size_multiplier), + level_compaction_dynamic_level_bytes( + options.level_compaction_dynamic_level_bytes), + max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), + max_bytes_for_level_multiplier_additional( + options.max_bytes_for_level_multiplier_additional), + max_compaction_bytes(options.max_compaction_bytes), + ignore_max_compaction_bytes_for_input( + options.ignore_max_compaction_bytes_for_input), + soft_pending_compaction_bytes_limit( + options.soft_pending_compaction_bytes_limit), + hard_pending_compaction_bytes_limit( + options.hard_pending_compaction_bytes_limit), + compaction_style(options.compaction_style), + compaction_pri(options.compaction_pri), + compaction_options_universal(options.compaction_options_universal), + compaction_options_fifo(options.compaction_options_fifo), + max_sequential_skip_in_iterations( + options.max_sequential_skip_in_iterations), + memtable_factory(options.memtable_factory), + table_properties_collector_factories( + options.table_properties_collector_factories), + max_successive_merges(options.max_successive_merges), + optimize_filters_for_hits(options.optimize_filters_for_hits), + paranoid_file_checks(options.paranoid_file_checks), + force_consistency_checks(options.force_consistency_checks), + report_bg_io_stats(options.report_bg_io_stats), + ttl(options.ttl), + periodic_compaction_seconds(options.periodic_compaction_seconds), + sample_for_compression(options.sample_for_compression), + preclude_last_level_data_seconds( + options.preclude_last_level_data_seconds), + preserve_internal_time_seconds(options.preserve_internal_time_seconds), + enable_blob_files(options.enable_blob_files), + min_blob_size(options.min_blob_size), + blob_file_size(options.blob_file_size), + blob_compression_type(options.blob_compression_type), + enable_blob_garbage_collection(options.enable_blob_garbage_collection), + blob_garbage_collection_age_cutoff( + options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold), + blob_compaction_readahead_size(options.blob_compaction_readahead_size), + blob_file_starting_level(options.blob_file_starting_level), + blob_cache(options.blob_cache), + prepopulate_blob_cache(options.prepopulate_blob_cache), + persist_user_defined_timestamps(options.persist_user_defined_timestamps) { + assert(memtable_factory.get() != nullptr); + if (max_bytes_for_level_multiplier_additional.size() < + static_cast(num_levels)) { + max_bytes_for_level_multiplier_additional.resize(num_levels, 1); + } +} + +ColumnFamilyOptions::ColumnFamilyOptions() + : compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), + table_factory( + std::shared_ptr(new BlockBasedTableFactory())) {} + +ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) + : ColumnFamilyOptions(*static_cast(&options)) {} + +DBOptions::DBOptions() {} +DBOptions::DBOptions(const Options& options) + : DBOptions(*static_cast(&options)) {} + +void DBOptions::Dump(Logger* log) const { + ImmutableDBOptions(*this).Dump(log); + MutableDBOptions(*this).Dump(log); +} // DBOptions::Dump + +void ColumnFamilyOptions::Dump(Logger* log) const { + ROCKS_LOG_HEADER(log, " Options.comparator: %s", + comparator->Name()); + if (comparator->timestamp_size() > 0) { + ROCKS_LOG_HEADER( + log, " Options.persist_user_defined_timestamps: %s", + persist_user_defined_timestamps ? "true" : "false"); + } + ROCKS_LOG_HEADER(log, " Options.merge_operator: %s", + merge_operator ? merge_operator->Name() : "None"); + ROCKS_LOG_HEADER(log, " Options.compaction_filter: %s", + compaction_filter ? compaction_filter->Name() : "None"); + ROCKS_LOG_HEADER( + log, " Options.compaction_filter_factory: %s", + compaction_filter_factory ? compaction_filter_factory->Name() : "None"); + ROCKS_LOG_HEADER( + log, " Options.sst_partitioner_factory: %s", + sst_partitioner_factory ? sst_partitioner_factory->Name() : "None"); + ROCKS_LOG_HEADER(log, " Options.memtable_factory: %s", + memtable_factory->Name()); + ROCKS_LOG_HEADER(log, " Options.table_factory: %s", + table_factory->Name()); + ROCKS_LOG_HEADER(log, " table_factory options: %s", + table_factory->GetPrintableOptions().c_str()); + ROCKS_LOG_HEADER(log, " Options.write_buffer_size: %" ROCKSDB_PRIszt, + write_buffer_size); + ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number: %d", + max_write_buffer_number); + if (!compression_per_level.empty()) { + for (unsigned int i = 0; i < compression_per_level.size(); i++) { + ROCKS_LOG_HEADER( + log, " Options.compression[%d]: %s", i, + CompressionTypeToString(compression_per_level[i]).c_str()); + } + } else { + ROCKS_LOG_HEADER(log, " Options.compression: %s", + CompressionTypeToString(compression).c_str()); + } + ROCKS_LOG_HEADER( + log, " Options.bottommost_compression: %s", + bottommost_compression == kDisableCompressionOption + ? "Disabled" + : CompressionTypeToString(bottommost_compression).c_str()); + ROCKS_LOG_HEADER( + log, " Options.prefix_extractor: %s", + prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); + ROCKS_LOG_HEADER(log, + " Options.memtable_insert_with_hint_prefix_extractor: %s", + memtable_insert_with_hint_prefix_extractor == nullptr + ? "nullptr" + : memtable_insert_with_hint_prefix_extractor->Name()); + ROCKS_LOG_HEADER(log, " Options.num_levels: %d", num_levels); + ROCKS_LOG_HEADER(log, " Options.min_write_buffer_number_to_merge: %d", + min_write_buffer_number_to_merge); + ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number_to_maintain: %d", + max_write_buffer_number_to_maintain); + ROCKS_LOG_HEADER(log, + " Options.max_write_buffer_size_to_maintain: %" PRIu64, + max_write_buffer_size_to_maintain); + ROCKS_LOG_HEADER( + log, " Options.bottommost_compression_opts.window_bits: %d", + bottommost_compression_opts.window_bits); + ROCKS_LOG_HEADER( + log, " Options.bottommost_compression_opts.level: %d", + bottommost_compression_opts.level); + ROCKS_LOG_HEADER( + log, " Options.bottommost_compression_opts.strategy: %d", + bottommost_compression_opts.strategy); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.max_dict_bytes: " + "%" PRIu32, + bottommost_compression_opts.max_dict_bytes); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.zstd_max_train_bytes: " + "%" PRIu32, + bottommost_compression_opts.zstd_max_train_bytes); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.parallel_threads: " + "%" PRIu32, + bottommost_compression_opts.parallel_threads); + ROCKS_LOG_HEADER( + log, " Options.bottommost_compression_opts.enabled: %s", + bottommost_compression_opts.enabled ? "true" : "false"); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.max_dict_buffer_bytes: " + "%" PRIu64, + bottommost_compression_opts.max_dict_buffer_bytes); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.use_zstd_dict_trainer: %s", + bottommost_compression_opts.use_zstd_dict_trainer ? "true" : "false"); + ROCKS_LOG_HEADER(log, " Options.compression_opts.window_bits: %d", + compression_opts.window_bits); + ROCKS_LOG_HEADER(log, " Options.compression_opts.level: %d", + compression_opts.level); + ROCKS_LOG_HEADER(log, " Options.compression_opts.strategy: %d", + compression_opts.strategy); + ROCKS_LOG_HEADER( + log, + " Options.compression_opts.max_dict_bytes: %" PRIu32, + compression_opts.max_dict_bytes); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.zstd_max_train_bytes: " + "%" PRIu32, + compression_opts.zstd_max_train_bytes); + ROCKS_LOG_HEADER( + log, " Options.compression_opts.use_zstd_dict_trainer: %s", + compression_opts.use_zstd_dict_trainer ? "true" : "false"); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.parallel_threads: " + "%" PRIu32, + compression_opts.parallel_threads); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.enabled: %s", + compression_opts.enabled ? "true" : "false"); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.max_dict_buffer_bytes: " + "%" PRIu64, + compression_opts.max_dict_buffer_bytes); + ROCKS_LOG_HEADER(log, " Options.level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + ROCKS_LOG_HEADER(log, " Options.level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + ROCKS_LOG_HEADER(log, " Options.level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + ROCKS_LOG_HEADER( + log, " Options.target_file_size_base: %" PRIu64, + target_file_size_base); + ROCKS_LOG_HEADER(log, " Options.target_file_size_multiplier: %d", + target_file_size_multiplier); + ROCKS_LOG_HEADER( + log, " Options.max_bytes_for_level_base: %" PRIu64, + max_bytes_for_level_base); + ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d", + level_compaction_dynamic_level_bytes); + ROCKS_LOG_HEADER(log, " Options.max_bytes_for_level_multiplier: %f", + max_bytes_for_level_multiplier); + for (size_t i = 0; i < max_bytes_for_level_multiplier_additional.size(); + i++) { + ROCKS_LOG_HEADER( + log, "Options.max_bytes_for_level_multiplier_addtl[%" ROCKSDB_PRIszt + "]: %d", + i, max_bytes_for_level_multiplier_additional[i]); + } + ROCKS_LOG_HEADER( + log, " Options.max_sequential_skip_in_iterations: %" PRIu64, + max_sequential_skip_in_iterations); + ROCKS_LOG_HEADER( + log, " Options.max_compaction_bytes: %" PRIu64, + max_compaction_bytes); + ROCKS_LOG_HEADER(log, " Options.ignore_max_compaction_bytes_for_input: %s", + ignore_max_compaction_bytes_for_input ? "true" : "false"); + ROCKS_LOG_HEADER( + log, + " Options.arena_block_size: %" ROCKSDB_PRIszt, + arena_block_size); + ROCKS_LOG_HEADER(log, + " Options.soft_pending_compaction_bytes_limit: %" PRIu64, + soft_pending_compaction_bytes_limit); + ROCKS_LOG_HEADER(log, + " Options.hard_pending_compaction_bytes_limit: %" PRIu64, + hard_pending_compaction_bytes_limit); + ROCKS_LOG_HEADER(log, " Options.disable_auto_compactions: %d", + disable_auto_compactions); + + const auto& it_compaction_style = + compaction_style_to_string.find(compaction_style); + std::string str_compaction_style; + if (it_compaction_style == compaction_style_to_string.end()) { + assert(false); + str_compaction_style = "unknown_" + std::to_string(compaction_style); + } else { + str_compaction_style = it_compaction_style->second; + } + ROCKS_LOG_HEADER(log, + " Options.compaction_style: %s", + str_compaction_style.c_str()); + + const auto& it_compaction_pri = + compaction_pri_to_string.find(compaction_pri); + std::string str_compaction_pri; + if (it_compaction_pri == compaction_pri_to_string.end()) { + assert(false); + str_compaction_pri = "unknown_" + std::to_string(compaction_pri); + } else { + str_compaction_pri = it_compaction_pri->second; + } + ROCKS_LOG_HEADER(log, + " Options.compaction_pri: %s", + str_compaction_pri.c_str()); + ROCKS_LOG_HEADER(log, + "Options.compaction_options_universal.size_ratio: %u", + compaction_options_universal.size_ratio); + ROCKS_LOG_HEADER(log, + "Options.compaction_options_universal.min_merge_width: %u", + compaction_options_universal.min_merge_width); + ROCKS_LOG_HEADER(log, + "Options.compaction_options_universal.max_merge_width: %u", + compaction_options_universal.max_merge_width); + ROCKS_LOG_HEADER( + log, + "Options.compaction_options_universal." + "max_size_amplification_percent: %u", + compaction_options_universal.max_size_amplification_percent); + ROCKS_LOG_HEADER( + log, + "Options.compaction_options_universal.compression_size_percent: %d", + compaction_options_universal.compression_size_percent); + const auto& it_compaction_stop_style = compaction_stop_style_to_string.find( + compaction_options_universal.stop_style); + std::string str_compaction_stop_style; + if (it_compaction_stop_style == compaction_stop_style_to_string.end()) { + assert(false); + str_compaction_stop_style = + "unknown_" + std::to_string(compaction_options_universal.stop_style); + } else { + str_compaction_stop_style = it_compaction_stop_style->second; + } + ROCKS_LOG_HEADER(log, + "Options.compaction_options_universal.stop_style: %s", + str_compaction_stop_style.c_str()); + ROCKS_LOG_HEADER( + log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64, + compaction_options_fifo.max_table_files_size); + ROCKS_LOG_HEADER(log, + "Options.compaction_options_fifo.allow_compaction: %d", + compaction_options_fifo.allow_compaction); + std::ostringstream collector_info; + for (const auto& collector_factory : table_properties_collector_factories) { + collector_info << collector_factory->ToString() << ';'; + } + ROCKS_LOG_HEADER( + log, " Options.table_properties_collectors: %s", + collector_info.str().c_str()); + ROCKS_LOG_HEADER(log, + " Options.inplace_update_support: %d", + inplace_update_support); + ROCKS_LOG_HEADER( + log, + " Options.inplace_update_num_locks: %" ROCKSDB_PRIszt, + inplace_update_num_locks); + // TODO: easier config for bloom (maybe based on avg key/value size) + ROCKS_LOG_HEADER( + log, " Options.memtable_prefix_bloom_size_ratio: %f", + memtable_prefix_bloom_size_ratio); + ROCKS_LOG_HEADER(log, + " Options.memtable_whole_key_filtering: %d", + memtable_whole_key_filtering); + + ROCKS_LOG_HEADER(log, " Options.memtable_huge_page_size: %" ROCKSDB_PRIszt, + memtable_huge_page_size); + ROCKS_LOG_HEADER(log, + " Options.bloom_locality: %d", + bloom_locality); + + ROCKS_LOG_HEADER( + log, + " Options.max_successive_merges: %" ROCKSDB_PRIszt, + max_successive_merges); + ROCKS_LOG_HEADER(log, + " Options.optimize_filters_for_hits: %d", + optimize_filters_for_hits); + ROCKS_LOG_HEADER(log, " Options.paranoid_file_checks: %d", + paranoid_file_checks); + ROCKS_LOG_HEADER(log, " Options.force_consistency_checks: %d", + force_consistency_checks); + ROCKS_LOG_HEADER(log, " Options.report_bg_io_stats: %d", + report_bg_io_stats); + ROCKS_LOG_HEADER(log, " Options.ttl: %" PRIu64, + ttl); + ROCKS_LOG_HEADER(log, + " Options.periodic_compaction_seconds: %" PRIu64, + periodic_compaction_seconds); + ROCKS_LOG_HEADER(log, " Options.preclude_last_level_data_seconds: %" PRIu64, + preclude_last_level_data_seconds); + ROCKS_LOG_HEADER(log, " Options.preserve_internal_time_seconds: %" PRIu64, + preserve_internal_time_seconds); + ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", + enable_blob_files ? "true" : "false"); + ROCKS_LOG_HEADER( + log, " Options.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " Options.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", + CompressionTypeToString(blob_compression_type).c_str()); + ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", + enable_blob_garbage_collection ? "true" : "false"); + ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", + blob_garbage_collection_age_cutoff); + ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); + ROCKS_LOG_HEADER( + log, " Options.blob_compaction_readahead_size: %" PRIu64, + blob_compaction_readahead_size); + ROCKS_LOG_HEADER(log, " Options.blob_file_starting_level: %d", + blob_file_starting_level); + if (blob_cache) { + ROCKS_LOG_HEADER(log, " Options.blob_cache: %s", + blob_cache->Name()); + ROCKS_LOG_HEADER(log, " blob_cache options: %s", + blob_cache->GetPrintableOptions().c_str()); + ROCKS_LOG_HEADER( + log, " blob_cache prepopulated: %s", + prepopulate_blob_cache == PrepopulateBlobCache::kFlushOnly + ? "flush only" + : "disabled"); + } + ROCKS_LOG_HEADER(log, "Options.experimental_mempurge_threshold: %f", + experimental_mempurge_threshold); +} // ColumnFamilyOptions::Dump + +void Options::Dump(Logger* log) const { + DBOptions::Dump(log); + ColumnFamilyOptions::Dump(log); +} // Options::Dump + +void Options::DumpCFOptions(Logger* log) const { + ColumnFamilyOptions::Dump(log); +} // Options::DumpCFOptions + +// +// The goal of this method is to create a configuration that +// allows an application to write all files into L0 and +// then do a single compaction to output all files into L1. +Options* +Options::PrepareForBulkLoad() +{ + // never slowdown ingest. + level0_file_num_compaction_trigger = (1<<30); + level0_slowdown_writes_trigger = (1<<30); + level0_stop_writes_trigger = (1<<30); + soft_pending_compaction_bytes_limit = 0; + hard_pending_compaction_bytes_limit = 0; + + // no auto compactions please. The application should issue a + // manual compaction after all data is loaded into L0. + disable_auto_compactions = true; + // A manual compaction run should pick all files in L0 in + // a single compaction run. + max_compaction_bytes = (static_cast(1) << 60); + + // It is better to have only 2 levels, otherwise a manual + // compaction would compact at every possible level, thereby + // increasing the total time needed for compactions. + num_levels = 2; + + // Need to allow more write buffers to allow more parallism + // of flushes. + max_write_buffer_number = 6; + min_write_buffer_number_to_merge = 1; + + // When compaction is disabled, more parallel flush threads can + // help with write throughput. + max_background_flushes = 4; + + // Prevent a memtable flush to automatically promote files + // to L1. This is helpful so that all files that are + // input to the manual compaction are all at L0. + max_background_compactions = 2; + + // The compaction would create large files in L1. + target_file_size_base = 256 * 1024 * 1024; + return this; +} + +Options* Options::OptimizeForSmallDb() { + // 16MB block cache + std::shared_ptr cache = NewLRUCache(16 << 20); + + ColumnFamilyOptions::OptimizeForSmallDb(&cache); + DBOptions::OptimizeForSmallDb(&cache); + return this; +} + +Options* Options::DisableExtraChecks() { + // See https://github.com/facebook/rocksdb/issues/9354 + force_consistency_checks = false; + // Considered but no clear performance impact seen: + // * check_flush_compaction_key_order + // * paranoid_checks + // * flush_verify_memtable_count + // By current API contract, not including + // * verify_checksums + // because checking storage data integrity is a more standard practice. + return this; +} + +Options* Options::OldDefaults(int rocksdb_major_version, + int rocksdb_minor_version) { + ColumnFamilyOptions::OldDefaults(rocksdb_major_version, + rocksdb_minor_version); + DBOptions::OldDefaults(rocksdb_major_version, rocksdb_minor_version); + return this; +} + +DBOptions* DBOptions::OldDefaults(int rocksdb_major_version, + int rocksdb_minor_version) { + if (rocksdb_major_version < 4 || + (rocksdb_major_version == 4 && rocksdb_minor_version < 7)) { + max_file_opening_threads = 1; + table_cache_numshardbits = 4; + } + if (rocksdb_major_version < 5 || + (rocksdb_major_version == 5 && rocksdb_minor_version < 2)) { + delayed_write_rate = 2 * 1024U * 1024U; + } else if (rocksdb_major_version < 5 || + (rocksdb_major_version == 5 && rocksdb_minor_version < 6)) { + delayed_write_rate = 16 * 1024U * 1024U; + } + max_open_files = 5000; + wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults( + int rocksdb_major_version, int rocksdb_minor_version) { + if (rocksdb_major_version < 5 || + (rocksdb_major_version == 5 && rocksdb_minor_version <= 18)) { + compaction_pri = CompactionPri::kByCompensatedSize; + } + if (rocksdb_major_version < 4 || + (rocksdb_major_version == 4 && rocksdb_minor_version < 7)) { + write_buffer_size = 4 << 20; + target_file_size_base = 2 * 1048576; + max_bytes_for_level_base = 10 * 1048576; + soft_pending_compaction_bytes_limit = 0; + hard_pending_compaction_bytes_limit = 0; + } + if (rocksdb_major_version < 5) { + level0_stop_writes_trigger = 24; + } else if (rocksdb_major_version == 5 && rocksdb_minor_version < 2) { + level0_stop_writes_trigger = 30; + } + + return this; +} + +// Optimization functions +DBOptions* DBOptions::OptimizeForSmallDb(std::shared_ptr* cache) { + max_file_opening_threads = 1; + max_open_files = 5000; + + // Cost memtable to block cache too. + std::shared_ptr wbm = + std::make_shared( + 0, (cache != nullptr) ? *cache : std::shared_ptr()); + write_buffer_manager = wbm; + + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( + std::shared_ptr* cache) { + write_buffer_size = 2 << 20; + target_file_size_base = 2 * 1048576; + max_bytes_for_level_base = 10 * 1048576; + soft_pending_compaction_bytes_limit = 256 * 1048576; + hard_pending_compaction_bytes_limit = 1073741824ul; + + BlockBasedTableOptions table_options; + table_options.block_cache = + (cache != nullptr) ? *cache : std::shared_ptr(); + table_options.cache_index_and_filter_blocks = true; + // Two level iterator to avoid LRU cache imbalance + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_factory.reset(new BlockBasedTableFactory(table_options)); + + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( + uint64_t block_cache_size_mb) { + BlockBasedTableOptions block_based_options; + block_based_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + block_based_options.data_block_hash_table_util_ratio = 0.75; + block_based_options.filter_policy.reset(NewBloomFilterPolicy(10)); + block_based_options.block_cache = + NewLRUCache(static_cast(block_cache_size_mb * 1024 * 1024)); + table_factory.reset(new BlockBasedTableFactory(block_based_options)); + memtable_prefix_bloom_size_ratio = 0.02; + memtable_whole_key_filtering = true; + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction( + uint64_t memtable_memory_budget) { + write_buffer_size = static_cast(memtable_memory_budget / 4); + // merge two memtables when flushing to L0 + min_write_buffer_number_to_merge = 2; + // this means we'll use 50% extra memory in the worst case, but will reduce + // write stalls. + max_write_buffer_number = 6; + // start flushing L0->L1 as soon as possible. each file on level0 is + // (memtable_memory_budget / 2). This will flush level 0 when it's bigger than + // memtable_memory_budget. + level0_file_num_compaction_trigger = 2; + // doesn't really matter much, but we don't want to create too many files + target_file_size_base = memtable_memory_budget / 8; + // make Level1 size equal to Level0 size, so that L0->L1 compactions are fast + max_bytes_for_level_base = memtable_memory_budget; + + // level style compaction + compaction_style = kCompactionStyleLevel; + + // only compress levels >= 2 + compression_per_level.resize(num_levels); + for (int i = 0; i < num_levels; ++i) { + if (i < 2) { + compression_per_level[i] = kNoCompression; + } else { + compression_per_level[i] = + LZ4_Supported() + ? kLZ4Compression + : (Snappy_Supported() ? kSnappyCompression : kNoCompression); + } + } + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction( + uint64_t memtable_memory_budget) { + write_buffer_size = static_cast(memtable_memory_budget / 4); + // merge two memtables when flushing to L0 + min_write_buffer_number_to_merge = 2; + // this means we'll use 50% extra memory in the worst case, but will reduce + // write stalls. + max_write_buffer_number = 6; + // universal style compaction + compaction_style = kCompactionStyleUniversal; + compaction_options_universal.compression_size_percent = 80; + return this; +} + +DBOptions* DBOptions::IncreaseParallelism(int total_threads) { + max_background_jobs = total_threads; + env->SetBackgroundThreads(total_threads, Env::LOW); + env->SetBackgroundThreads(1, Env::HIGH); + return this; +} + +ReadOptions::ReadOptions(bool _verify_checksums, bool _fill_cache) + : verify_checksums(_verify_checksums), fill_cache(_fill_cache) {} + +ReadOptions::ReadOptions(Env::IOActivity _io_activity) + : io_activity(_io_activity) {} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.cc new file mode 100644 index 0000000000..abe5053d22 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.cc @@ -0,0 +1,1424 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "options/options_helper.h" + +#include +#include +#include +#include +#include +#include + +#include "options/cf_options.h" +#include "options/db_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +ConfigOptions::ConfigOptions() + : registry(ObjectRegistry::NewInstance()) +{ + env = Env::Default(); +} + +ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) { + registry = ObjectRegistry::NewInstance(); +} + +Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) { + Status s; + auto db_cfg = DBOptionsAsConfigurable(db_opts); + auto cf_cfg = CFOptionsAsConfigurable(cf_opts); + s = db_cfg->ValidateOptions(db_opts, cf_opts); + if (s.ok()) s = cf_cfg->ValidateOptions(db_opts, cf_opts); + return s; +} + +DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, + const MutableDBOptions& mutable_db_options) { + DBOptions options; + + options.create_if_missing = immutable_db_options.create_if_missing; + options.create_missing_column_families = + immutable_db_options.create_missing_column_families; + options.error_if_exists = immutable_db_options.error_if_exists; + options.paranoid_checks = immutable_db_options.paranoid_checks; + options.flush_verify_memtable_count = + immutable_db_options.flush_verify_memtable_count; + options.track_and_verify_wals_in_manifest = + immutable_db_options.track_and_verify_wals_in_manifest; + options.verify_sst_unique_id_in_manifest = + immutable_db_options.verify_sst_unique_id_in_manifest; + options.env = immutable_db_options.env; + options.rate_limiter = immutable_db_options.rate_limiter; + options.sst_file_manager = immutable_db_options.sst_file_manager; + options.info_log = immutable_db_options.info_log; + options.info_log_level = immutable_db_options.info_log_level; + options.max_open_files = mutable_db_options.max_open_files; + options.max_file_opening_threads = + immutable_db_options.max_file_opening_threads; + options.max_total_wal_size = mutable_db_options.max_total_wal_size; + options.statistics = immutable_db_options.statistics; + options.use_fsync = immutable_db_options.use_fsync; + options.db_paths = immutable_db_options.db_paths; + options.db_log_dir = immutable_db_options.db_log_dir; + options.wal_dir = immutable_db_options.wal_dir; + options.delete_obsolete_files_period_micros = + mutable_db_options.delete_obsolete_files_period_micros; + options.max_background_jobs = mutable_db_options.max_background_jobs; + options.max_background_compactions = + mutable_db_options.max_background_compactions; + options.bytes_per_sync = mutable_db_options.bytes_per_sync; + options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync; + options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync; + options.max_subcompactions = mutable_db_options.max_subcompactions; + options.max_background_flushes = mutable_db_options.max_background_flushes; + options.max_log_file_size = immutable_db_options.max_log_file_size; + options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll; + options.keep_log_file_num = immutable_db_options.keep_log_file_num; + options.recycle_log_file_num = immutable_db_options.recycle_log_file_num; + options.max_manifest_file_size = immutable_db_options.max_manifest_file_size; + options.table_cache_numshardbits = + immutable_db_options.table_cache_numshardbits; + options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds; + options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB; + options.manifest_preallocation_size = + immutable_db_options.manifest_preallocation_size; + options.allow_mmap_reads = immutable_db_options.allow_mmap_reads; + options.allow_mmap_writes = immutable_db_options.allow_mmap_writes; + options.use_direct_reads = immutable_db_options.use_direct_reads; + options.use_direct_io_for_flush_and_compaction = + immutable_db_options.use_direct_io_for_flush_and_compaction; + options.allow_fallocate = immutable_db_options.allow_fallocate; + options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec; + options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec; + options.stats_persist_period_sec = + mutable_db_options.stats_persist_period_sec; + options.persist_stats_to_disk = immutable_db_options.persist_stats_to_disk; + options.stats_history_buffer_size = + mutable_db_options.stats_history_buffer_size; + options.advise_random_on_open = immutable_db_options.advise_random_on_open; + options.db_write_buffer_size = immutable_db_options.db_write_buffer_size; + options.write_buffer_manager = immutable_db_options.write_buffer_manager; + options.access_hint_on_compaction_start = + immutable_db_options.access_hint_on_compaction_start; + options.compaction_readahead_size = + mutable_db_options.compaction_readahead_size; + options.random_access_max_buffer_size = + immutable_db_options.random_access_max_buffer_size; + options.writable_file_max_buffer_size = + mutable_db_options.writable_file_max_buffer_size; + options.use_adaptive_mutex = immutable_db_options.use_adaptive_mutex; + options.listeners = immutable_db_options.listeners; + options.enable_thread_tracking = immutable_db_options.enable_thread_tracking; + options.delayed_write_rate = mutable_db_options.delayed_write_rate; + options.enable_pipelined_write = immutable_db_options.enable_pipelined_write; + options.unordered_write = immutable_db_options.unordered_write; + options.allow_concurrent_memtable_write = + immutable_db_options.allow_concurrent_memtable_write; + options.enable_write_thread_adaptive_yield = + immutable_db_options.enable_write_thread_adaptive_yield; + options.max_write_batch_group_size_bytes = + immutable_db_options.max_write_batch_group_size_bytes; + options.write_thread_max_yield_usec = + immutable_db_options.write_thread_max_yield_usec; + options.write_thread_slow_yield_usec = + immutable_db_options.write_thread_slow_yield_usec; + options.skip_stats_update_on_db_open = + immutable_db_options.skip_stats_update_on_db_open; + options.skip_checking_sst_file_sizes_on_db_open = + immutable_db_options.skip_checking_sst_file_sizes_on_db_open; + options.wal_recovery_mode = immutable_db_options.wal_recovery_mode; + options.allow_2pc = immutable_db_options.allow_2pc; + options.row_cache = immutable_db_options.row_cache; + options.wal_filter = immutable_db_options.wal_filter; + options.fail_if_options_file_error = + immutable_db_options.fail_if_options_file_error; + options.dump_malloc_stats = immutable_db_options.dump_malloc_stats; + options.avoid_flush_during_recovery = + immutable_db_options.avoid_flush_during_recovery; + options.avoid_flush_during_shutdown = + mutable_db_options.avoid_flush_during_shutdown; + options.allow_ingest_behind = immutable_db_options.allow_ingest_behind; + options.two_write_queues = immutable_db_options.two_write_queues; + options.manual_wal_flush = immutable_db_options.manual_wal_flush; + options.wal_compression = immutable_db_options.wal_compression; + options.atomic_flush = immutable_db_options.atomic_flush; + options.avoid_unnecessary_blocking_io = + immutable_db_options.avoid_unnecessary_blocking_io; + options.log_readahead_size = immutable_db_options.log_readahead_size; + options.file_checksum_gen_factory = + immutable_db_options.file_checksum_gen_factory; + options.best_efforts_recovery = immutable_db_options.best_efforts_recovery; + options.max_bgerror_resume_count = + immutable_db_options.max_bgerror_resume_count; + options.bgerror_resume_retry_interval = + immutable_db_options.bgerror_resume_retry_interval; + options.db_host_id = immutable_db_options.db_host_id; + options.allow_data_in_errors = immutable_db_options.allow_data_in_errors; + options.checksum_handoff_file_types = + immutable_db_options.checksum_handoff_file_types; + options.lowest_used_cache_tier = immutable_db_options.lowest_used_cache_tier; + options.enforce_single_del_contracts = + immutable_db_options.enforce_single_del_contracts; + return options; +} + +ColumnFamilyOptions BuildColumnFamilyOptions( + const ColumnFamilyOptions& options, + const MutableCFOptions& mutable_cf_options) { + ColumnFamilyOptions cf_opts(options); + UpdateColumnFamilyOptions(mutable_cf_options, &cf_opts); + // TODO(yhchiang): find some way to handle the following derived options + // * max_file_size + return cf_opts; +} + +void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, + ColumnFamilyOptions* cf_opts) { + // Memtable related options + cf_opts->write_buffer_size = moptions.write_buffer_size; + cf_opts->max_write_buffer_number = moptions.max_write_buffer_number; + cf_opts->arena_block_size = moptions.arena_block_size; + cf_opts->memtable_prefix_bloom_size_ratio = + moptions.memtable_prefix_bloom_size_ratio; + cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; + cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; + cf_opts->max_successive_merges = moptions.max_successive_merges; + cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks; + cf_opts->prefix_extractor = moptions.prefix_extractor; + cf_opts->experimental_mempurge_threshold = + moptions.experimental_mempurge_threshold; + cf_opts->memtable_protection_bytes_per_key = + moptions.memtable_protection_bytes_per_key; + cf_opts->block_protection_bytes_per_key = + moptions.block_protection_bytes_per_key; + + // Compaction related options + cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; + cf_opts->soft_pending_compaction_bytes_limit = + moptions.soft_pending_compaction_bytes_limit; + cf_opts->hard_pending_compaction_bytes_limit = + moptions.hard_pending_compaction_bytes_limit; + cf_opts->level0_file_num_compaction_trigger = + moptions.level0_file_num_compaction_trigger; + cf_opts->level0_slowdown_writes_trigger = + moptions.level0_slowdown_writes_trigger; + cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger; + cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; + cf_opts->ignore_max_compaction_bytes_for_input = + moptions.ignore_max_compaction_bytes_for_input; + cf_opts->target_file_size_base = moptions.target_file_size_base; + cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; + cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; + cf_opts->max_bytes_for_level_multiplier = + moptions.max_bytes_for_level_multiplier; + cf_opts->ttl = moptions.ttl; + cf_opts->periodic_compaction_seconds = moptions.periodic_compaction_seconds; + + cf_opts->max_bytes_for_level_multiplier_additional.clear(); + for (auto value : moptions.max_bytes_for_level_multiplier_additional) { + cf_opts->max_bytes_for_level_multiplier_additional.emplace_back(value); + } + + cf_opts->compaction_options_fifo = moptions.compaction_options_fifo; + cf_opts->compaction_options_universal = moptions.compaction_options_universal; + + // Blob file related options + cf_opts->enable_blob_files = moptions.enable_blob_files; + cf_opts->min_blob_size = moptions.min_blob_size; + cf_opts->blob_file_size = moptions.blob_file_size; + cf_opts->blob_compression_type = moptions.blob_compression_type; + cf_opts->enable_blob_garbage_collection = + moptions.enable_blob_garbage_collection; + cf_opts->blob_garbage_collection_age_cutoff = + moptions.blob_garbage_collection_age_cutoff; + cf_opts->blob_garbage_collection_force_threshold = + moptions.blob_garbage_collection_force_threshold; + cf_opts->blob_compaction_readahead_size = + moptions.blob_compaction_readahead_size; + cf_opts->blob_file_starting_level = moptions.blob_file_starting_level; + cf_opts->prepopulate_blob_cache = moptions.prepopulate_blob_cache; + + // Misc options + cf_opts->max_sequential_skip_in_iterations = + moptions.max_sequential_skip_in_iterations; + cf_opts->check_flush_compaction_key_order = + moptions.check_flush_compaction_key_order; + cf_opts->paranoid_file_checks = moptions.paranoid_file_checks; + cf_opts->report_bg_io_stats = moptions.report_bg_io_stats; + cf_opts->compression = moptions.compression; + cf_opts->compression_opts = moptions.compression_opts; + cf_opts->bottommost_compression = moptions.bottommost_compression; + cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts; + cf_opts->sample_for_compression = moptions.sample_for_compression; + cf_opts->compression_per_level = moptions.compression_per_level; + cf_opts->last_level_temperature = moptions.last_level_temperature; + cf_opts->bottommost_temperature = moptions.last_level_temperature; +} + +void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, + ColumnFamilyOptions* cf_opts) { + cf_opts->compaction_style = ioptions.compaction_style; + cf_opts->compaction_pri = ioptions.compaction_pri; + cf_opts->comparator = ioptions.user_comparator; + cf_opts->merge_operator = ioptions.merge_operator; + cf_opts->compaction_filter = ioptions.compaction_filter; + cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory; + cf_opts->min_write_buffer_number_to_merge = + ioptions.min_write_buffer_number_to_merge; + cf_opts->max_write_buffer_number_to_maintain = + ioptions.max_write_buffer_number_to_maintain; + cf_opts->max_write_buffer_size_to_maintain = + ioptions.max_write_buffer_size_to_maintain; + cf_opts->inplace_update_support = ioptions.inplace_update_support; + cf_opts->inplace_callback = ioptions.inplace_callback; + cf_opts->memtable_factory = ioptions.memtable_factory; + cf_opts->table_factory = ioptions.table_factory; + cf_opts->table_properties_collector_factories = + ioptions.table_properties_collector_factories; + cf_opts->bloom_locality = ioptions.bloom_locality; + cf_opts->level_compaction_dynamic_level_bytes = + ioptions.level_compaction_dynamic_level_bytes; + cf_opts->level_compaction_dynamic_file_size = + ioptions.level_compaction_dynamic_file_size; + cf_opts->num_levels = ioptions.num_levels; + cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits; + cf_opts->force_consistency_checks = ioptions.force_consistency_checks; + cf_opts->memtable_insert_with_hint_prefix_extractor = + ioptions.memtable_insert_with_hint_prefix_extractor; + cf_opts->cf_paths = ioptions.cf_paths; + cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter; + cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory; + cf_opts->blob_cache = ioptions.blob_cache; + cf_opts->preclude_last_level_data_seconds = + ioptions.preclude_last_level_data_seconds; + cf_opts->preserve_internal_time_seconds = + ioptions.preserve_internal_time_seconds; + cf_opts->persist_user_defined_timestamps = + ioptions.persist_user_defined_timestamps; + + // TODO(yhchiang): find some way to handle the following derived options + // * max_file_size +} + +std::map + OptionsHelper::compaction_style_to_string = { + {kCompactionStyleLevel, "kCompactionStyleLevel"}, + {kCompactionStyleUniversal, "kCompactionStyleUniversal"}, + {kCompactionStyleFIFO, "kCompactionStyleFIFO"}, + {kCompactionStyleNone, "kCompactionStyleNone"}}; + +std::map OptionsHelper::compaction_pri_to_string = { + {kByCompensatedSize, "kByCompensatedSize"}, + {kOldestLargestSeqFirst, "kOldestLargestSeqFirst"}, + {kOldestSmallestSeqFirst, "kOldestSmallestSeqFirst"}, + {kMinOverlappingRatio, "kMinOverlappingRatio"}, + {kRoundRobin, "kRoundRobin"}}; + +std::map + OptionsHelper::compaction_stop_style_to_string = { + {kCompactionStopStyleSimilarSize, "kCompactionStopStyleSimilarSize"}, + {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}}; + +std::map OptionsHelper::temperature_to_string = { + {Temperature::kUnknown, "kUnknown"}, + {Temperature::kHot, "kHot"}, + {Temperature::kWarm, "kWarm"}, + {Temperature::kCold, "kCold"}}; + +std::unordered_map + OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum}, + {"kCRC32c", kCRC32c}, + {"kxxHash", kxxHash}, + {"kxxHash64", kxxHash64}, + {"kXXH3", kXXH3}}; + +std::unordered_map + OptionsHelper::compression_type_string_map = { + {"kNoCompression", kNoCompression}, + {"kSnappyCompression", kSnappyCompression}, + {"kZlibCompression", kZlibCompression}, + {"kBZip2Compression", kBZip2Compression}, + {"kLZ4Compression", kLZ4Compression}, + {"kLZ4HCCompression", kLZ4HCCompression}, + {"kXpressCompression", kXpressCompression}, + {"kZSTD", kZSTD}, + {"kZSTDNotFinalCompression", kZSTDNotFinalCompression}, + {"kDisableCompressionOption", kDisableCompressionOption}}; + +std::vector GetSupportedCompressions() { + // std::set internally to deduplicate potential name aliases + std::set supported_compressions; + for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) { + CompressionType t = comp_to_name.second; + if (t != kDisableCompressionOption && CompressionTypeSupported(t)) { + supported_compressions.insert(t); + } + } + return std::vector(supported_compressions.begin(), + supported_compressions.end()); +} + +std::vector GetSupportedDictCompressions() { + std::set dict_compression_types; + for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) { + CompressionType t = comp_to_name.second; + if (t != kDisableCompressionOption && DictCompressionTypeSupported(t)) { + dict_compression_types.insert(t); + } + } + return std::vector(dict_compression_types.begin(), + dict_compression_types.end()); +} + +std::vector GetSupportedChecksums() { + std::set checksum_types; + for (const auto& e : OptionsHelper::checksum_type_string_map) { + checksum_types.insert(e.second); + } + return std::vector(checksum_types.begin(), + checksum_types.end()); +} + +static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type, + const std::string& value) { + switch (opt_type) { + case OptionType::kBoolean: + *static_cast(opt_address) = ParseBoolean("", value); + break; + case OptionType::kInt: + *static_cast(opt_address) = ParseInt(value); + break; + case OptionType::kInt32T: + *static_cast(opt_address) = ParseInt32(value); + break; + case OptionType::kInt64T: + PutUnaligned(static_cast(opt_address), ParseInt64(value)); + break; + case OptionType::kUInt: + *static_cast(opt_address) = ParseUint32(value); + break; + case OptionType::kUInt8T: + *static_cast(opt_address) = ParseUint8(value); + break; + case OptionType::kUInt32T: + *static_cast(opt_address) = ParseUint32(value); + break; + case OptionType::kUInt64T: + PutUnaligned(static_cast(opt_address), ParseUint64(value)); + break; + case OptionType::kSizeT: + PutUnaligned(static_cast(opt_address), ParseSizeT(value)); + break; + case OptionType::kString: + *static_cast(opt_address) = value; + break; + case OptionType::kDouble: + *static_cast(opt_address) = ParseDouble(value); + break; + case OptionType::kCompactionStyle: + return ParseEnum( + compaction_style_string_map, value, + static_cast(opt_address)); + case OptionType::kCompactionPri: + return ParseEnum(compaction_pri_string_map, value, + static_cast(opt_address)); + case OptionType::kCompressionType: + return ParseEnum( + compression_type_string_map, value, + static_cast(opt_address)); + case OptionType::kChecksumType: + return ParseEnum(checksum_type_string_map, value, + static_cast(opt_address)); + case OptionType::kEncodingType: + return ParseEnum(encoding_type_string_map, value, + static_cast(opt_address)); + case OptionType::kCompactionStopStyle: + return ParseEnum( + compaction_stop_style_string_map, value, + static_cast(opt_address)); + case OptionType::kEncodedString: { + std::string* output_addr = static_cast(opt_address); + (Slice(value)).DecodeHex(output_addr); + break; + } + case OptionType::kTemperature: { + return ParseEnum(temperature_string_map, value, + static_cast(opt_address)); + } + default: + return false; + } + return true; +} + +bool SerializeSingleOptionHelper(const void* opt_address, + const OptionType opt_type, + std::string* value) { + assert(value); + switch (opt_type) { + case OptionType::kBoolean: + *value = *(static_cast(opt_address)) ? "true" : "false"; + break; + case OptionType::kInt: + *value = std::to_string(*(static_cast(opt_address))); + break; + case OptionType::kInt32T: + *value = std::to_string(*(static_cast(opt_address))); + break; + case OptionType::kInt64T: + { + int64_t v; + GetUnaligned(static_cast(opt_address), &v); + *value = std::to_string(v); + } + break; + case OptionType::kUInt: + *value = std::to_string(*(static_cast(opt_address))); + break; + case OptionType::kUInt8T: + *value = std::to_string(*(static_cast(opt_address))); + break; + case OptionType::kUInt32T: + *value = std::to_string(*(static_cast(opt_address))); + break; + case OptionType::kUInt64T: + { + uint64_t v; + GetUnaligned(static_cast(opt_address), &v); + *value = std::to_string(v); + } + break; + case OptionType::kSizeT: + { + size_t v; + GetUnaligned(static_cast(opt_address), &v); + *value = std::to_string(v); + } + break; + case OptionType::kDouble: + *value = std::to_string(*(static_cast(opt_address))); + break; + case OptionType::kString: + *value = + EscapeOptionString(*(static_cast(opt_address))); + break; + case OptionType::kCompactionStyle: + return SerializeEnum( + compaction_style_string_map, + *(static_cast(opt_address)), value); + case OptionType::kCompactionPri: + return SerializeEnum( + compaction_pri_string_map, + *(static_cast(opt_address)), value); + case OptionType::kCompressionType: + return SerializeEnum( + compression_type_string_map, + *(static_cast(opt_address)), value); + break; + case OptionType::kChecksumType: + return SerializeEnum( + checksum_type_string_map, + *static_cast(opt_address), value); + case OptionType::kEncodingType: + return SerializeEnum( + encoding_type_string_map, + *static_cast(opt_address), value); + case OptionType::kCompactionStopStyle: + return SerializeEnum( + compaction_stop_style_string_map, + *static_cast(opt_address), value); + case OptionType::kEncodedString: { + const auto* ptr = static_cast(opt_address); + *value = (Slice(*ptr)).ToString(true); + break; + } + case OptionType::kTemperature: { + return SerializeEnum( + temperature_string_map, *static_cast(opt_address), + value); + } + default: + return false; + } + return true; +} + +template +Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map, + const std::string& option_name, Configurable* config, T* new_opts) { + Status s = config->ConfigureFromMap(config_options, opt_map); + if (s.ok()) { + *new_opts = *(config->GetOptions(option_name)); + } + return s; +} + + +Status StringToMap(const std::string& opts_str, + std::unordered_map* opts_map) { + assert(opts_map); + // Example: + // opts_str = "write_buffer_size=1024;max_write_buffer_number=2;" + // "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100" + size_t pos = 0; + std::string opts = trim(opts_str); + // If the input string starts and ends with "{...}", strip off the brackets + while (opts.size() > 2 && opts[0] == '{' && opts[opts.size() - 1] == '}') { + opts = trim(opts.substr(1, opts.size() - 2)); + } + + while (pos < opts.size()) { + size_t eq_pos = opts.find_first_of("={};", pos); + if (eq_pos == std::string::npos) { + return Status::InvalidArgument("Mismatched key value pair, '=' expected"); + } else if (opts[eq_pos] != '=') { + return Status::InvalidArgument("Unexpected char in key"); + } + + std::string key = trim(opts.substr(pos, eq_pos - pos)); + if (key.empty()) { + return Status::InvalidArgument("Empty key found"); + } + + std::string value; + Status s = OptionTypeInfo::NextToken(opts, ';', eq_pos + 1, &pos, &value); + if (!s.ok()) { + return s; + } else { + (*opts_map)[key] = value; + if (pos == std::string::npos) { + break; + } else { + pos++; + } + } + } + + return Status::OK(); +} + + +Status GetStringFromDBOptions(std::string* opt_string, + const DBOptions& db_options, + const std::string& delimiter) { + ConfigOptions config_options(db_options); + config_options.delimiter = delimiter; + return GetStringFromDBOptions(config_options, db_options, opt_string); +} + +Status GetStringFromDBOptions(const ConfigOptions& config_options, + const DBOptions& db_options, + std::string* opt_string) { + assert(opt_string); + opt_string->clear(); + auto config = DBOptionsAsConfigurable(db_options); + return config->GetOptionString(config_options, opt_string); +} + + +Status GetStringFromColumnFamilyOptions(std::string* opt_string, + const ColumnFamilyOptions& cf_options, + const std::string& delimiter) { + ConfigOptions config_options; + config_options.delimiter = delimiter; + return GetStringFromColumnFamilyOptions(config_options, cf_options, + opt_string); +} + +Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options, + const ColumnFamilyOptions& cf_options, + std::string* opt_string) { + const auto config = CFOptionsAsConfigurable(cf_options); + return config->GetOptionString(config_options, opt_string); +} + +Status GetStringFromCompressionType(std::string* compression_str, + CompressionType compression_type) { + bool ok = SerializeEnum(compression_type_string_map, + compression_type, compression_str); + if (ok) { + return Status::OK(); + } else { + return Status::InvalidArgument("Invalid compression types"); + } +} + +Status GetColumnFamilyOptionsFromMap( + const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options) { + assert(new_options); + + *new_options = base_options; + + const auto config = CFOptionsAsConfigurable(base_options); + Status s = ConfigureFromMap( + config_options, opts_map, OptionsHelper::kCFOptionsName, config.get(), + new_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } +} + +Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + *new_options = base_options; + return s; + } + return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map, + new_options); +} + +Status GetDBOptionsFromMap( + const ConfigOptions& config_options, const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options) { + assert(new_options); + *new_options = base_options; + auto config = DBOptionsAsConfigurable(base_options); + Status s = ConfigureFromMap(config_options, opts_map, + OptionsHelper::kDBOptionsName, + config.get(), new_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } +} + +Status GetDBOptionsFromString(const ConfigOptions& config_options, + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + *new_options = base_options; + return s; + } + return GetDBOptionsFromMap(config_options, base_options, opts_map, + new_options); +} + +Status GetOptionsFromString(const Options& base_options, + const std::string& opts_str, Options* new_options) { + ConfigOptions config_options(base_options); + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + + return GetOptionsFromString(config_options, base_options, opts_str, + new_options); +} + +Status GetOptionsFromString(const ConfigOptions& config_options, + const Options& base_options, + const std::string& opts_str, Options* new_options) { + ColumnFamilyOptions new_cf_options; + std::unordered_map unused_opts; + std::unordered_map opts_map; + + assert(new_options); + *new_options = base_options; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + auto config = DBOptionsAsConfigurable(base_options); + s = config->ConfigureFromMap(config_options, opts_map, &unused_opts); + + if (s.ok()) { + DBOptions* new_db_options = + config->GetOptions(OptionsHelper::kDBOptionsName); + if (!unused_opts.empty()) { + s = GetColumnFamilyOptionsFromMap(config_options, base_options, + unused_opts, &new_cf_options); + if (s.ok()) { + *new_options = Options(*new_db_options, new_cf_options); + } + } else { + *new_options = Options(*new_db_options, base_options); + } + } + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } +} + +std::unordered_map + OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain}, + {"kPrefix", kPrefix}}; + +std::unordered_map + OptionsHelper::compaction_style_string_map = { + {"kCompactionStyleLevel", kCompactionStyleLevel}, + {"kCompactionStyleUniversal", kCompactionStyleUniversal}, + {"kCompactionStyleFIFO", kCompactionStyleFIFO}, + {"kCompactionStyleNone", kCompactionStyleNone}}; + +std::unordered_map + OptionsHelper::compaction_pri_string_map = { + {"kByCompensatedSize", kByCompensatedSize}, + {"kOldestLargestSeqFirst", kOldestLargestSeqFirst}, + {"kOldestSmallestSeqFirst", kOldestSmallestSeqFirst}, + {"kMinOverlappingRatio", kMinOverlappingRatio}, + {"kRoundRobin", kRoundRobin}}; + +std::unordered_map + OptionsHelper::compaction_stop_style_string_map = { + {"kCompactionStopStyleSimilarSize", kCompactionStopStyleSimilarSize}, + {"kCompactionStopStyleTotalSize", kCompactionStopStyleTotalSize}}; + +std::unordered_map + OptionsHelper::temperature_string_map = { + {"kUnknown", Temperature::kUnknown}, + {"kHot", Temperature::kHot}, + {"kWarm", Temperature::kWarm}, + {"kCold", Temperature::kCold}}; + +std::unordered_map + OptionsHelper::prepopulate_blob_cache_string_map = { + {"kDisable", PrepopulateBlobCache::kDisable}, + {"kFlushOnly", PrepopulateBlobCache::kFlushOnly}}; + +Status OptionTypeInfo::NextToken(const std::string& opts, char delimiter, + size_t pos, size_t* end, std::string* token) { + while (pos < opts.size() && isspace(opts[pos])) { + ++pos; + } + // Empty value at the end + if (pos >= opts.size()) { + *token = ""; + *end = std::string::npos; + return Status::OK(); + } else if (opts[pos] == '{') { + int count = 1; + size_t brace_pos = pos + 1; + while (brace_pos < opts.size()) { + if (opts[brace_pos] == '{') { + ++count; + } else if (opts[brace_pos] == '}') { + --count; + if (count == 0) { + break; + } + } + ++brace_pos; + } + // found the matching closing brace + if (count == 0) { + *token = trim(opts.substr(pos + 1, brace_pos - pos - 1)); + // skip all whitespace and move to the next delimiter + // brace_pos points to the next position after the matching '}' + pos = brace_pos + 1; + while (pos < opts.size() && isspace(opts[pos])) { + ++pos; + } + if (pos < opts.size() && opts[pos] != delimiter) { + return Status::InvalidArgument("Unexpected chars after nested options"); + } + *end = pos; + } else { + return Status::InvalidArgument( + "Mismatched curly braces for nested options"); + } + } else { + *end = opts.find(delimiter, pos); + if (*end == std::string::npos) { + // It either ends with a trailing semi-colon or the last key-value pair + *token = trim(opts.substr(pos)); + } else { + *token = trim(opts.substr(pos, *end - pos)); + } + } + return Status::OK(); +} + +Status OptionTypeInfo::Parse(const ConfigOptions& config_options, + const std::string& opt_name, + const std::string& value, void* opt_ptr) const { + if (IsDeprecated()) { + return Status::OK(); + } + try { + const std::string& opt_value = config_options.input_strings_escaped + ? UnescapeOptionString(value) + : value; + + if (opt_ptr == nullptr) { + return Status::NotFound("Could not find option", opt_name); + } else if (parse_func_ != nullptr) { + ConfigOptions copy = config_options; + copy.invoke_prepare_options = false; + void* opt_addr = GetOffset(opt_ptr); + return parse_func_(copy, opt_name, opt_value, opt_addr); + } else if (ParseOptionHelper(GetOffset(opt_ptr), type_, opt_value)) { + return Status::OK(); + } else if (IsConfigurable()) { + // The option is . + Configurable* config = AsRawPointer(opt_ptr); + if (opt_value.empty()) { + return Status::OK(); + } else if (config == nullptr) { + return Status::NotFound("Could not find configurable: ", opt_name); + } else { + ConfigOptions copy = config_options; + copy.ignore_unknown_options = false; + copy.invoke_prepare_options = false; + if (opt_value.find("=") != std::string::npos) { + return config->ConfigureFromString(copy, opt_value); + } else { + return config->ConfigureOption(copy, opt_name, opt_value); + } + } + } else if (IsByName()) { + return Status::NotSupported("Deserializing the option " + opt_name + + " is not supported"); + } else { + return Status::InvalidArgument("Error parsing:", opt_name); + } + } catch (std::exception& e) { + return Status::InvalidArgument("Error parsing " + opt_name + ":" + + std::string(e.what())); + } +} + +Status OptionTypeInfo::ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, std::unordered_map* unused) { + std::unordered_map opts_map; + Status status = StringToMap(opts_str, &opts_map); + if (!status.ok()) { + return status; + } else { + return ParseType(config_options, opts_map, type_map, opt_addr, unused); + } +} + +Status OptionTypeInfo::ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, std::unordered_map* unused) { + for (const auto& opts_iter : opts_map) { + std::string opt_name; + const auto* opt_info = Find(opts_iter.first, type_map, &opt_name); + if (opt_info != nullptr) { + Status status = + opt_info->Parse(config_options, opt_name, opts_iter.second, opt_addr); + if (!status.ok()) { + return status; + } + } else if (unused != nullptr) { + (*unused)[opts_iter.first] = opts_iter.second; + } else if (!config_options.ignore_unknown_options) { + return Status::NotFound("Unrecognized option", opts_iter.first); + } + } + return Status::OK(); +} + +Status OptionTypeInfo::ParseStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* struct_map, + const std::string& opt_name, const std::string& opt_value, void* opt_addr) { + assert(struct_map); + Status status; + if (opt_name == struct_name || EndsWith(opt_name, "." + struct_name)) { + // This option represents the entire struct + std::unordered_map unused; + status = + ParseType(config_options, opt_value, *struct_map, opt_addr, &unused); + if (status.ok() && !unused.empty()) { + status = Status::InvalidArgument( + "Unrecognized option", struct_name + "." + unused.begin()->first); + } + } else if (StartsWith(opt_name, struct_name + ".")) { + // This option represents a nested field in the struct (e.g, struct.field) + std::string elem_name; + const auto opt_info = + Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name); + if (opt_info != nullptr) { + status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr); + } else { + status = Status::InvalidArgument("Unrecognized option", opt_name); + } + } else { + // This option represents a field in the struct (e.g. field) + std::string elem_name; + const auto opt_info = Find(opt_name, *struct_map, &elem_name); + if (opt_info != nullptr) { + status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr); + } else { + status = Status::InvalidArgument("Unrecognized option", + struct_name + "." + opt_name); + } + } + return status; +} + +Status OptionTypeInfo::Serialize(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const opt_ptr, + std::string* opt_value) const { + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + if (opt_ptr == nullptr || IsDeprecated()) { + return Status::OK(); + } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) { + return Status::NotSupported("Cannot serialize option: ", opt_name); + } else if (serialize_func_ != nullptr) { + const void* opt_addr = GetOffset(opt_ptr); + return serialize_func_(config_options, opt_name, opt_addr, opt_value); + } else if (IsCustomizable()) { + const Customizable* custom = AsRawPointer(opt_ptr); + opt_value->clear(); + if (custom == nullptr) { + // We do not have a custom object to serialize. + // If the option is not mutable and we are doing only mutable options, + // we return an empty string (which will cause the option not to be + // printed). Otherwise, we return the "nullptr" string, which will result + // in "option=nullptr" being printed. + if (IsMutable() || !config_options.mutable_options_only) { + *opt_value = kNullptrString; + } else { + *opt_value = ""; + } + } else if (IsEnabled(OptionTypeFlags::kStringNameOnly) && + !config_options.IsDetailed()) { + if (!config_options.mutable_options_only || IsMutable()) { + *opt_value = custom->GetId(); + } + } else { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + // If this option is mutable, everything inside it should be considered + // mutable + if (IsMutable()) { + embedded.mutable_options_only = false; + } + std::string value = custom->ToString(embedded); + if (!embedded.mutable_options_only || + value.find("=") != std::string::npos) { + *opt_value = value; + } else { + *opt_value = ""; + } + } + return Status::OK(); + } else if (IsConfigurable()) { + const Configurable* config = AsRawPointer(opt_ptr); + if (config != nullptr) { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + *opt_value = config->ToString(embedded); + } + return Status::OK(); + } else if (config_options.mutable_options_only && !IsMutable()) { + return Status::OK(); + } else if (SerializeSingleOptionHelper(GetOffset(opt_ptr), type_, + opt_value)) { + return Status::OK(); + } else { + return Status::InvalidArgument("Cannot serialize option: ", opt_name); + } +} + +Status OptionTypeInfo::SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* result) { + Status status; + for (const auto& iter : type_map) { + std::string single; + const auto& opt_info = iter.second; + if (opt_info.ShouldSerialize()) { + status = + opt_info.Serialize(config_options, iter.first, opt_addr, &single); + if (!status.ok()) { + return status; + } else { + result->append(iter.first + "=" + single + config_options.delimiter); + } + } + } + return status; +} + +Status OptionTypeInfo::SerializeStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* struct_map, + const std::string& opt_name, const void* opt_addr, std::string* value) { + assert(struct_map); + Status status; + if (EndsWith(opt_name, struct_name)) { + // We are going to write the struct as "{ prop1=value1; prop2=value2;}. + // Set the delimiter to ";" so that the everything will be on one line. + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + + // This option represents the entire struct + std::string result; + status = SerializeType(embedded, *struct_map, opt_addr, &result); + if (!status.ok()) { + return status; + } else { + *value = "{" + result + "}"; + } + } else if (StartsWith(opt_name, struct_name + ".")) { + // This option represents a nested field in the struct (e.g, struct.field) + std::string elem_name; + const auto opt_info = + Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name); + if (opt_info != nullptr) { + status = opt_info->Serialize(config_options, elem_name, opt_addr, value); + } else { + status = Status::InvalidArgument("Unrecognized option", opt_name); + } + } else { + // This option represents a field in the struct (e.g. field) + std::string elem_name; + const auto opt_info = Find(opt_name, *struct_map, &elem_name); + if (opt_info == nullptr) { + status = Status::InvalidArgument("Unrecognized option", opt_name); + } else if (opt_info->ShouldSerialize()) { + status = opt_info->Serialize(config_options, opt_name + "." + elem_name, + opt_addr, value); + } + } + return status; +} + +template +bool IsOptionEqual(const void* offset1, const void* offset2) { + return (*static_cast(offset1) == *static_cast(offset2)); +} + +static bool AreEqualDoubles(const double a, const double b) { + return (fabs(a - b) < 0.00001); +} + +static bool AreOptionsEqual(OptionType type, const void* this_offset, + const void* that_offset) { + switch (type) { + case OptionType::kBoolean: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kInt: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kUInt: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kInt32T: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kInt64T: { + int64_t v1, v2; + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); + return (v1 == v2); + } + case OptionType::kUInt8T: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kUInt32T: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kUInt64T: { + uint64_t v1, v2; + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); + return (v1 == v2); + } + case OptionType::kSizeT: { + size_t v1, v2; + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); + return (v1 == v2); + } + case OptionType::kString: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kDouble: + return AreEqualDoubles(*static_cast(this_offset), + *static_cast(that_offset)); + case OptionType::kCompactionStyle: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kCompactionStopStyle: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kCompactionPri: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kCompressionType: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kChecksumType: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kEncodingType: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kEncodedString: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kTemperature: + return IsOptionEqual(this_offset, that_offset); + default: + return false; + } // End switch +} + +bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const { + auto level = GetSanityLevel(); + if (!config_options.IsCheckEnabled(level)) { + return true; // If the sanity level is not being checked, skip it + } + if (this_ptr == nullptr || that_ptr == nullptr) { + if (this_ptr == that_ptr) { + return true; + } + } else if (equals_func_ != nullptr) { + const void* this_addr = GetOffset(this_ptr); + const void* that_addr = GetOffset(that_ptr); + if (equals_func_(config_options, opt_name, this_addr, that_addr, + mismatch)) { + return true; + } + } else { + const void* this_addr = GetOffset(this_ptr); + const void* that_addr = GetOffset(that_ptr); + if (AreOptionsEqual(type_, this_addr, that_addr)) { + return true; + } else if (IsConfigurable()) { + const auto* this_config = AsRawPointer(this_ptr); + const auto* that_config = AsRawPointer(that_ptr); + if (this_config == that_config) { + return true; + } else if (this_config != nullptr && that_config != nullptr) { + std::string bad_name; + bool matches; + if (level < config_options.sanity_level) { + ConfigOptions copy = config_options; + copy.sanity_level = level; + matches = this_config->AreEquivalent(copy, that_config, &bad_name); + } else { + matches = this_config->AreEquivalent(config_options, that_config, + &bad_name); + } + if (!matches) { + *mismatch = opt_name + "." + bad_name; + } + return matches; + } + } + } + if (mismatch->empty()) { + *mismatch = opt_name; + } + return false; +} + +bool OptionTypeInfo::TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* this_addr, const void* that_addr, std::string* mismatch) { + for (const auto& iter : type_map) { + const auto& opt_info = iter.second; + if (!opt_info.AreEqual(config_options, iter.first, this_addr, that_addr, + mismatch)) { + return false; + } + } + return true; +} + +bool OptionTypeInfo::StructsAreEqual( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* struct_map, + const std::string& opt_name, const void* this_addr, const void* that_addr, + std::string* mismatch) { + assert(struct_map); + bool matches = true; + std::string result; + if (EndsWith(opt_name, struct_name)) { + // This option represents the entire struct + matches = TypesAreEqual(config_options, *struct_map, this_addr, that_addr, + &result); + if (!matches) { + *mismatch = struct_name + "." + result; + return false; + } + } else if (StartsWith(opt_name, struct_name + ".")) { + // This option represents a nested field in the struct (e.g, struct.field) + std::string elem_name; + const auto opt_info = + Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name); + assert(opt_info); + if (opt_info == nullptr) { + *mismatch = opt_name; + matches = false; + } else if (!opt_info->AreEqual(config_options, elem_name, this_addr, + that_addr, &result)) { + matches = false; + *mismatch = struct_name + "." + result; + } + } else { + // This option represents a field in the struct (e.g. field) + std::string elem_name; + const auto opt_info = Find(opt_name, *struct_map, &elem_name); + assert(opt_info); + if (opt_info == nullptr) { + *mismatch = struct_name + "." + opt_name; + matches = false; + } else if (!opt_info->AreEqual(config_options, elem_name, this_addr, + that_addr, &result)) { + matches = false; + *mismatch = struct_name + "." + result; + } + } + return matches; +} + +bool MatchesOptionsTypeFromMap( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* const this_ptr, const void* const that_ptr, + std::string* mismatch) { + for (auto& pair : type_map) { + // We skip checking deprecated variables as they might + // contain random values since they might not be initialized + if (config_options.IsCheckEnabled(pair.second.GetSanityLevel())) { + if (!pair.second.AreEqual(config_options, pair.first, this_ptr, that_ptr, + mismatch) && + !pair.second.AreEqualByName(config_options, pair.first, this_ptr, + that_ptr)) { + return false; + } + } + } + return true; +} + +bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const this_ptr, + const void* const that_ptr) const { + if (IsByName()) { + std::string that_value; + if (Serialize(config_options, opt_name, that_ptr, &that_value).ok()) { + return AreEqualByName(config_options, opt_name, this_ptr, that_value); + } + } + return false; +} + +bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const opt_ptr, + const std::string& that_value) const { + std::string this_value; + if (!IsByName()) { + return false; + } else if (!Serialize(config_options, opt_name, opt_ptr, &this_value).ok()) { + return false; + } else if (IsEnabled(OptionVerificationType::kByNameAllowFromNull)) { + if (that_value == kNullptrString) { + return true; + } + } else if (IsEnabled(OptionVerificationType::kByNameAllowNull)) { + if (that_value == kNullptrString) { + return true; + } + } + return (this_value == that_value); +} + +Status OptionTypeInfo::Prepare(const ConfigOptions& config_options, + const std::string& name, void* opt_ptr) const { + if (ShouldPrepare()) { + if (prepare_func_ != nullptr) { + void* opt_addr = GetOffset(opt_ptr); + return prepare_func_(config_options, name, opt_addr); + } else if (IsConfigurable()) { + Configurable* config = AsRawPointer(opt_ptr); + if (config != nullptr) { + return config->PrepareOptions(config_options); + } else if (!CanBeNull()) { + return Status::NotFound("Missing configurable object", name); + } + } + } + return Status::OK(); +} + +Status OptionTypeInfo::Validate(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts, + const std::string& name, + const void* opt_ptr) const { + if (ShouldValidate()) { + if (validate_func_ != nullptr) { + const void* opt_addr = GetOffset(opt_ptr); + return validate_func_(db_opts, cf_opts, name, opt_addr); + } else if (IsConfigurable()) { + const Configurable* config = AsRawPointer(opt_ptr); + if (config != nullptr) { + return config->ValidateOptions(db_opts, cf_opts); + } else if (!CanBeNull()) { + return Status::NotFound("Missing configurable object", name); + } + } + } + return Status::OK(); +} + +const OptionTypeInfo* OptionTypeInfo::Find( + const std::string& opt_name, + const std::unordered_map& opt_map, + std::string* elem_name) { + const auto iter = opt_map.find(opt_name); // Look up the value in the map + if (iter != opt_map.end()) { // Found the option in the map + *elem_name = opt_name; // Return the name + return &(iter->second); // Return the contents of the iterator + } else { + auto idx = opt_name.find("."); // Look for a separator + if (idx > 0 && idx != std::string::npos) { // We found a separator + auto siter = + opt_map.find(opt_name.substr(0, idx)); // Look for the short name + if (siter != opt_map.end()) { // We found the short name + if (siter->second.IsStruct() || // If the object is a struct + siter->second.IsConfigurable()) { // or a Configurable + *elem_name = opt_name.substr(idx + 1); // Return the rest + return &(siter->second); // Return the contents of the iterator + } + } + } + } + return nullptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.h new file mode 100644 index 0000000000..76e312a63c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_helper.h @@ -0,0 +1,116 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; +struct ImmutableCFOptions; +struct ImmutableDBOptions; +struct MutableDBOptions; +struct MutableCFOptions; +struct Options; + +std::vector GetSupportedCompressions(); + +std::vector GetSupportedDictCompressions(); + +std::vector GetSupportedChecksums(); + +inline bool IsSupportedChecksumType(ChecksumType type) { + // Avoid annoying compiler warning-as-error (-Werror=type-limits) + auto min = kNoChecksum; + auto max = kXXH3; + return type >= min && type <= max; +} + +// Checks that the combination of DBOptions and ColumnFamilyOptions are valid +Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts); + +DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, + const MutableDBOptions& mutable_db_options); + +ColumnFamilyOptions BuildColumnFamilyOptions( + const ColumnFamilyOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + +void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, + ColumnFamilyOptions* cf_opts); +void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, + ColumnFamilyOptions* cf_opts); + +std::unique_ptr DBOptionsAsConfigurable( + const MutableDBOptions& opts); +std::unique_ptr DBOptionsAsConfigurable( + const DBOptions& opts, + const std::unordered_map* opt_map = nullptr); +std::unique_ptr CFOptionsAsConfigurable( + const MutableCFOptions& opts); +std::unique_ptr CFOptionsAsConfigurable( + const ColumnFamilyOptions& opts, + const std::unordered_map* opt_map = nullptr); + +extern Status StringToMap( + const std::string& opts_str, + std::unordered_map* opts_map); + +struct OptionsHelper { + static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/; + static const std::string kDBOptionsName /*= "DBOptions" */; + static std::map compaction_style_to_string; + static std::map compaction_pri_to_string; + static std::map + compaction_stop_style_to_string; + static std::map temperature_to_string; + static std::unordered_map checksum_type_string_map; + static std::unordered_map + compression_type_string_map; + static std::unordered_map + prepopulate_blob_cache_string_map; + static std::unordered_map + compaction_stop_style_string_map; + static std::unordered_map encoding_type_string_map; + static std::unordered_map + compaction_style_string_map; + static std::unordered_map + compaction_pri_string_map; + static std::unordered_map temperature_string_map; +}; + +// Some aliasing +static auto& compaction_style_to_string = + OptionsHelper::compaction_style_to_string; +static auto& compaction_pri_to_string = OptionsHelper::compaction_pri_to_string; +static auto& compaction_stop_style_to_string = + OptionsHelper::compaction_stop_style_to_string; +static auto& temperature_to_string = OptionsHelper::temperature_to_string; +static auto& checksum_type_string_map = OptionsHelper::checksum_type_string_map; +static auto& compaction_stop_style_string_map = + OptionsHelper::compaction_stop_style_string_map; +static auto& compression_type_string_map = + OptionsHelper::compression_type_string_map; +static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map; +static auto& compaction_style_string_map = + OptionsHelper::compaction_style_string_map; +static auto& compaction_pri_string_map = + OptionsHelper::compaction_pri_string_map; +static auto& temperature_string_map = OptionsHelper::temperature_string_map; +static auto& prepopulate_blob_cache_string_map = + OptionsHelper::prepopulate_blob_cache_string_map; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.cc new file mode 100644 index 0000000000..a8c855d6e2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.cc @@ -0,0 +1,736 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "options/options_parser.h" + +#include +#include +#include +#include +#include + +#include "file/line_file_reader.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "options/db_options.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/options_type.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +static const std::string option_file_header = + "# This is a RocksDB option file.\n" + "#\n" + "# For detailed file format spec, please refer to the example file\n" + "# in examples/rocksdb_option_file_example.ini\n" + "#\n" + "\n"; + +Status PersistRocksDBOptions(const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs) { + ConfigOptions + config_options; // Use default for escaped(true) and check (exact) + config_options.delimiter = "\n "; + // Do not invoke PrepareOptions when we are doing validation. + config_options.invoke_prepare_options = false; + // If a readahead size was set in the input options, use it + if (db_opt.log_readahead_size > 0) { + config_options.file_readahead_size = db_opt.log_readahead_size; + } + return PersistRocksDBOptions(config_options, db_opt, cf_names, cf_opts, + file_name, fs); +} + +Status PersistRocksDBOptions(const ConfigOptions& config_options_in, + const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs) { + ConfigOptions config_options = config_options_in; + config_options.delimiter = "\n "; // Override the default to nl + + TEST_SYNC_POINT("PersistRocksDBOptions:start"); + if (cf_names.size() != cf_opts.size()) { + return Status::InvalidArgument( + "cf_names.size() and cf_opts.size() must be the same"); + } + std::unique_ptr wf; + + Status s = + fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr); + if (!s.ok()) { + return s; + } + std::unique_ptr writable; + writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(), + nullptr /* statistics */)); + TEST_SYNC_POINT("PersistRocksDBOptions:create"); + + std::string options_file_content; + + s = writable->Append( + option_file_header + "[" + opt_section_titles[kOptionSectionVersion] + + "]\n" + " rocksdb_version=" + + std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR) + + "." + std::to_string(ROCKSDB_PATCH) + "\n"); + if (s.ok()) { + s = writable->Append( + " options_file_version=" + std::to_string(ROCKSDB_OPTION_FILE_MAJOR) + + "." + std::to_string(ROCKSDB_OPTION_FILE_MINOR) + "\n"); + } + if (s.ok()) { + s = writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] + + "]\n "); + } + + if (s.ok()) { + s = GetStringFromDBOptions(config_options, db_opt, &options_file_content); + } + if (s.ok()) { + s = writable->Append(options_file_content + "\n"); + } + + for (size_t i = 0; s.ok() && i < cf_opts.size(); ++i) { + // CFOptions section + s = writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] + + " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + if (s.ok()) { + s = GetStringFromColumnFamilyOptions(config_options, cf_opts[i], + &options_file_content); + } + if (s.ok()) { + s = writable->Append(options_file_content + "\n"); + } + // TableOptions section + auto* tf = cf_opts[i].table_factory.get(); + if (tf != nullptr) { + if (s.ok()) { + s = writable->Append( + "[" + opt_section_titles[kOptionSectionTableOptions] + tf->Name() + + " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + } + if (s.ok()) { + options_file_content.clear(); + s = tf->GetOptionString(config_options, &options_file_content); + } + if (s.ok()) { + s = writable->Append(options_file_content + "\n"); + } + } + } + if (s.ok()) { + s = writable->Sync(true /* use_fsync */); + } + if (s.ok()) { + s = writable->Close(); + } + TEST_SYNC_POINT("PersistRocksDBOptions:written"); + if (s.ok()) { + return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( + config_options, db_opt, cf_names, cf_opts, file_name, fs); + } + return s; +} + +RocksDBOptionsParser::RocksDBOptionsParser() { Reset(); } + +void RocksDBOptionsParser::Reset() { + db_opt_ = DBOptions(); + db_opt_map_.clear(); + cf_names_.clear(); + cf_opts_.clear(); + cf_opt_maps_.clear(); + has_version_section_ = false; + has_db_options_ = false; + has_default_cf_options_ = false; + for (int i = 0; i < 3; ++i) { + db_version[i] = 0; + opt_file_version[i] = 0; + } +} + +bool RocksDBOptionsParser::IsSection(const std::string& line) { + if (line.size() < 2) { + return false; + } + if (line[0] != '[' || line[line.size() - 1] != ']') { + return false; + } + return true; +} + +Status RocksDBOptionsParser::ParseSection(OptionSection* section, + std::string* title, + std::string* argument, + const std::string& line, + const int line_num) { + *section = kOptionSectionUnknown; + // A section is of the form [ ""], where + // "" is optional. + size_t arg_start_pos = line.find("\""); + size_t arg_end_pos = line.rfind("\""); + // The following if-then check tries to identify whether the input + // section has the optional section argument. + if (arg_start_pos != std::string::npos && arg_start_pos != arg_end_pos) { + *title = TrimAndRemoveComment(line.substr(1, arg_start_pos - 1), true); + *argument = UnescapeOptionString( + line.substr(arg_start_pos + 1, arg_end_pos - arg_start_pos - 1)); + } else { + *title = TrimAndRemoveComment(line.substr(1, line.size() - 2), true); + *argument = ""; + } + for (int i = 0; i < kOptionSectionUnknown; ++i) { + if (title->find(opt_section_titles[i]) == 0) { + if (i == kOptionSectionVersion || i == kOptionSectionDBOptions || + i == kOptionSectionCFOptions) { + if (title->size() == opt_section_titles[i].size()) { + // if true, then it indicats equal + *section = static_cast(i); + return CheckSection(*section, *argument, line_num); + } + } else if (i == kOptionSectionTableOptions) { + // This type of sections has a sufffix at the end of the + // section title + if (title->size() > opt_section_titles[i].size()) { + *section = static_cast(i); + return CheckSection(*section, *argument, line_num); + } + } + } + } + return Status::InvalidArgument(std::string("Unknown section ") + line); +} + +Status RocksDBOptionsParser::InvalidArgument(const int line_num, + const std::string& message) { + return Status::InvalidArgument( + "[RocksDBOptionsParser Error] ", + message + " (at line " + std::to_string(line_num) + ")"); +} + +Status RocksDBOptionsParser::ParseStatement(std::string* name, + std::string* value, + const std::string& line, + const int line_num) { + size_t eq_pos = line.find("="); + if (eq_pos == std::string::npos) { + return InvalidArgument(line_num, "A valid statement must have a '='."); + } + + *name = TrimAndRemoveComment(line.substr(0, eq_pos), true); + *value = + TrimAndRemoveComment(line.substr(eq_pos + 1, line.size() - eq_pos - 1)); + if (name->empty()) { + return InvalidArgument(line_num, + "A valid statement must have a variable name."); + } + return Status::OK(); +} + +Status RocksDBOptionsParser::Parse(const std::string& file_name, FileSystem* fs, + bool ignore_unknown_options, + size_t file_readahead_size) { + ConfigOptions + config_options; // Use default for escaped(true) and check (exact) + config_options.ignore_unknown_options = ignore_unknown_options; + if (file_readahead_size > 0) { + config_options.file_readahead_size = file_readahead_size; + } + return Parse(config_options, file_name, fs); +} + +Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in, + const std::string& file_name, + FileSystem* fs) { + Reset(); + ConfigOptions config_options = config_options_in; + + std::unique_ptr seq_file; + Status s = fs->NewSequentialFile(file_name, FileOptions(), &seq_file, + nullptr); + if (!s.ok()) { + return s; + } + LineFileReader lf_reader(std::move(seq_file), file_name, + config_options.file_readahead_size); + + OptionSection section = kOptionSectionUnknown; + std::string title; + std::string argument; + std::unordered_map opt_map; + std::string line; + // we only support single-lined statement. + while (lf_reader.ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) { + int line_num = static_cast(lf_reader.GetLineNumber()); + line = TrimAndRemoveComment(line); + if (line.empty()) { + continue; + } + if (IsSection(line)) { + s = EndSection(config_options, section, title, argument, opt_map); + opt_map.clear(); + if (!s.ok()) { + return s; + } + + // If the option file is not generated by a higher minor version, + // there shouldn't be any unknown option. + if (config_options.ignore_unknown_options && + section == kOptionSectionVersion) { + if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR && + db_version[1] <= ROCKSDB_MINOR)) { + config_options.ignore_unknown_options = false; + } + } + + s = ParseSection(§ion, &title, &argument, line, line_num); + if (!s.ok()) { + return s; + } + } else { + std::string name; + std::string value; + s = ParseStatement(&name, &value, line, line_num); + if (!s.ok()) { + return s; + } + opt_map.insert({name, value}); + } + } + s = lf_reader.GetStatus(); + if (!s.ok()) { + return s; + } + + s = EndSection(config_options, section, title, argument, opt_map); + opt_map.clear(); + if (!s.ok()) { + return s; + } + return ValidityCheck(); +} + +Status RocksDBOptionsParser::CheckSection(const OptionSection section, + const std::string& section_arg, + const int line_num) { + if (section == kOptionSectionDBOptions) { + if (has_db_options_) { + return InvalidArgument( + line_num, + "More than one DBOption section found in the option config file"); + } + has_db_options_ = true; + } else if (section == kOptionSectionCFOptions) { + bool is_default_cf = (section_arg == kDefaultColumnFamilyName); + if (cf_opts_.size() == 0 && !is_default_cf) { + return InvalidArgument( + line_num, + "Default column family must be the first CFOptions section " + "in the option config file"); + } else if (cf_opts_.size() != 0 && is_default_cf) { + return InvalidArgument( + line_num, + "Default column family must be the first CFOptions section " + "in the optio/n config file"); + } else if (GetCFOptions(section_arg) != nullptr) { + return InvalidArgument( + line_num, + "Two identical column families found in option config file"); + } + has_default_cf_options_ |= is_default_cf; + } else if (section == kOptionSectionTableOptions) { + if (GetCFOptions(section_arg) == nullptr) { + return InvalidArgument( + line_num, std::string( + "Does not find a matched column family name in " + "TableOptions section. Column Family Name:") + + section_arg); + } + } else if (section == kOptionSectionVersion) { + if (has_version_section_) { + return InvalidArgument( + line_num, + "More than one Version section found in the option config file."); + } + has_version_section_ = true; + } + return Status::OK(); +} + +Status RocksDBOptionsParser::ParseVersionNumber(const std::string& ver_name, + const std::string& ver_string, + const int max_count, + int* version) { + int version_index = 0; + int current_number = 0; + int current_digit_count = 0; + bool has_dot = false; + for (int i = 0; i < max_count; ++i) { + version[i] = 0; + } + constexpr int kBufferSize = 200; + char buffer[kBufferSize]; + for (size_t i = 0; i < ver_string.size(); ++i) { + if (ver_string[i] == '.') { + if (version_index >= max_count - 1) { + snprintf(buffer, sizeof(buffer) - 1, + "A valid %s can only contains at most %d dots.", + ver_name.c_str(), max_count - 1); + return Status::InvalidArgument(buffer); + } + if (current_digit_count == 0) { + snprintf(buffer, sizeof(buffer) - 1, + "A valid %s must have at least one digit before each dot.", + ver_name.c_str()); + return Status::InvalidArgument(buffer); + } + version[version_index++] = current_number; + current_number = 0; + current_digit_count = 0; + has_dot = true; + } else if (isdigit(ver_string[i])) { + current_number = current_number * 10 + (ver_string[i] - '0'); + current_digit_count++; + } else { + snprintf(buffer, sizeof(buffer) - 1, + "A valid %s can only contains dots and numbers.", + ver_name.c_str()); + return Status::InvalidArgument(buffer); + } + } + version[version_index] = current_number; + if (has_dot && current_digit_count == 0) { + snprintf(buffer, sizeof(buffer) - 1, + "A valid %s must have at least one digit after each dot.", + ver_name.c_str()); + return Status::InvalidArgument(buffer); + } + return Status::OK(); +} + +Status RocksDBOptionsParser::EndSection( + const ConfigOptions& config_options, const OptionSection section, + const std::string& section_title, const std::string& section_arg, + const std::unordered_map& opt_map) { + Status s; + if (section == kOptionSectionDBOptions) { + s = GetDBOptionsFromMap(config_options, DBOptions(), opt_map, &db_opt_); + if (!s.ok()) { + return s; + } + db_opt_map_ = opt_map; + } else if (section == kOptionSectionCFOptions) { + // This condition should be ensured earlier in ParseSection + // so we make an assertion here. + assert(GetCFOptions(section_arg) == nullptr); + cf_names_.emplace_back(section_arg); + cf_opts_.emplace_back(); + s = GetColumnFamilyOptionsFromMap(config_options, ColumnFamilyOptions(), + opt_map, &cf_opts_.back()); + if (!s.ok()) { + return s; + } + // keep the parsed string. + cf_opt_maps_.emplace_back(opt_map); + } else if (section == kOptionSectionTableOptions) { + assert(GetCFOptions(section_arg) != nullptr); + auto* cf_opt = GetCFOptionsImpl(section_arg); + if (cf_opt == nullptr) { + return Status::InvalidArgument( + "The specified column family must be defined before the " + "TableOptions section:", + section_arg); + } + // Ignore error as table factory deserialization is optional + cf_opt->table_factory.reset(); + s = TableFactory::CreateFromString( + config_options, + section_title.substr( + opt_section_titles[kOptionSectionTableOptions].size()), + &(cf_opt->table_factory)); + if (s.ok() && cf_opt->table_factory != nullptr) { + s = cf_opt->table_factory->ConfigureFromMap(config_options, opt_map); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } + } else { + // Return OK for not supported table factories as TableFactory + // Deserialization is optional. + cf_opt->table_factory.reset(); + return Status::OK(); + } + } else if (section == kOptionSectionVersion) { + for (const auto& pair : opt_map) { + if (pair.first == "rocksdb_version") { + s = ParseVersionNumber(pair.first, pair.second, 3, db_version); + if (!s.ok()) { + return s; + } + } else if (pair.first == "options_file_version") { + s = ParseVersionNumber(pair.first, pair.second, 2, opt_file_version); + if (!s.ok()) { + return s; + } + if (opt_file_version[0] < 1) { + return Status::InvalidArgument( + "A valid options_file_version must be at least 1."); + } + } + } + } + return s; +} + +Status RocksDBOptionsParser::ValidityCheck() { + if (!has_db_options_) { + return Status::Corruption( + "A RocksDB Option file must have a single DBOptions section"); + } + if (!has_default_cf_options_) { + return Status::Corruption( + "A RocksDB Option file must have a single CFOptions:default section"); + } + + return Status::OK(); +} + +std::string RocksDBOptionsParser::TrimAndRemoveComment(const std::string& line, + bool trim_only) { + size_t start = 0; + size_t end = line.size(); + + // we only support "#" style comment + if (!trim_only) { + size_t search_pos = 0; + while (search_pos < line.size()) { + size_t comment_pos = line.find('#', search_pos); + if (comment_pos == std::string::npos) { + break; + } + if (comment_pos == 0 || line[comment_pos - 1] != '\\') { + end = comment_pos; + break; + } + search_pos = comment_pos + 1; + } + } + + while (start < end && isspace(line[start]) != 0) { + ++start; + } + + // start < end implies end > 0. + while (start < end && isspace(line[end - 1]) != 0) { + --end; + } + + if (start < end) { + return line.substr(start, end - start); + } + + return ""; +} + +Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( + const ConfigOptions& config_options_in, const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs) { + RocksDBOptionsParser parser; + ConfigOptions config_options = config_options_in; + config_options.invoke_prepare_options = + false; // No need to do a prepare for verify + if (config_options.sanity_level < ConfigOptions::kSanityLevelExactMatch) { + // If we are not doing an exact comparison, we should ignore + // unsupported options, as they may cause the Parse to fail + // (if the ObjectRegistry is not initialized) + config_options.ignore_unsupported_options = true; + } + Status s = parser.Parse(config_options, file_name, fs); + if (!s.ok()) { + return s; + } + + // Verify DBOptions + s = VerifyDBOptions(config_options, db_opt, *parser.db_opt(), + parser.db_opt_map()); + if (!s.ok()) { + return s; + } + + // Verify ColumnFamily Name + if (cf_names.size() != parser.cf_names()->size()) { + if (config_options.sanity_level >= + ConfigOptions::kSanityLevelLooselyCompatible) { + return Status::InvalidArgument( + "[RocksDBOptionParser Error] The persisted options does not have " + "the same number of column family names as the db instance."); + } else if (cf_opts.size() > parser.cf_opts()->size()) { + return Status::InvalidArgument( + "[RocksDBOptionsParser Error]", + "The persisted options file has less number of column family " + "names than that of the specified one."); + } + } + for (size_t i = 0; i < cf_names.size(); ++i) { + if (cf_names[i] != parser.cf_names()->at(i)) { + return Status::InvalidArgument( + "[RocksDBOptionParser Error] The persisted options and the db" + "instance does not have the same name for column family ", + std::to_string(i)); + } + } + + // Verify Column Family Options + if (cf_opts.size() != parser.cf_opts()->size()) { + if (config_options.sanity_level >= + ConfigOptions::kSanityLevelLooselyCompatible) { + return Status::InvalidArgument( + "[RocksDBOptionsParser Error]", + "The persisted options does not have the same number of " + "column families as the db instance."); + } else if (cf_opts.size() > parser.cf_opts()->size()) { + return Status::InvalidArgument( + "[RocksDBOptionsParser Error]", + "The persisted options file has less number of column families " + "than that of the specified number."); + } + } + for (size_t i = 0; i < cf_opts.size(); ++i) { + s = VerifyCFOptions(config_options, cf_opts[i], parser.cf_opts()->at(i), + &(parser.cf_opt_maps()->at(i))); + if (!s.ok()) { + return s; + } + s = VerifyTableFactory(config_options, cf_opts[i].table_factory.get(), + parser.cf_opts()->at(i).table_factory.get()); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status RocksDBOptionsParser::VerifyDBOptions( + const ConfigOptions& config_options, const DBOptions& base_opt, + const DBOptions& file_opt, + const std::unordered_map* opt_map) { + auto base_config = DBOptionsAsConfigurable(base_opt, opt_map); + auto file_config = DBOptionsAsConfigurable(file_opt, opt_map); + std::string mismatch; + if (!base_config->AreEquivalent(config_options, file_config.get(), + &mismatch)) { + const size_t kBufferSize = 2048; + char buffer[kBufferSize]; + std::string base_value; + std::string file_value; + int offset = snprintf(buffer, sizeof(buffer), + "[RocksDBOptionsParser]: " + "failed the verification on DBOptions::%s -- ", + mismatch.c_str()); + Status s = base_config->GetOption(config_options, mismatch, &base_value); + if (s.ok()) { + s = file_config->GetOption(config_options, mismatch, &file_value); + } + assert(offset >= 0); + assert(static_cast(offset) < sizeof(buffer)); + if (s.ok()) { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "-- The specified one is %s while the persisted one is %s.\n", + base_value.c_str(), file_value.c_str()); + } else { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "-- Unable to re-serialize an option: %s.\n", + s.ToString().c_str()); + } + return Status::InvalidArgument(Slice(buffer, strlen(buffer))); + } + return Status::OK(); +} + +Status RocksDBOptionsParser::VerifyCFOptions( + const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt, + const ColumnFamilyOptions& file_opt, + const std::unordered_map* opt_map) { + auto base_config = CFOptionsAsConfigurable(base_opt, opt_map); + auto file_config = CFOptionsAsConfigurable(file_opt, opt_map); + std::string mismatch; + if (!base_config->AreEquivalent(config_options, file_config.get(), + &mismatch)) { + std::string base_value; + std::string file_value; + // The options do not match + const size_t kBufferSize = 2048; + char buffer[kBufferSize]; + Status s = base_config->GetOption(config_options, mismatch, &base_value); + if (s.ok()) { + s = file_config->GetOption(config_options, mismatch, &file_value); + // In file_opt, certain options like MergeOperator may be nullptr due to + // factor methods not available. So we use opt_map to get + // option value to use in the error message below. + if (s.ok() && file_value == kNullptrString && opt_map) { + auto const& opt_val_str = (opt_map->find(mismatch)); + if (opt_val_str != opt_map->end()) { + file_value = opt_val_str->second; + } + } + } + int offset = snprintf(buffer, sizeof(buffer), + "[RocksDBOptionsParser]: " + "failed the verification on ColumnFamilyOptions::%s", + mismatch.c_str()); + assert(offset >= 0); + assert(static_cast(offset) < sizeof(buffer)); + if (s.ok()) { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "--- The specified one is %s while the persisted one is %s.\n", + base_value.c_str(), file_value.c_str()); + } else { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "--- Unable to re-serialize an option: %s.\n", + s.ToString().c_str()); + } + return Status::InvalidArgument(Slice(buffer, sizeof(buffer))); + } // For each option + return Status::OK(); +} + +Status RocksDBOptionsParser::VerifyTableFactory( + const ConfigOptions& config_options, const TableFactory* base_tf, + const TableFactory* file_tf) { + std::string mismatch; + if (base_tf && file_tf) { + if (config_options.sanity_level > ConfigOptions::kSanityLevelNone && + std::string(base_tf->Name()) != std::string(file_tf->Name())) { + return Status::Corruption( + "[RocksDBOptionsParser]: " + "failed the verification on TableFactory->Name()"); + } else if (!base_tf->AreEquivalent(config_options, file_tf, &mismatch)) { + return Status::Corruption(std::string("[RocksDBOptionsParser]:" + "failed the verification on ") + + base_tf->Name() + "::", + mismatch); + } + } else { + // TODO(yhchiang): further support sanity check here + } + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.h new file mode 100644 index 0000000000..4268051f34 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/options/options_parser.h @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +struct ConfigOptions; +class OptionTypeInfo; +class TableFactory; + +#define ROCKSDB_OPTION_FILE_MAJOR 1 +#define ROCKSDB_OPTION_FILE_MINOR 1 + +enum OptionSection : char { + kOptionSectionVersion = 0, + kOptionSectionDBOptions, + kOptionSectionCFOptions, + kOptionSectionTableOptions, + kOptionSectionUnknown +}; + +static const std::string opt_section_titles[] = { + "Version", "DBOptions", "CFOptions", "TableOptions/", "Unknown"}; + +Status PersistRocksDBOptions(const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs); +Status PersistRocksDBOptions(const ConfigOptions& config_options, + const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs); + +class RocksDBOptionsParser { + public: + explicit RocksDBOptionsParser(); + ~RocksDBOptionsParser() {} + void Reset(); + + // `file_readahead_size` is used for readahead for the option file. + // If 0 is given, a default value will be used. + Status Parse(const std::string& file_name, FileSystem* fs, + bool ignore_unknown_options, size_t file_readahead_size); + + Status Parse(const ConfigOptions& config_options, + const std::string& file_name, FileSystem* fs); + + static std::string TrimAndRemoveComment(const std::string& line, + const bool trim_only = false); + + const DBOptions* db_opt() const { return &db_opt_; } + const std::unordered_map* db_opt_map() const { + return &db_opt_map_; + } + const std::vector* cf_opts() const { return &cf_opts_; } + const std::vector* cf_names() const { return &cf_names_; } + const std::vector>* cf_opt_maps() + const { + return &cf_opt_maps_; + } + + const ColumnFamilyOptions* GetCFOptions(const std::string& name) { + return GetCFOptionsImpl(name); + } + size_t NumColumnFamilies() { return cf_opts_.size(); } + static Status VerifyRocksDBOptionsFromFile( + const ConfigOptions& config_options, const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs); + static Status VerifyDBOptions( + const ConfigOptions& config_options, const DBOptions& base_opt, + const DBOptions& new_opt, + const std::unordered_map* new_opt_map = + nullptr); + + static Status VerifyCFOptions( + const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt, + const ColumnFamilyOptions& new_opt, + const std::unordered_map* new_opt_map = + nullptr); + + static Status VerifyTableFactory(const ConfigOptions& config_options, + const TableFactory* base_tf, + const TableFactory* file_tf); + + static Status ExtraParserCheck(const RocksDBOptionsParser& input_parser); + + static Status ParseStatement(std::string* name, std::string* value, + const std::string& line, const int line_num); + + protected: + bool IsSection(const std::string& line); + Status ParseSection(OptionSection* section, std::string* title, + std::string* argument, const std::string& line, + const int line_num); + + Status CheckSection(const OptionSection section, + const std::string& section_arg, const int line_num); + + Status EndSection( + const ConfigOptions& config_options, const OptionSection section, + const std::string& title, const std::string& section_arg, + const std::unordered_map& opt_map); + + Status ValidityCheck(); + + static Status InvalidArgument(const int line_num, const std::string& message); + + Status ParseVersionNumber(const std::string& ver_name, + const std::string& ver_string, const int max_count, + int* version); + + ColumnFamilyOptions* GetCFOptionsImpl(const std::string& name) { + assert(cf_names_.size() == cf_opts_.size()); + for (size_t i = 0; i < cf_names_.size(); ++i) { + if (cf_names_[i] == name) { + return &cf_opts_[i]; + } + } + return nullptr; + } + + private: + DBOptions db_opt_; + std::unordered_map db_opt_map_; + std::vector cf_names_; + std::vector cf_opts_; + std::vector> cf_opt_maps_; + bool has_version_section_; + bool has_db_options_; + bool has_default_cf_options_; + int db_version[3]; + int opt_file_version[3]; +}; + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/jemalloc_helper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/jemalloc_helper.h new file mode 100644 index 0000000000..f085f6226d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/jemalloc_helper.h @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if defined(__clang__) && defined(__GLIBC__) +// glibc's `posix_memalign()` declaration specifies `throw()` while clang's +// declaration does not. There is a hack in clang to make its re-declaration +// compatible with glibc's if they are declared consecutively. That hack breaks +// if yet another `posix_memalign()` declaration comes between glibc's and +// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's +// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration. +// +// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()` +// declaration did not specify `throw()` when built with clang. +#include +#endif + +#ifdef ROCKSDB_JEMALLOC +#ifdef __FreeBSD__ +#include +#define JEMALLOC_USABLE_SIZE_CONST const +#else +#define JEMALLOC_MANGLE +#include +#endif + +#ifndef JEMALLOC_CXX_THROW +#define JEMALLOC_CXX_THROW +#endif + +#if defined(OS_WIN) && defined(_MSC_VER) + +// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is +// defined, Jemalloc memory allocator is used. +static inline bool HasJemalloc() { return true; } + +#else + +// definitions for compatibility with older versions of jemalloc +#if !defined(JEMALLOC_ALLOCATOR) +#define JEMALLOC_ALLOCATOR +#endif +#if !defined(JEMALLOC_RESTRICT_RETURN) +#define JEMALLOC_RESTRICT_RETURN +#endif +#if !defined(JEMALLOC_NOTHROW) +#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +#endif +#if !defined(JEMALLOC_ALLOC_SIZE) +#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +#else +#define JEMALLOC_ALLOC_SIZE(s) +#endif +#endif + +// Declare non-standard jemalloc APIs as weak symbols. We can null-check these +// symbols to detect whether jemalloc is linked with the binary. +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) + __attribute__((__weak__)); +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int) + JEMALLOC_ATTR(pure) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure) + __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *, + size_t) __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *, + size_t *) + __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *, + size_t *, void *, size_t) + __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW +malloc_stats_print(void (*)(void *, const char *), void *, const char *) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW +malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW + __attribute__((__weak__)); + +// Check if Jemalloc is linked with the binary. Note the main program might be +// using a different memory allocator even this method return true. +// It is loosely based on folly::usingJEMalloc(), minus the check that actually +// allocate memory and see if it is through jemalloc, to handle the dlopen() +// case: +// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147 +static inline bool HasJemalloc() { + return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr && + sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr && + nallocx != nullptr && mallctl != nullptr && + mallctlnametomib != nullptr && mallctlbymib != nullptr && + malloc_stats_print != nullptr && malloc_usable_size != nullptr; +} + +#endif + +#endif // ROCKSDB_JEMALLOC diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/lang.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/lang.h new file mode 100644 index 0000000000..a4201ca3b2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/lang.h @@ -0,0 +1,97 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef FALLTHROUGH_INTENDED +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED \ + do { \ + } while (0) +#endif +#endif + +#define DECLARE_DEFAULT_MOVES(Name) \ + Name(Name&&) noexcept = default; \ + Name& operator=(Name&&) = default + +// ASAN (Address sanitizer) + +#if defined(__clang__) +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // __has_feature(address_sanitizer) +#endif // defined(__has_feature) +#else // __clang__ +#ifdef __SANITIZE_ADDRESS__ +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // __SANITIZE_ADDRESS__ +#endif // __clang__ + +#ifdef ROCKSDB_VALGRIND_RUN +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // ROCKSDB_VALGRIND_RUN + +// Coding guidelines say to avoid static objects with non-trivial destructors, +// because it's easy to cause trouble (UB) in static destruction. This +// macro makes it easier to define static objects that are normally never +// destructed, except are destructed when running under ASAN. This should +// avoid unexpected, unnecessary destruction behavior in production. +// Note that constructor arguments can be provided as in +// STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2); +#ifdef MUST_FREE_HEAP_ALLOCATIONS +#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name +constexpr bool kMustFreeHeapAllocations = true; +#else +#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type +constexpr bool kMustFreeHeapAllocations = false; +#endif + +// TSAN (Thread sanitizer) + +// For simplicity, standardize on the GCC define +#if defined(__clang__) +#if defined(__has_feature) && __has_feature(thread_sanitizer) +#define __SANITIZE_THREAD__ 1 +#endif // __has_feature(thread_sanitizer) +#endif // __clang__ + +#ifdef __SANITIZE_THREAD__ +#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread"))) +#else +#define TSAN_SUPPRESSION +#endif // TSAN_SUPPRESSION + +// Compile-time CPU feature testing compatibility +// +// A way to be extra sure these defines have been included. +#define ASSERT_FEATURE_COMPAT_HEADER() /* empty */ + +// MSVC doesn't support the same defines that gcc and clang provide +// but does some like __AVX__. Here we can infer some features from others. +#ifdef __AVX__ +#define __SSE4_2__ 1 +#define __PCLMUL__ 1 +#endif // __AVX__ + +// A way to disable PCLMUL +#ifdef NO_PCLMUL +#undef __PCLMUL__ +#endif + +// popcnt is generally implied by SSE4.2 +#if defined(__SSE4_2__) +#define __POPCNT__ 1 +#endif + +// A way to disable POPCNT +#ifdef NO_POPCNT +#undef __POPCNT__ +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/likely.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/likely.h new file mode 100644 index 0000000000..0bd90d7015 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/likely.h @@ -0,0 +1,18 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#if defined(__GNUC__) && __GNUC__ >= 4 +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/malloc.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/malloc.h new file mode 100644 index 0000000000..f973263e2a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/malloc.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.cc new file mode 100644 index 0000000000..36e8f32617 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.cc @@ -0,0 +1,98 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "port/mmap.h" + +#include +#include +#include +#include +#include + +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +MemMapping::~MemMapping() { +#ifdef OS_WIN + if (addr_ != nullptr) { + (void)::UnmapViewOfFile(addr_); + } + if (page_file_handle_ != NULL) { + (void)::CloseHandle(page_file_handle_); + } +#else // OS_WIN -> !OS_WIN + if (addr_ != nullptr) { + auto status = munmap(addr_, length_); + assert(status == 0); + if (status != 0) { + // TODO: handle error? + } + } +#endif // OS_WIN +} + +MemMapping::MemMapping(MemMapping&& other) noexcept { + *this = std::move(other); +} + +MemMapping& MemMapping::operator=(MemMapping&& other) noexcept { + if (&other == this) { + return *this; + } + this->~MemMapping(); + std::memcpy(this, &other, sizeof(*this)); + new (&other) MemMapping(); + return *this; +} + +MemMapping MemMapping::AllocateAnonymous(size_t length, bool huge) { + MemMapping mm; + mm.length_ = length; + assert(mm.addr_ == nullptr); + if (length == 0) { + // OK to leave addr as nullptr + return mm; + } + int huge_flag = 0; +#ifdef OS_WIN + if (huge) { +#ifdef FILE_MAP_LARGE_PAGES + huge_flag = FILE_MAP_LARGE_PAGES; +#endif // FILE_MAP_LARGE_PAGES + } + mm.page_file_handle_ = ::CreateFileMapping( + INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE | SEC_COMMIT, + Upper32of64(length), Lower32of64(length), nullptr); + if (mm.page_file_handle_ == NULL) { + // Failure + return mm; + } + mm.addr_ = ::MapViewOfFile(mm.page_file_handle_, FILE_MAP_WRITE | huge_flag, + 0, 0, length); +#else // OS_WIN -> !OS_WIN + if (huge) { +#ifdef MAP_HUGETLB + huge_flag = MAP_HUGETLB; +#endif // MAP_HUGE_TLB + } + mm.addr_ = mmap(nullptr, length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | huge_flag, -1, 0); + if (mm.addr_ == MAP_FAILED) { + mm.addr_ = nullptr; + } +#endif // OS_WIN + return mm; +} + +MemMapping MemMapping::AllocateHuge(size_t length) { + return AllocateAnonymous(length, /*huge*/ true); +} + +MemMapping MemMapping::AllocateLazyZeroed(size_t length) { + return AllocateAnonymous(length, /*huge*/ false); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.h new file mode 100644 index 0000000000..7342a13f96 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/mmap.h @@ -0,0 +1,70 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifdef OS_WIN +#include "port/win/port_win.h" +// ^^^ For proper/safe inclusion of windows.h. Must come first. +#include +#else +#include +#endif // OS_WIN + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// An RAII wrapper for mmaped memory +class MemMapping { + public: + static constexpr bool kHugePageSupported = +#if defined(MAP_HUGETLB) || defined(FILE_MAP_LARGE_PAGES) + true; +#else + false; +#endif + + // Allocate memory requesting to be backed by huge pages + static MemMapping AllocateHuge(size_t length); + + // Allocate memory that is only lazily mapped to resident memory and + // guaranteed to be zero-initialized. Note that some platforms like + // Linux allow memory over-commit, where only the used portion of memory + // matters, while other platforms require enough swap space (page file) to + // back the full mapping. + static MemMapping AllocateLazyZeroed(size_t length); + + // No copies + MemMapping(const MemMapping&) = delete; + MemMapping& operator=(const MemMapping&) = delete; + // Move + MemMapping(MemMapping&&) noexcept; + MemMapping& operator=(MemMapping&&) noexcept; + + // Releases the mapping + ~MemMapping(); + + inline void* Get() const { return addr_; } + inline size_t Length() const { return length_; } + + private: + MemMapping() {} + + // The mapped memory, or nullptr on failure / not supported + void* addr_ = nullptr; + // The known usable number of bytes starting at that address + size_t length_ = 0; + +#ifdef OS_WIN + HANDLE page_file_handle_ = NULL; +#endif // OS_WIN + + static MemMapping AllocateAnonymous(size_t length, bool huge); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port.h new file mode 100644 index 0000000000..13aa56d47b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(ROCKSDB_PLATFORM_POSIX) +#include "port/port_posix.h" +#elif defined(OS_WIN) +#include "port/win/port_win.h" +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_dirent.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_dirent.h new file mode 100644 index 0000000000..2b23e2f076 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_dirent.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +#ifdef ROCKSDB_PLATFORM_POSIX +#include +#include +#elif defined(OS_WIN) + +namespace ROCKSDB_NAMESPACE { +namespace port { + +struct dirent { + char d_name[_MAX_PATH]; /* filename */ +}; + +struct DIR; + +DIR* opendir(const char* name); + +dirent* readdir(DIR* dirp); + +int closedir(DIR* dirp); + +} // namespace port + +using port::closedir; +using port::DIR; +using port::dirent; +using port::opendir; +using port::readdir; + +} // namespace ROCKSDB_NAMESPACE + +#endif // OS_WIN diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_example.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_example.h new file mode 100644 index 0000000000..794149a690 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_example.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#pragma once + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// Thread-safe initialization. +// Used as follows: +// static port::OnceType init_control = LEVELDB_ONCE_INIT; +// static void Initializer() { ... do something ...; } +// ... +// port::InitOnce(&init_control, &Initializer); +using OnceType = intptr_t; +#define LEVELDB_ONCE_INIT 0 +extern void InitOnce(port::OnceType*, void (*initializer)()); + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// If input[0,input_length-1] looks like a valid snappy compressed +// buffer, store the size of the uncompressed data in *result and +// return true. Else return false. +extern bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +// +// REQUIRES: at least the first "n" bytes of output[] must be writable +// where "n" is the result of a successful call to +// Snappy_GetUncompressedLength. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + char* output); + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.cc new file mode 100644 index 0000000000..3872293b81 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.cc @@ -0,0 +1,300 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if !defined(OS_WIN) + +#include "port/port_posix.h" + +#include +#if defined(__i386__) || defined(__x86_64__) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// We want to give users opportunity to default all the mutexes to adaptive if +// not specified otherwise. This enables a quick way to conduct various +// performance related experiements. +// +// NB! Support for adaptive mutexes is turned on by definining +// ROCKSDB_PTHREAD_ADAPTIVE_MUTEX during the compilation. If you use RocksDB +// build environment then this happens automatically; otherwise it's up to the +// consumer to define the identifier. +#ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX +extern const bool kDefaultToAdaptiveMutex = true; +#else +extern const bool kDefaultToAdaptiveMutex = false; +#endif + +namespace port { + +static int PthreadCall(const char* label, int result) { + if (result != 0 && result != ETIMEDOUT && result != EBUSY) { + fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str()); + abort(); + } + return result; +} + +Mutex::Mutex(bool adaptive) { + (void)adaptive; +#ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX + if (!adaptive) { + PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); + } else { + pthread_mutexattr_t mutex_attr; + PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); + PthreadCall("set mutex attr", pthread_mutexattr_settype( + &mutex_attr, PTHREAD_MUTEX_ADAPTIVE_NP)); + PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); + PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&mutex_attr)); + } +#else + PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); +#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX +} + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); +#ifndef NDEBUG + locked_ = true; +#endif +} + +void Mutex::Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +bool Mutex::TryLock() { + bool ret = PthreadCall("trylock", pthread_mutex_trylock(&mu_)) == 0; +#ifndef NDEBUG + if (ret) { + locked_ = true; + } +#endif + return ret; +} + +void Mutex::AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif +} + +CondVar::CondVar(Mutex* mu) : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, nullptr)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { +#ifndef NDEBUG + mu_->locked_ = false; +#endif + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +#ifndef NDEBUG + mu_->locked_ = true; +#endif +} + +bool CondVar::TimedWait(uint64_t abs_time_us) { + struct timespec ts; + ts.tv_sec = static_cast(abs_time_us / 1000000); + ts.tv_nsec = static_cast((abs_time_us % 1000000) * 1000); + +#ifndef NDEBUG + mu_->locked_ = false; +#endif + int err = pthread_cond_timedwait(&cv_, &mu_->mu_, &ts); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + if (err == ETIMEDOUT) { + return true; + } + if (err != 0) { + PthreadCall("timedwait", err); + } + return false; +} + +void CondVar::Signal() { PthreadCall("signal", pthread_cond_signal(&cv_)); } + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +RWMutex::RWMutex() { + PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr)); +} + +RWMutex::~RWMutex() { + PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); +} + +void RWMutex::ReadLock() { + PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); +} + +void RWMutex::WriteLock() { + PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); +} + +void RWMutex::ReadUnlock() { + PthreadCall("read unlock", pthread_rwlock_unlock(&mu_)); +} + +void RWMutex::WriteUnlock() { + PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); +} + +int PhysicalCoreID() { +#if defined(ROCKSDB_SCHED_GETCPU_PRESENT) && defined(__x86_64__) && \ + (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 22)) + // sched_getcpu uses VDSO getcpu() syscall since 2.22. I believe Linux offers + // VDSO support only on x86_64. This is the fastest/preferred method if + // available. + int cpuno = sched_getcpu(); + if (cpuno < 0) { + return -1; + } + return cpuno; +#elif defined(__x86_64__) || defined(__i386__) + // clang/gcc both provide cpuid.h, which defines __get_cpuid(), for x86_64 and + // i386. + unsigned eax, ebx = 0, ecx, edx; + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + return -1; + } + return ebx >> 24; +#else + // give up, the caller can generate a random number or something. + return -1; +#endif +} + +void InitOnce(OnceType* once, void (*initializer)()) { + PthreadCall("once", pthread_once(once, initializer)); +} + +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); +} + +int GetMaxOpenFiles() { +#if defined(RLIMIT_NOFILE) + struct rlimit no_files_limit; + if (getrlimit(RLIMIT_NOFILE, &no_files_limit) != 0) { + return -1; + } + // protect against overflow + if (static_cast(no_files_limit.rlim_cur) >= + static_cast(std::numeric_limits::max())) { + return std::numeric_limits::max(); + } + return static_cast(no_files_limit.rlim_cur); +#endif + return -1; +} + +void* cacheline_aligned_alloc(size_t size) { +#if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__) + return malloc(size); +#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__)) + void* m; + errno = posix_memalign(&m, CACHE_LINE_SIZE, size); + return errno ? nullptr : m; +#else + return malloc(size); +#endif +} + +void cacheline_aligned_free(void* memblock) { free(memblock); } + +static size_t GetPageSize() { +#if defined(OS_LINUX) || defined(_SC_PAGESIZE) + long v = sysconf(_SC_PAGESIZE); + if (v >= 1024) { + return static_cast(v); + } +#endif + // Default assume 4KB + return 4U * 1024U; +} + +const size_t kPageSize = GetPageSize(); + +void SetCpuPriority(ThreadId id, CpuPriority priority) { +#ifdef OS_LINUX + sched_param param; + param.sched_priority = 0; + switch (priority) { + case CpuPriority::kHigh: + sched_setscheduler(id, SCHED_OTHER, ¶m); + setpriority(PRIO_PROCESS, id, -20); + break; + case CpuPriority::kNormal: + sched_setscheduler(id, SCHED_OTHER, ¶m); + setpriority(PRIO_PROCESS, id, 0); + break; + case CpuPriority::kLow: + sched_setscheduler(id, SCHED_OTHER, ¶m); + setpriority(PRIO_PROCESS, id, 19); + break; + case CpuPriority::kIdle: + sched_setscheduler(id, SCHED_IDLE, ¶m); + break; + default: + assert(false); + } +#else + (void)id; + (void)priority; +#endif +} + +int64_t GetProcessID() { return getpid(); } + +bool GenerateRfcUuid(std::string* output) { + output->clear(); + std::ifstream f("/proc/sys/kernel/random/uuid"); + std::getline(f, /*&*/ *output); + if (output->size() == 36) { + return true; + } else { + output->clear(); + return false; + } +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.h new file mode 100644 index 0000000000..cdb256a6d6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/port_posix.h @@ -0,0 +1,243 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +#include + +#include "rocksdb/port_defs.h" +#include "rocksdb/rocksdb_namespace.h" + +// size_t printf formatting named in the manner of C99 standard formatting +// strings such as PRIu64 +// in fact, we could use that one +#define ROCKSDB_PRIszt "zu" + +#define __declspec(S) + +#undef PLATFORM_IS_LITTLE_ENDIAN +#if defined(OS_MACOSX) +#include +#if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) +#define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) +#endif +#elif defined(OS_SOLARIS) +#include +#ifdef _LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN true +#else +#define PLATFORM_IS_LITTLE_ENDIAN false +#endif +#include +#elif defined(OS_AIX) +#include +#include +#define PLATFORM_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) +#include +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || \ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) +#include +#include +#define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) +#else +#include +#endif +#include +#include +#include + +#include +#include + +#ifndef PLATFORM_IS_LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#endif + +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) || \ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) || \ + defined(OS_ANDROID) || defined(CYGWIN) || defined(OS_AIX) +// Use fread/fwrite/fflush on platforms without _unlocked variants +#define fread_unlocked fread +#define fwrite_unlocked fwrite +#define fflush_unlocked fflush +#endif + +#if defined(OS_MACOSX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || \ + defined(OS_DRAGONFLYBSD) +// Use fsync() on platforms without fdatasync() +#define fdatasync fsync +#endif + +#if defined(OS_ANDROID) && __ANDROID_API__ < 9 +// fdatasync() was only introduced in API level 9 on Android. Use fsync() +// when targeting older platforms. +#define fdatasync fsync +#endif + +namespace ROCKSDB_NAMESPACE { + +extern const bool kDefaultToAdaptiveMutex; + +namespace port { +constexpr bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + static const char* kName() { return "pthread_mutex_t"; } + + explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex); + // No copying + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; + + ~Mutex(); + + void Lock(); + void Unlock(); + + bool TryLock(); + + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld(); + + // Also implement std Lockable + inline void lock() { Lock(); } + inline void unlock() { Unlock(); } + inline bool try_lock() { return TryLock(); } + + private: + friend class CondVar; + pthread_mutex_t mu_; +#ifndef NDEBUG + bool locked_ = false; +#endif +}; + +class RWMutex { + public: + RWMutex(); + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; + + ~RWMutex(); + + void ReadLock(); + void WriteLock(); + void ReadUnlock(); + void WriteUnlock(); + void AssertHeld() {} + + private: + pthread_rwlock_t mu_; // the underlying platform mutex +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + // Timed condition wait. Returns true if timeout occurred. + bool TimedWait(uint64_t abs_time_us); + void Signal(); + void SignalAll(); + + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +using Thread = std::thread; + +static inline void AsmVolatilePause() { +#if defined(__i386__) || defined(__x86_64__) + asm volatile("pause"); +#elif defined(__aarch64__) + asm volatile("isb"); +#elif defined(__powerpc64__) + asm volatile("or 27,27,27"); +#elif defined(__loongarch64) + asm volatile("dbar 0"); +#endif + // it's okay for other platforms to be no-ops +} + +// Returns -1 if not available on this platform +extern int PhysicalCoreID(); + +using OnceType = pthread_once_t; +#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT +extern void InitOnce(OnceType* once, void (*initializer)()); + +#ifndef CACHE_LINE_SIZE +// To test behavior with non-native cache line size, e.g. for +// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size. +// This disables ALIGN_AS to keep it from failing compilation. +#ifdef TEST_CACHE_LINE_SIZE +#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE +#define ALIGN_AS(n) /*empty*/ +#else +#if defined(__s390__) +#if defined(__GNUC__) && __GNUC__ < 7 +#define CACHE_LINE_SIZE 64U +#else +#define CACHE_LINE_SIZE 256U +#endif +#elif defined(__powerpc__) || defined(__aarch64__) +#define CACHE_LINE_SIZE 128U +#else +#define CACHE_LINE_SIZE 64U +#endif +#define ALIGN_AS(n) alignas(n) +#endif +#endif + +static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0, + "Cache line size must be a power of 2 number of bytes"); + +extern void* cacheline_aligned_alloc(size_t size); + +extern void cacheline_aligned_free(void* memblock); + +#if defined(__aarch64__) +// __builtin_prefetch(..., 1) turns into a prefetch into prfm pldl3keep. On +// arm64 we want this as close to the core as possible to turn it into a +// L1 prefetech unless locality == 0 in which case it will be turned into a +// non-temporal prefetch +#define PREFETCH(addr, rw, locality) \ + __builtin_prefetch(addr, rw, locality >= 1 ? 3 : locality) +#else +#define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +#endif + +extern void Crash(const std::string& srcfile, int srcline); + +extern int GetMaxOpenFiles(); + +extern const size_t kPageSize; + +using ThreadId = pid_t; + +extern void SetCpuPriority(ThreadId id, CpuPriority priority); + +int64_t GetProcessID(); + +// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns +// true on success or false on failure. +bool GenerateRfcUuid(std::string* output); + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.cc new file mode 100644 index 0000000000..9e9d8b8b54 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.cc @@ -0,0 +1,336 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "port/stack_trace.h" + +#if !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \ + defined(OS_SOLARIS) || defined(OS_WIN) + +// noop + +namespace ROCKSDB_NAMESPACE { +namespace port { +void InstallStackTraceHandler() {} +void PrintStack(int /*first_frames_to_skip*/) {} +void PrintAndFreeStack(void* /*callstack*/, int /*num_frames*/) {} +void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { + return nullptr; +} +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#else + +#include +#include +#include +#include +#include +#include +#include + +#ifdef OS_FREEBSD +#include +#endif // OS_FREEBSD +#ifdef OS_LINUX +#include +#include +#include +#if __GLIBC__ < 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 30) +#include +#define gettid() syscall(SYS_gettid) +#endif // GLIBC version +#endif // OS_LINUX + +#include "port/lang.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +namespace { + +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) +const char* GetExecutableName() { + static char name[1024]; + +#if !defined(OS_FREEBSD) + char link[1024]; + snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); + auto read = readlink(link, name, sizeof(name) - 1); + if (-1 == read) { + return nullptr; + } else { + name[read] = 0; + return name; + } +#else + int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; + size_t namesz = sizeof(name); + + auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0); + if (-1 == ret) { + return nullptr; + } else { + return name; + } +#endif +} + +void PrintStackTraceLine(const char* symbol, void* frame) { + static const char* executable = GetExecutableName(); + if (symbol) { + fprintf(stderr, "%s ", symbol); + } + if (executable) { + // out source to addr2line, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } + } else { + fprintf(stderr, " %p", frame); + } + + fprintf(stderr, "\n"); +} +#elif defined(OS_MACOSX) + +void PrintStackTraceLine(const char* symbol, void* frame) { + static int pid = getpid(); + // out source to atos, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "xcrun atos %p -p %d 2>&1", frame, pid); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } else if (symbol) { + fprintf(stderr, "%s ", symbol); + } + + fprintf(stderr, "\n"); +} + +#endif + +const char* GetLldbScriptSelectThread(long long tid) { + // NOTE: called from a signal handler, so no heap allocation + static char script[80]; + snprintf(script, sizeof(script), + "script -l python -- lldb.process.SetSelectedThreadByID(%lld)", tid); + return script; +} + +} // namespace + +void PrintStack(void* frames[], int num_frames) { + auto symbols = backtrace_symbols(frames, num_frames); + + for (int i = 0; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i); + PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]); + } + free(symbols); +} + +void PrintStack(int first_frames_to_skip) { + // Default to getting stack traces with GDB, at least on Linux where we + // know how to attach to a particular thread. + // + // * Address space layout randomization (ASLR) interferes with getting good + // stack information from backtrace+addr2line. This is more likely to show + // up with LIB_MODE=shared builds (when kernel.randomize_va_space >= 1) + // but can also show up with LIB_MODE=static builds ((when + // kernel.randomize_va_space == 2). + // * It doesn't appear easy to detect when ASLR is in use. + // * With DEBUG_LEVEL < 2, backtrace() can skip frames that are not skipped + // in GDB. + // + // LLDB also available as an option + bool lldb_stack_trace = getenv("ROCKSDB_LLDB_STACK") != nullptr; +#if defined(OS_LINUX) + // Default true, override with ROCKSDB_BACKTRACE_STACK=1 + bool gdb_stack_trace = + !lldb_stack_trace && getenv("ROCKSDB_BACKTRACE_STACK") == nullptr; +#else + // Default false, override with ROCKSDB_GDB_STACK=1 + bool gdb_stack_trace = getenv("ROCKSDB_GDB_STACK") != nullptr; +#endif + // Also support invoking interactive debugger on stack trace, with this + // envvar set to non-empty + char* debug_env = getenv("ROCKSDB_DEBUG"); + bool debug = debug_env != nullptr && strlen(debug_env) > 0; + + if (lldb_stack_trace || gdb_stack_trace || debug) { + // Allow ouside debugger to attach, even with Yama security restrictions +#ifdef PR_SET_PTRACER_ANY + (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); +#endif + // Try to invoke GDB, either for stack trace or debugging. + long long attach_pid = getpid(); + // NOTE: we're in a signal handler, so no heap allocation + char attach_pid_str[20]; + snprintf(attach_pid_str, sizeof(attach_pid_str), "%lld", attach_pid); + + // `gdb -p PID` seems to always attach to main thread, but `gdb -p TID` + // seems to be able to attach to a particular thread in a process, which + // makes sense as the main thread TID == PID of the process. + // But I haven't found that gdb capability documented anywhere, so leave + // a back door to attach to main thread. + long long gdb_attach_id = attach_pid; + // Save current thread id before fork + long long attach_tid = 0; +#ifdef OS_LINUX + attach_tid = gettid(); + if (getenv("ROCKSDB_DEBUG_USE_PID") == nullptr) { + gdb_attach_id = attach_tid; + } +#endif + + char gdb_attach_id_str[20]; + snprintf(gdb_attach_id_str, sizeof(gdb_attach_id_str), "%lld", + gdb_attach_id); + + pid_t child_pid = fork(); + if (child_pid == 0) { + // child process + if (debug) { + if (strcmp(debug_env, "lldb") == 0) { + fprintf(stderr, "Invoking LLDB for debugging (ROCKSDB_DEBUG=%s)...\n", + debug_env); + execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, + /*"-Q",*/ "-o", GetLldbScriptSelectThread(attach_tid), + (char*)nullptr); + return; + } else { + fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n", + debug_env); + execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", gdb_attach_id_str, + (char*)nullptr); + return; + } + } else { + // Redirect child stdout to original stderr + dup2(2, 1); + // No child stdin (don't use pager) + close(0); + if (lldb_stack_trace) { + fprintf(stderr, "Invoking LLDB for stack trace...\n"); + + // Skip top ~8 frames here in PrintStack + auto bt_in_lldb = + "script -l python -- for f in lldb.thread.frames[8:]: print(f)"; + execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, + "-b", "-Q", "-o", GetLldbScriptSelectThread(attach_tid), "-o", + bt_in_lldb, (char*)nullptr); + } else { + // gdb_stack_trace + fprintf(stderr, "Invoking GDB for stack trace...\n"); + + // Skip top ~4 frames here in PrintStack + // See https://stackoverflow.com/q/40991943/454544 + auto bt_in_gdb = + "frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 " + "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 " + "42 43 44 -q frame"; + // -n : Loading config files can apparently cause failures with the + // other options here. + // -batch : non-interactive; suppress banners as much as possible + execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p", + gdb_attach_id_str, "-ex", bt_in_gdb, (char*)nullptr); + } + return; + } + } else { + // parent process; wait for child + int wstatus; + waitpid(child_pid, &wstatus, 0); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == EXIT_SUCCESS) { + // Good + return; + } + } + fprintf(stderr, "GDB failed; falling back on backtrace+addr2line...\n"); + } + + const int kMaxFrames = 100; + void* frames[kMaxFrames]; + + auto num_frames = backtrace(frames, kMaxFrames); + PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip); +} + +void PrintAndFreeStack(void* callstack, int num_frames) { + PrintStack(static_cast(callstack), num_frames); + free(callstack); +} + +void* SaveStack(int* num_frames, int first_frames_to_skip) { + const int kMaxFrames = 100; + void* frames[kMaxFrames]; + + auto count = backtrace(frames, kMaxFrames); + *num_frames = count - first_frames_to_skip; + void* callstack = malloc(sizeof(void*) * *num_frames); + memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames); + return callstack; +} + +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); + + // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of + // a signal" have failed, so just warn the user about them. +#ifdef __SANITIZE_THREAD__ + fprintf(stderr, + "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" + "==> ignorable, as they are expected when generating a stack\n" + "==> trace because of a signal under TSAN. Consider why the\n" + "==> signal was generated to begin with, and the stack trace\n" + "==> in the TSAN warning can be useful for that. (The stack\n" + "==> trace printed by the signal handler is likely obscured\n" + "==> by TSAN output.)\n"); +#endif + + // re-signal to default handler (so we still get core dump if needed...) + raise(sig); +} + +void InstallStackTraceHandler() { + // just use the plain old signal as it's simple and sufficient + // for this use case + signal(SIGILL, StackTraceHandler); + signal(SIGSEGV, StackTraceHandler); + signal(SIGBUS, StackTraceHandler); + signal(SIGABRT, StackTraceHandler); + // Allow ouside debugger to attach, even with Yama security restrictions. + // This is needed even outside of PrintStack() so that external mechanisms + // can dump stacks if they suspect that a test has hung. +#ifdef PR_SET_PTRACER_ANY + (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); +#endif +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.h new file mode 100644 index 0000000000..5b3bf93205 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/stack_trace.h @@ -0,0 +1,31 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// Install a signal handler to print callstack on the following signals: +// SIGILL SIGSEGV SIGBUS SIGABRT +// And also (Linux ony for now) overrides security settings to allow outside +// processes to attach to this one as a debugger. ONLY USE FOR NON-SECURITY +// CRITICAL PROCESSES such as unit tests or benchmarking tools. +// Currently supports only some POSIX implementations. No-op otherwise. +void InstallStackTraceHandler(); + +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + +// Prints the given callstack +void PrintAndFreeStack(void* callstack, int num_frames); + +// Save the current callstack +void* SaveStack(int* num_frame, int first_frames_to_skip = 0); + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/sys_time.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/sys_time.h new file mode 100644 index 0000000000..f2137526b1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/sys_time.h @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This file is a portable substitute for sys/time.h which does not exist on +// Windows + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +#if defined(OS_WIN) && (defined(_MSC_VER) || defined(__MINGW32__)) + +#include + +namespace ROCKSDB_NAMESPACE { + +namespace port { + +struct TimeVal { + long tv_sec; + long tv_usec; +}; + +void GetTimeOfDay(TimeVal* tv, struct timezone* tz); + +inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) { + errno_t ret = localtime_s(result, timep); + return (ret == 0) ? result : NULL; +} + +} // namespace port + +} // namespace ROCKSDB_NAMESPACE + +#else +#include +#include + +namespace ROCKSDB_NAMESPACE { + +namespace port { + +using TimeVal = struct timeval; + +inline void GetTimeOfDay(TimeVal* tv, struct timezone* tz) { + gettimeofday(tv, tz); +} + +inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) { + return localtime_r(timep, result); +} + +} // namespace port + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/util_logger.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/util_logger.h new file mode 100644 index 0000000000..ce7e3a9416 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/util_logger.h @@ -0,0 +1,18 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. + +#if defined(OS_WIN) +#include "port/win/win_logger.h" +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_default.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_default.cc new file mode 100644 index 0000000000..48853f26e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_default.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include + +#include "port/win/env_win.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/compression_context_cache.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// We choose not to destroy the env because joining the threads from the +// system loader +// which destroys the statics (same as from DLLMain) creates a system loader +// dead-lock. +// in this manner any remaining threads are terminated OK. +namespace { +std::once_flag winenv_once_flag; +Env* envptr; +}; // namespace +} // namespace port + +Env* Env::Default() { + ThreadLocalPtr::InitSingletons(); + CompressionContextCache::InitSingleton(); + INIT_SYNC_POINT_SINGLETONS(); + std::call_once(port::winenv_once_flag, + []() { port::envptr = new port::WinEnv(); }); + return port::envptr; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.cc new file mode 100644 index 0000000000..2262eb59c4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.cc @@ -0,0 +1,1437 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/env_win.h" + +#include // _rmdir, _mkdir, _getcwd +#include +#include // _access +#include // for uuid generation +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "port/lang.h" +#include "port/port.h" +#include "port/port_dirent.h" +#include "port/win/io_win.h" +#include "port/win/win_logger.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "strsafe.h" +#include "util/string_util.h" + +// Undefine the functions windows might use (again)... +#undef GetCurrentTime +#undef DeleteFile +#undef LoadLibrary + +namespace ROCKSDB_NAMESPACE { + +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + +namespace { + +// Sector size used when physical sector size cannot be obtained from device. +static const size_t kSectorSize = 512; + +// RAII helpers for HANDLEs +const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; +using UniqueCloseHandlePtr = std::unique_ptr; + +const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); }; +using UniqueFindClosePtr = std::unique_ptr; + +void WinthreadCall(const char* label, std::error_code result) { + if (0 != result.value()) { + fprintf(stderr, "Winthread %s: %s\n", label, + errnoStr(result.value()).c_str()); + abort(); + } +} + +} // namespace + +namespace port { +WinClock::WinClock() + : perf_counter_frequency_(0), + nano_seconds_per_period_(0), + GetSystemTimePreciseAsFileTime_(NULL) { + { + LARGE_INTEGER qpf; + BOOL ret __attribute__((__unused__)); + ret = QueryPerformanceFrequency(&qpf); + assert(ret == TRUE); + perf_counter_frequency_ = qpf.QuadPart; + + if (std::nano::den % perf_counter_frequency_ == 0) { + nano_seconds_per_period_ = std::nano::den / perf_counter_frequency_; + } + } + + HMODULE module = GetModuleHandle("kernel32.dll"); + if (module != NULL) { + GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)( + void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime"); + } +} + +void WinClock::SleepForMicroseconds(int micros) { + std::this_thread::sleep_for(std::chrono::microseconds(micros)); +} + +std::string WinClock::TimeToString(uint64_t secondsSince1970) { + std::string result; + + const time_t seconds = secondsSince1970; + const int maxsize = 64; + + struct tm t; + errno_t ret = localtime_s(&t, &seconds); + + if (ret) { + result = std::to_string(seconds); + } else { + result.resize(maxsize); + char* p = &result[0]; + + int len = + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + assert(len > 0); + + result.resize(len); + } + + return result; +} + +uint64_t WinClock::NowMicros() { + if (GetSystemTimePreciseAsFileTime_ != NULL) { + // all std::chrono clocks on windows proved to return + // values that may repeat that is not good enough for some uses. + const int64_t c_UnixEpochStartTicks = 116444736000000000LL; + const int64_t c_FtToMicroSec = 10; + + // This interface needs to return system time and not + // just any microseconds because it is often used as an argument + // to TimedWait() on condition variable + FILETIME ftSystemTime; + GetSystemTimePreciseAsFileTime_(&ftSystemTime); + + LARGE_INTEGER li; + li.LowPart = ftSystemTime.dwLowDateTime; + li.HighPart = ftSystemTime.dwHighDateTime; + // Subtract unix epoch start + li.QuadPart -= c_UnixEpochStartTicks; + // Convert to microsecs + li.QuadPart /= c_FtToMicroSec; + return li.QuadPart; + } + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + +uint64_t WinClock::NowNanos() { + if (nano_seconds_per_period_ != 0) { + // all std::chrono clocks on windows have the same resolution that is only + // good enough for microseconds but not nanoseconds + // On Windows 8 and Windows 2012 Server + // GetSystemTimePreciseAsFileTime(¤t_time) can be used + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + // Convert performance counter to nanoseconds by precomputed ratio. + // Directly multiply nano::den with li.QuadPart causes overflow. + // Only do this when nano::den is divisible by perf_counter_frequency_, + // which most likely is the case in reality. If it's not, fall back to + // high_resolution_clock, which may be less precise under old compilers. + li.QuadPart *= nano_seconds_per_period_; + return li.QuadPart; + } + return std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); +} + +Status WinClock::GetCurrentTime(int64_t* unix_time) { + time_t time = std::time(nullptr); + if (time == (time_t)(-1)) { + return Status::NotSupported("Failed to get time"); + } + + *unix_time = time; + return Status::OK(); +} + +WinFileSystem::WinFileSystem(const std::shared_ptr& clock) + : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) { + SYSTEM_INFO sinfo; + GetSystemInfo(&sinfo); + + page_size_ = sinfo.dwPageSize; + allocation_granularity_ = sinfo.dwAllocationGranularity; +} + +const std::shared_ptr& WinFileSystem::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr, fs) + (std::make_shared(WinClock::Default())); + return fs; +} + +WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {} + +WinEnvIO::~WinEnvIO() {} + +IOStatus WinFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError); + } + + return result; +} + +IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); + if (result != 0) { + s = IOError("Failed to truncate: " + fname, errno); + } + return s; +} + +IOStatus WinFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + IOStatus s; + + result->reset(); + + // Corruption test needs to rename and delete files of these kind + // while they are still open with another handle. For that reason we + // allow share_write and delete(allows rename). + HANDLE hFile = INVALID_HANDLE_VALUE; + + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + if (options.use_direct_reads && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } + + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, + OPEN_EXISTING, // Original fopen mode is "rb" + fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname, + lastError); + } else { + result->reset(new WinSequentialFile(fname, hFile, options)); + } + return s; +} + +IOStatus WinFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + result->reset(); + IOStatus s; + + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + if (options.use_direct_reads && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } else { + fileFlags |= FILE_FLAG_RANDOM_ACCESS; + } + + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = + RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomAccessFile failed to Create/Open: " + fname, lastError); + } + + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + + // CAUTION! This will map the entire file into the process address space. + // Not recommended for 32-bit platforms. + if (options.use_mmap_reads) { + uint64_t fileSize; + + s = GetFileSize(fname, IOOptions(), &fileSize, dbg); + + if (s.ok()) { + // Will not map empty files + if (fileSize == 0) { + return IOError("NewRandomAccessFile failed to map empty file: " + fname, + EINVAL); + } + + HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY, + 0, // At its present length + 0, + NULL); // Mapping name + + if (!hMap) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create file mapping for NewRandomAccessFile: " + fname, + lastError); + } + + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); + + const void* mapped_region = + MapViewOfFileEx(hMap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast(fileSize), + NULL); // Let the OS choose the mapping + + if (!mapped_region) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewRandomAccessFile: " + fname, + lastError); + } + + result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, + static_cast(fileSize))); + + mapGuard.release(); + fileGuard.release(); + } + } else { + result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options)); + fileGuard.release(); + } + return s; +} + +IOStatus WinFileSystem::OpenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, bool reopen) { + const size_t c_BufferCapacity = 64 * 1024; + + EnvOptions local_options(options); + + result->reset(); + IOStatus s; + + DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; + + if (local_options.use_direct_writes && !local_options.use_mmap_writes) { + fileFlags = FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + } + + // Desired access. We are want to write only here but if we want to memory + // map + // the file then there is no write only mode so we have to create it + // Read/Write + // However, MapViewOfFile specifies only Write only + DWORD desired_access = GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ; + + if (local_options.use_mmap_writes) { + desired_access |= GENERIC_READ; + } else { + // Adding this solely for tests to pass (fault_injection_test, + // wal_manager_test). + shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE); + } + + // This will always truncate the file + DWORD creation_disposition = CREATE_ALWAYS; + if (reopen) { + creation_disposition = OPEN_ALWAYS; + } + + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), + desired_access, // Access desired + shared_mode, + NULL, // Security attributes + // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC + creation_disposition, + fileFlags, // Flags + NULL); // Template File + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a NewWritableFile: " + fname, lastError); + } + + // We will start writing at the end, appending + if (reopen) { + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; + BOOL ret = SetFilePointerEx(hFile, zero_move, NULL, FILE_END); + if (!ret) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a ReopenWritableFile move to the end: " + fname, + lastError); + } + } + + if (options.use_mmap_writes) { + // We usually do not use mmmapping on SSD and thus we pass memory + // page_size + result->reset(new WinMmapFile(fname, hFile, page_size_, + allocation_granularity_, local_options)); + } else { + // Here we want the buffer allocation to be aligned by the SSD page size + // and to be a multiple of it + result->reset(new WinWritableFile(fname, hFile, GetPageSize(), + c_BufferCapacity, local_options)); + } + return s; +} + +IOStatus WinFileSystem::NewWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, false); +} + +IOStatus WinFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, true); +} + +IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; + + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD desired_access = GENERIC_READ | GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; + DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist + DWORD file_flags = FILE_FLAG_RANDOM_ACCESS; + + if (options.use_direct_reads && options.use_direct_writes) { + file_flags |= FILE_FLAG_NO_BUFFERING; + } + + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode, + NULL, // Security attributes + creation_disposition, file_flags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomRWFile failed to Create/Open: " + fname, lastError); + } + + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + result->reset(new WinRandomRWFile(fname, hFile, GetPageSize(), options)); + fileGuard.release(); + + return s; +} + +IOStatus WinFileSystem::NewMemoryMappedFileBuffer( + const std::string& fname, std::unique_ptr* result) { + IOStatus s; + result->reset(); + + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + HANDLE hFile = INVALID_HANDLE_VALUE; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, + OPEN_EXISTING, // Open only if it exists + fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to open NewMemoryMappedFileBuffer: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + + uint64_t fileSize = 0; + s = GetFileSize(fname, IOOptions(), &fileSize, nullptr); + if (!s.ok()) { + return s; + } + // Will not map empty files + if (fileSize == 0) { + return IOStatus::NotSupported( + "NewMemoryMappedFileBuffer can not map zero length files: " + fname); + } + + // size_t is 32-bit with 32-bit builds + if (fileSize > std::numeric_limits::max()) { + return IOStatus::NotSupported( + "The specified file size does not fit into 32-bit memory addressing: " + + fname); + } + + HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE, + 0, // Whole file at its present length + 0, + NULL); // Mapping name + + if (!hMap) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create file mapping for: " + fname, lastError); + } + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); + + void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast(fileSize), + NULL); // Let the OS choose the mapping + + if (!base) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewMemoryMappedFileBuffer: " + fname, + lastError); + } + + result->reset(new WinMemoryMappedBuffer(hFile, hMap, base, + static_cast(fileSize))); + + mapGuard.release(); + fileGuard.release(); + + return s; +} + +IOStatus WinFileSystem::NewDirectory(const std::string& name, + const IOOptions& /*options*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; + // Must be nullptr on failure + result->reset(); + + if (!DirExists(name)) { + s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY); + return s; + } + + HANDLE handle = INVALID_HANDLE_VALUE; + // 0 - for access means read metadata + { + IOSTATS_TIMER_GUARD(open_nanos); + handle = RX_CreateFile( + RX_FN(name).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + } + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open folder: " + name, lastError); + return s; + } + + result->reset(new WinDirectory(name, handle)); + + return s; +} + +IOStatus WinFileSystem::FileExists(const std::string& fname, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + // TODO: This does not follow symbolic links at this point + // which is consistent with _access() impl on windows + // but can be added + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(), + GetFileExInfoStandard, &attrs)) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_ACCESS_DENIED: + case ERROR_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + s = IOStatus::NotFound(); + break; + default: + s = IOErrorFromWindowsError("Unexpected error for: " + fname, + lastError); + break; + } + } + return s; +} + +IOStatus WinFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*opts*/, + std::vector* result, + IODebugContext* /*dbg*/) { + IOStatus status; + result->clear(); + + RX_WIN32_FIND_DATA data; + memset(&data, 0, sizeof(data)); + std::string pattern(dir); + pattern.append("\\").append("*"); + + HANDLE handle = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + // Do not want alternative name + FindExInfoBasic, &data, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); + + if (handle == INVALID_HANDLE_VALUE) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_NOT_FOUND: + case ERROR_ACCESS_DENIED: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + status = IOStatus::NotFound(); + break; + default: + status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir, + lastError); + } + return status; + } + + UniqueFindClosePtr fc(handle, FindCloseFunc); + + // For safety + data.cFileName[MAX_PATH - 1] = 0; + + while (true) { + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) && + (RX_FNCMP(data.cFileName, ".") == 0 || + RX_FNCMP(data.cFileName, "..") == 0); + if (!ignore) { + auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); + result->push_back(FN_TO_RX(x)); + } + + BOOL ret = -RX_FindNextFile(handle, &data); + // If the function fails the return value is zero + // and non-zero otherwise. Not TRUE or FALSE. + if (ret == FALSE) { + // Posix does not care why we stopped + break; + } + data.cFileName[MAX_PATH - 1] = 0; + } + return status; +} + +IOStatus WinFileSystem::CreateDir(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); + } + + return result; +} + +IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + if (DirExists(name)) { + return result; + } + + BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + if (lastError != ERROR_ALREADY_EXISTS) { + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); + } else { + result = IOStatus::IOError(name + ": exists but is not a directory"); + } + } + return result; +} + +IOStatus WinFileSystem::DeleteDir(const std::string& name, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; + BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str()); + if (!ret) { + auto lastError = GetLastError(); + result = + IOErrorFromWindowsError("Failed to remove dir: " + name, lastError); + } + return result; +} + +IOStatus WinFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* size, + IODebugContext* /*dbg*/) { + IOStatus s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, + &attrs)) { + ULARGE_INTEGER file_size; + file_size.HighPart = attrs.nFileSizeHigh; + file_size.LowPart = attrs.nFileSizeLow; + *size = file_size.QuadPart; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError); + } + return s; +} + +uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) { + const uint64_t c_FileTimePerSecond = 10000000U; + // UNIX epoch starts on 1970-01-01T00:00:00Z + // Windows FILETIME starts on 1601-01-01T00:00:00Z + // Therefore, we need to subtract the below number of seconds from + // the seconds that we obtain from FILETIME with an obvious loss of + // precision + const uint64_t c_SecondBeforeUnixEpoch = 11644473600U; + + ULARGE_INTEGER li; + li.HighPart = ftTime.dwHighDateTime; + li.LowPart = ftTime.dwLowDateTime; + + uint64_t result = + (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; + return result; +} + +IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*opts*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) { + IOStatus s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, + &attrs)) { + *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Can not get file modification time for: " + fname, lastError); + *file_mtime = 0; + } + + return s; +} + +IOStatus WinFileSystem::RenameFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + // rename() is not capable of replacing the existing file as on Linux + // so use OS API directly + if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(), + MOVEFILE_REPLACE_EXISTING)) { + DWORD lastError = GetLastError(); + + std::string text("Failed to rename: "); + text.append(src).append(" to: ").append(target); + + result = IOErrorFromWindowsError(text, lastError); + } + + return result; +} + +IOStatus WinFileSystem::LinkFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { + DWORD lastError = GetLastError(); + if (lastError == ERROR_NOT_SAME_DEVICE) { + return IOStatus::NotSupported("No cross FS links allowed"); + } + + std::string text("Failed to link: "); + text.append(src).append(" to: ").append(target); + + result = IOErrorFromWindowsError(text, lastError); + } + + return result; +} + +IOStatus WinFileSystem::NumFileLinks(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* count, + IODebugContext* /*dbg*/) { + IOStatus s; + HANDLE handle = + RX_CreateFile(RX_FN(fname).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc); + FILE_STANDARD_INFO standard_info; + if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo, + &standard_info, + sizeof(standard_info))) { + *count = standard_info.NumberOfLinks; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname, + lastError); + } + return s; +} + +IOStatus WinFileSystem::AreFilesSame(const std::string& first, + const std::string& second, + const IOOptions& /*opts*/, bool* res, + IODebugContext* /*dbg*/) { +// For MinGW builds +#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) + IOStatus s = IOStatus::NotSupported(); +#else + assert(res != nullptr); + IOStatus s; + if (res == nullptr) { + s = IOStatus::InvalidArgument("res"); + return s; + } + + // 0 - for access means read metadata + HANDLE file_1 = RX_CreateFile( + RX_FN(first).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_1) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open file: " + first, lastError); + return s; + } + UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc); + + HANDLE file_2 = RX_CreateFile( + RX_FN(second).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_2) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open file: " + second, lastError); + return s; + } + UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc); + + FILE_ID_INFO FileInfo_1; + BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1, + sizeof(FileInfo_1)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("stat file: " + first, lastError); + return s; + } + + FILE_ID_INFO FileInfo_2; + result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, + sizeof(FileInfo_2)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("stat file: " + second, lastError); + return s; + } + + if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) { + *res = + (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier, + sizeof(FileInfo_1.FileId.Identifier))); + } else { + *res = false; + } +#endif + return s; +} + +IOStatus WinFileSystem::LockFile(const std::string& lockFname, + const IOOptions& /*opts*/, FileLock** lock, + IODebugContext* /*dbg*/) { + assert(lock != nullptr); + + *lock = NULL; + IOStatus result; + + // No-sharing, this is a LOCK file + const DWORD ExclusiveAccessON = 0; + + // Obtain exclusive access to the LOCK file + // Previously, instead of NORMAL attr we set DELETE on close and that worked + // well except with fault_injection test that insists on deleting it. + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile(RX_FN(lockFname).c_str(), + (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON, + NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname, + lastError); + } else { + *lock = new WinFileLock(hFile); + } + + return result; +} + +IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + assert(lock != nullptr); + + delete lock; + + return result; +} + +IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts, + std::string* result, + IODebugContext* dbg) { + std::string output; + + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + output = env; + } else { + env = getenv("TMP"); + + if (env && env[0] != '\0') { + output = env; + } else { + output = "c:\\tmp"; + } + } + CreateDir(output, opts, dbg); + + output.append("\\testrocksdb-"); + output.append(std::to_string(GetCurrentProcessId())); + + CreateDir(output, opts, dbg); + + output.swap(*result); + + return IOStatus::OK(); +} + +IOStatus WinFileSystem::NewLogger(const std::string& fname, + const IOOptions& /*opts*/, + std::shared_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; + + result->reset(); + + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are + // renamed and deleted before + // they are closed. This enables + // doing so. + NULL, + CREATE_ALWAYS, // Original fopen mode is "w" + FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError); + } else { + { + // With log files we want to set the true creation time as of now + // because the system + // for some reason caches the attributes of the previous file that just + // been renamed from + // this name so auto_roll_logger_test fails + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + // Set creation, last access and last write time to the same value + SetFileTime(hFile, &ft, &ft, &ft); + } + result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile)); + } + return s; +} + +IOStatus WinFileSystem::IsDirectory(const std::string& path, + const IOOptions& /*opts*/, bool* is_dir, + IODebugContext* /*dbg*/) { + BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str()); + if (is_dir) { + *is_dir = ret ? true : false; + } + return IOStatus::OK(); +} + +Status WinEnvIO::GetHostName(char* name, uint64_t len) { + Status s; + DWORD nSize = static_cast( + std::min(len, std::numeric_limits::max())); + + if (!::GetComputerNameA(name, &nSize)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetHostName", lastError); + } else { + name[nSize] = 0; + } + + return s; +} + +IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* dbg) { + // Check if we already have an absolute path + // For test compatibility we will consider starting slash as an + // absolute path + if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) || + !RX_PathIsRelative(RX_FN(db_path).c_str())) { + *output_path = db_path; + return IOStatus::OK(); + } + + RX_FILESTRING result; + result.resize(MAX_PATH); + + // Hopefully no changes the current directory while we do this + // however _getcwd also suffers from the same limitation + DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]); + if (len == 0) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get current working directory", + lastError); + } + + result.resize(len); + std::string res = FN_TO_RX(result); + + res.swap(*output_path); + return IOStatus::OK(); +} + +IOStatus WinFileSystem::GetFreeSpace(const std::string& path, + const IOOptions& /*options*/, + uint64_t* diskfree, + IODebugContext* /*dbg*/) { + assert(diskfree != nullptr); + ULARGE_INTEGER freeBytes; + BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL); + if (f) { + *diskfree = freeBytes.QuadPart; + return IOStatus::OK(); + } else { + DWORD lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get free space: " + path, + lastError); + } +} + +FileOptions WinFileSystem::OptimizeForLogWrite( + const FileOptions& file_options, const DBOptions& db_options) const { + FileOptions optimized(file_options); + // These two the same as default optimizations + optimized.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + + // This adversely affects %999 on windows + optimized.use_mmap_writes = false; + // Direct writes will produce a huge perf impact on + // Windows. Pre-allocate space for WAL. + optimized.use_direct_writes = false; + return optimized; +} + +FileOptions WinFileSystem::OptimizeForManifestWrite( + const FileOptions& options) const { + FileOptions optimized(options); + optimized.use_mmap_writes = false; + optimized.use_direct_reads = false; + return optimized; +} + +FileOptions WinFileSystem::OptimizeForManifestRead( + const FileOptions& file_options) const { + FileOptions optimized(file_options); + optimized.use_mmap_writes = false; + optimized.use_direct_reads = false; + return optimized; +} + +// Returns true iff the named directory exists and is a directory. +bool WinFileSystem::DirExists(const std::string& dname) { + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard, + &attrs)) { + return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); + } + return false; +} + +size_t WinFileSystem::GetSectorSize(const std::string& fname) { + size_t sector_size = kSectorSize; + + // obtain device handle + char devicename[7] = "\\\\.\\"; + int erresult = 0; + if (RX_PathIsRelative(RX_FN(fname).c_str())) { + RX_FILESTRING rx_current_dir; + rx_current_dir.resize(MAX_PATH); + DWORD len = RX_GetCurrentDirectory(MAX_PATH, &rx_current_dir[0]); + if (len == 0) { + return sector_size; + } + rx_current_dir.resize(len); + std::string current_dir = FN_TO_RX(rx_current_dir); + erresult = + strncat_s(devicename, sizeof(devicename), current_dir.c_str(), 2); + } else { + erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2); + } + + if (erresult) { + assert(false); + return sector_size; + } + + HANDLE hDevice = CreateFile(devicename, 0, 0, nullptr, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, nullptr); + + if (hDevice == INVALID_HANDLE_VALUE) { + return sector_size; + } + + STORAGE_PROPERTY_QUERY spropertyquery; + spropertyquery.PropertyId = StorageAccessAlignmentProperty; + spropertyquery.QueryType = PropertyStandardQuery; + + BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)]; + DWORD output_bytes = 0; + + BOOL ret = DeviceIoControl( + hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery, + sizeof(spropertyquery), output_buffer, + sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr); + + if (ret) { + sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer) + ->BytesPerLogicalSector; + } else { + // many devices do not support StorageProcessAlignmentProperty. Any failure + // here and we fall back to logical alignment + + DISK_GEOMETRY_EX geometry = {0}; + ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0, + &geometry, sizeof(geometry), &output_bytes, nullptr); + if (ret) { + sector_size = geometry.Geometry.BytesPerSector; + } + } + + if (hDevice != INVALID_HANDLE_VALUE) { + CloseHandle(hDevice); + } + + return sector_size; +} + +//////////////////////////////////////////////////////////////////////// +// WinEnvThreads + +WinEnvThreads::WinEnvThreads(Env* hosted_env) + : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) { + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(hosted_env); + } +} + +WinEnvThreads::~WinEnvThreads() { + WaitForJoin(); + + for (auto& thpool : thread_pools_) { + thpool.JoinAllThreads(); + } +} + +void WinEnvThreads::Schedule(void (*function)(void*), void* arg, + Env::Priority pri, void* tag, + void (*unschedFunction)(void* arg)) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} + +namespace { + +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; + +void* StartThreadWrapper(void* arg) { + std::unique_ptr state( + reinterpret_cast(arg)); + state->user_function(state->arg); + return nullptr; +} + +} // namespace + +void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) { + std::unique_ptr state(new StartThreadState); + state->user_function = function; + state->arg = arg; + try { + Thread th(&StartThreadWrapper, state.get()); + state.release(); + + std::lock_guard lg(mu_); + threads_to_join_.push_back(std::move(th)); + + } catch (const std::system_error& ex) { + WinthreadCall("start thread", ex.code()); + } +} + +void WinEnvThreads::WaitForJoin() { + for (auto& th : threads_to_join_) { + th.join(); + } + threads_to_join_.clear(); +} + +unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +int WinEnvThreads::ReserveThreads(int threads_to_reserved, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].ReserveThreads(threads_to_reserved); +} + +int WinEnvThreads::ReleaseThreads(int threads_to_released, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].ReleaseThreads(threads_to_released); +} + +uint64_t WinEnvThreads::gettid() { + uint64_t thread_id = GetCurrentThreadId(); + return thread_id; +} + +uint64_t WinEnvThreads::GetThreadID() const { return gettid(); } + +void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); +} + +int WinEnvThreads::GetBackgroundThreads(Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); +} + +void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); +} + +///////////////////////////////////////////////////////////////////////// +// WinEnv + +WinEnv::WinEnv() + : CompositeEnv(WinFileSystem::Default(), WinClock::Default()), + winenv_io_(this), + winenv_threads_(this) { + // Protected member of the base class + thread_status_updater_ = CreateThreadStatusUpdater(); +} + +WinEnv::~WinEnv() { + // All threads must be joined before the deletion of + // thread_status_updater_. + delete thread_status_updater_; +} + +Status WinEnv::GetThreadList(std::vector* thread_list) { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); +} + +Status WinEnv::GetHostName(char* name, uint64_t len) { + return winenv_io_.GetHostName(name, len); +} + +void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { + return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction); +} + +int WinEnv::UnSchedule(void* arg, Env::Priority pri) { + return winenv_threads_.UnSchedule(arg, pri); +} + +void WinEnv::StartThread(void (*function)(void* arg), void* arg) { + return winenv_threads_.StartThread(function, arg); +} + +void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); } + +unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { + return winenv_threads_.GetThreadPoolQueueLen(pri); +} +int WinEnv::ReserveThreads(int threads_to_reserved, Env::Priority pri) { + return winenv_threads_.ReserveThreads(threads_to_reserved, pri); +} + +int WinEnv::ReleaseThreads(int threads_to_released, Env::Priority pri) { + return winenv_threads_.ReleaseThreads(threads_to_released, pri); +} + +uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); } + +// Allow increasing the number of worker threads. +void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { + return winenv_threads_.SetBackgroundThreads(num, pri); +} + +int WinEnv::GetBackgroundThreads(Env::Priority pri) { + return winenv_threads_.GetBackgroundThreads(pri); +} + +void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri); +} + +} // namespace port + +std::shared_ptr FileSystem::Default() { + return port::WinFileSystem::Default(); +} + +const std::shared_ptr& SystemClock::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr, clock) + (std::make_shared()); + return clock; +} +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.h new file mode 100644 index 0000000000..cf04ec2fec --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/env_win.h @@ -0,0 +1,305 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once +#include +#include + +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "util/threadpool_imp.h" + +#undef GetCurrentTime +#undef DeleteFile +#undef LoadLibrary + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// Currently not designed for inheritance but rather a replacement +class WinEnvThreads { + public: + explicit WinEnvThreads(Env* hosted_env); + + ~WinEnvThreads(); + + WinEnvThreads(const WinEnvThreads&) = delete; + WinEnvThreads& operator=(const WinEnvThreads&) = delete; + + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)); + + int UnSchedule(void* arg, Env::Priority pri); + + void StartThread(void (*function)(void* arg), void* arg); + + void WaitForJoin(); + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const; + + int ReserveThreads(int threads_to_be_reserved, Env::Priority pri); + + int ReleaseThreads(int threads_to_be_released, Env::Priority pri); + + static uint64_t gettid(); + + uint64_t GetThreadID() const; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri); + int GetBackgroundThreads(Env::Priority pri); + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri); + + private: + Env* hosted_env_; + mutable std::mutex mu_; + std::vector thread_pools_; + std::vector threads_to_join_; +}; + +class WinClock : public SystemClock { + public: + WinClock(); + virtual ~WinClock() {} + + static const char* kClassName() { return "WindowsClock"; } + const char* Name() const override { return kDefaultName(); } + const char* NickName() const override { return kClassName(); } + + uint64_t NowMicros() override; + + uint64_t NowNanos() override; + + // 0 indicates not supported + uint64_t CPUMicros() override { return 0; } + void SleepForMicroseconds(int micros) override; + + Status GetCurrentTime(int64_t* unix_time) override; + // Converts seconds-since-Jan-01-1970 to a printable string + std::string TimeToString(uint64_t time) override; + + uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } + + private: + using FnGetSystemTimePreciseAsFileTime = VOID(WINAPI*)(LPFILETIME); + + uint64_t perf_counter_frequency_; + uint64_t nano_seconds_per_period_; + FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; +}; + +class WinFileSystem : public FileSystem { + public: + static const std::shared_ptr& Default(); + WinFileSystem(const std::shared_ptr& clock); + ~WinFileSystem() {} + static const char* kClassName() { return "WinFS"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + static size_t GetSectorSize(const std::string& fname); + size_t GetPageSize() const { return page_size_; } + size_t GetAllocationGranularity() const { return allocation_granularity_; } + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + // Truncate the named file to the specified size. + IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override; + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + // Delete the specified directory. + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + // Store the size of fname in *file_size. + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + // Store the last modification time of fname in *file_mtime. + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + // Rename file src to target. + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + + // Hard Link file src to target. + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, uint64_t* /*count*/, + IODebugContext* /*dbg*/) override; + IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; + IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options, + bool* is_dir, IODebugContext* /*dgb*/) override; + // This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) override; + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override; + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override; + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override; + bool use_async_io() override { return false; } + + protected: + static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); + // Returns true iff the named directory exists and is a directory. + + virtual bool DirExists(const std::string& dname); + // Helper for NewWritable and ReopenWritableFile + virtual IOStatus OpenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + bool reopen); + + private: + std::shared_ptr clock_; + size_t page_size_; + size_t allocation_granularity_; +}; + +// Designed for inheritance so can be re-used +// but certain parts replaced +class WinEnvIO { + public: + explicit WinEnvIO(Env* hosted_env); + + virtual ~WinEnvIO(); + + virtual Status GetHostName(char* name, uint64_t len); + + private: + Env* hosted_env_; +}; + +class WinEnv : public CompositeEnv { + public: + WinEnv(); + + ~WinEnv(); + static const char* kClassName() { return "WinEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + Status GetHostName(char* name, uint64_t len) override; + + Status GetThreadList(std::vector* thread_list) override; + + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) override; + + int UnSchedule(void* arg, Env::Priority pri) override; + + void StartThread(void (*function)(void* arg), void* arg) override; + + void WaitForJoin() override; + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override; + + int ReserveThreads(int threads_to_be_reserved, Env::Priority pri) override; + + int ReleaseThreads(int threads_to_be_released, Env::Priority pri) override; + + uint64_t GetThreadID() const override; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri) override; + int GetBackgroundThreads(Env::Priority pri) override; + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; + + private: + WinEnvIO winenv_io_; + WinEnvThreads winenv_threads_; +}; + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.cc new file mode 100644 index 0000000000..4fa735518f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.cc @@ -0,0 +1,1101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/io_win.h" + +#include "env_win.h" +#include "monitoring/iostats_context_imp.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +/* + * DirectIOHelper + */ +namespace { + +const size_t kSectorSize = 512; + +inline bool IsPowerOfTwo(const size_t alignment) { + return ((alignment) & (alignment - 1)) == 0; +} + +inline bool IsAligned(size_t alignment, const void* ptr) { + return ((uintptr_t(ptr)) & (alignment - 1)) == 0; +} +} // namespace + +std::string GetWindowsErrSz(DWORD err) { + std::string Err; + LPSTR lpMsgBuf = nullptr; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast(&lpMsgBuf), 0, NULL); + + if (lpMsgBuf) { + Err = lpMsgBuf; + LocalFree(lpMsgBuf); + } + return Err; +} + +// We preserve the original name of this interface to denote the original idea +// behind it. +// All reads happen by a specified offset and pwrite interface does not change +// the position of the file pointer. Judging from the man page and errno it does +// execute +// lseek atomically to return the position of the file back where it was. +// WriteFile() does not +// have this capability. Therefore, for both pread and pwrite the pointer is +// advanced to the next position +// which is fine for writes because they are (should be) sequential. +// Because all the reads/writes happen by the specified offset, the caller in +// theory should not +// rely on the current file offset. +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written) { + IOStatus s; + bytes_written = 0; + + size_t num_bytes = data.size(); + if (num_bytes > std::numeric_limits::max()) { + // May happen in 64-bit builds where size_t is 64-bits but + // long is still 32-bit, but that's the API here at the moment + return IOStatus::InvalidArgument( + "num_bytes is too large for a single write: " + file_data->GetName()); + } + + OVERLAPPED overlapped = {0}; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesWritten = 0; + + if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), + static_cast(num_bytes), &bytesWritten, + &overlapped)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(), + lastError); + } else { + bytes_written = bytesWritten; + } + + return s; +} + +// See comments for pwrite above +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read) { + IOStatus s; + bytes_read = 0; + + if (num_bytes > std::numeric_limits::max()) { + return IOStatus::InvalidArgument( + "num_bytes is too large for a single read: " + file_data->GetName()); + } + + OVERLAPPED overlapped = {0}; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesRead = 0; + + if (FALSE == ReadFile(file_data->GetFileHandle(), src, + static_cast(num_bytes), &bytesRead, + &overlapped)) { + auto lastError = GetLastError(); + // EOF is OK with zero bytes read + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(), + lastError); + } + } else { + bytes_read = bytesRead; + } + + return s; +} + +// SetFileInformationByHandle() is capable of fast pre-allocates. +// However, this does not change the file end position unless the file is +// truncated and the pre-allocated space is not considered filled with zeros. +IOStatus fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + IOStatus status; + + FILE_ALLOCATION_INFO alloc_info; + alloc_info.AllocationSize.QuadPart = to_size; + + if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, + sizeof(FILE_ALLOCATION_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError( + "Failed to pre-allocate space: " + filename, lastError); + } + + return status; +} + +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) { + IOStatus status; + + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = toSize; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, + lastError); + } + + return status; +} + +size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/, + size_t /*max_size*/) { + // Returning 0 is safe as it causes the table reader to generate a unique ID. + // This is suboptimal for performance as it prevents multiple table readers + // for the same file from sharing cached blocks. For example, if users have + // a low value for `max_open_files`, there can be many table readers opened + // for the same file. + // + // TODO: this is a temporarily solution as it is safe but not optimal for + // performance. For more details see discussion in + // https://github.com/facebook/rocksdb/pull/5844. + return 0; +} + +WinFileData::WinFileData(const std::string& filename, HANDLE hFile, + bool direct_io) + : filename_(filename), + hFile_(hFile), + use_direct_io_(direct_io), + sector_size_(WinFileSystem::GetSectorSize(filename)) {} + +bool WinFileData::IsSectorAligned(const size_t off) const { + return (off & (sector_size_ - 1)) == 0; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// WinMmapReadableFile + +WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, + HANDLE hFile, HANDLE hMap, + const void* mapped_region, + size_t length) + : WinFileData(fileName, hFile, false /* use_direct_io */), + hMap_(hMap), + mapped_region_(mapped_region), + length_(length) {} + +WinMmapReadableFile::~WinMmapReadableFile() { + BOOL ret __attribute__((__unused__)); + ret = ::UnmapViewOfFile(mapped_region_); + assert(ret); + + ret = ::CloseHandle(hMap_); + assert(ret); +} + +IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + IOStatus s; + + if (offset > length_) { + *result = Slice(); + return IOError(filename_, EINVAL); + } else if (offset + n > length_) { + n = length_ - static_cast(offset); + } + *result = Slice(reinterpret_cast(mapped_region_) + offset, n); + return s; +} + +IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +/////////////////////////////////////////////////////////////////////////////// +/// WinMmapFile + +// Can only truncate or reserve to a sector size aligned if +// used on files that are opened with Unbuffered I/O +IOStatus WinMmapFile::TruncateFile(uint64_t toSize) { + return ftruncate(filename_, hFile_, toSize); +} + +IOStatus WinMmapFile::UnmapCurrentRegion() { + IOStatus status; + + if (mapped_begin_ != nullptr) { + if (!::UnmapViewOfFile(mapped_begin_)) { + status = IOErrorFromWindowsError( + "Failed to unmap file view: " + filename_, GetLastError()); + } + + // Move on to the next portion of the file + file_offset_ += view_size_; + + // UnmapView automatically sends data to disk but not the metadata + // which is good and provides some equivalent of fdatasync() on Linux + // therefore, we donot need separate flag for metadata + mapped_begin_ = nullptr; + mapped_end_ = nullptr; + dst_ = nullptr; + + last_sync_ = nullptr; + pending_sync_ = false; + } + + return status; +} + +IOStatus WinMmapFile::MapNewRegion(const IOOptions& options, + IODebugContext* dbg) { + IOStatus status; + + assert(mapped_begin_ == nullptr); + + size_t minDiskSize = static_cast(file_offset_) + view_size_; + + if (minDiskSize > reserved_size_) { + status = Allocate(file_offset_, view_size_, options, dbg); + if (!status.ok()) { + return status; + } + } + + // Need to remap + if (hMap_ == NULL || reserved_size_ > mapping_size_) { + if (hMap_ != NULL) { + // Unmap the previous one + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hMap_); + assert(ret); + hMap_ = NULL; + } + + ULARGE_INTEGER mappingSize; + mappingSize.QuadPart = reserved_size_; + + hMap_ = CreateFileMappingA( + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name + + if (NULL == hMap_) { + return IOErrorFromWindowsError( + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); + } + + mapping_size_ = reserved_size_; + } + + ULARGE_INTEGER offset; + offset.QuadPart = file_offset_; + + // View must begin at the granularity aligned offset + mapped_begin_ = reinterpret_cast( + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); + + if (!mapped_begin_) { + status = IOErrorFromWindowsError( + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); + } else { + mapped_end_ = mapped_begin_ + view_size_; + dst_ = mapped_begin_; + last_sync_ = mapped_begin_; + pending_sync_ = false; + } + return status; +} + +IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(filename_, hFile_, spaceToReserve); +} + +WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, + size_t page_size, size_t allocation_granularity, + const FileOptions& options) + : WinFileData(fname, hFile, false), + FSWritableFile(options), + hMap_(NULL), + page_size_(page_size), + allocation_granularity_(allocation_granularity), + reserved_size_(0), + mapping_size_(0), + view_size_(0), + mapped_begin_(nullptr), + mapped_end_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { + // Allocation granularity must be obtained from GetSystemInfo() and must be + // a power of two. + assert(allocation_granularity > 0); + assert((allocation_granularity & (allocation_granularity - 1)) == 0); + + assert(page_size > 0); + assert((page_size & (page_size - 1)) == 0); + + // Only for memory mapped writes + assert(options.use_mmap_writes); + + // View size must be both the multiple of allocation_granularity AND the + // page size and the granularity is usually a multiple of a page size. + const size_t viewSize = + 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + view_size_ = Roundup(viewSize, allocation_granularity_); +} + +WinMmapFile::~WinMmapFile() { + if (hFile_) { + this->Close(IOOptions(), nullptr); + } +} + +IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) { + const char* src = data.data(); + size_t left = data.size(); + + while (left > 0) { + assert(mapped_begin_ <= dst_); + size_t avail = mapped_end_ - dst_; + + if (avail == 0) { + IOStatus s = UnmapCurrentRegion(); + if (s.ok()) { + s = MapNewRegion(options, dbg); + } + + if (!s.ok()) { + return s; + } + } else { + size_t n = std::min(left, avail); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + pending_sync_ = true; + } + } + + // Now make sure that the last partial page is padded with zeros if needed + size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_); + if (bytesToPad > 0) { + memset(dst_, 0, bytesToPad); + } + + return IOStatus::OK(); +} + +// Means Close() will properly take care of truncate +// and it does not need any additional information +IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) { + IOStatus s; + + assert(NULL != hFile_); + + // We truncate to the precise size so no + // uninitialized data at the end. SetEndOfFile + // which we use does not write zeros and it is good. + uint64_t targetSize = GetFileSize(options, dbg); + + if (mapped_begin_ != nullptr) { + // Sync before unmapping to make sure everything + // is on disk and there is not a lazy writing + // so we are deterministic with the tests + Sync(options, dbg); + s = UnmapCurrentRegion(); + } + + if (NULL != hMap_) { + BOOL ret = ::CloseHandle(hMap_); + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to Close mapping for file: " + filename_, lastError); + } + + hMap_ = NULL; + } + + if (hFile_ != NULL) { + TruncateFile(targetSize); + + BOOL ret = ::CloseHandle(hFile_); + hFile_ = NULL; + + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to close file map handle: " + filename_, lastError); + } + } + + return s; +} + +IOStatus WinMmapFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +// Flush only data +IOStatus WinMmapFile::Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + + // Some writes occurred since last sync + if (dst_ > last_sync_) { + assert(mapped_begin_); + assert(dst_); + assert(dst_ > mapped_begin_); + assert(dst_ < mapped_end_); + + size_t page_begin = + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + size_t page_end = + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + + // Flush only the amount of that is a multiple of pages + if (!::FlushViewOfFile(mapped_begin_ + page_begin, + (page_end - page_begin) + page_size_)) { + s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, + GetLastError()); + } else { + last_sync_ = dst_; + } + } + + return s; +} + +/** + * Flush data as well as metadata to stable storage. + */ +IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + IOStatus s = Sync(options, dbg); + + // Flush metadata + if (s.ok() && pending_sync_) { + if (!::FlushFileBuffers(hFile_)) { + s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, + GetLastError()); + } + pending_sync_ = false; + } + + return s; +} + +/** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + size_t used = dst_ - mapped_begin_; + return file_offset_ + used; +} + +IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus status; + TEST_KILL_RANDOM("WinMmapFile::Allocate"); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = + Roundup(static_cast(offset + len), view_size_); + // Nothing to do + if (spaceToReserve <= reserved_size_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reserved_size_ = spaceToReserve; + } + return status; +} + +size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +////////////////////////////////////////////////////////////////////////////////// +// WinSequentialFile + +WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, + const FileOptions& options) + : WinFileData(fname, f, options.use_direct_reads) {} + +WinSequentialFile::~WinSequentialFile() { + assert(hFile_ != INVALID_HANDLE_VALUE); +} + +IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + IOStatus s; + size_t r = 0; + + assert(result != nullptr); + if (WinFileData::use_direct_io()) { + return IOStatus::NotSupported("Read() does not support direct_io"); + } + + // Windows ReadFile API accepts a DWORD. + // While it is possible to read in a loop if n is too big + // it is an unlikely case. + if (n > std::numeric_limits::max()) { + return IOStatus::InvalidArgument("n is too big for a single ReadFile: " + + filename_); + } + + DWORD bytesToRead = + static_cast(n); // cast is safe due to the check above + DWORD bytesRead = 0; + BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL); + if (ret != FALSE) { + r = bytesRead; + } else { + auto lastError = GetLastError(); + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError); + } + } + + *result = Slice(scratch, r); + return s; +} + +IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const { + return pread(this, src, numBytes, offset, bytes_read); +} + +IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + if (!WinFileData::use_direct_io()) { + return IOStatus::NotSupported("This function is only used for direct_io"); + } + + assert(IsSectorAligned(static_cast(offset))); + assert(IsSectorAligned(static_cast(n))); + + size_t bytes_read = 0; // out param + IOStatus s = PositionedReadInternal(scratch, static_cast(n), offset, + bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + +IOStatus WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed + // 64-bit integer. As such it is a highly unlikley case to have n so large. + if (n > static_cast(std::numeric_limits::max())) { + return IOStatus::InvalidArgument( + "n is too large for a single SetFilePointerEx() call" + filename_); + } + + LARGE_INTEGER li; + li.QuadPart = static_cast(n); // cast is safe due to the check + // above + BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT); + if (ret == FALSE) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, + lastError); + } + return IOStatus::OK(); +} + +IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessBase + +inline IOStatus WinRandomAccessImpl::PositionedReadInternal( + char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const { + return pread(file_base_, src, numBytes, offset, bytes_read); +} + +inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, + size_t alignment, + const FileOptions& options) + : file_base_(file_base), + alignment_(std::max(alignment, file_base->GetSectorSize())) { + assert(!options.use_mmap_reads); +} + +inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, + Slice* result, + char* scratch) const { + // Check buffer alignment + if (file_base_->use_direct_io()) { + assert(file_base_->IsSectorAligned(static_cast(offset))); + assert(IsAligned(alignment_, scratch)); + } + + if (n == 0) { + *result = Slice(scratch, 0); + return IOStatus::OK(); + } + + size_t bytes_read = 0; + IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessFile + +WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, + size_t alignment, + const FileOptions& options) + : WinFileData(fname, hFile, options.use_direct_reads), + WinRandomAccessImpl(this, alignment, options) {} + +WinRandomAccessFile::~WinRandomAccessFile() {} + +IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + return ReadImpl(offset, n, result, scratch); +} + +IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +size_t WinRandomAccessFile::GetRequiredBufferAlignment() const { + return GetAlignment(); +} + +///////////////////////////////////////////////////////////////////////////// +// WinWritableImpl +// + +inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), + spaceToReserve); +} + +inline WinWritableImpl::WinWritableImpl(WinFileData* file_data, + size_t alignment) + : file_data_(file_data), + alignment_(std::max(alignment, file_data->GetSectorSize())), + next_write_offset_(0), + reservedsize_(0) { + // Query current position in case ReopenWritableFile is called + // This position is only important for buffered writes + // for unbuffered writes we explicitely specify the position. + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; // Do not move + LARGE_INTEGER pos; + pos.QuadPart = 0; + BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos, + FILE_CURRENT); + // Querying no supped to fail + if (ret != 0) { + next_write_offset_ = pos.QuadPart; + } else { + assert(false); + } +} + +inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) { + IOStatus s; + + if (data.size() > std::numeric_limits::max()) { + return IOStatus::InvalidArgument("data is too long for a single write" + + file_data_->GetName()); + } + + size_t bytes_written = 0; // out param + + if (file_data_->use_direct_io()) { + // With no offset specified we are appending + // to the end of the file + assert(file_data_->IsSectorAligned(next_write_offset_)); + assert(file_data_->IsSectorAligned(data.size())); + assert(IsAligned(static_cast(GetAlignment()), data.data())); + s = pwrite(file_data_, data, next_write_offset_, bytes_written); + } else { + DWORD bytesWritten = 0; + if (!WriteFile(file_data_->GetFileHandle(), data.data(), + static_cast(data.size()), &bytesWritten, NULL)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to WriteFile: " + file_data_->GetName(), lastError); + } else { + bytes_written = bytesWritten; + } + } + + if (s.ok()) { + if (bytes_written == data.size()) { + // This matters for direct_io cases where + // we rely on the fact that next_write_offset_ + // is sector aligned + next_write_offset_ += bytes_written; + } else { + s = IOStatus::IOError("Failed to write all bytes: " + + file_data_->GetName()); + } + } + + return s; +} + +inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data, + uint64_t offset) { + if (file_data_->use_direct_io()) { + assert(file_data_->IsSectorAligned(static_cast(offset))); + assert(file_data_->IsSectorAligned(data.size())); + assert(IsAligned(static_cast(GetAlignment()), data.data())); + } + + size_t bytes_written = 0; + IOStatus s = pwrite(file_data_, data, offset, bytes_written); + + if (s.ok()) { + if (bytes_written == data.size()) { + // For sequential write this would be simple + // size extension by data.size() + uint64_t write_end = offset + bytes_written; + if (write_end >= next_write_offset_) { + next_write_offset_ = write_end; + } + } else { + s = IOStatus::IOError("Failed to write all of the requested data: " + + file_data_->GetName()); + } + } + return s; +} + +inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) { + // It is tempting to check for the size for sector alignment + // but truncation may come at the end and there is not a requirement + // for this to be sector aligned so long as we do not attempt to write + // after that. The interface docs state that the behavior is undefined + // in that case. + IOStatus s = + ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size); + + if (s.ok()) { + next_write_offset_ = size; + } + return s; +} + +inline IOStatus WinWritableImpl::CloseImpl() { + IOStatus s; + + auto hFile = file_data_->GetFileHandle(); + assert(INVALID_HANDLE_VALUE != hFile); + + if (!::FlushFileBuffers(hFile)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Close() for: " + file_data_->GetName(), + lastError); + } + + if (!file_data_->CloseFile() && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "CloseHandle failed for: " + file_data_->GetName(), lastError); + } + return s; +} + +inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + if (!::FlushFileBuffers(file_data_->GetFileHandle())) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), + lastError); + } + return s; +} + +inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { + IOStatus status; + TEST_KILL_RANDOM("WinWritableFile::Allocate"); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(static_cast(offset + len), + static_cast(alignment_)); + // Nothing to do + if (spaceToReserve <= reservedsize_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reservedsize_ = spaceToReserve; + } + return status; +} + +//////////////////////////////////////////////////////////////////////////////// +/// WinWritableFile + +WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, + size_t alignment, size_t /* capacity */, + const FileOptions& options) + : WinFileData(fname, hFile, options.use_direct_writes), + WinWritableImpl(this, alignment), + FSWritableFile(options) { + assert(!options.use_mmap_writes); +} + +WinWritableFile::~WinWritableFile() {} + +// Indicates if the class makes use of direct I/O +bool WinWritableFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} + +size_t WinWritableFile::GetRequiredBufferAlignment() const { + return static_cast(GetAlignment()); +} + +IOStatus WinWritableFile::Append(const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return AppendImpl(data); +} + +IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return PositionedAppendImpl(data, offset); +} + +// Need to implement this so the file is truncated correctly +// when buffered and unbuffered mode +IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return TruncateImpl(size); +} + +IOStatus WinWritableFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return CloseImpl(); +} + +// write out the cached data to the OS cache +// This is now taken care of the WritableFileWriter +IOStatus WinWritableFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} + +IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} + +bool WinWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return GetFileNextWriteOffset(); +} + +IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return AllocateImpl(offset, len); +} + +size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +///////////////////////////////////////////////////////////////////////// +/// WinRandomRWFile + +WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, + size_t alignment, const FileOptions& options) + : WinFileData(fname, hFile, + options.use_direct_reads && options.use_direct_writes), + WinRandomAccessImpl(this, alignment, options), + WinWritableImpl(this, alignment) {} + +bool WinRandomRWFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} + +size_t WinRandomRWFile::GetRequiredBufferAlignment() const { + assert(WinRandomAccessImpl::GetAlignment() == + WinWritableImpl::GetAlignment()); + return static_cast(WinRandomAccessImpl::GetAlignment()); +} + +IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return PositionedAppendImpl(data, offset); +} + +IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) const { + return ReadImpl(offset, n, result, scratch); +} + +IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} + +IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return CloseImpl(); +} + +////////////////////////////////////////////////////////////////////////// +/// WinMemoryMappedBufer +WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { + BOOL ret +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + if (base_ != nullptr) { + ret = ::UnmapViewOfFile(base_); + assert(ret); + base_ = nullptr; + } + if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(map_handle_); + assert(ret); + map_handle_ = NULL; + } + if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(file_handle_); + assert(ret); + file_handle_ = NULL; + } +} + +////////////////////////////////////////////////////////////////////////// +/// WinDirectory + +IOStatus WinDirectory::Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinDirectory::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s = IOStatus::OK(); + BOOL ret __attribute__((__unused__)); + if (handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(handle_); + if (!ret) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(), + lastError); + } + handle_ = NULL; + } + return s; +} + +size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(handle_, id, max_size); +} +////////////////////////////////////////////////////////////////////////// +/// WinFileLock + +WinFileLock::~WinFileLock() { + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hFile_); + assert(ret); +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.h new file mode 100644 index 0000000000..a4fee8346c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/io_win.h @@ -0,0 +1,508 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/status.h" +#include "util/aligned_buffer.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +std::string GetWindowsErrSz(DWORD err); + +inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) { + return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) + ? IOStatus::NoSpace(context, GetWindowsErrSz(err)) + : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) + ? IOStatus::PathNotFound(context, GetWindowsErrSz(err)) + : IOStatus::IOError(context, GetWindowsErrSz(err)); +} + +inline IOStatus IOErrorFromLastWindowsError(const std::string& context) { + return IOErrorFromWindowsError(context, GetLastError()); +} + +inline IOStatus IOError(const std::string& context, int err_number) { + return (err_number == ENOSPC) + ? IOStatus::NoSpace(context, errnoStr(err_number).c_str()) + : (err_number == ENOENT) + ? IOStatus::PathNotFound(context, errnoStr(err_number).c_str()) + : IOStatus::IOError(context, errnoStr(err_number).c_str()); +} + +class WinFileData; + +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written); + +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read); + +IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); + +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); + +size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); + +class WinFileData { + protected: + const std::string filename_; + HANDLE hFile_; + // If true, the I/O issued would be direct I/O which the buffer + // will need to be aligned (not sure there is a guarantee that the buffer + // passed in is aligned). + const bool use_direct_io_; + const size_t sector_size_; + + public: + // We want this class be usable both for inheritance (prive + // or protected) and for containment so __ctor and __dtor public + WinFileData(const std::string& filename, HANDLE hFile, bool direct_io); + + virtual ~WinFileData() { this->CloseFile(); } + + bool CloseFile() { + bool result = true; + + if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { + result = ::CloseHandle(hFile_); + assert(result); + hFile_ = NULL; + } + return result; + } + + const std::string& GetName() const { return filename_; } + + HANDLE GetFileHandle() const { return hFile_; } + + bool use_direct_io() const { return use_direct_io_; } + + size_t GetSectorSize() const { return sector_size_; } + + bool IsSectorAligned(const size_t off) const; + + WinFileData(const WinFileData&) = delete; + WinFileData& operator=(const WinFileData&) = delete; +}; + +class WinSequentialFile : protected WinFileData, public FSSequentialFile { + // Override for behavior change when creating a custom env + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; + + public: + WinSequentialFile(const std::string& fname, HANDLE f, + const FileOptions& options); + + ~WinSequentialFile(); + + WinSequentialFile(const WinSequentialFile&) = delete; + WinSequentialFile& operator=(const WinSequentialFile&) = delete; + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + IOStatus Skip(uint64_t n) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } +}; + +// mmap() based random-access +class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile { + HANDLE hMap_; + + const void* mapped_region_; + const size_t length_; + + public: + // mapped_region_[0,length-1] contains the mmapped contents of the file. + WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, + const void* mapped_region, size_t length); + + ~WinMmapReadableFile(); + + WinMmapReadableFile(const WinMmapReadableFile&) = delete; + WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +// We preallocate and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class WinMmapFile : private WinFileData, public FSWritableFile { + private: + HANDLE hMap_; + + const size_t page_size_; // We flush the mapping view in page_size + // increments. We may decide if this is a memory + // page size or SSD page size + const size_t + allocation_granularity_; // View must start at such a granularity + + size_t reserved_size_; // Preallocated size + + size_t mapping_size_; // The max size of the mapping object + // we want to guess the final file size to minimize the remapping + size_t view_size_; // How much memory to map into a view at a time + + char* mapped_begin_; // Must begin at the file offset that is aligned with + // allocation_granularity_ + char* mapped_end_; + char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_]) + char* last_sync_; // Where have we synced up to + + uint64_t file_offset_; // Offset of mapped_begin_ in file + + // Do we have unsynced writes? + bool pending_sync_; + + // Can only truncate or reserve to a sector size aligned if + // used on files that are opened with Unbuffered I/O + IOStatus TruncateFile(uint64_t toSize); + + IOStatus UnmapCurrentRegion(); + + IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg); + + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); + + public: + WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, + size_t allocation_granularity, const FileOptions& options); + + ~WinMmapFile(); + + WinMmapFile(const WinMmapFile&) = delete; + WinMmapFile& operator=(const WinMmapFile&) = delete; + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + + // Means Close() will properly take care of truncate + // and it does not need any additional information + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + // Flush only data + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + /** + * Flush data as well as metadata to stable storage. + */ + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomAccessImpl { + protected: + WinFileData* file_base_; + size_t alignment_; + + // Override for behavior change when creating a custom env + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; + + WinRandomAccessImpl(WinFileData* file_base, size_t alignment, + const FileOptions& options); + + virtual ~WinRandomAccessImpl() {} + + IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const; + + size_t GetAlignment() const { return alignment_; } + + public: + WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; + WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; +}; + +// pread() based random-access +class WinRandomAccessFile + : private WinFileData, + protected WinRandomAccessImpl, // Want to be able to override + // PositionedReadInternal + public FSRandomAccessFile { + public: + WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, + const FileOptions& options); + + ~WinRandomAccessFile(); + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; + + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetRequiredBufferAlignment() const override; +}; + +// This is a sequential write class. It has been mimicked (as others) after +// the original Posix class. We add support for unbuffered I/O on windows as +// well +// we utilize the original buffer as an alignment buffer to write directly to +// file with no buffering. +// No buffering requires that the provided buffer is aligned to the physical +// sector size (SSD page size) and +// that all SetFilePointer() operations to occur with such an alignment. +// We thus always write in sector/page size increments to the drive and leave +// the tail for the next write OR for Close() at which point we pad with zeros. +// No padding is required for +// buffered access. +class WinWritableImpl { + protected: + WinFileData* file_data_; + const uint64_t alignment_; + uint64_t + next_write_offset_; // Needed because Windows does not support O_APPEND + uint64_t reservedsize_; // how far we have reserved space + + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); + + WinWritableImpl(WinFileData* file_data, size_t alignment); + + ~WinWritableImpl() {} + + uint64_t GetAlignment() const { return alignment_; } + + IOStatus AppendImpl(const Slice& data); + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset); + + IOStatus TruncateImpl(uint64_t size); + + IOStatus CloseImpl(); + + IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg); + + uint64_t GetFileNextWriteOffset() { + // Double accounting now here with WritableFileWriter + // and this size will be wrong when unbuffered access is used + // but tests implement their own writable files and do not use + // WritableFileWrapper + // so we need to squeeze a square peg through + // a round hole here. + return next_write_offset_; + } + + IOStatus AllocateImpl(uint64_t offset, uint64_t len); + + public: + WinWritableImpl(const WinWritableImpl&) = delete; + WinWritableImpl& operator=(const WinWritableImpl&) = delete; +}; + +class WinWritableFile : private WinFileData, + protected WinWritableImpl, + public FSWritableFile { + public: + WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, + size_t capacity, const FileOptions& options); + + ~WinWritableFile(); + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } + + // Need to implement this so the file is truncated correctly + // when buffered and unbuffered mode + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + virtual bool IsSyncThreadSafe() const override; + + // Indicates if the class makes use of direct I/O + // Use PositionedAppend + virtual bool use_direct_io() const override; + + virtual size_t GetRequiredBufferAlignment() const override; + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomRWFile : private WinFileData, + protected WinRandomAccessImpl, + protected WinWritableImpl, + public FSRandomRWFile { + public: + WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, + const FileOptions& options); + + ~WinRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate aligned + // buffer for Write() when use_direct_io() returns true + virtual size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return Sync(options, dbg); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; + +class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { + private: + HANDLE file_handle_; + HANDLE map_handle_; + + public: + WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, + size_t size) + : MemoryMappedFileBuffer(base, size), + file_handle_(file_handle), + map_handle_(map_handle) {} + ~WinMemoryMappedBuffer() override; +}; + +class WinDirectory : public FSDirectory { + const std::string filename_; + HANDLE handle_; + + public: + explicit WinDirectory(const std::string& filename, HANDLE h) noexcept + : filename_(filename), handle_(h) { + assert(handle_ != INVALID_HANDLE_VALUE); + } + ~WinDirectory() { + if (handle_ != NULL) { + IOStatus s = WinDirectory::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } + } + const std::string& GetName() const { return filename_; } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinFileLock : public FileLock { + public: + explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { + assert(hFile != NULL); + assert(hFile != INVALID_HANDLE_VALUE); + } + + ~WinFileLock(); + + private: + HANDLE hFile_; +}; +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.cc new file mode 100644 index 0000000000..37e8f655ce --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.cc @@ -0,0 +1,303 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/port_win.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "port/port_dirent.h" +#include "port/sys_time.h" + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES +// utf8 <-> utf16 +#include +#include +#include +#endif + +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { + +extern const bool kDefaultToAdaptiveMutex = false; + +namespace port { + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES +std::string utf16_to_utf8(const std::wstring& utf16) { + std::wstring_convert, wchar_t> convert; + return convert.to_bytes(utf16); +} + +std::wstring utf8_to_utf16(const std::string& utf8) { + std::wstring_convert> converter; + return converter.from_bytes(utf8); +} +#endif + +void GetTimeOfDay(TimeVal* tv, struct timezone* /* tz */) { + std::chrono::microseconds usNow( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch())); + + std::chrono::seconds secNow( + std::chrono::duration_cast(usNow)); + + tv->tv_sec = static_cast(secNow.count()); + tv->tv_usec = static_cast( + usNow.count() - + std::chrono::duration_cast(secNow).count()); +} + +Mutex::~Mutex() {} + +CondVar::~CondVar() {} + +void CondVar::Wait() { + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock lk(mu_->getLock(), std::adopt_lock); +#ifndef NDEBUG + mu_->locked_ = false; +#endif + cv_.wait(lk); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); +} + +bool CondVar::TimedWait(uint64_t abs_time_us) { + // MSVC++ library implements wait_until in terms of wait_for so + // we need to convert absolute wait into relative wait. + std::chrono::microseconds usAbsTime(abs_time_us); + + std::chrono::microseconds usNow( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch())); + std::chrono::microseconds relTimeUs = (usAbsTime > usNow) + ? (usAbsTime - usNow) + : std::chrono::microseconds::zero(); + + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock lk(mu_->getLock(), std::adopt_lock); + + // Work around https://github.com/microsoft/STL/issues/369 +#if defined(_MSC_VER) && \ + (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L) + if (relTimeUs == std::chrono::microseconds::zero()) { + lk.unlock(); + lk.lock(); + } +#endif +#ifndef NDEBUG + mu_->locked_ = false; +#endif + std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); + + if (cvStatus == std::cv_status::timeout) { + return true; + } + + return false; +} + +void CondVar::Signal() { cv_.notify_one(); } + +void CondVar::SignalAll() { cv_.notify_all(); } + +int PhysicalCoreID() { return GetCurrentProcessorNumber(); } + +void InitOnce(OnceType* once, void (*initializer)()) { + std::call_once(once->flag_, initializer); +} + +// Private structure, exposed only by pointer +struct DIR { + HANDLE handle_; + bool firstread_; + RX_WIN32_FIND_DATA data_; + dirent entry_; + + DIR() : handle_(INVALID_HANDLE_VALUE), firstread_(true) {} + + DIR(const DIR&) = delete; + DIR& operator=(const DIR&) = delete; + + ~DIR() { + if (INVALID_HANDLE_VALUE != handle_) { + ::FindClose(handle_); + } + } +}; + +DIR* opendir(const char* name) { + if (!name || *name == 0) { + errno = ENOENT; + return nullptr; + } + + std::string pattern(name); + pattern.append("\\").append("*"); + + std::unique_ptr dir(new DIR); + + dir->handle_ = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + FindExInfoBasic, // Do not want alternative name + &dir->data_, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); + + if (dir->handle_ == INVALID_HANDLE_VALUE) { + return nullptr; + } + + RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName)); + strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), FN_TO_RX(x).c_str()); + + return dir.release(); +} + +struct dirent* readdir(DIR* dirp) { + if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) { + errno = EBADF; + return nullptr; + } + + if (dirp->firstread_) { + dirp->firstread_ = false; + return &dirp->entry_; + } + + auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_); + + if (ret == 0) { + return nullptr; + } + + RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName)); + strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), + FN_TO_RX(x).c_str()); + + return &dirp->entry_; +} + +int closedir(DIR* dirp) { + delete dirp; + return 0; +} + +int truncate(const char* path, int64_t length) { + if (path == nullptr) { + errno = EFAULT; + return -1; + } + return ROCKSDB_NAMESPACE::port::Truncate(path, length); +} + +int Truncate(std::string path, int64_t len) { + if (len < 0) { + errno = EINVAL; + return -1; + } + + HANDLE hFile = + RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, // Security attrs + OPEN_EXISTING, // Truncate existing file only + FILE_ATTRIBUTE_NORMAL, NULL); + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + if (lastError == ERROR_FILE_NOT_FOUND) { + errno = ENOENT; + } else if (lastError == ERROR_ACCESS_DENIED) { + errno = EACCES; + } else { + errno = EIO; + } + return -1; + } + + int result = 0; + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = len; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + errno = EIO; + result = -1; + } + + CloseHandle(hFile); + return result; +} + +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + abort(); +} + +int GetMaxOpenFiles() { return -1; } + +// Assume 4KB page size +const size_t kPageSize = 4U * 1024U; + +void SetCpuPriority(ThreadId id, CpuPriority priority) { + // Not supported + (void)id; + (void)priority; +} + +int64_t GetProcessID() { return GetCurrentProcessId(); } + +bool GenerateRfcUuid(std::string* output) { + UUID uuid; + UuidCreateSequential(&uuid); + + RPC_CSTR rpc_str; + auto status = UuidToStringA(&uuid, &rpc_str); + if (status != RPC_S_OK) { + return false; + } + + // rpc_str is nul-terminated + *output = reinterpret_cast(rpc_str); + + status = RpcStringFreeA(&rpc_str); + assert(status == RPC_S_OK); + + return true; +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.h new file mode 100644 index 0000000000..4d9883b63a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/port_win.h @@ -0,0 +1,379 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +// Always want minimum headers +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif + +#include +//^^ should be included first before other system lib +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "port/win/win_thread.h" +#include "rocksdb/port_defs.h" + +#undef min +#undef max +#undef DeleteFile +#undef GetCurrentTime + +#ifndef strcasecmp +#define strcasecmp _stricmp +#endif + +#undef GetCurrentTime +#undef DeleteFile + +#ifndef _SSIZE_T_DEFINED +using ssize_t = SSIZE_T; +#endif + +// size_t printf formatting named in the manner of C99 standard formatting +// strings such as PRIu64 +// in fact, we could use that one +#ifndef ROCKSDB_PRIszt +#define ROCKSDB_PRIszt "Iu" +#endif + +#ifdef _MSC_VER +#define __attribute__(A) + +#endif + +namespace ROCKSDB_NAMESPACE { + +#define PREFETCH(addr, rw, locality) + +extern const bool kDefaultToAdaptiveMutex; + +namespace port { + +// "Windows is designed to run on little-endian computer architectures." +// https://docs.microsoft.com/en-us/windows/win32/sysinfo/registry-value-types +constexpr bool kLittleEndian = true; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + static const char* kName() { return "std::mutex"; } + + explicit Mutex(bool IGNORED_adaptive = kDefaultToAdaptiveMutex) +#ifndef NDEBUG + : locked_(false) +#endif + { + (void)IGNORED_adaptive; + } + + ~Mutex(); + + void Lock() { + mutex_.lock(); +#ifndef NDEBUG + locked_ = true; +#endif + } + + void Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + mutex_.unlock(); + } + + bool TryLock() { + bool ret = mutex_.try_lock(); +#ifndef NDEBUG + if (ret) { + locked_ = true; + } +#endif + return ret; + } + + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif + } + + // Also implement std Lockable + inline void lock() { Lock(); } + inline void unlock() { Unlock(); } + inline bool try_lock() { return TryLock(); } + + // Mutex is move only with lock ownership transfer + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; + + private: + friend class CondVar; + + std::mutex& getLock() { return mutex_; } + + std::mutex mutex_; +#ifndef NDEBUG + bool locked_; +#endif +}; + +class RWMutex { + public: + RWMutex() { InitializeSRWLock(&srwLock_); } + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; + + void ReadLock() { AcquireSRWLockShared(&srwLock_); } + + void WriteLock() { AcquireSRWLockExclusive(&srwLock_); } + + void ReadUnlock() { ReleaseSRWLockShared(&srwLock_); } + + void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); } + + // Empty as in POSIX + void AssertHeld() {} + + private: + SRWLOCK srwLock_; +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu) : mu_(mu) {} + + ~CondVar(); + void Wait(); + bool TimedWait(uint64_t expiration_time); + void Signal(); + void SignalAll(); + + // Condition var is not copy/move constructible + CondVar(const CondVar&) = delete; + CondVar& operator=(const CondVar&) = delete; + + CondVar(CondVar&&) = delete; + CondVar& operator=(CondVar&&) = delete; + + private: + std::condition_variable cv_; + Mutex* mu_; +}; + +#ifdef _POSIX_THREADS +using Thread = std::thread; +#else +// Wrapper around the platform efficient +// or otherwise preferrable implementation +using Thread = WindowsThread; +#endif + +// OnceInit type helps emulate +// Posix semantics with initialization +// adopted in the project +struct OnceType { + struct Init {}; + + OnceType() {} + OnceType(const Init&) {} + OnceType(const OnceType&) = delete; + OnceType& operator=(const OnceType&) = delete; + + std::once_flag flag_; +}; + +#define LEVELDB_ONCE_INIT port::OnceType::Init() +extern void InitOnce(OnceType* once, void (*initializer)()); + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 64U +#endif + +#ifdef ROCKSDB_JEMALLOC +// Separate inlines so they can be replaced if needed +void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept; +void jemalloc_aligned_free(void* p) noexcept; +#endif + +inline void* cacheline_aligned_alloc(size_t size) { +#ifdef ROCKSDB_JEMALLOC + return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE); +#else + return _aligned_malloc(size, CACHE_LINE_SIZE); +#endif +} + +inline void cacheline_aligned_free(void* memblock) { +#ifdef ROCKSDB_JEMALLOC + jemalloc_aligned_free(memblock); +#else + _aligned_free(memblock); +#endif +} + +extern const size_t kPageSize; + +// Part of C++11 +#define ALIGN_AS(n) alignas(n) + +static inline void AsmVolatilePause() { +#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) + YieldProcessor(); +#endif + // it would be nice to get "wfe" on ARM here +} + +extern int PhysicalCoreID(); + +// For Thread Local Storage abstraction +using pthread_key_t = DWORD; + +inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) { + // Not used + (void)destructor; + + pthread_key_t k = TlsAlloc(); + if (TLS_OUT_OF_INDEXES == k) { + return ENOMEM; + } + + *key = k; + return 0; +} + +inline int pthread_key_delete(pthread_key_t key) { + if (!TlsFree(key)) { + return EINVAL; + } + return 0; +} + +inline int pthread_setspecific(pthread_key_t key, const void* value) { + if (!TlsSetValue(key, const_cast(value))) { + return ENOMEM; + } + return 0; +} + +inline void* pthread_getspecific(pthread_key_t key) { + void* result = TlsGetValue(key); + if (!result) { + if (GetLastError() != ERROR_SUCCESS) { + errno = EINVAL; + } else { + errno = NOERROR; + } + } + return result; +} + +// UNIX equiv although errno numbers will be off +// using C-runtime to implement. Note, this does not +// feel space with zeros in case the file is extended. +int truncate(const char* path, int64_t length); +int Truncate(std::string path, int64_t length); +void Crash(const std::string& srcfile, int srcline); +extern int GetMaxOpenFiles(); +std::string utf16_to_utf8(const std::wstring& utf16); +std::wstring utf8_to_utf16(const std::string& utf8); + +using ThreadId = int; + +extern void SetCpuPriority(ThreadId id, CpuPriority priority); + +int64_t GetProcessID(); + +// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns +// true on success or false on failure. +bool GenerateRfcUuid(std::string* output); + +} // namespace port + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES + +#define RX_FILESTRING std::wstring +#define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a) +#define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a) +#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str()) +#define RX_FNLEN(a) ::wcslen(a) + +#define RX_DeleteFile DeleteFileW +#define RX_CreateFile CreateFileW +#define RX_CreateFileMapping CreateFileMappingW +#define RX_GetFileAttributesEx GetFileAttributesExW +#define RX_FindFirstFileEx FindFirstFileExW +#define RX_FindNextFile FindNextFileW +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW +#define RX_CreateDirectory CreateDirectoryW +#define RX_RemoveDirectory RemoveDirectoryW +#define RX_GetFileAttributesEx GetFileAttributesExW +#define RX_MoveFileEx MoveFileExW +#define RX_CreateHardLink CreateHardLinkW +#define RX_PathIsRelative PathIsRelativeW +#define RX_GetCurrentDirectory GetCurrentDirectoryW +#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW +#define RX_PathIsDirectory PathIsDirectoryW + +#else + +#define RX_FILESTRING std::string +#define RX_FN(a) a +#define FN_TO_RX(a) a +#define RX_FNCMP(a, b) strcmp(a, b) +#define RX_FNLEN(a) strlen(a) + +#define RX_DeleteFile DeleteFileA +#define RX_CreateFile CreateFileA +#define RX_CreateFileMapping CreateFileMappingA +#define RX_GetFileAttributesEx GetFileAttributesExA +#define RX_FindFirstFileEx FindFirstFileExA +#define RX_CreateDirectory CreateDirectoryA +#define RX_FindNextFile FindNextFileA +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA +#define RX_CreateDirectory CreateDirectoryA +#define RX_RemoveDirectory RemoveDirectoryA +#define RX_GetFileAttributesEx GetFileAttributesExA +#define RX_MoveFileEx MoveFileExA +#define RX_CreateHardLink CreateHardLinkA +#define RX_PathIsRelative PathIsRelativeA +#define RX_GetCurrentDirectory GetCurrentDirectoryA +#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA +#define RX_PathIsDirectory PathIsDirectoryA + +#endif + +using port::pthread_getspecific; +using port::pthread_key_create; +using port::pthread_key_delete; +using port::pthread_key_t; +using port::pthread_setspecific; +using port::truncate; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc new file mode 100644 index 0000000000..072ea419a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#if defined(OS_WIN) + +#include "port/win/win_logger.h" + +#include +#include +#include + +#include +#include + +#include "monitoring/iostats_context_imp.h" +#include "port/sys_time.h" +#include "port/win/env_win.h" +#include "port/win/io_win.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +namespace port { + +WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, + const InfoLogLevel log_level) + : Logger(log_level), + file_(file), + gettid_(gettid), + log_size_(0), + last_flush_micros_(0), + clock_(clock), + flush_pending_(false) { + assert(file_ != NULL); + assert(file_ != INVALID_HANDLE_VALUE); +} + +void WinLogger::DebugWriter(const char* str, int len) { + assert(file_ != INVALID_HANDLE_VALUE); + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL); + if (ret == FALSE) { + std::string errSz = GetWindowsErrSz(GetLastError()); + fprintf(stderr, "%s", errSz.c_str()); + } +} + +WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); } + +Status WinLogger::CloseImpl() { return CloseInternal(); } + +Status WinLogger::CloseInternal() { + Status s; + if (INVALID_HANDLE_VALUE != file_) { + BOOL ret = FlushFileBuffers(file_); + if (ret == 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError); + } + ret = CloseHandle(file_); + // On error the return value is zero + if (ret == 0 && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError); + } + file_ = INVALID_HANDLE_VALUE; + closed_ = true; + } + return s; +} + +void WinLogger::Flush() { + assert(file_ != INVALID_HANDLE_VALUE); + if (flush_pending_) { + flush_pending_ = false; + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. + } + + last_flush_micros_ = clock_->NowMicros(); +} + +void WinLogger::Logv(const char* format, va_list ap) { + IOSTATS_TIMER_GUARD(logger_nanos); + assert(file_ != INVALID_HANDLE_VALUE); + + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + std::unique_ptr largeBuffer; + for (int iter = 0; iter < 2; ++iter) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + largeBuffer.reset(new char[bufsize]); + base = largeBuffer.get(); + } + + char* p = base; + char* limit = base + bufsize; + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_s(&t, &seconds); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + int done = vsnprintf(p, limit - p, format, backup_ap); + if (done > 0) { + p += done; + } else { + continue; + } + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, base, static_cast(write_size), + &bytesWritten, NULL); + if (ret == FALSE) { + std::string errSz = GetWindowsErrSz(GetLastError()); + fprintf(stderr, "%s", errSz.c_str()); + } + + flush_pending_ = true; + assert((bytesWritten == write_size) || (ret == FALSE)); + if (bytesWritten > 0) { + log_size_ += write_size; + } + + uint64_t now_micros = + static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. + last_flush_micros_ = now_micros; + } + break; + } +} + +size_t WinLogger::GetLogFileSize() const { return log_size_; } + +} // namespace port + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.h new file mode 100644 index 0000000000..1ca4610e9e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once + +#include +#include + +#include +#include + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +namespace port { +class WinLogger : public ROCKSDB_NAMESPACE::Logger { + public: + WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL); + + virtual ~WinLogger(); + + WinLogger(const WinLogger&) = delete; + + WinLogger& operator=(const WinLogger&) = delete; + + void Flush() override; + + using ROCKSDB_NAMESPACE::Logger::Logv; + void Logv(const char* format, va_list ap) override; + + size_t GetLogFileSize() const override; + + void DebugWriter(const char* str, int len); + + protected: + Status CloseImpl() override; + + private: + HANDLE file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + std::atomic_uint_fast64_t last_flush_micros_; + SystemClock* clock_; + bool flush_pending_; + + Status CloseInternal(); + + const static uint64_t flush_every_seconds_ = 5; +}; +} // namespace port + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc new file mode 100644 index 0000000000..3c82e736ec --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) +// Most Mingw builds support std::thread only when using posix threads. +// In that case, some of these functions will be unavailable. +// Note that we're using either WindowsThread or std::thread, depending on +// which one is available. +#ifndef _POSIX_THREADS + +#include "port/win/win_thread.h" + +#include +#include // __beginthreadex +#include + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { +namespace port { + +struct WindowsThread::Data { + std::function func_; + uintptr_t handle_; + + Data(std::function&& func) : func_(std::move(func)), handle_(0) {} + + Data(const Data&) = delete; + Data& operator=(const Data&) = delete; + + static unsigned int __stdcall ThreadProc(void* arg); +}; + +void WindowsThread::Init(std::function&& func) { + data_ = std::make_shared(std::move(func)); + // We create another instance of std::shared_ptr to get an additional ref + // since we may detach and destroy this instance before the threadproc + // may start to run. We choose to allocate this additional ref on the heap + // so we do not need to synchronize and allow this thread to proceed + std::unique_ptr> th_data( + new std::shared_ptr(data_)); + + data_->handle_ = _beginthreadex(NULL, + 0, // stack size + &Data::ThreadProc, th_data.get(), + 0, // init flag + &th_id_); + + if (data_->handle_ == 0) { + throw std::system_error( + std::make_error_code(std::errc::resource_unavailable_try_again), + "Unable to create a thread"); + } + th_data.release(); +} + +WindowsThread::WindowsThread() : data_(nullptr), th_id_(0) {} + +WindowsThread::~WindowsThread() { + // Must be joined or detached + // before destruction. + // This is the same as std::thread + if (data_) { + if (joinable()) { + assert(false); + std::terminate(); + } + data_.reset(); + } +} + +WindowsThread::WindowsThread(WindowsThread&& o) noexcept : WindowsThread() { + *this = std::move(o); +} + +WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept { + if (joinable()) { + assert(false); + std::terminate(); + } + + data_ = std::move(o.data_); + + // Per spec both instances will have the same id + th_id_ = o.th_id_; + + return *this; +} + +bool WindowsThread::joinable() const { return (data_ && data_->handle_ != 0); } + +WindowsThread::native_handle_type WindowsThread::native_handle() const { + return reinterpret_cast(data_->handle_); +} + +unsigned WindowsThread::hardware_concurrency() { + return std::thread::hardware_concurrency(); +} + +void WindowsThread::join() { + if (!joinable()) { + assert(false); + throw std::system_error(std::make_error_code(std::errc::invalid_argument), + "Thread is no longer joinable"); + } + + if (GetThreadId(GetCurrentThread()) == th_id_) { + assert(false); + throw std::system_error( + std::make_error_code(std::errc::resource_deadlock_would_occur), + "Can not join itself"); + } + + auto ret = + WaitForSingleObject(reinterpret_cast(data_->handle_), INFINITE); + if (ret != WAIT_OBJECT_0) { + auto lastError = GetLastError(); + assert(false); + throw std::system_error(static_cast(lastError), std::system_category(), + "WaitForSingleObjectFailed: thread join"); + } + + BOOL rc +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + rc = CloseHandle(reinterpret_cast(data_->handle_)); + assert(rc != 0); + data_->handle_ = 0; +} + +bool WindowsThread::detach() { + if (!joinable()) { + assert(false); + throw std::system_error(std::make_error_code(std::errc::invalid_argument), + "Thread is no longer available"); + } + + BOOL ret = CloseHandle(reinterpret_cast(data_->handle_)); + data_->handle_ = 0; + + return (ret != 0); +} + +void WindowsThread::swap(WindowsThread& o) { + data_.swap(o.data_); + std::swap(th_id_, o.th_id_); +} + +unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) { + auto ptr = reinterpret_cast*>(arg); + std::unique_ptr> data(ptr); + (*data)->func_(); + return 0; +} +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif // !_POSIX_THREADS +#endif // OS_WIN diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.h new file mode 100644 index 0000000000..916033b77c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.h @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef _POSIX_THREADS + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// This class is a replacement for std::thread +// 2 reasons we do not like std::thread: +// -- is that it dynamically allocates its internals that are automatically +// freed when the thread terminates and not on the destruction of the +// object. This makes it difficult to control the source of memory +// allocation +// - This implements Pimpl so we can easily replace the guts of the +// object in our private version if necessary. +class WindowsThread { + struct Data; + + std::shared_ptr data_; + unsigned int th_id_; + + void Init(std::function&&); + + public: + using native_handle_type = void*; + + // Construct with no thread + WindowsThread(); + + // Template constructor + // + // This templated constructor accomplishes several things + // + // - Allows the class as whole to be not a template + // + // - take "universal" references to support both _lvalues and _rvalues + // + // - because this constructor is a catchall case in many respects it + // may prevent us from using both the default __ctor, the move __ctor. + // Also it may circumvent copy __ctor deletion. To work around this + // we make sure this one has at least one argument and eliminate + // it from the overload selection when WindowsThread is the first + // argument. + // + // - construct with Fx(Ax...) with a variable number of types/arguments. + // + // - Gathers together the callable object with its arguments and constructs + // a single callable entity + // + // - Makes use of std::function to convert it to a specification-template + // dependent type that both checks the signature conformance to ensure + // that all of the necessary arguments are provided and allows pimpl + // implementation. + template ::type, WindowsThread>::value>::type> + explicit WindowsThread(Fn&& fx, Args&&... ax) : WindowsThread() { + // Use binder to create a single callable entity + auto binder = std::bind(std::forward(fx), std::forward(ax)...); + // Use std::function to take advantage of the type erasure + // so we can still hide implementation within pimpl + // This also makes sure that the binder signature is compliant + std::function target = binder; + + Init(std::move(target)); + } + + ~WindowsThread(); + + WindowsThread(const WindowsThread&) = delete; + + WindowsThread& operator=(const WindowsThread&) = delete; + + WindowsThread(WindowsThread&&) noexcept; + + WindowsThread& operator=(WindowsThread&&) noexcept; + + bool joinable() const; + + unsigned int get_id() const { return th_id_; } + + native_handle_type native_handle() const; + + static unsigned hardware_concurrency(); + + void join(); + + bool detach(); + + void swap(WindowsThread&); +}; +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +namespace std { +inline void swap(ROCKSDB_NAMESPACE::port::WindowsThread& th1, + ROCKSDB_NAMESPACE::port::WindowsThread& th2) { + th1.swap(th2); +} +} // namespace std + +#endif // !_POSIX_THREADS diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/xpress_win.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/xpress_win.h new file mode 100644 index 0000000000..187adffa65 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/xpress_win.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { +namespace xpress { + +bool Compress(const char* input, size_t length, std::string* output); + +char* Decompress(const char* input_data, size_t input_length, + size_t* uncompressed_size); +} // namespace xpress +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/xpress.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/xpress.h new file mode 100644 index 0000000000..457025f666 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/xpress.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +// Xpress on Windows is implemeted using Win API +#if defined(ROCKSDB_PLATFORM_POSIX) +#error "Xpress compression not implemented" +#elif defined(OS_WIN) +#include "port/win/xpress_win.h" +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.cc new file mode 100644 index 0000000000..5a573ca992 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.cc @@ -0,0 +1,125 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/adaptive/adaptive_table_factory.h" + +#include "port/port.h" +#include "table/format.h" +#include "table/table_builder.h" + +namespace ROCKSDB_NAMESPACE { + +AdaptiveTableFactory::AdaptiveTableFactory( + std::shared_ptr table_factory_to_write, + std::shared_ptr block_based_table_factory, + std::shared_ptr plain_table_factory, + std::shared_ptr cuckoo_table_factory) + : table_factory_to_write_(table_factory_to_write), + block_based_table_factory_(block_based_table_factory), + plain_table_factory_(plain_table_factory), + cuckoo_table_factory_(cuckoo_table_factory) { + if (!plain_table_factory_) { + plain_table_factory_.reset(NewPlainTableFactory()); + } + if (!block_based_table_factory_) { + block_based_table_factory_.reset(NewBlockBasedTableFactory()); + } + if (!cuckoo_table_factory_) { + cuckoo_table_factory_.reset(NewCuckooTableFactory()); + } + if (!table_factory_to_write_) { + table_factory_to_write_ = block_based_table_factory_; + } +} + +extern const uint64_t kPlainTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kCuckooTableMagicNumber; + +Status AdaptiveTableFactory::NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const { + Footer footer; + IOOptions opts; + auto s = + ReadFooterFromFile(opts, file.get(), *table_reader_options.ioptions.fs, + nullptr /* prefetch_buffer */, file_size, &footer); + if (!s.ok()) { + return s; + } + if (footer.table_magic_number() == kPlainTableMagicNumber || + footer.table_magic_number() == kLegacyPlainTableMagicNumber) { + return plain_table_factory_->NewTableReader( + table_reader_options, std::move(file), file_size, table); + } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || + footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { + return block_based_table_factory_->NewTableReader( + ro, table_reader_options, std::move(file), file_size, table, + prefetch_index_and_filter_in_cache); + } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { + return cuckoo_table_factory_->NewTableReader( + table_reader_options, std::move(file), file_size, table); + } else { + return Status::NotSupported("Unidentified table format"); + } +} + +TableBuilder* AdaptiveTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const { + return table_factory_to_write_->NewTableBuilder(table_builder_options, file); +} + +std::string AdaptiveTableFactory::GetPrintableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + if (table_factory_to_write_) { + snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n", + (table_factory_to_write_->Name() ? table_factory_to_write_->Name() + : ""), + table_factory_to_write_->GetPrintableOptions().c_str()); + ret.append(buffer); + } + if (plain_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + plain_table_factory_->Name() ? plain_table_factory_->Name() : "", + plain_table_factory_->GetPrintableOptions().c_str()); + ret.append(buffer); + } + if (block_based_table_factory_) { + snprintf( + buffer, kBufferSize, " %s options:\n%s\n", + (block_based_table_factory_->Name() ? block_based_table_factory_->Name() + : ""), + block_based_table_factory_->GetPrintableOptions().c_str()); + ret.append(buffer); + } + if (cuckoo_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "", + cuckoo_table_factory_->GetPrintableOptions().c_str()); + ret.append(buffer); + } + return ret; +} + +extern TableFactory* NewAdaptiveTableFactory( + std::shared_ptr table_factory_to_write, + std::shared_ptr block_based_table_factory, + std::shared_ptr plain_table_factory, + std::shared_ptr cuckoo_table_factory) { + return new AdaptiveTableFactory(table_factory_to_write, + block_based_table_factory, + plain_table_factory, cuckoo_table_factory); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.h new file mode 100644 index 0000000000..55c8bca1f4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/adaptive/adaptive_table_factory.h @@ -0,0 +1,56 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + + +#include + +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +struct EnvOptions; + +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +class AdaptiveTableFactory : public TableFactory { + public: + ~AdaptiveTableFactory() {} + + explicit AdaptiveTableFactory( + std::shared_ptr table_factory_to_write, + std::shared_ptr block_based_table_factory, + std::shared_ptr plain_table_factory, + std::shared_ptr cuckoo_table_factory); + + const char* Name() const override { return "AdaptiveTableFactory"; } + + using TableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const override; + + std::string GetPrintableOptions() const override; + + private: + std::shared_ptr table_factory_to_write_; + std::shared_ptr block_based_table_factory_; + std::shared_ptr plain_table_factory_; + std::shared_ptr cuckoo_table_factory_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.cc new file mode 100644 index 0000000000..2cf9a55318 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/binary_search_index_reader.h" + +namespace ROCKSDB_NAMESPACE { +Status BinarySearchIndexReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ro, use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset( + new BinarySearchIndexReader(table, std::move(index_block))); + + return Status::OK(); +} + +InternalIteratorBase* BinarySearchIndexReader::NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + const BlockBasedTable::Rep* rep = table()->get_rep(); + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context, + &index_block, read_options); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), index_value_is_full()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.h new file mode 100644 index 0000000000..d4a611ecca --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/binary_search_index_reader.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/index_reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + BinarySearchIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.cc new file mode 100644 index 0000000000..136275b6c8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.cc @@ -0,0 +1,1291 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block_based/block.h" + +#include +#include +#include +#include + +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/comparator.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_footer.h" +#include "table/format.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not dereference past "limit". +// +// If any errors are detected, returns nullptr. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +struct DecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + assert(limit - p >= 3); + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } + + // Using an assert in place of "return null" since we should not pay the + // cost of checking for corruption on every single key decoding + assert(!(static_cast(limit - p) < (*non_shared + *value_length))); + return p; + } +}; + +// Helper routine: similar to DecodeEntry but does not have assertions. +// Instead, returns nullptr so that caller can detect and report failure. +struct CheckAndDecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) { + return nullptr; + } + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } + + if (static_cast(limit - p) < (*non_shared + *value_length)) { + return nullptr; + } + return p; + } +}; + +struct DecodeKey { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + uint32_t value_length; + return DecodeEntry()(p, limit, shared, non_shared, &value_length); + } +}; + +// In format_version 4, which is used by index blocks, the value size is not +// encoded before the entry, as the value is known to be the handle with the +// known size. +struct DecodeKeyV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + if ((*shared | *non_shared) < 128) { + // Fast path: all three values are encoded in one byte each + p += 2; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + } + return p; + } +}; + +struct DecodeEntryV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + assert(value_length); + + *value_length = 0; + return DecodeKeyV4()(p, limit, shared, non_shared); + } +}; + +void DataBlockIter::NextImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return; +#endif + bool is_shared = false; + ParseNextDataKey(&is_shared); + ++cur_entry_idx_; +} + +void MetaBlockIter::NextImpl() { + bool is_shared = false; + ParseNextKey(&is_shared); + ++cur_entry_idx_; +} + +void IndexBlockIter::NextImpl() { + ParseNextIndexKey(); + ++cur_entry_idx_; +} + +void IndexBlockIter::PrevImpl() { + assert(Valid()); + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + SeekToRestartPoint(restart_index_); + // Loop until end of current entry hits the start of original entry + while (ParseNextIndexKey() && NextEntryOffset() < original) { + } + --cur_entry_idx_; +} + +void MetaBlockIter::PrevImpl() { + assert(Valid()); + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + SeekToRestartPoint(restart_index_); + bool is_shared = false; + // Loop until end of current entry hits the start of original entry + while (ParseNextKey(&is_shared) && + NextEntryOffset() < original) { + } + --cur_entry_idx_; +} + +// Similar to IndexBlockIter::PrevImpl but also caches the prev entries +void DataBlockIter::PrevImpl() { + assert(Valid()); + + assert(prev_entries_idx_ == -1 || + static_cast(prev_entries_idx_) < prev_entries_.size()); + --cur_entry_idx_; + // Check if we can use cached prev_entries_ + if (prev_entries_idx_ > 0 && + prev_entries_[prev_entries_idx_].offset == current_) { + // Read cached CachedPrevEntry + prev_entries_idx_--; + const CachedPrevEntry& current_prev_entry = + prev_entries_[prev_entries_idx_]; + + const char* key_ptr = nullptr; + bool raw_key_cached; + if (current_prev_entry.key_ptr != nullptr) { + // The key is not delta encoded and stored in the data block + key_ptr = current_prev_entry.key_ptr; + raw_key_cached = false; + } else { + // The key is delta encoded and stored in prev_entries_keys_buff_ + key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; + raw_key_cached = true; + } + const Slice current_key(key_ptr, current_prev_entry.key_size); + + current_ = current_prev_entry.offset; + // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience, + // not necessity. It is convenient since this class treats keys as pinned + // when `raw_key_` points to an outside buffer. So we cannot allow + // `raw_key_` point into Prev cache as it is a transient outside buffer + // (i.e., keys in it are not actually pinned). + raw_key_.SetKey(current_key, raw_key_cached /* copy */); + value_ = current_prev_entry.value; + + return; + } + + // Clear prev entries cache + prev_entries_idx_ = -1; + prev_entries_.clear(); + prev_entries_keys_buff_.clear(); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + + do { + bool is_shared = false; + if (!ParseNextDataKey(&is_shared)) { + break; + } + Slice current_key = raw_key_.GetKey(); + + if (raw_key_.IsKeyPinned()) { + // The key is not delta encoded + prev_entries_.emplace_back(current_, current_key.data(), 0, + current_key.size(), value()); + } else { + // The key is delta encoded, cache decoded key in buffer + size_t new_key_offset = prev_entries_keys_buff_.size(); + prev_entries_keys_buff_.append(current_key.data(), current_key.size()); + + prev_entries_.emplace_back(current_, nullptr, new_key_offset, + current_key.size(), value()); + } + // Loop until end of current entry hits the start of original entry + } while (NextEntryOffset() < original); + prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; +} + +void DataBlockIter::SeekImpl(const Slice& target) { + Slice seek_key = target; + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); +} + +void MetaBlockIter::SeekImpl(const Slice& target) { + Slice seek_key = target; + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); +} + +// Optimized Seek for point lookup for an internal key `target` +// target = "seek_user_key @ type | seqno". +// +// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// kTypeBlobIndex, kTypeWideColumnEntity or kTypeMerge, this function behaves +// identically to Seek(). +// +// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// kTypeBlobIndex, kTypeWideColumnEntity, or kTypeMerge: +// +// If the return value is FALSE, iter location is undefined, and it means: +// 1) there is no key in this block falling into the range: +// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], +// inclusive; AND +// 2) the last key of this block has a greater user_key from seek_user_key +// +// If the return value is TRUE, iter location has two possibilities: +// 1) If iter is valid, it is set to a location as if set by SeekImpl(target). +// In this case, it points to the first key with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. +// 2) If the iter is invalid, it means that either all the user_key is less +// than the seek_user_key, or the block ends with a matching user_key but +// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno +// but larger type). +bool DataBlockIter::SeekForGetImpl(const Slice& target) { + Slice target_user_key = ExtractUserKey(target); + uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); + uint8_t entry = + data_block_hash_index_->Lookup(data_, map_offset, target_user_key); + + if (entry == kCollision) { + // HashSeek not effective, falling back + SeekImpl(target); + return true; + } + + if (entry == kNoEntry) { + // Even if we cannot find the user_key in this block, the result may + // exist in the next block. Consider this example: + // + // Block N: [aab@100, ... , app@120] + // boundary key: axy@50 (we make minimal assumption about a boundary key) + // Block N+1: [axy@10, ... ] + // + // If seek_key = axy@60, the search will start from Block N. + // Even if the user_key is not found in the hash map, the caller still + // have to continue searching the next block. + // + // In this case, we pretend the key is in the last restart interval. + // The while-loop below will search the last restart interval for the + // key. It will stop at the first key that is larger than the seek_key, + // or to the end of the block if no one is larger. + entry = static_cast(num_restarts_ - 1); + } + + uint32_t restart_index = entry; + + // check if the key is in the restart_interval + assert(restart_index < num_restarts_); + SeekToRestartPoint(restart_index); + current_ = GetRestartPoint(restart_index); + cur_entry_idx_ = + static_cast(restart_index * block_restart_interval_) - 1; + + uint32_t limit = restarts_; + if (restart_index + 1 < num_restarts_) { + limit = GetRestartPoint(restart_index + 1); + } + while (current_ < limit) { + ++cur_entry_idx_; + bool shared; + // Here we only linear seek the target key inside the restart interval. + // If a key does not exist inside a restart interval, we avoid + // further searching the block content across restart interval boundary. + // + // TODO(fwu): check the left and right boundary of the restart interval + // to avoid linear seek a target key that is out of range. + if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) { + // we stop at the first potential matching user key. + break; + } + // If the loop exits due to CompareCurrentKey(target) >= 0, then current key + // exists, and its checksum verification will be done in UpdateKey() called + // in SeekForGet(). + // TODO(cbi): If this loop exits with current_ == restart_, per key-value + // checksum will not be verified in UpdateKey() since Valid() + // will return false. + } + + if (current_ == restarts_) { + // Search reaches to the end of the block. There are three possibilities: + // 1) there is only one user_key match in the block (otherwise collision). + // the matching user_key resides in the last restart interval, and it + // is the last key of the restart interval and of the block as well. + // ParseNextKey() skipped it as its [ type | seqno ] is smaller. + // + // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, + // AND all existing user_keys in the restart interval are smaller than + // seek_user_key. + // + // 3) The seek_key is a false positive and happens to be hashed to the + // last restart interval, AND all existing user_keys in the restart + // interval are smaller than seek_user_key. + // + // The result may exist in the next block each case, so we return true. + return true; + } + + if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), + target_user_key) != 0) { + // the key is not in this block and cannot be at the next block either. + return false; + } + + // Here we are conservative and only support a limited set of cases + ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); + if (value_type != ValueType::kTypeValue && + value_type != ValueType::kTypeDeletion && + value_type != ValueType::kTypeMerge && + value_type != ValueType::kTypeSingleDeletion && + value_type != ValueType::kTypeBlobIndex && + value_type != ValueType::kTypeWideColumnEntity) { + SeekImpl(target); + } + + // Result found, and the iter is correctly set. + return true; +} + +void IndexBlockIter::SeekImpl(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return; +#endif + TEST_SYNC_POINT("IndexBlockIter::Seek:0"); + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + Slice seek_key = target; + if (raw_key_.IsUserKey()) { + seek_key = ExtractUserKey(target); + } + status_ = Status::OK(); + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = false; + if (prefix_index_) { + bool prefix_may_exist = true; + ok = PrefixSeek(target, &index, &prefix_may_exist); + if (!prefix_may_exist) { + // This is to let the caller to distinguish between non-existing prefix, + // and when key is larger than the last key, which both set Valid() to + // false. + current_ = restarts_; + status_ = Status::NotFound(); + } + // restart interval must be one when hash search is enabled so the binary + // search simply lands at the right place. + skip_linear_scan = true; + } else if (value_delta_encoded_) { + ok = BinarySeek(seek_key, &index, &skip_linear_scan); + } else { + ok = BinarySeek(seek_key, &index, &skip_linear_scan); + } + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); +} + +void DataBlockIter::SeekForPrevImpl(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); + + if (!Valid()) { + if (status_.ok()) { + SeekToLastImpl(); + } + } else { + while (Valid() && CompareCurrentKey(seek_key) > 0) { + PrevImpl(); + } + } +} + +void MetaBlockIter::SeekForPrevImpl(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); + + if (!Valid()) { + if (status_.ok()) { + SeekToLastImpl(); + } + } else { + while (Valid() && CompareCurrentKey(seek_key) > 0) { + PrevImpl(); + } + } +} + +void DataBlockIter::SeekToFirstImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + bool is_shared = false; + ParseNextDataKey(&is_shared); + cur_entry_idx_ = 0; +} + +void MetaBlockIter::SeekToFirstImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + bool is_shared = false; + ParseNextKey(&is_shared); + cur_entry_idx_ = 0; +} + +void IndexBlockIter::SeekToFirstImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return; +#endif + if (data_ == nullptr) { // Not init yet + return; + } + status_ = Status::OK(); + SeekToRestartPoint(0); + ParseNextIndexKey(); + cur_entry_idx_ = 0; +} + +void DataBlockIter::SeekToLastImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + bool is_shared = false; + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; + while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { + // Keep skipping + ++cur_entry_idx_; + } +} + +void MetaBlockIter::SeekToLastImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + bool is_shared = false; + assert(num_restarts_ >= 1); + cur_entry_idx_ = + static_cast((num_restarts_ - 1) * block_restart_interval_); + while (ParseNextKey(&is_shared) && + NextEntryOffset() < restarts_) { + // Will probably never reach here since restart_interval is always 1 + ++cur_entry_idx_; + } +} + +void IndexBlockIter::SeekToLastImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + status_ = Status::OK(); + SeekToRestartPoint(num_restarts_ - 1); + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; + while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { + ++cur_entry_idx_; + } +} + +template +template +bool BlockIter::ParseNextKey(bool* is_shared) { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || raw_key_.Size() < shared) { + CorruptionError(); + return false; + } else { + if (shared == 0) { + *is_shared = false; + // If this key doesn't share any bytes with prev key then we don't need + // to decode it and can use its address in the block directly. + raw_key_.SetKey(Slice(p, non_shared), false /* copy */); + } else { + // This key share `shared` bytes with prev key, we need to decode it + *is_shared = true; + raw_key_.TrimAppend(shared, p, non_shared); + } + value_ = Slice(p + non_shared, value_length); + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + return true; + } +} + +bool DataBlockIter::ParseNextDataKey(bool* is_shared) { + if (ParseNextKey(is_shared)) { +#ifndef NDEBUG + if (global_seqno_ != kDisableGlobalSequenceNumber) { + // If we are reading a file with a global sequence number we should + // expect that all encoded sequence numbers are zeros and any value + // type is kTypeValue, kTypeMerge, kTypeDeletion, + // kTypeDeletionWithTimestamp, or kTypeRangeDeletion. + uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey()); + SequenceNumber seqno; + ValueType value_type; + UnPackSequenceAndType(packed, &seqno, &value_type); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeDeletionWithTimestamp || + value_type == ValueType::kTypeRangeDeletion); + assert(seqno == 0); + } +#endif // NDEBUG + return true; + } else { + return false; + } +} + +bool IndexBlockIter::ParseNextIndexKey() { + bool is_shared = false; + bool ok = (value_delta_encoded_) ? ParseNextKey(&is_shared) + : ParseNextKey(&is_shared); + if (ok) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + DecodeCurrentValue(is_shared); + } + } + return ok; +} + +// The format: +// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// ... +// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// where, k is key, v is value, and its encoding is in parentheses. +// The format of each key is (shared_size, non_shared_size, shared, non_shared) +// The format of each value, i.e., block handle, is (offset, size) whenever the +// is_shared is false, which included the first entry in each restart point. +// Otherwise, the format is delta-size = the size of current block - the size o +// last block. +void IndexBlockIter::DecodeCurrentValue(bool is_shared) { + Slice v(value_.data(), data_ + restarts_ - value_.data()); + // Delta encoding is used if `shared` != 0. + Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( + &v, have_first_key_, + (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr); + assert(decode_s.ok()); + value_ = Slice(value_.data(), v.data() - value_.data()); + + if (global_seqno_state_ != nullptr) { + // Overwrite sequence number the same way as in DataBlockIter. + + IterKey& first_internal_key = global_seqno_state_->first_internal_key; + first_internal_key.SetInternalKey(decoded_value_.first_internal_key, + /* copy */ true); + + assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(first_internal_key.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, + value_type); + decoded_value_.first_internal_key = first_internal_key.GetKey(); + } +} + +template +void BlockIter::FindKeyAfterBinarySeek(const Slice& target, + uint32_t index, + bool skip_linear_scan) { + // SeekToRestartPoint() only does the lookup in the restart block. We need + // to follow it up with NextImpl() to position the iterator at the restart + // key. + SeekToRestartPoint(index); + cur_entry_idx_ = static_cast(index * block_restart_interval_) - 1; + NextImpl(); + + if (!skip_linear_scan) { + // Linear search (within restart block) for first key >= target + uint32_t max_offset; + if (index + 1 < num_restarts_) { + // We are in a non-last restart interval. Since `BinarySeek()` guarantees + // the next restart key is strictly greater than `target`, we can + // terminate upon reaching it without any additional key comparison. + max_offset = GetRestartPoint(index + 1); + } else { + // We are in the last restart interval. The while-loop will terminate by + // `Valid()` returning false upon advancing past the block's last key. + max_offset = std::numeric_limits::max(); + } + while (true) { + NextImpl(); + if (!Valid()) { + // TODO(cbi): per key-value checksum will not be verified in UpdateKey() + // since Valid() will returns false. + break; + } + if (current_ == max_offset) { + assert(CompareCurrentKey(target) > 0); + break; + } else if (CompareCurrentKey(target) >= 0) { + break; + } + } + } +} + +// Binary searches in restart array to find the starting restart point for the +// linear scan, and stores it in `*index`. Assumes restart array does not +// contain duplicate keys. It is guaranteed that the restart key at `*index + 1` +// is strictly greater than `target` or does not exist (this can be used to +// elide a comparison when linear scan reaches all the way to the next restart +// key). Furthermore, `*skip_linear_scan` is set to indicate whether the +// `*index`th restart key is the final result so that key does not need to be +// compared again later. +template +template +bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, + bool* skip_linear_scan) { + if (restarts_ == 0) { + // SST files dedicated to range tombstones are written with index blocks + // that have no keys while also having `num_restarts_ == 1`. This would + // cause a problem for `BinarySeek()` as it'd try to access the first key + // which does not exist. We identify such blocks by the offset at which + // their restarts are stored, and return false to prevent any attempted + // key accesses. + return false; + } + + *skip_linear_scan = false; + // Loop invariants: + // - Restart key at index `left` is less than or equal to the target key. The + // sentinel index `-1` is considered to have a key that is less than all + // keys. + // - Any restart keys after index `right` are strictly greater than the target + // key. + int64_t left = -1, right = num_restarts_ - 1; + while (left != right) { + // The `mid` is computed by rounding up so it lands in (`left`, `right`]. + int64_t mid = left + (right - left + 1) / 2; + uint32_t region_offset = GetRestartPoint(static_cast(mid)); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + Slice mid_key(key_ptr, non_shared); + raw_key_.SetKey(mid_key, false /* copy */); + int cmp = CompareCurrentKey(target); + if (cmp < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else if (cmp > 0) { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } else { + *skip_linear_scan = true; + left = right = mid; + } + } + + if (left == -1) { + // All keys in the block were strictly greater than `target`. So the very + // first key in the block is the final seek result. + *skip_linear_scan = true; + *index = 0; + } else { + *index = static_cast(left); + } + return true; +} + +// Compare target key and the block key of the block of `block_index`. +// Return -1 if error. +int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { + uint32_t region_offset = GetRestartPoint(block_index); + uint32_t shared, non_shared; + const char* key_ptr = + value_delta_encoded_ + ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared) + : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return 1; // Return target is smaller + } + Slice block_key(key_ptr, non_shared); + raw_key_.SetKey(block_key, false /* copy */); + return CompareCurrentKey(target); +} + +// Binary search in block_ids to find the first block +// with a key >= target +bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, + uint32_t* block_ids, uint32_t left, + uint32_t right, uint32_t* index, + bool* prefix_may_exist) { + assert(left <= right); + assert(index); + assert(prefix_may_exist); + *prefix_may_exist = true; + uint32_t left_bound = left; + + while (left <= right) { + uint32_t mid = (right + left) / 2; + + int cmp = CompareBlockKey(block_ids[mid], target); + if (!status_.ok()) { + return false; + } + if (cmp < 0) { + // Key at "target" is larger than "mid". Therefore all + // blocks before or at "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "target" is <= "mid". Therefore all blocks + // after "mid" are uninteresting. + // If there is only one block left, we found it. + if (left == right) break; + right = mid; + } + } + + if (left == right) { + // In one of the two following cases: + // (1) left is the first one of block_ids + // (2) there is a gap of blocks between block of `left` and `left-1`. + // we can further distinguish the case of key in the block or key not + // existing, by comparing the target key and the key of the previous + // block to the left of the block found. + if (block_ids[left] > 0 && + (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && + CompareBlockKey(block_ids[left] - 1, target) > 0) { + current_ = restarts_; + *prefix_may_exist = false; + return false; + } + + *index = block_ids[left]; + return true; + } else { + assert(left > right); + + // If the next block key is larger than seek key, it is possible that + // no key shares the prefix with `target`, or all keys with the same + // prefix as `target` are smaller than prefix. In the latter case, + // we are mandated to set the position the same as the total order. + // In the latter case, either: + // (1) `target` falls into the range of the next block. In this case, + // we can place the iterator to the next block, or + // (2) `target` is larger than all block keys. In this case we can + // keep the iterator invalidate without setting `prefix_may_exist` + // to false. + // We might sometimes end up with setting the total order position + // while there is no key sharing the prefix as `target`, but it + // still follows the contract. + uint32_t right_index = block_ids[right]; + assert(right_index + 1 <= num_restarts_); + if (right_index + 1 < num_restarts_) { + if (CompareBlockKey(right_index + 1, target) >= 0) { + *index = right_index + 1; + return true; + } else { + // We have to set the flag here because we are not positioning + // the iterator to the total order position. + *prefix_may_exist = false; + } + } + + // Mark iterator invalid + current_ = restarts_; + return false; + } +} + +bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, + bool* prefix_may_exist) { + assert(index); + assert(prefix_may_exist); + assert(prefix_index_); + *prefix_may_exist = true; + Slice seek_key = target; + if (raw_key_.IsUserKey()) { + seek_key = ExtractUserKey(target); + } + uint32_t* block_ids = nullptr; + uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); + + if (num_blocks == 0) { + current_ = restarts_; + *prefix_may_exist = false; + return false; + } else { + assert(block_ids); + return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index, + prefix_may_exist); + } +} + +uint32_t Block::NumRestarts() const { + assert(size_ >= 2 * sizeof(uint32_t)); + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // In BlockBuilder, we have ensured a block with HashIndex is less than + // kMaxBlockSizeSupportedByHashIndex (64KiB). + // + // Therefore, if we encounter a block with a size > 64KiB, the block + // cannot have HashIndex. So the footer will directly interpreted as + // num_restarts. + // + // Such check is for backward compatibility. We can ensure legacy block + // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted + // correctly as no HashIndex even if the MSB of num_restarts is set. + return num_restarts; + } + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return num_restarts; +} + +BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { + assert(size_ >= 2 * sizeof(uint32_t)); + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // The check is for the same reason as that in NumRestarts() + return BlockBasedTableOptions::kDataBlockBinarySearch; + } + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return index_type; +} + +Block::~Block() { + // This sync point can be re-enabled if RocksDB can control the + // initialization order of any/all static options created by the user. + // TEST_SYNC_POINT("Block::~Block"); + delete[] kv_checksum_; +} + +Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, + Statistics* statistics) + : contents_(std::move(contents)), + data_(contents_.data.data()), + size_(contents_.data.size()), + restart_offset_(0), + num_restarts_(0) { + TEST_SYNC_POINT("Block::Block:0"); + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + // Should only decode restart points for uncompressed blocks + num_restarts_ = NumRestarts(); + switch (IndexType()) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + restart_offset_ = static_cast(size_) - + (1 + num_restarts_) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + if (size_ < sizeof(uint32_t) /* block footer */ + + sizeof(uint16_t) /* NUM_BUCK */) { + size_ = 0; + break; + } + + uint16_t map_offset; + data_block_hash_index_.Initialize( + contents.data.data(), + static_cast(contents.data.size() - + sizeof(uint32_t)), /*chop off + NUM_RESTARTS*/ + &map_offset); + + restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); + + if (restart_offset_ > map_offset) { + // map_offset is too small for NumRestarts() and + // therefore restart_offset_ wrapped around. + size_ = 0; + break; + } + break; + default: + size_ = 0; // Error marker + } + } + if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { + read_amp_bitmap_.reset(new BlockReadAmpBitmap( + restart_offset_, read_amp_bytes_per_bit, statistics)); + } +} + +void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp) { + protection_bytes_per_key_ = 0; + if (protection_bytes_per_key > 0 && num_restarts_ > 0) { + // NewDataIterator() is called with protection_bytes_per_key_ = 0. + // This is intended since checksum is not constructed yet. + // + // We do not know global_seqno yet, so checksum computation and + // verification all assume global_seqno = 0. + std::unique_ptr iter{NewDataIterator( + raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */, + nullptr /* stats */, true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + size_t i = 0; + iter->SeekToFirst(); + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They + // do not impact how the index block is parsed. During checksum + // construction/verification, we use the entire key buffer from + // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value + // checksum, and the content of this buffer do not change for different + // values of `global_seqno` or `key_includes_seq`. + std::unique_ptr iter{NewIndexIterator( + raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr, + nullptr /* Statistics */, true /* total_order_seek */, + index_has_first_key /* have_first_key */, false /* key_includes_seq */, + value_is_full, true /* block_contents_pinned */, + nullptr /* prefix_index */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->raw_value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeMetaIndexBlockProtectionInfo( + uint8_t protection_bytes_per_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + std::unique_ptr iter{ + NewMetaIterator(true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { + MetaBlockIter* iter = new MetaBlockIter(); + if (size_ < 2 * sizeof(uint32_t)) { + iter->Invalidate(Status::Corruption("bad block contents")); + return iter; + } else if (num_restarts_ == 0) { + // Empty block. + iter->Invalidate(Status::OK()); + } else { + iter->Initialize(data_, restart_offset_, num_restarts_, + block_contents_pinned, protection_bytes_per_key_, + kv_checksum_, block_restart_interval_); + } + return iter; +} + +DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, + SequenceNumber global_seqno, + DataBlockIter* iter, Statistics* stats, + bool block_contents_pinned) { + DataBlockIter* ret_iter; + if (iter != nullptr) { + ret_iter = iter; + } else { + ret_iter = new DataBlockIter; + } + if (size_ < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(Status::Corruption("bad block contents")); + return ret_iter; + } + if (num_restarts_ == 0) { + // Empty block. + ret_iter->Invalidate(Status::OK()); + return ret_iter; + } else { + ret_iter->Initialize( + raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, + read_amp_bitmap_.get(), block_contents_pinned, + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, + protection_bytes_per_key_, kv_checksum_, block_restart_interval_); + if (read_amp_bitmap_) { + if (read_amp_bitmap_->GetStatistics() != stats) { + // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ + read_amp_bitmap_->SetStatistics(stats); + } + } + } + + return ret_iter; +} + +IndexBlockIter* Block::NewIndexIterator( + const Comparator* raw_ucmp, SequenceNumber global_seqno, + IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, + bool have_first_key, bool key_includes_seq, bool value_is_full, + bool block_contents_pinned, BlockPrefixIndex* prefix_index) { + IndexBlockIter* ret_iter; + if (iter != nullptr) { + ret_iter = iter; + } else { + ret_iter = new IndexBlockIter; + } + if (size_ < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(Status::Corruption("bad block contents")); + return ret_iter; + } + if (num_restarts_ == 0) { + // Empty block. + ret_iter->Invalidate(Status::OK()); + return ret_iter; + } else { + BlockPrefixIndex* prefix_index_ptr = + total_order_seek ? nullptr : prefix_index; + ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_, + global_seqno, prefix_index_ptr, have_first_key, + key_includes_seq, value_is_full, block_contents_pinned, + protection_bytes_per_key_, kv_checksum_, + block_restart_interval_); + } + + return ret_iter; +} + +size_t Block::ApproximateMemoryUsage() const { + size_t usage = usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + if (read_amp_bitmap_) { + usage += read_amp_bitmap_->ApproximateMemoryUsage(); + } + usage += checksum_size_; + return usage; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.h new file mode 100644 index 0000000000..68b6906fac --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block.h @@ -0,0 +1,921 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include +#include + +#include "db/kv_checksum.h" +#include "db/pinned_iterators_manager.h" +#include "port/malloc.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_hash_index.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +struct BlockContents; +class Comparator; +template +class BlockIter; +class DataBlockIter; +class IndexBlockIter; +class MetaBlockIter; +class BlockPrefixIndex; + +// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data +// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of +// bytes in the Block we update the bitmap and increment +// READ_AMP_ESTIMATE_USEFUL_BYTES. +class BlockReadAmpBitmap { + public: + explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit, + Statistics* statistics) + : bitmap_(nullptr), + bytes_per_bit_pow_(0), + statistics_(statistics), + rnd_(Random::GetTLSInstance()->Uniform( + static_cast(bytes_per_bit))) { + TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_); + assert(block_size > 0 && bytes_per_bit > 0); + + // convert bytes_per_bit to be a power of 2 + while (bytes_per_bit >>= 1) { + bytes_per_bit_pow_++; + } + + // num_bits_needed = ceil(block_size / bytes_per_bit) + size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1; + assert(num_bits_needed > 0); + + // bitmap_size = ceil(num_bits_needed / kBitsPerEntry) + size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1; + + // Create bitmap and set all the bits to 0 + bitmap_ = new std::atomic[bitmap_size](); + + RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size); + } + + ~BlockReadAmpBitmap() { delete[] bitmap_; } + + void Mark(uint32_t start_offset, uint32_t end_offset) { + assert(end_offset >= start_offset); + // Index of first bit in mask + uint32_t start_bit = + (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >> + bytes_per_bit_pow_; + // Index of last bit in mask + 1 + uint32_t exclusive_end_bit = + (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_; + if (start_bit >= exclusive_end_bit) { + return; + } + assert(exclusive_end_bit > 0); + + if (GetAndSet(start_bit) == 0) { + uint32_t new_useful_bytes = (exclusive_end_bit - start_bit) + << bytes_per_bit_pow_; + RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES, + new_useful_bytes); + } + } + + Statistics* GetStatistics() { + return statistics_.load(std::memory_order_relaxed); + } + + void SetStatistics(Statistics* stats) { statistics_.store(stats); } + + uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } + + size_t ApproximateMemoryUsage() const { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size((void*)this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return sizeof(*this); + } + + private: + // Get the current value of bit at `bit_idx` and set it to 1 + inline bool GetAndSet(uint32_t bit_idx) { + const uint32_t byte_idx = bit_idx / kBitsPerEntry; + const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry); + + return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) & + bit_mask; + } + + const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes + const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits + + // Bitmap used to record the bytes that we read, use atomic to protect + // against multiple threads updating the same bit + std::atomic* bitmap_; + // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize + // muliplication and division + uint8_t bytes_per_bit_pow_; + // Pointer to DB Statistics object, Since this bitmap may outlive the DB + // this pointer maybe invalid, but the DB will update it to a valid pointer + // by using SetStatistics() before calling Mark() + std::atomic statistics_; + uint32_t rnd_; +}; + +// class Block is the uncompressed and "parsed" form for blocks containing +// key-value pairs. (See BlockContents comments for more on terminology.) +// This includes the in-memory representation of data blocks, index blocks +// (including partitions), range deletion blocks, properties blocks, metaindex +// blocks, as well as the top level of the partitioned filter structure (which +// is actually an index of the filter partitions). It is NOT suitable for +// compressed blocks in general, filter blocks/partitions, or compression +// dictionaries. +// +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details of the format and the various block types. +// +// TODO: Rename to ParsedKvBlock? +class Block { + public: + // Initialize the block with the specified contents. + explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0, + Statistics* statistics = nullptr); + // No copying allowed + Block(const Block&) = delete; + void operator=(const Block&) = delete; + + ~Block(); + + size_t size() const { return size_; } + const char* data() const { return data_; } + // The additional memory space taken by the block data. + size_t usable_size() const { return contents_.usable_size(); } + uint32_t NumRestarts() const; + bool own_bytes() const { return contents_.own_bytes(); } + + BlockBasedTableOptions::DataBlockIndexType IndexType() const; + + // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key + // comparator. + // + // If iter is null, return new Iterator + // If iter is not null, update this one and return it as Iterator* + // + // Updates read_amp_bitmap_ if it is not nullptr. + // + // If `block_contents_pinned` is true, the caller will guarantee that when + // the cleanup functions are transferred from the iterator to other + // classes, e.g. PinnableSlice, the pointer to the bytes will still be + // valid. Either the iterator holds cache handle or ownership of some resource + // and release them in a release function, or caller is sure that the data + // will not go away (for example, it's from mmapped file which will not be + // closed). + // + // NOTE: for the hash based lookup, if a key prefix doesn't match any key, + // the iterator will simply be set as "invalid", rather than returning + // the key that is just pass the target key. + DataBlockIter* NewDataIterator(const Comparator* raw_ucmp, + SequenceNumber global_seqno, + DataBlockIter* iter = nullptr, + Statistics* stats = nullptr, + bool block_contents_pinned = false); + + // Returns an MetaBlockIter for iterating over blocks containing metadata + // (like Properties blocks). Unlike data blocks, the keys for these blocks + // do not contain sequence numbers, do not use a user-define comparator, and + // do not track read amplification/statistics. Additionally, MetaBlocks will + // not assert if the block is formatted improperly. + // + // If `block_contents_pinned` is true, the caller will guarantee that when + // the cleanup functions are transferred from the iterator to other + // classes, e.g. PinnableSlice, the pointer to the bytes will still be + // valid. Either the iterator holds cache handle or ownership of some resource + // and release them in a release function, or caller is sure that the data + // will not go away (for example, it's from mmapped file which will not be + // closed). + MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false); + + // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key + // comparator. + // + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + // + // If `prefix_index` is not nullptr this block will do hash lookup for the key + // prefix. If total_order_seek is true, prefix_index_ is ignored. + // + // `have_first_key` controls whether IndexValue will contain + // first_internal_key. It affects data serialization format, so the same value + // have_first_key must be used when writing and reading index. + // It is determined by IndexType property of the table. + IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp, + SequenceNumber global_seqno, + IndexBlockIter* iter, Statistics* stats, + bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned = false, + BlockPrefixIndex* prefix_index = nullptr); + + // Report an approximation of how much memory has been used. + size_t ApproximateMemoryUsage() const; + + // For TypedCacheInterface + const Slice& ContentSlice() const { return contents_.data; } + + // Initializes per key-value checksum protection. + // After this method is called, each DataBlockIterator returned + // by NewDataIterator will verify per key-value checksum for any key it read. + void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp); + + // Initializes per key-value checksum protection. + // After this method is called, each IndexBlockIterator returned + // by NewIndexIterator will verify per key-value checksum for any key it read. + // value_is_full and index_has_first_key are needed to be able to parse + // the index block content and construct checksums. + void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key); + + // Initializes per key-value checksum protection. + // After this method is called, each MetaBlockIter returned + // by NewMetaIterator will verify per key-value checksum for any key it read. + void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key); + + static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len, + const Slice& key, const Slice& value) { + ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr); + } + + const char* TEST_GetKVChecksum() const { return kv_checksum_; } + + private: + BlockContents contents_; + const char* data_; // contents_.data.data() + size_t size_; // contents_.data.size() + uint32_t restart_offset_; // Offset in data_ of restart array + uint32_t num_restarts_; + std::unique_ptr read_amp_bitmap_; + char* kv_checksum_{nullptr}; + uint32_t checksum_size_{0}; + // Used by block iterators to calculate current key index within a block + uint32_t block_restart_interval_{0}; + uint8_t protection_bytes_per_key_{0}; + DataBlockHashIndex data_block_hash_index_; +}; + +// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The +// format of this data buffer is an uncompressed, sorted sequence of key-value +// pairs (see `Block` API for more details). +// +// Notably, the keys may either be in internal key format or user key format. +// Subclasses are responsible for configuring the key format. +// +// `BlockIter` intends to provide final overrides for all of +// `InternalIteratorBase` functions that can move the iterator. It does +// this to guarantee `UpdateKey()` is called exactly once after each key +// movement potentially visible to users. In this step, the key is prepared +// (e.g., serialized if global seqno is in effect) so it can be returned +// immediately when the user asks for it via calling `key() const`. +// +// For its subclasses, it provides protected variants of the above-mentioned +// final-overridden methods. They are named with the "Impl" suffix, e.g., +// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These +// "Impl" functions are responsible for positioning `raw_key_` but not +// invoking `UpdateKey()`. +// +// Per key-value checksum is enabled if relevant states are passed in during +// `InitializeBase()`. The checksum verification is done in each call to +// UpdateKey() for the current key. Each subclass is responsible for keeping +// track of cur_entry_idx_, the index of the current key within the block. +// BlockIter uses this index to get the corresponding checksum for current key. +// Additional checksum verification may be done in subclasses if they read keys +// other than the key being processed in UpdateKey(). +template +class BlockIter : public InternalIteratorBase { + public: + // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do + // nothing. Calls cleanup functions. + virtual void Invalidate(const Status& s) { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); + + data_ = nullptr; + current_ = restarts_; + status_ = s; + + // Call cleanup callbacks. + Cleanable::Reset(); + } + + bool Valid() const override { + // When status_ is not ok, iter should be invalid. + assert(status_.ok() || current_ >= restarts_); + return current_ < restarts_; + } + + virtual void SeekToFirst() override final { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; +#endif + SeekToFirstImpl(); + UpdateKey(); + } + + virtual void SeekToLast() override final { + SeekToLastImpl(); + UpdateKey(); + } + + virtual void Seek(const Slice& target) override final { + SeekImpl(target); + UpdateKey(); + } + + virtual void SeekForPrev(const Slice& target) override final { + SeekForPrevImpl(target); + UpdateKey(); + } + + virtual void Next() override final { + NextImpl(); + UpdateKey(); + } + + virtual bool NextAndGetResult(IterateResult* result) override final { + // This does not need to call `UpdateKey()` as the parent class only has + // access to the `UpdateKey()`-invoking functions. + return InternalIteratorBase::NextAndGetResult(result); + } + + virtual void Prev() override final { + PrevImpl(); + UpdateKey(); + } + + Status status() const override { return status_; } + + Slice key() const override { + assert(Valid()); + return key_; + } + +#ifndef NDEBUG + ~BlockIter() override { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + status_.PermitUncheckedError(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; + + bool TEST_Corrupt_Callback(const std::string& sync_point) { + bool corrupt = false; + TEST_SYNC_POINT_CALLBACK(sync_point, static_cast(&corrupt)); + + if (corrupt) { + CorruptionError(); + } + return corrupt; + } +#endif + + bool IsKeyPinned() const override { + return block_contents_pinned_ && key_pinned_; + } + + bool IsValuePinned() const override { return block_contents_pinned_; } + + size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } + + uint32_t ValueOffset() const { + return static_cast(value_.data() - data_); + } + + void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } + + Cache::Handle* cache_handle() { return cache_handle_; } + + protected: + std::unique_ptr icmp_; + const char* data_; // underlying block contents + uint32_t num_restarts_; // Number of uint32_t entries in restart array + + // Index of restart block in which current_ or current_-1 falls + uint32_t restart_index_; + uint32_t restarts_; // Offset of restart array (list of fixed32) + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + // Raw key from block. + IterKey raw_key_; + // Buffer for key data when global seqno assignment is enabled. + IterKey key_buf_; + Slice value_; + Status status_; + // Key to be exposed to users. + Slice key_; + SequenceNumber global_seqno_; + + // Per key-value checksum related states + const char* kv_checksum_; + int32_t cur_entry_idx_; + uint32_t block_restart_interval_; + uint8_t protection_bytes_per_key_; + + bool key_pinned_; + // Whether the block data is guaranteed to outlive this iterator, and + // as long as the cleanup functions are transferred to another class, + // e.g. PinnableSlice, the pointer to the bytes will still be valid. + bool block_contents_pinned_; + + virtual void SeekToFirstImpl() = 0; + virtual void SeekToLastImpl() = 0; + virtual void SeekImpl(const Slice& target) = 0; + virtual void SeekForPrevImpl(const Slice& target) = 0; + virtual void NextImpl() = 0; + virtual void PrevImpl() = 0; + + // Returns the restart interval of this block. + // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized. + virtual uint32_t GetRestartInterval() { + if (num_restarts_ <= 1 || data_ == nullptr) { + return 0; + } + SeekToFirstImpl(); + uint32_t end_index = GetRestartPoint(1); + uint32_t count = 1; + while (NextEntryOffset() < end_index && status_.ok()) { + assert(Valid()); + NextImpl(); + ++count; + } + return count; + } + + // Returns the number of keys in this block. + virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) { + if (num_restarts_ == 0 || data_ == nullptr) { + return 0; + } + uint32_t count = (num_restarts_ - 1) * block_restart_interval; + // Add number of keys from the last restart interval + SeekToRestartPoint(num_restarts_ - 1); + while (NextEntryOffset() < restarts_ && status_.ok()) { + NextImpl(); + ++count; + } + return count; + } + + // Stores whether the current key has a shared bytes with prev key in + // *is_shared. + // Sets raw_key_, value_ to the current parsed key and value. + // Sets restart_index_ to point to the restart interval that contains + // the current key. + template + inline bool ParseNextKey(bool* is_shared); + + // protection_bytes_per_key, kv_checksum, and block_restart_interval + // are needed only for per kv checksum verification. + void InitializeBase(const Comparator* raw_ucmp, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, bool block_contents_pinned, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { + assert(data_ == nullptr); // Ensure it is called only once + assert(num_restarts > 0); // Ensure the param is valid + + icmp_ = std::make_unique(raw_ucmp); + data_ = data; + restarts_ = restarts; + num_restarts_ = num_restarts; + current_ = restarts_; + restart_index_ = num_restarts_; + global_seqno_ = global_seqno; + block_contents_pinned_ = block_contents_pinned; + cache_handle_ = nullptr; + cur_entry_idx_ = -1; + protection_bytes_per_key_ = protection_bytes_per_key; + kv_checksum_ = kv_checksum; + block_restart_interval_ = block_restart_interval; + // Checksum related states are either all 0/nullptr or all non-zero. + // One exception is when num_restarts == 0, block_restart_interval can be 0 + // since we are not able to compute it. + assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || + (protection_bytes_per_key > 0 && kv_checksum != nullptr && + (block_restart_interval > 0 || num_restarts == 1))); + } + + void CorruptionError(const std::string& error_msg = "bad entry in block") { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption(error_msg); + raw_key_.Clear(); + value_.clear(); + } + + void PerKVChecksumCorruptionError() { + std::string error_msg{ + "Corrupted block entry: per key-value checksum verification " + "failed."}; + error_msg.append(" Offset: " + std::to_string(current_) + "."); + error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + "."); + CorruptionError(error_msg); + } + + // Must be called every time a key is found that needs to be returned to user, + // and may be called when no key is found (as a no-op). Updates `key_`, + // `key_buf_`, and `key_pinned_` with info about the found key. + // Per key-value checksum verification is done if available for the key to be + // returned. Iterator is invalidated with corruption status if checksum + // verification fails. + void UpdateKey() { + key_buf_.Clear(); + if (!Valid()) { + return; + } + if (raw_key_.IsUserKey()) { + assert(global_seqno_ == kDisableGlobalSequenceNumber); + key_ = raw_key_.GetUserKey(); + key_pinned_ = raw_key_.IsKeyPinned(); + } else if (global_seqno_ == kDisableGlobalSequenceNumber) { + key_ = raw_key_.GetInternalKey(); + key_pinned_ = raw_key_.IsKeyPinned(); + } else { + key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, + ExtractValueType(raw_key_.GetInternalKey())); + key_ = key_buf_.GetInternalKey(); + key_pinned_ = false; + } + TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", + (void*)value_.data()); + TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", + &protection_bytes_per_key_); + if (protection_bytes_per_key_ > 0) { + if (!ProtectionInfo64() + .ProtectKV(raw_key_.GetKey(), value_) + .Verify( + protection_bytes_per_key_, + kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { + PerKVChecksumCorruptionError(); + } + } + } + + // Returns the result of `Comparator::Compare()`, where the appropriate + // comparator is used for the block contents, the LHS argument is the current + // key with global seqno applied, and the RHS argument is `other`. + int CompareCurrentKey(const Slice& other) { + if (raw_key_.IsUserKey()) { + assert(global_seqno_ == kDisableGlobalSequenceNumber); + return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other); + } else if (global_seqno_ == kDisableGlobalSequenceNumber) { + return icmp_->Compare(raw_key_.GetInternalKey(), other); + } + return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other, + kDisableGlobalSequenceNumber); + } + + private: + // Store the cache handle, if the block is cached. We need this since the + // only other place the handle is stored is as an argument to the Cleanable + // function callback, which is hard to retrieve. When multiple value + // PinnableSlices reference the block, they need the cache handle in order + // to bump up the ref count + Cache::Handle* cache_handle_; + + public: + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + // NOTE: We don't support blocks bigger than 2GB + return static_cast((value_.data() + value_.size()) - data_); + } + + uint32_t GetRestartPoint(uint32_t index) const { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + raw_key_.Clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + protected: + template + inline bool BinarySeek(const Slice& target, uint32_t* index, + bool* is_index_key_result); + + // Find the first key in restart interval `index` that is >= `target`. + // If there is no such key, iterator is positioned at the first key in + // restart interval `index + 1`. + // If is_index_key_result is true, it positions the iterator at the first key + // in this restart interval. + // Per key-value checksum verification is done for all keys scanned + // up to but not including the last key (the key that current_ points to + // when this function returns). This key's checksum is verified in + // UpdateKey(). + void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, + bool is_index_key_result); +}; + +class DataBlockIter final : public BlockIter { + public: + DataBlockIter() + : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} + void Initialize(const Comparator* raw_ucmp, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, + BlockReadAmpBitmap* read_amp_bitmap, + bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { + InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, + block_contents_pinned, protection_bytes_per_key, kv_checksum, + block_restart_interval); + raw_key_.SetIsUserKey(false); + read_amp_bitmap_ = read_amp_bitmap; + last_bitmap_offset_ = current_ + 1; + data_block_hash_index_ = data_block_hash_index; + } + + Slice value() const override { + assert(Valid()); + if (read_amp_bitmap_ && current_ < restarts_ && + current_ != last_bitmap_offset_) { + read_amp_bitmap_->Mark(current_ /* current entry offset */, + NextEntryOffset() - 1); + last_bitmap_offset_ = current_; + } + return value_; + } + + // Returns if `target` may exist. + inline bool SeekForGet(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true; +#endif + if (!data_block_hash_index_) { + SeekImpl(target); + UpdateKey(); + return true; + } + bool res = SeekForGetImpl(target); + UpdateKey(); + return res; + } + + void Invalidate(const Status& s) override { + BlockIter::Invalidate(s); + // Clear prev entries cache. + prev_entries_keys_buff_.clear(); + prev_entries_.clear(); + prev_entries_idx_ = -1; + } + + protected: + friend Block; + inline bool ParseNextDataKey(bool* is_shared); + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + void SeekImpl(const Slice& target) override; + void SeekForPrevImpl(const Slice& target) override; + void NextImpl() override; + void PrevImpl() override; + + private: + // read-amp bitmap + BlockReadAmpBitmap* read_amp_bitmap_; + // last `current_` value we report to read-amp bitmp + mutable uint32_t last_bitmap_offset_; + struct CachedPrevEntry { + explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, + size_t _key_offset, size_t _key_size, Slice _value) + : offset(_offset), + key_ptr(_key_ptr), + key_offset(_key_offset), + key_size(_key_size), + value(_value) {} + + // offset of entry in block + uint32_t offset; + // Pointer to key data in block (nullptr if key is delta-encoded) + const char* key_ptr; + // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) + size_t key_offset; + // size of key + size_t key_size; + // value slice pointing to data in block + Slice value; + }; + std::string prev_entries_keys_buff_; + std::vector prev_entries_; + int32_t prev_entries_idx_ = -1; + + DataBlockHashIndex* data_block_hash_index_; + + bool SeekForGetImpl(const Slice& target); +}; + +// Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and +// are used to store Properties associated with table. +// Meta blocks always store user keys (no sequence number) and always +// use the BytewiseComparator. Additionally, MetaBlock accesses are +// not recorded in the Statistics or for Read-Amplification. +class MetaBlockIter final : public BlockIter { + public: + MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } + void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, + bool block_contents_pinned, uint8_t protection_bytes_per_key, + const char* kv_checksum, uint32_t block_restart_interval) { + // Initializes the iterator with a BytewiseComparator and + // the raw key being a user key. + InitializeBase(BytewiseComparator(), data, restarts, num_restarts, + kDisableGlobalSequenceNumber, block_contents_pinned, + protection_bytes_per_key, kv_checksum, + block_restart_interval); + raw_key_.SetIsUserKey(true); + } + + Slice value() const override { + assert(Valid()); + return value_; + } + + protected: + friend Block; + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + void SeekImpl(const Slice& target) override; + void SeekForPrevImpl(const Slice& target) override; + void NextImpl() override; + void PrevImpl() override; + // Meta index block's restart interval is always 1. See + // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval. + uint32_t GetRestartInterval() override { return 1; } + uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; } +}; + +class IndexBlockIter final : public BlockIter { + public: + IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} + + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + void Initialize(const Comparator* raw_ucmp, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, + bool have_first_key, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { + InitializeBase(raw_ucmp, data, restarts, num_restarts, + kDisableGlobalSequenceNumber, block_contents_pinned, + protection_bytes_per_key, kv_checksum, + block_restart_interval); + raw_key_.SetIsUserKey(!key_includes_seq); + prefix_index_ = prefix_index; + value_delta_encoded_ = !value_is_full; + have_first_key_ = have_first_key; + if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { + global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); + } else { + global_seqno_state_.reset(); + } + } + + Slice user_key() const override { + assert(Valid()); + return raw_key_.GetUserKey(); + } + + IndexValue value() const override { + assert(Valid()); + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + return decoded_value_; + } else { + IndexValue entry; + Slice v = value_; + Status decode_s __attribute__((__unused__)) = + entry.DecodeFrom(&v, have_first_key_, nullptr); + assert(decode_s.ok()); + return entry; + } + } + + Slice raw_value() const { + assert(Valid()); + return value_; + } + + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + + protected: + friend Block; + // IndexBlockIter follows a different contract for prefix iterator + // from data iterators. + // If prefix of the seek key `target` exists in the file, it must + // return the same result as total order seek. + // If the prefix of `target` doesn't exist in the file, it can either + // return the result of total order seek, or set both of Valid() = false + // and status() = NotFound(). + void SeekImpl(const Slice& target) override; + + void SeekForPrevImpl(const Slice&) override { + assert(false); + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::InvalidArgument( + "RocksDB internal error: should never call SeekForPrev() on index " + "blocks"); + raw_key_.Clear(); + value_.clear(); + } + + void PrevImpl() override; + void NextImpl() override; + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + + private: + bool value_delta_encoded_; + bool have_first_key_; // value includes first_internal_key + BlockPrefixIndex* prefix_index_; + // Whether the value is delta encoded. In that case the value is assumed to be + // BlockHandle. The first value in each restart interval is the full encoded + // BlockHandle; the restart of encoded size part of the BlockHandle. The + // offset of delta encoded BlockHandles is computed by adding the size of + // previous delta encoded values in the same restart interval to the offset of + // the first value in that restart interval. + IndexValue decoded_value_; + + // When sequence number overwriting is enabled, this struct contains the seqno + // to overwrite with, and current first_internal_key with overwritten seqno. + // This is rarely used, so we put it behind a pointer and only allocate when + // needed. + struct GlobalSeqnoState { + // First internal key according to current index entry, but with sequence + // number overwritten to global_seqno. + IterKey first_internal_key; + SequenceNumber global_seqno; + + explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} + }; + + std::unique_ptr global_seqno_state_; + + // Set *prefix_may_exist to false if no key possibly share the same prefix + // as `target`. If not set, the result position should be the same as total + // order Seek. + bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist); + // Set *prefix_may_exist to false if no key can possibly share the same + // prefix as `target`. If not set, the result position should be the same + // as total order seek. + bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, + uint32_t left, uint32_t right, uint32_t* index, + bool* prefix_may_exist); + inline int CompareBlockKey(uint32_t block_index, const Slice& target); + + inline bool ParseNextIndexKey(); + + // When value_delta_encoded_ is enabled it decodes the value which is assumed + // to be BlockHandle and put it to decoded_value_ + inline void DecodeCurrentValue(bool is_shared); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.cc new file mode 100644 index 0000000000..ee8865a50d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.cc @@ -0,0 +1,2027 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/block_based_table_builder.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "block_cache.h" +#include "cache/cache_entry_roles.h" +#include "cache/cache_helpers.h" +#include "cache/cache_key.h" +#include "cache/cache_reservation_manager.h" +#include "db/dbformat.h" +#include "index_builder.h" +#include "logging/logging.h" +#include "memory/memory_allocator_impl.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/table.h" +#include "rocksdb/types.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/work_queue.h" + +namespace ROCKSDB_NAMESPACE { + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { + +constexpr size_t kBlockTrailerSize = BlockBasedTable::kBlockTrailerSize; + +// Create a filter block builder based on its type. +FilterBlockBuilder* CreateFilterBlockBuilder( + const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, + const FilterBuildingContext& context, + const bool use_delta_encoding_for_index_values, + PartitionedIndexBuilder* const p_index_builder) { + const BlockBasedTableOptions& table_opt = context.table_options; + assert(table_opt.filter_policy); // precondition + + FilterBitsBuilder* filter_bits_builder = + BloomFilterPolicy::GetBuilderFromContext(context); + if (filter_bits_builder == nullptr) { + return nullptr; + } else { + if (table_opt.partition_filters) { + assert(p_index_builder != nullptr); + // Since after partition cut request from filter builder it takes time + // until index builder actully cuts the partition, until the end of a + // data block potentially with many keys, we take the lower bound as + // partition size. + assert(table_opt.block_size_deviation <= 100); + auto partition_size = + static_cast(((table_opt.metadata_block_size * + (100 - table_opt.block_size_deviation)) + + 99) / + 100); + partition_size = std::max(partition_size, static_cast(1)); + return new PartitionedFilterBlockBuilder( + mopt.prefix_extractor.get(), table_opt.whole_key_filtering, + filter_bits_builder, table_opt.index_block_restart_interval, + use_delta_encoding_for_index_values, p_index_builder, partition_size); + } else { + return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), + table_opt.whole_key_filtering, + filter_bits_builder); + } + } +} + +bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size, + int max_compressed_bytes_per_kb) { + // For efficiency, avoid floating point and division + return compressed_size <= + (static_cast(max_compressed_bytes_per_kb) * uncomp_size) >> + 10; +} + +} // namespace + +// format_version is the block format as defined in include/rocksdb/table.h +Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, + CompressionType* type, uint32_t format_version, + bool allow_sample, std::string* compressed_output, + std::string* sampled_output_fast, + std::string* sampled_output_slow) { + assert(type); + assert(compressed_output); + assert(compressed_output->empty()); + + // If requested, we sample one in every N block with a + // fast and slow compression algorithm and report the stats. + // The users can use these stats to decide if it is worthwhile + // enabling compression and they also get a hint about which + // compression algorithm wil be beneficial. + if (allow_sample && info.SampleForCompression() && + Random::GetTLSInstance()->OneIn( + static_cast(info.SampleForCompression()))) { + // Sampling with a fast compression algorithm + if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) { + CompressionType c = + LZ4_Supported() ? kLZ4Compression : kSnappyCompression; + CompressionContext context(c); + CompressionOptions options; + CompressionInfo info_tmp(options, context, + CompressionDict::GetEmptyDict(), c, + info.SampleForCompression()); + + CompressData(uncompressed_data, info_tmp, + GetCompressFormatForVersion(format_version), + sampled_output_fast); + } + + // Sampling with a slow but high-compression algorithm + if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) { + CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression; + CompressionContext context(c); + CompressionOptions options; + CompressionInfo info_tmp(options, context, + CompressionDict::GetEmptyDict(), c, + info.SampleForCompression()); + + CompressData(uncompressed_data, info_tmp, + GetCompressFormatForVersion(format_version), + sampled_output_slow); + } + } + + int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb; + if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) { + *type = kNoCompression; + return uncompressed_data; + } + + // Actually compress the data; if the compression method is not supported, + // or the compression fails etc., just fall back to uncompressed + if (!CompressData(uncompressed_data, info, + GetCompressFormatForVersion(format_version), + compressed_output)) { + *type = kNoCompression; + return uncompressed_data; + } + + // Check the compression ratio; if it's not good enough, just fall back to + // uncompressed + if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(), + max_compressed_bytes_per_kb)) { + *type = kNoCompression; + return uncompressed_data; + } + + *type = info.type(); + return *compressed_output; +} + +// kBlockBasedTableMagicNumber was picked by running +// echo rocksdb.table.block_based | sha1sum +// and taking the leading 64 bits. +// Please note that kBlockBasedTableMagicNumber may also be accessed by other +// .cc files +// for that reason we declare it extern in the header but to get the space +// allocated +// it must be not extern in one place. +const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull; +// We also support reading and writing legacy block based table format (for +// backwards compatibility) +const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull; + +// A collector that collects properties of interest to block-based table. +// For now this class looks heavy-weight since we only write one additional +// property. +// But in the foreseeable future, we will add more and more properties that are +// specific to block-based table. +class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector + : public IntTblPropCollector { + public: + explicit BlockBasedTablePropertiesCollector( + BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering, + bool prefix_filtering) + : index_type_(index_type), + whole_key_filtering_(whole_key_filtering), + prefix_filtering_(prefix_filtering) {} + + Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + // Intentionally left blank. No interest in collecting stats for + // blocks. + return; + } + + Status Finish(UserCollectedProperties* properties) override { + std::string val; + PutFixed32(&val, static_cast(index_type_)); + properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); + properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering, + whole_key_filtering_ ? kPropTrue : kPropFalse}); + properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering, + prefix_filtering_ ? kPropTrue : kPropFalse}); + return Status::OK(); + } + + // The name of the properties collector can be used for debugging purpose. + const char* Name() const override { + return "BlockBasedTablePropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override { + // Intentionally left blank. + return UserCollectedProperties(); + } + + private: + BlockBasedTableOptions::IndexType index_type_; + bool whole_key_filtering_; + bool prefix_filtering_; +}; + +struct BlockBasedTableBuilder::Rep { + const ImmutableOptions ioptions; + const MutableCFOptions moptions; + const BlockBasedTableOptions table_options; + const InternalKeyComparator& internal_comparator; + WritableFileWriter* file; + std::atomic offset; + size_t alignment; + BlockBuilder data_block; + // Buffers uncompressed data blocks to replay later. Needed when + // compression dictionary is enabled so we can finalize the dictionary before + // compressing any data blocks. + std::vector data_block_buffers; + BlockBuilder range_del_block; + + InternalKeySliceTransform internal_prefix_transform; + std::unique_ptr index_builder; + PartitionedIndexBuilder* p_index_builder_ = nullptr; + + std::string last_key; + const Slice* first_key_in_next_block = nullptr; + CompressionType compression_type; + uint64_t sample_for_compression; + std::atomic compressible_input_data_bytes; + std::atomic uncompressible_input_data_bytes; + std::atomic sampled_input_data_bytes; + std::atomic sampled_output_slow_data_bytes; + std::atomic sampled_output_fast_data_bytes; + CompressionOptions compression_opts; + std::unique_ptr compression_dict; + std::vector> compression_ctxs; + std::vector> verify_ctxs; + std::unique_ptr verify_dict; + + size_t data_begin_offset = 0; + + TableProperties props; + + // States of the builder. + // + // - `kBuffered`: This is the initial state where zero or more data blocks are + // accumulated uncompressed in-memory. From this state, call + // `EnterUnbuffered()` to finalize the compression dictionary if enabled, + // compress/write out any buffered blocks, and proceed to the `kUnbuffered` + // state. + // + // - `kUnbuffered`: This is the state when compression dictionary is finalized + // either because it wasn't enabled in the first place or it's been created + // from sampling previously buffered data. In this state, blocks are simply + // compressed/written out as they fill up. From this state, call `Finish()` + // to complete the file (write meta-blocks, etc.), or `Abandon()` to delete + // the partially created file. + // + // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been + // called, so the table builder is no longer usable. We must be in this + // state by the time the destructor runs. + enum class State { + kBuffered, + kUnbuffered, + kClosed, + }; + State state; + // `kBuffered` state is allowed only as long as the buffering of uncompressed + // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`. + uint64_t buffer_limit; + std::shared_ptr + compression_dict_buffer_cache_res_mgr; + const bool use_delta_encoding_for_index_values; + std::unique_ptr filter_builder; + OffsetableCacheKey base_cache_key; + const TableFileCreationReason reason; + + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + std::unique_ptr flush_block_policy; + + std::vector> table_properties_collectors; + + std::unique_ptr pc_rep; + BlockCreateContext create_context; + + // The size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t tail_size; + + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } + void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } + + bool IsParallelCompressionEnabled() const { + return compression_opts.parallel_threads > 1; + } + + Status GetStatus() { + // We need to make modifications of status visible when status_ok is set + // to false, and this is ensured by status_mutex, so no special memory + // order for status_ok is required. + if (status_ok.load(std::memory_order_relaxed)) { + return Status::OK(); + } else { + return CopyStatus(); + } + } + + Status CopyStatus() { + std::lock_guard lock(status_mutex); + return status; + } + + IOStatus GetIOStatus() { + // We need to make modifications of io_status visible when status_ok is set + // to false, and this is ensured by io_status_mutex, so no special memory + // order for io_status_ok is required. + if (io_status_ok.load(std::memory_order_relaxed)) { + return IOStatus::OK(); + } else { + return CopyIOStatus(); + } + } + + IOStatus CopyIOStatus() { + std::lock_guard lock(io_status_mutex); + return io_status; + } + + // Never erase an existing status that is not OK. + void SetStatus(Status s) { + if (!s.ok() && status_ok.load(std::memory_order_relaxed)) { + // Locking is an overkill for non compression_opts.parallel_threads + // case but since it's unlikely that s is not OK, we take this cost + // to be simplicity. + std::lock_guard lock(status_mutex); + status = s; + status_ok.store(false, std::memory_order_relaxed); + } + } + + // Never erase an existing I/O status that is not OK. + // Calling this will also SetStatus(ios) + void SetIOStatus(IOStatus ios) { + if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) { + // Locking is an overkill for non compression_opts.parallel_threads + // case but since it's unlikely that s is not OK, we take this cost + // to be simplicity. + std::lock_guard lock(io_status_mutex); + io_status = ios; + io_status_ok.store(false, std::memory_order_relaxed); + } + SetStatus(ios); + } + + Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo, + WritableFileWriter* f) + : ioptions(tbo.ioptions), + moptions(tbo.moptions), + table_options(table_opt), + internal_comparator(tbo.internal_comparator), + file(f), + offset(0), + alignment(table_options.block_align + ? std::min(static_cast(table_options.block_size), + kDefaultPageSize) + : 0), + data_block(table_options.block_restart_interval, + table_options.use_delta_encoding, + false /* use_value_delta_encoding */, + tbo.internal_comparator.user_comparator() + ->CanKeysWithDifferentByteContentsBeEqual() + ? BlockBasedTableOptions::kDataBlockBinarySearch + : table_options.data_block_index_type, + table_options.data_block_hash_table_util_ratio), + range_del_block(1 /* block_restart_interval */), + internal_prefix_transform(tbo.moptions.prefix_extractor.get()), + compression_type(tbo.compression_type), + sample_for_compression(tbo.moptions.sample_for_compression), + compressible_input_data_bytes(0), + uncompressible_input_data_bytes(0), + sampled_input_data_bytes(0), + sampled_output_slow_data_bytes(0), + sampled_output_fast_data_bytes(0), + compression_opts(tbo.compression_opts), + compression_dict(), + compression_ctxs(tbo.compression_opts.parallel_threads), + verify_ctxs(tbo.compression_opts.parallel_threads), + verify_dict(), + state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered + : State::kUnbuffered), + use_delta_encoding_for_index_values(table_opt.format_version >= 4 && + !table_opt.block_align), + reason(tbo.reason), + flush_block_policy( + table_options.flush_block_policy_factory->NewFlushBlockPolicy( + table_options, data_block)), + create_context(&table_options, ioptions.stats, + compression_type == kZSTD || + compression_type == kZSTDNotFinalCompression, + tbo.moptions.block_protection_bytes_per_key, + tbo.internal_comparator.user_comparator(), + !use_delta_encoding_for_index_values, + table_opt.index_type == + BlockBasedTableOptions::kBinarySearchWithFirstKey), + tail_size(0), + status_ok(true), + io_status_ok(true) { + if (tbo.target_file_size == 0) { + buffer_limit = compression_opts.max_dict_buffer_bytes; + } else if (compression_opts.max_dict_buffer_bytes == 0) { + buffer_limit = tbo.target_file_size; + } else { + buffer_limit = std::min(tbo.target_file_size, + compression_opts.max_dict_buffer_bytes); + } + + const auto compress_dict_build_buffer_charged = + table_options.cache_usage_options.options_overrides + .at(CacheEntryRole::kCompressionDictionaryBuildingBuffer) + .charged; + if (table_options.block_cache && + (compress_dict_build_buffer_charged == + CacheEntryRoleOptions::Decision::kEnabled || + compress_dict_build_buffer_charged == + CacheEntryRoleOptions::Decision::kFallback)) { + compression_dict_buffer_cache_res_mgr = + std::make_shared>( + table_options.block_cache); + } else { + compression_dict_buffer_cache_res_mgr = nullptr; + } + + for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { + compression_ctxs[i].reset(new CompressionContext(compression_type)); + } + if (table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( + &internal_comparator, use_delta_encoding_for_index_values, + table_options); + index_builder.reset(p_index_builder_); + } else { + index_builder.reset(IndexBuilder::CreateIndexBuilder( + table_options.index_type, &internal_comparator, + &this->internal_prefix_transform, use_delta_encoding_for_index_values, + table_options)); + } + if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) { + // Apply optimize_filters_for_hits setting here when applicable by + // skipping filter generation + filter_builder.reset(); + } else if (tbo.skip_filters) { + // For SstFileWriter skip_filters + filter_builder.reset(); + } else if (!table_options.filter_policy) { + // Null filter_policy -> no filter + filter_builder.reset(); + } else { + FilterBuildingContext filter_context(table_options); + + filter_context.info_log = ioptions.logger; + filter_context.column_family_name = tbo.column_family_name; + filter_context.reason = reason; + + // Only populate other fields if known to be in LSM rather than + // generating external SST file + if (reason != TableFileCreationReason::kMisc) { + filter_context.compaction_style = ioptions.compaction_style; + filter_context.num_levels = ioptions.num_levels; + filter_context.level_at_creation = tbo.level_at_creation; + filter_context.is_bottommost = tbo.is_bottommost; + assert(filter_context.level_at_creation < filter_context.num_levels); + } + + filter_builder.reset(CreateFilterBlockBuilder( + ioptions, moptions, filter_context, + use_delta_encoding_for_index_values, p_index_builder_)); + } + + assert(tbo.int_tbl_prop_collector_factories); + for (auto& factory : *tbo.int_tbl_prop_collector_factories) { + assert(factory); + + table_properties_collectors.emplace_back( + factory->CreateIntTblPropCollector(tbo.column_family_id, + tbo.level_at_creation)); + } + table_properties_collectors.emplace_back( + new BlockBasedTablePropertiesCollector( + table_options.index_type, table_options.whole_key_filtering, + moptions.prefix_extractor != nullptr)); + const Comparator* ucmp = tbo.internal_comparator.user_comparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + table_properties_collectors.emplace_back( + new TimestampTablePropertiesCollector(ucmp)); + } + if (table_options.verify_compression) { + for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { + verify_ctxs[i].reset(new UncompressionContext(compression_type)); + } + } + + // These are only needed for populating table properties + props.column_family_id = tbo.column_family_id; + props.column_family_name = tbo.column_family_name; + props.oldest_key_time = tbo.oldest_key_time; + props.file_creation_time = tbo.file_creation_time; + props.orig_file_number = tbo.cur_file_num; + props.db_id = tbo.db_id; + props.db_session_id = tbo.db_session_id; + props.db_host_id = ioptions.db_host_id; + if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) { + ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); + } + } + + Rep(const Rep&) = delete; + Rep& operator=(const Rep&) = delete; + + private: + // Synchronize status & io_status accesses across threads from main thread, + // compression thread and write thread in parallel compression. + std::mutex status_mutex; + std::atomic status_ok; + Status status; + std::mutex io_status_mutex; + std::atomic io_status_ok; + IOStatus io_status; +}; + +struct BlockBasedTableBuilder::ParallelCompressionRep { + // Keys is a wrapper of vector of strings avoiding + // releasing string memories during vector clear() + // in order to save memory allocation overhead + class Keys { + public: + Keys() : keys_(kKeysInitSize), size_(0) {} + void PushBack(const Slice& key) { + if (size_ == keys_.size()) { + keys_.emplace_back(key.data(), key.size()); + } else { + keys_[size_].assign(key.data(), key.size()); + } + size_++; + } + void SwapAssign(std::vector& keys) { + size_ = keys.size(); + std::swap(keys_, keys); + } + void Clear() { size_ = 0; } + size_t Size() { return size_; } + std::string& Back() { return keys_[size_ - 1]; } + std::string& operator[](size_t idx) { + assert(idx < size_); + return keys_[idx]; + } + + private: + const size_t kKeysInitSize = 32; + std::vector keys_; + size_t size_; + }; + std::unique_ptr curr_block_keys; + + class BlockRepSlot; + + // BlockRep instances are fetched from and recycled to + // block_rep_pool during parallel compression. + struct BlockRep { + Slice contents; + Slice compressed_contents; + std::unique_ptr data; + std::unique_ptr compressed_data; + CompressionType compression_type; + std::unique_ptr first_key_in_next_block; + std::unique_ptr keys; + std::unique_ptr slot; + Status status; + }; + // Use a vector of BlockRep as a buffer for a determined number + // of BlockRep structures. All data referenced by pointers in + // BlockRep will be freed when this vector is destructed. + using BlockRepBuffer = std::vector; + BlockRepBuffer block_rep_buf; + // Use a thread-safe queue for concurrent access from block + // building thread and writer thread. + using BlockRepPool = WorkQueue; + BlockRepPool block_rep_pool; + + // Use BlockRepSlot to keep block order in write thread. + // slot_ will pass references to BlockRep + class BlockRepSlot { + public: + BlockRepSlot() : slot_(1) {} + template + void Fill(T&& rep) { + slot_.push(std::forward(rep)); + }; + void Take(BlockRep*& rep) { slot_.pop(rep); } + + private: + // slot_ will pass references to BlockRep in block_rep_buf, + // and those references are always valid before the destruction of + // block_rep_buf. + WorkQueue slot_; + }; + + // Compression queue will pass references to BlockRep in block_rep_buf, + // and those references are always valid before the destruction of + // block_rep_buf. + using CompressQueue = WorkQueue; + CompressQueue compress_queue; + std::vector compress_thread_pool; + + // Write queue will pass references to BlockRep::slot in block_rep_buf, + // and those references are always valid before the corresponding + // BlockRep::slot is destructed, which is before the destruction of + // block_rep_buf. + using WriteQueue = WorkQueue; + WriteQueue write_queue; + std::unique_ptr write_thread; + + // Estimate output file size when parallel compression is enabled. This is + // necessary because compression & flush are no longer synchronized, + // and BlockBasedTableBuilder::FileSize() is no longer accurate. + // memory_order_relaxed suffices because accurate statistics is not required. + class FileSizeEstimator { + public: + explicit FileSizeEstimator() + : uncomp_bytes_compressed(0), + uncomp_bytes_curr_block(0), + uncomp_bytes_curr_block_set(false), + uncomp_bytes_inflight(0), + blocks_inflight(0), + curr_compression_ratio(0), + estimated_file_size(0) {} + + // Estimate file size when a block is about to be emitted to + // compression thread + void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) { + uint64_t new_uncomp_bytes_inflight = + uncomp_bytes_inflight.fetch_add(uncomp_block_size, + std::memory_order_relaxed) + + uncomp_block_size; + + uint64_t new_blocks_inflight = + blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1; + + estimated_file_size.store( + curr_file_size + + static_cast( + static_cast(new_uncomp_bytes_inflight) * + curr_compression_ratio.load(std::memory_order_relaxed)) + + new_blocks_inflight * kBlockTrailerSize, + std::memory_order_relaxed); + } + + // Estimate file size when a block is already reaped from + // compression thread + void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) { + assert(uncomp_bytes_curr_block_set); + + uint64_t new_uncomp_bytes_compressed = + uncomp_bytes_compressed + uncomp_bytes_curr_block; + assert(new_uncomp_bytes_compressed > 0); + + curr_compression_ratio.store( + (curr_compression_ratio.load(std::memory_order_relaxed) * + uncomp_bytes_compressed + + compressed_block_size) / + static_cast(new_uncomp_bytes_compressed), + std::memory_order_relaxed); + uncomp_bytes_compressed = new_uncomp_bytes_compressed; + + uint64_t new_uncomp_bytes_inflight = + uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block, + std::memory_order_relaxed) - + uncomp_bytes_curr_block; + + uint64_t new_blocks_inflight = + blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1; + + estimated_file_size.store( + curr_file_size + + static_cast( + static_cast(new_uncomp_bytes_inflight) * + curr_compression_ratio.load(std::memory_order_relaxed)) + + new_blocks_inflight * kBlockTrailerSize, + std::memory_order_relaxed); + + uncomp_bytes_curr_block_set = false; + } + + void SetEstimatedFileSize(uint64_t size) { + estimated_file_size.store(size, std::memory_order_relaxed); + } + + uint64_t GetEstimatedFileSize() { + return estimated_file_size.load(std::memory_order_relaxed); + } + + void SetCurrBlockUncompSize(uint64_t size) { + uncomp_bytes_curr_block = size; + uncomp_bytes_curr_block_set = true; + } + + private: + // Input bytes compressed so far. + uint64_t uncomp_bytes_compressed; + // Size of current block being appended. + uint64_t uncomp_bytes_curr_block; + // Whether uncomp_bytes_curr_block has been set for next + // ReapBlock call. + bool uncomp_bytes_curr_block_set; + // Input bytes under compression and not appended yet. + std::atomic uncomp_bytes_inflight; + // Number of blocks under compression and not appended yet. + std::atomic blocks_inflight; + // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock. + std::atomic curr_compression_ratio; + // Estimated SST file size. + std::atomic estimated_file_size; + }; + FileSizeEstimator file_size_estimator; + + // Facilities used for waiting first block completion. Need to Wait for + // the completion of first block compression and flush to get a non-zero + // compression ratio. + std::atomic first_block_processed; + std::condition_variable first_block_cond; + std::mutex first_block_mutex; + + explicit ParallelCompressionRep(uint32_t parallel_threads) + : curr_block_keys(new Keys()), + block_rep_buf(parallel_threads), + block_rep_pool(parallel_threads), + compress_queue(parallel_threads), + write_queue(parallel_threads), + first_block_processed(false) { + for (uint32_t i = 0; i < parallel_threads; i++) { + block_rep_buf[i].contents = Slice(); + block_rep_buf[i].compressed_contents = Slice(); + block_rep_buf[i].data.reset(new std::string()); + block_rep_buf[i].compressed_data.reset(new std::string()); + block_rep_buf[i].compression_type = CompressionType(); + block_rep_buf[i].first_key_in_next_block.reset(new std::string()); + block_rep_buf[i].keys.reset(new Keys()); + block_rep_buf[i].slot.reset(new BlockRepSlot()); + block_rep_buf[i].status = Status::OK(); + block_rep_pool.push(&block_rep_buf[i]); + } + } + + ~ParallelCompressionRep() { block_rep_pool.finish(); } + + // Make a block prepared to be emitted to compression thread + // Used in non-buffered mode + BlockRep* PrepareBlock(CompressionType compression_type, + const Slice* first_key_in_next_block, + BlockBuilder* data_block) { + BlockRep* block_rep = + PrepareBlockInternal(compression_type, first_key_in_next_block); + assert(block_rep != nullptr); + data_block->SwapAndReset(*(block_rep->data)); + block_rep->contents = *(block_rep->data); + std::swap(block_rep->keys, curr_block_keys); + curr_block_keys->Clear(); + return block_rep; + } + + // Used in EnterUnbuffered + BlockRep* PrepareBlock(CompressionType compression_type, + const Slice* first_key_in_next_block, + std::string* data_block, + std::vector* keys) { + BlockRep* block_rep = + PrepareBlockInternal(compression_type, first_key_in_next_block); + assert(block_rep != nullptr); + std::swap(*(block_rep->data), *data_block); + block_rep->contents = *(block_rep->data); + block_rep->keys->SwapAssign(*keys); + return block_rep; + } + + // Emit a block to compression thread + void EmitBlock(BlockRep* block_rep) { + assert(block_rep != nullptr); + assert(block_rep->status.ok()); + if (!write_queue.push(block_rep->slot.get())) { + return; + } + if (!compress_queue.push(block_rep)) { + return; + } + + if (!first_block_processed.load(std::memory_order_relaxed)) { + std::unique_lock lock(first_block_mutex); + first_block_cond.wait(lock, [this] { + return first_block_processed.load(std::memory_order_relaxed); + }); + } + } + + // Reap a block from compression thread + void ReapBlock(BlockRep* block_rep) { + assert(block_rep != nullptr); + block_rep->compressed_data->clear(); + block_rep_pool.push(block_rep); + + if (!first_block_processed.load(std::memory_order_relaxed)) { + std::lock_guard lock(first_block_mutex); + first_block_processed.store(true, std::memory_order_relaxed); + first_block_cond.notify_one(); + } + } + + private: + BlockRep* PrepareBlockInternal(CompressionType compression_type, + const Slice* first_key_in_next_block) { + BlockRep* block_rep = nullptr; + block_rep_pool.pop(block_rep); + assert(block_rep != nullptr); + + assert(block_rep->data); + + block_rep->compression_type = compression_type; + + if (first_key_in_next_block == nullptr) { + block_rep->first_key_in_next_block.reset(nullptr); + } else { + block_rep->first_key_in_next_block->assign( + first_key_in_next_block->data(), first_key_in_next_block->size()); + } + + return block_rep; + } +}; + +BlockBasedTableBuilder::BlockBasedTableBuilder( + const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo, + WritableFileWriter* file) { + BlockBasedTableOptions sanitized_table_options(table_options); + if (sanitized_table_options.format_version == 0 && + sanitized_table_options.checksum != kCRC32c) { + ROCKS_LOG_WARN( + tbo.ioptions.logger, + "Silently converting format_version to 1 because checksum is " + "non-default"); + // silently convert format_version to 1 to keep consistent with current + // behavior + sanitized_table_options.format_version = 1; + } + + rep_ = new Rep(sanitized_table_options, tbo, file); + + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", + const_cast(&rep_->props)); + + BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id, + tbo.cur_file_num, &rep_->base_cache_key); + + if (rep_->IsParallelCompressionEnabled()) { + StartParallelCompression(); + } +} + +BlockBasedTableBuilder::~BlockBasedTableBuilder() { + // Catch errors where caller forgot to call Finish() + assert(rep_->state == Rep::State::kClosed); + delete rep_; +} + +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(rep_->state != Rep::State::kClosed); + if (!ok()) return; + ValueType value_type = ExtractValueType(key); + if (IsValueType(value_type)) { +#ifndef NDEBUG + if (r->props.num_entries > r->props.num_range_deletions) { + assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); + } +#endif // !NDEBUG + + auto should_flush = r->flush_block_policy->Update(key, value); + if (should_flush) { + assert(!r->data_block.empty()); + r->first_key_in_next_block = &key; + Flush(); + if (r->state == Rep::State::kBuffered) { + bool exceeds_buffer_limit = + (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit); + bool exceeds_global_block_cache_limit = false; + + // Increase cache charging for the last buffered data block + // only if the block is not going to be unbuffered immediately + // and there exists a cache reservation manager + if (!exceeds_buffer_limit && + r->compression_dict_buffer_cache_res_mgr != nullptr) { + Status s = + r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation( + r->data_begin_offset); + exceeds_global_block_cache_limit = s.IsMemoryLimit(); + } + + if (exceeds_buffer_limit || exceeds_global_block_cache_limit) { + EnterUnbuffered(); + } + } + + // Add item to index block. + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + if (ok() && r->state == Rep::State::kUnbuffered) { + if (r->IsParallelCompressionEnabled()) { + r->pc_rep->curr_block_keys->Clear(); + } else { + r->index_builder->AddIndexEntry(&r->last_key, &key, + r->pending_handle); + } + } + } + + // Note: PartitionedFilterBlockBuilder requires key being added to filter + // builder after being added to index builder. + if (r->state == Rep::State::kUnbuffered) { + if (r->IsParallelCompressionEnabled()) { + r->pc_rep->curr_block_keys->PushBack(key); + } else { + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + } + } + + r->data_block.AddWithLastKey(key, value, r->last_key); + r->last_key.assign(key.data(), key.size()); + if (r->state == Rep::State::kBuffered) { + // Buffered keys will be replayed from data_block_buffers during + // `Finish()` once compression dictionary has been finalized. + } else { + if (!r->IsParallelCompressionEnabled()) { + r->index_builder->OnKeyAdded(key); + } + } + // TODO offset passed in is not accurate for parallel compression case + NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), + r->table_properties_collectors, + r->ioptions.logger); + + } else if (value_type == kTypeRangeDeletion) { + r->range_del_block.Add(key, value); + // TODO offset passed in is not accurate for parallel compression case + NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), + r->table_properties_collectors, + r->ioptions.logger); + } else { + assert(false); + } + + r->props.num_entries++; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion || + value_type == kTypeDeletionWithTimestamp) { + r->props.num_deletions++; + } else if (value_type == kTypeRangeDeletion) { + r->props.num_deletions++; + r->props.num_range_deletions++; + } else if (value_type == kTypeMerge) { + r->props.num_merge_operands++; + } +} + +void BlockBasedTableBuilder::Flush() { + Rep* r = rep_; + assert(rep_->state != Rep::State::kClosed); + if (!ok()) return; + if (r->data_block.empty()) return; + if (r->IsParallelCompressionEnabled() && + r->state == Rep::State::kUnbuffered) { + r->data_block.Finish(); + ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( + r->compression_type, r->first_key_in_next_block, &(r->data_block)); + assert(block_rep != nullptr); + r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), + r->get_offset()); + r->pc_rep->EmitBlock(block_rep); + } else { + WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData); + } +} + +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle, + BlockType block_type) { + block->Finish(); + std::string uncompressed_block_data; + uncompressed_block_data.reserve(rep_->table_options.block_size); + block->SwapAndReset(uncompressed_block_data); + if (rep_->state == Rep::State::kBuffered) { + assert(block_type == BlockType::kData); + rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_data)); + rep_->data_begin_offset += rep_->data_block_buffers.back().size(); + return; + } + WriteBlock(uncompressed_block_data, handle, block_type); +} + +void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data, + BlockHandle* handle, + BlockType block_type) { + Rep* r = rep_; + assert(r->state == Rep::State::kUnbuffered); + Slice block_contents; + CompressionType type; + Status compress_status; + bool is_data_block = block_type == BlockType::kData; + CompressAndVerifyBlock(uncompressed_block_data, is_data_block, + *(r->compression_ctxs[0]), r->verify_ctxs[0].get(), + &(r->compressed_output), &(block_contents), &type, + &compress_status); + r->SetStatus(compress_status); + if (!ok()) { + return; + } + + WriteMaybeCompressedBlock(block_contents, type, handle, block_type, + &uncompressed_block_data); + r->compressed_output.clear(); + if (is_data_block) { + r->props.data_size = r->get_offset(); + ++r->props.num_data_blocks; + } +} + +void BlockBasedTableBuilder::BGWorkCompression( + const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx) { + ParallelCompressionRep::BlockRep* block_rep = nullptr; + while (rep_->pc_rep->compress_queue.pop(block_rep)) { + assert(block_rep != nullptr); + CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/ + compression_ctx, verify_ctx, + block_rep->compressed_data.get(), + &block_rep->compressed_contents, + &(block_rep->compression_type), &block_rep->status); + block_rep->slot->Fill(block_rep); + } +} + +void BlockBasedTableBuilder::CompressAndVerifyBlock( + const Slice& uncompressed_block_data, bool is_data_block, + const CompressionContext& compression_ctx, UncompressionContext* verify_ctx, + std::string* compressed_output, Slice* block_contents, + CompressionType* type, Status* out_status) { + Rep* r = rep_; + bool is_status_ok = ok(); + if (!r->IsParallelCompressionEnabled()) { + assert(is_status_ok); + } + + if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) { + StopWatchNano timer( + r->ioptions.clock, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); + + if (is_data_block) { + r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(), + std::memory_order_relaxed); + } + const CompressionDict* compression_dict; + if (!is_data_block || r->compression_dict == nullptr) { + compression_dict = &CompressionDict::GetEmptyDict(); + } else { + compression_dict = r->compression_dict.get(); + } + assert(compression_dict != nullptr); + CompressionInfo compression_info(r->compression_opts, compression_ctx, + *compression_dict, r->compression_type, + r->sample_for_compression); + + std::string sampled_output_fast; + std::string sampled_output_slow; + *block_contents = CompressBlock( + uncompressed_block_data, compression_info, type, + r->table_options.format_version, is_data_block /* allow_sample */, + compressed_output, &sampled_output_fast, &sampled_output_slow); + + if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) { + // Currently compression sampling is only enabled for data block. + assert(is_data_block); + r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(), + std::memory_order_relaxed); + r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(), + std::memory_order_relaxed); + r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(), + std::memory_order_relaxed); + } + // notify collectors on block add + NotifyCollectTableCollectorsOnBlockAdd( + r->table_properties_collectors, uncompressed_block_data.size(), + sampled_output_fast.size(), sampled_output_slow.size()); + + // Some of the compression algorithms are known to be unreliable. If + // the verify_compression flag is set then try to de-compress the + // compressed data and compare to the input. + if (*type != kNoCompression && r->table_options.verify_compression) { + // Retrieve the uncompressed contents into a new buffer + const UncompressionDict* verify_dict; + if (!is_data_block || r->verify_dict == nullptr) { + verify_dict = &UncompressionDict::GetEmptyDict(); + } else { + verify_dict = r->verify_dict.get(); + } + assert(verify_dict != nullptr); + BlockContents contents; + UncompressionInfo uncompression_info(*verify_ctx, *verify_dict, + r->compression_type); + Status uncompress_status = UncompressBlockData( + uncompression_info, block_contents->data(), block_contents->size(), + &contents, r->table_options.format_version, r->ioptions); + + if (uncompress_status.ok()) { + bool data_match = contents.data.compare(uncompressed_block_data) == 0; + if (!data_match) { + // The result of the compression was invalid. abort. + const char* const msg = + "Decompressed block did not match pre-compression block"; + ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg); + *out_status = Status::Corruption(msg); + *type = kNoCompression; + } + } else { + // Decompression reported an error. abort. + *out_status = Status::Corruption(std::string("Could not decompress: ") + + uncompress_status.getState()); + *type = kNoCompression; + } + } + if (timer.IsStarted()) { + RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + } + } else { + // Status is not OK, or block is too big to be compressed. + if (is_data_block) { + r->uncompressible_input_data_bytes.fetch_add( + uncompressed_block_data.size(), std::memory_order_relaxed); + } + *type = kNoCompression; + } + if (is_data_block) { + r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize, + std::memory_order_relaxed); + } + + // Abort compression if the block is too big, or did not pass + // verification. + if (*type == kNoCompression) { + *block_contents = uncompressed_block_data; + bool compression_attempted = !compressed_output->empty(); + RecordTick(r->ioptions.stats, compression_attempted + ? NUMBER_BLOCK_COMPRESSION_REJECTED + : NUMBER_BLOCK_COMPRESSION_BYPASSED); + RecordTick(r->ioptions.stats, + compression_attempted ? BYTES_COMPRESSION_REJECTED + : BYTES_COMPRESSION_BYPASSED, + uncompressed_block_data.size()); + } else { + RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED); + RecordTick(r->ioptions.stats, BYTES_COMPRESSED_FROM, + uncompressed_block_data.size()); + RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO, + compressed_output->size()); + } +} + +void BlockBasedTableBuilder::WriteMaybeCompressedBlock( + const Slice& block_contents, CompressionType comp_type, BlockHandle* handle, + BlockType block_type, const Slice* uncompressed_block_data) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // compression_type: uint8 + // checksum: uint32 + Rep* r = rep_; + bool is_data_block = block_type == BlockType::kData; + // Old, misleading name of this function: WriteRawBlock + StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->get_offset()); + handle->set_size(block_contents.size()); + assert(status().ok()); + assert(io_status().ok()); + if (uncompressed_block_data == nullptr) { + uncompressed_block_data = &block_contents; + assert(comp_type == kNoCompression); + } + + { + IOStatus io_s = r->file->Append(block_contents); + if (!io_s.ok()) { + r->SetIOStatus(io_s); + return; + } + } + + std::array trailer; + trailer[0] = comp_type; + uint32_t checksum = ComputeBuiltinChecksumWithLastByte( + r->table_options.checksum, block_contents.data(), block_contents.size(), + /*last_byte*/ comp_type); + + if (block_type == BlockType::kFilter) { + Status s = r->filter_builder->MaybePostVerifyFilter(block_contents); + if (!s.ok()) { + r->SetStatus(s); + return; + } + } + + EncodeFixed32(trailer.data() + 1, checksum); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum", + trailer.data()); + { + IOStatus io_s = r->file->Append(Slice(trailer.data(), trailer.size())); + if (!io_s.ok()) { + r->SetIOStatus(io_s); + return; + } + } + + { + bool warm_cache; + switch (r->table_options.prepopulate_block_cache) { + case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly: + warm_cache = (r->reason == TableFileCreationReason::kFlush); + break; + case BlockBasedTableOptions::PrepopulateBlockCache::kDisable: + warm_cache = false; + break; + default: + // missing case + assert(false); + warm_cache = false; + } + if (warm_cache) { + Status s = InsertBlockInCacheHelper(*uncompressed_block_data, handle, + block_type); + if (!s.ok()) { + r->SetStatus(s); + return; + } + } + } + + r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize); + if (r->table_options.block_align && is_data_block) { + size_t pad_bytes = + (r->alignment - + ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) & + (r->alignment - 1); + IOStatus io_s = r->file->Pad(pad_bytes); + if (io_s.ok()) { + r->set_offset(r->get_offset() + pad_bytes); + } else { + r->SetIOStatus(io_s); + return; + } + } + + if (r->IsParallelCompressionEnabled()) { + if (is_data_block) { + r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(), + r->get_offset()); + } else { + r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset()); + } + } +} + +void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() { + Rep* r = rep_; + ParallelCompressionRep::BlockRepSlot* slot = nullptr; + ParallelCompressionRep::BlockRep* block_rep = nullptr; + while (r->pc_rep->write_queue.pop(slot)) { + assert(slot != nullptr); + slot->Take(block_rep); + assert(block_rep != nullptr); + if (!block_rep->status.ok()) { + r->SetStatus(block_rep->status); + // Reap block so that blocked Flush() can finish + // if there is one, and Flush() will notice !ok() next time. + block_rep->status = Status::OK(); + r->pc_rep->ReapBlock(block_rep); + continue; + } + + for (size_t i = 0; i < block_rep->keys->Size(); i++) { + auto& key = (*block_rep->keys)[i]; + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + r->index_builder->OnKeyAdded(key); + } + + r->pc_rep->file_size_estimator.SetCurrBlockUncompSize( + block_rep->data->size()); + WriteMaybeCompressedBlock(block_rep->compressed_contents, + block_rep->compression_type, &r->pending_handle, + BlockType::kData, &block_rep->contents); + if (!ok()) { + break; + } + + r->props.data_size = r->get_offset(); + ++r->props.num_data_blocks; + + if (block_rep->first_key_in_next_block == nullptr) { + r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), nullptr, + r->pending_handle); + } else { + Slice first_key_in_next_block = + Slice(*block_rep->first_key_in_next_block); + r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), + &first_key_in_next_block, + r->pending_handle); + } + + r->pc_rep->ReapBlock(block_rep); + } +} + +void BlockBasedTableBuilder::StartParallelCompression() { + rep_->pc_rep.reset( + new ParallelCompressionRep(rep_->compression_opts.parallel_threads)); + rep_->pc_rep->compress_thread_pool.reserve( + rep_->compression_opts.parallel_threads); + for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) { + rep_->pc_rep->compress_thread_pool.emplace_back([this, i] { + BGWorkCompression(*(rep_->compression_ctxs[i]), + rep_->verify_ctxs[i].get()); + }); + } + rep_->pc_rep->write_thread.reset( + new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); })); +} + +void BlockBasedTableBuilder::StopParallelCompression() { + rep_->pc_rep->compress_queue.finish(); + for (auto& thread : rep_->pc_rep->compress_thread_pool) { + thread.join(); + } + rep_->pc_rep->write_queue.finish(); + rep_->pc_rep->write_thread->join(); +} + +Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); } + +IOStatus BlockBasedTableBuilder::io_status() const { + return rep_->GetIOStatus(); +} + +Status BlockBasedTableBuilder::InsertBlockInCacheHelper( + const Slice& block_contents, const BlockHandle* handle, + BlockType block_type) { + + Cache* block_cache = rep_->table_options.block_cache.get(); + Status s; + auto helper = + GetCacheItemHelper(block_type, rep_->ioptions.lowest_used_cache_tier); + if (block_cache && helper && helper->create_cb) { + CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); + size_t charge; + s = WarmInCache(block_cache, key.AsSlice(), block_contents, + &rep_->create_context, helper, Cache::Priority::LOW, + &charge); + + if (s.ok()) { + BlockBasedTable::UpdateCacheInsertionMetrics( + block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(), + rep_->ioptions.stats); + } else { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES); + } + } + return s; +} + +void BlockBasedTableBuilder::WriteFilterBlock( + MetaIndexBuilder* meta_index_builder) { + if (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty()) { + // No filter block needed + return; + } + BlockHandle filter_block_handle; + bool is_partitioned_filter = rep_->table_options.partition_filters; + if (ok()) { + rep_->props.num_filter_entries += + rep_->filter_builder->EstimateEntriesAdded(); + Status s = Status::Incomplete(); + while (ok() && s.IsIncomplete()) { + // filter_data is used to store the transferred filter data payload from + // FilterBlockBuilder and deallocate the payload by going out of scope. + // Otherwise, the payload will unnecessarily remain until + // BlockBasedTableBuilder is deallocated. + // + // See FilterBlockBuilder::Finish() for more on the difference in + // transferred filter data payload among different FilterBlockBuilder + // subtypes. + std::unique_ptr filter_data; + Slice filter_content = + rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data); + + assert(s.ok() || s.IsIncomplete() || s.IsCorruption()); + if (s.IsCorruption()) { + rep_->SetStatus(s); + break; + } + + rep_->props.filter_size += filter_content.size(); + + BlockType btype = is_partitioned_filter && /* last */ s.ok() + ? BlockType::kFilterPartitionIndex + : BlockType::kFilter; + WriteMaybeCompressedBlock(filter_content, kNoCompression, + &filter_block_handle, btype); + } + rep_->filter_builder->ResetFilterBitsBuilder(); + } + if (ok()) { + // Add mapping from ".Name" to location + // of filter data. + std::string key; + key = is_partitioned_filter ? BlockBasedTable::kPartitionedFilterBlockPrefix + : BlockBasedTable::kFullFilterBlockPrefix; + key.append(rep_->table_options.filter_policy->CompatibilityName()); + meta_index_builder->Add(key, filter_block_handle); + } +} + +void BlockBasedTableBuilder::WriteIndexBlock( + MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) { + if (!ok()) { + return; + } + IndexBuilder::IndexBlocks index_blocks; + auto index_builder_status = rep_->index_builder->Finish(&index_blocks); + if (index_builder_status.IsIncomplete()) { + // We we have more than one index partition then meta_blocks are not + // supported for the index. Currently meta_blocks are used only by + // HashIndexBuilder which is not multi-partition. + assert(index_blocks.meta_blocks.empty()); + } else if (ok() && !index_builder_status.ok()) { + rep_->SetStatus(index_builder_status); + } + if (ok()) { + for (const auto& item : index_blocks.meta_blocks) { + BlockHandle block_handle; + WriteBlock(item.second, &block_handle, BlockType::kIndex); + if (!ok()) { + break; + } + meta_index_builder->Add(item.first, block_handle); + } + } + if (ok()) { + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, + BlockType::kIndex); + } else { + WriteMaybeCompressedBlock(index_blocks.index_block_contents, + kNoCompression, index_block_handle, + BlockType::kIndex); + } + } + // If there are more index partitions, finish them and write them out + if (index_builder_status.IsIncomplete()) { + bool index_building_finished = false; + while (ok() && !index_building_finished) { + Status s = + rep_->index_builder->Finish(&index_blocks, *index_block_handle); + if (s.ok()) { + index_building_finished = true; + } else if (s.IsIncomplete()) { + // More partitioned index after this one + assert(!index_building_finished); + } else { + // Error + rep_->SetStatus(s); + return; + } + + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, + BlockType::kIndex); + } else { + WriteMaybeCompressedBlock(index_blocks.index_block_contents, + kNoCompression, index_block_handle, + BlockType::kIndex); + } + // The last index_block_handle will be for the partition index block + } + } +} + +void BlockBasedTableBuilder::WritePropertiesBlock( + MetaIndexBuilder* meta_index_builder) { + BlockHandle properties_block_handle; + if (ok()) { + PropertyBlockBuilder property_block_builder; + rep_->props.filter_policy_name = + rep_->table_options.filter_policy != nullptr + ? rep_->table_options.filter_policy->Name() + : ""; + rep_->props.index_size = + rep_->index_builder->IndexSize() + kBlockTrailerSize; + rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr + ? rep_->ioptions.user_comparator->Name() + : "nullptr"; + rep_->props.merge_operator_name = + rep_->ioptions.merge_operator != nullptr + ? rep_->ioptions.merge_operator->Name() + : "nullptr"; + rep_->props.compression_name = + CompressionTypeToString(rep_->compression_type); + rep_->props.compression_options = + CompressionOptionsToString(rep_->compression_opts); + rep_->props.prefix_extractor_name = + rep_->moptions.prefix_extractor != nullptr + ? rep_->moptions.prefix_extractor->AsString() + : "nullptr"; + std::string property_collectors_names = "["; + for (size_t i = 0; + i < rep_->ioptions.table_properties_collector_factories.size(); ++i) { + if (i != 0) { + property_collectors_names += ","; + } + property_collectors_names += + rep_->ioptions.table_properties_collector_factories[i]->Name(); + } + property_collectors_names += "]"; + rep_->props.property_collectors_names = property_collectors_names; + if (rep_->table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + assert(rep_->p_index_builder_ != nullptr); + rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); + rep_->props.top_level_index_size = + rep_->p_index_builder_->TopLevelIndexSize(rep_->offset); + } + rep_->props.index_key_is_user_key = + !rep_->index_builder->seperator_is_key_plus_seq(); + rep_->props.index_value_is_delta_encoded = + rep_->use_delta_encoding_for_index_values; + if (rep_->sampled_input_data_bytes > 0) { + rep_->props.slow_compression_estimated_data_size = static_cast( + static_cast(rep_->sampled_output_slow_data_bytes) / + rep_->sampled_input_data_bytes * + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes + 0.5); + rep_->props.fast_compression_estimated_data_size = static_cast( + static_cast(rep_->sampled_output_fast_data_bytes) / + rep_->sampled_input_data_bytes * + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes + 0.5); + } else if (rep_->sample_for_compression > 0) { + // We tried to sample but none were found. Assume worst-case (compression + // ratio 1.0) so data is complete and aggregatable. + rep_->props.slow_compression_estimated_data_size = + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes; + rep_->props.fast_compression_estimated_data_size = + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes; + } + + // Add basic properties + property_block_builder.AddTableProperty(rep_->props); + + // Add use collected properties + NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors, + rep_->ioptions.logger, + &property_block_builder); + + Slice block_data = property_block_builder.Finish(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", &block_data); + WriteMaybeCompressedBlock(block_data, kNoCompression, + &properties_block_handle, BlockType::kProperties); + } + if (ok()) { +#ifndef NDEBUG + { + uint64_t props_block_offset = properties_block_handle.offset(); + uint64_t props_block_size = properties_block_handle.size(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset", + &props_block_offset); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize", + &props_block_size); + } +#endif // !NDEBUG + + const std::string* properties_block_meta = &kPropertiesBlockName; + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:Meta", + &properties_block_meta); + meta_index_builder->Add(*properties_block_meta, properties_block_handle); + } +} + +void BlockBasedTableBuilder::WriteCompressionDictBlock( + MetaIndexBuilder* meta_index_builder) { + if (rep_->compression_dict != nullptr && + rep_->compression_dict->GetRawDict().size()) { + BlockHandle compression_dict_block_handle; + if (ok()) { + WriteMaybeCompressedBlock(rep_->compression_dict->GetRawDict(), + kNoCompression, &compression_dict_block_handle, + BlockType::kCompressionDictionary); +#ifndef NDEBUG + Slice compression_dict = rep_->compression_dict->GetRawDict(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + &compression_dict); +#endif // NDEBUG + } + if (ok()) { + meta_index_builder->Add(kCompressionDictBlockName, + compression_dict_block_handle); + } + } +} + +void BlockBasedTableBuilder::WriteRangeDelBlock( + MetaIndexBuilder* meta_index_builder) { + if (ok() && !rep_->range_del_block.empty()) { + BlockHandle range_del_block_handle; + WriteMaybeCompressedBlock(rep_->range_del_block.Finish(), kNoCompression, + &range_del_block_handle, + BlockType::kRangeDeletion); + meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle); + } +} + +void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, + BlockHandle& index_block_handle) { + Rep* r = rep_; + // this is guaranteed by BlockBasedTableBuilder's constructor + assert(r->table_options.checksum == kCRC32c || + r->table_options.format_version != 0); + assert(ok()); + + FooterBuilder footer; + footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version, + r->get_offset(), r->table_options.checksum, + metaindex_block_handle, index_block_handle); + IOStatus ios = r->file->Append(footer.GetSlice()); + if (ios.ok()) { + r->set_offset(r->get_offset() + footer.GetSlice().size()); + } else { + r->SetIOStatus(ios); + } +} + +void BlockBasedTableBuilder::EnterUnbuffered() { + Rep* r = rep_; + assert(r->state == Rep::State::kBuffered); + r->state = Rep::State::kUnbuffered; + const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 + ? r->compression_opts.zstd_max_train_bytes + : r->compression_opts.max_dict_bytes; + const size_t kNumBlocksBuffered = r->data_block_buffers.size(); + if (kNumBlocksBuffered == 0) { + // The below code is neither safe nor necessary for handling zero data + // blocks. + return; + } + + // Abstract algebra teaches us that a finite cyclic group (such as the + // additive group of integers modulo N) can be generated by a number that is + // coprime with N. Since N is variable (number of buffered data blocks), we + // must then pick a prime number in order to guarantee coprimeness with any N. + // + // One downside of this approach is the spread will be poor when + // `kPrimeGeneratorRemainder` is close to zero or close to + // `kNumBlocksBuffered`. + // + // Picked a random number between one and one trillion and then chose the + // next prime number greater than or equal to it. + const uint64_t kPrimeGenerator = 545055921143ull; + // Can avoid repeated division by just adding the remainder repeatedly. + const size_t kPrimeGeneratorRemainder = static_cast( + kPrimeGenerator % static_cast(kNumBlocksBuffered)); + const size_t kInitSampleIdx = kNumBlocksBuffered / 2; + + std::string compression_dict_samples; + std::vector compression_dict_sample_lens; + size_t buffer_idx = kInitSampleIdx; + for (size_t i = 0; + i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes; + ++i) { + size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(), + r->data_block_buffers[buffer_idx].size()); + compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0, + copy_len); + compression_dict_sample_lens.emplace_back(copy_len); + + buffer_idx += kPrimeGeneratorRemainder; + if (buffer_idx >= kNumBlocksBuffered) { + buffer_idx -= kNumBlocksBuffered; + } + } + + // final data block flushed, now we can generate dictionary from the samples. + // OK if compression_dict_samples is empty, we'll just get empty dictionary. + std::string dict; + if (r->compression_opts.zstd_max_train_bytes > 0) { + if (r->compression_opts.use_zstd_dict_trainer) { + dict = ZSTD_TrainDictionary(compression_dict_samples, + compression_dict_sample_lens, + r->compression_opts.max_dict_bytes); + } else { + dict = ZSTD_FinalizeDictionary( + compression_dict_samples, compression_dict_sample_lens, + r->compression_opts.max_dict_bytes, r->compression_opts.level); + } + } else { + dict = std::move(compression_dict_samples); + } + r->compression_dict.reset(new CompressionDict(dict, r->compression_type, + r->compression_opts.level)); + r->verify_dict.reset(new UncompressionDict( + dict, r->compression_type == kZSTD || + r->compression_type == kZSTDNotFinalCompression)); + + auto get_iterator_for_block = [&r](size_t i) { + auto& data_block = r->data_block_buffers[i]; + assert(!data_block.empty()); + + Block reader{BlockContents{data_block}}; + DataBlockIter* iter = reader.NewDataIterator( + r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber); + + iter->SeekToFirst(); + assert(iter->Valid()); + return std::unique_ptr(iter); + }; + + std::unique_ptr iter = nullptr, next_block_iter = nullptr; + + for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) { + if (iter == nullptr) { + iter = get_iterator_for_block(i); + assert(iter != nullptr); + }; + + if (i + 1 < r->data_block_buffers.size()) { + next_block_iter = get_iterator_for_block(i + 1); + } + + auto& data_block = r->data_block_buffers[i]; + if (r->IsParallelCompressionEnabled()) { + Slice first_key_in_next_block; + const Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + if (i + 1 < r->data_block_buffers.size()) { + assert(next_block_iter != nullptr); + first_key_in_next_block = next_block_iter->key(); + } else { + first_key_in_next_block_ptr = r->first_key_in_next_block; + } + + std::vector keys; + for (; iter->Valid(); iter->Next()) { + keys.emplace_back(iter->key().ToString()); + } + + ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( + r->compression_type, first_key_in_next_block_ptr, &data_block, &keys); + + assert(block_rep != nullptr); + r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), + r->get_offset()); + r->pc_rep->EmitBlock(block_rep); + } else { + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + r->index_builder->OnKeyAdded(key); + } + WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData); + if (ok() && i + 1 < r->data_block_buffers.size()) { + assert(next_block_iter != nullptr); + Slice first_key_in_next_block = next_block_iter->key(); + + Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + + iter->SeekToLast(); + std::string last_key = iter->key().ToString(); + r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr, + r->pending_handle); + } + } + std::swap(iter, next_block_iter); + } + r->data_block_buffers.clear(); + r->data_begin_offset = 0; + // Release all reserved cache for data block buffers + if (r->compression_dict_buffer_cache_res_mgr != nullptr) { + Status s = r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation( + r->data_begin_offset); + s.PermitUncheckedError(); + } +} + +Status BlockBasedTableBuilder::Finish() { + Rep* r = rep_; + assert(r->state != Rep::State::kClosed); + bool empty_data_block = r->data_block.empty(); + r->first_key_in_next_block = nullptr; + Flush(); + if (r->state == Rep::State::kBuffered) { + EnterUnbuffered(); + } + if (r->IsParallelCompressionEnabled()) { + StopParallelCompression(); +#ifndef NDEBUG + for (const auto& br : r->pc_rep->block_rep_buf) { + assert(br.status.ok()); + } +#endif // !NDEBUG + } else { + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries first. + if (ok() && !empty_data_block) { + r->index_builder->AddIndexEntry( + &r->last_key, nullptr /* no next data block */, r->pending_handle); + } + } + + r->props.tail_start_offset = r->offset; + + // Write meta blocks, metaindex block and footer in the following order. + // 1. [meta block: filter] + // 2. [meta block: index] + // 3. [meta block: compression dictionary] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: properties] + // 6. [metaindex block] + // 7. Footer + BlockHandle metaindex_block_handle, index_block_handle; + MetaIndexBuilder meta_index_builder; + WriteFilterBlock(&meta_index_builder); + WriteIndexBlock(&meta_index_builder, &index_block_handle); + WriteCompressionDictBlock(&meta_index_builder); + WriteRangeDelBlock(&meta_index_builder); + WritePropertiesBlock(&meta_index_builder); + if (ok()) { + // flush the meta index block + WriteMaybeCompressedBlock(meta_index_builder.Finish(), kNoCompression, + &metaindex_block_handle, BlockType::kMetaIndex); + } + if (ok()) { + WriteFooter(metaindex_block_handle, index_block_handle); + } + r->state = Rep::State::kClosed; + r->SetStatus(r->CopyIOStatus()); + Status ret_status = r->CopyStatus(); + assert(!ret_status.ok() || io_status().ok()); + r->tail_size = r->offset - r->props.tail_start_offset; + return ret_status; +} + +void BlockBasedTableBuilder::Abandon() { + assert(rep_->state != Rep::State::kClosed); + if (rep_->IsParallelCompressionEnabled()) { + StopParallelCompression(); + } + rep_->state = Rep::State::kClosed; + rep_->CopyStatus().PermitUncheckedError(); + rep_->CopyIOStatus().PermitUncheckedError(); +} + +uint64_t BlockBasedTableBuilder::NumEntries() const { + return rep_->props.num_entries; +} + +bool BlockBasedTableBuilder::IsEmpty() const { + return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } + +uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { + if (rep_->IsParallelCompressionEnabled()) { + // Use compression ratio so far and inflight uncompressed bytes to estimate + // final SST size. + return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize(); + } else { + return FileSize(); + } +} + +uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } + +bool BlockBasedTableBuilder::NeedCompact() const { + for (const auto& collector : rep_->table_properties_collectors) { + if (collector->NeedCompact()) { + return true; + } + } + return false; +} + +TableProperties BlockBasedTableBuilder::GetTableProperties() const { + TableProperties ret = rep_->props; + for (const auto& collector : rep_->table_properties_collectors) { + for (const auto& prop : collector->GetReadableProperties()) { + ret.readable_properties.insert(prop); + } + collector->Finish(&ret.user_collected_properties).PermitUncheckedError(); + } + return ret; +} + +std::string BlockBasedTableBuilder::GetFileChecksum() const { + if (rep_->file != nullptr) { + return rep_->file->GetFileChecksum(); + } else { + return kUnknownFileChecksum; + } +} + +const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const { + if (rep_->file != nullptr) { + return rep_->file->GetFileChecksumFuncName(); + } else { + return kUnknownFileChecksumFuncName; + } +} +void BlockBasedTableBuilder::SetSeqnoTimeTableProperties( + const std::string& encoded_seqno_to_time_mapping, + uint64_t oldest_ancestor_time) { + rep_->props.seqno_to_time_mapping = encoded_seqno_to_time_mapping; + rep_->props.creation_time = oldest_ancestor_time; +} + +const std::string BlockBasedTable::kObsoleteFilterBlockPrefix = "filter."; +const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; +const std::string BlockBasedTable::kPartitionedFilterBlockPrefix = + "partitionedfilter."; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.h new file mode 100644 index 0000000000..3949474c58 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_builder.h @@ -0,0 +1,209 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +struct BlockBasedTableOptions; + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; + +class BlockBasedTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + BlockBasedTableBuilder(const BlockBasedTableOptions& table_options, + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file); + + // No copying allowed + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete; + + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: Unless key has type kTypeRangeDeletion, key is after any + // previously added non-kTypeRangeDeletion key according to + // comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + bool IsEmpty() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + // Estimated size of the file generated so far. This is used when + // FileSize() cannot estimate final SST size, e.g. parallel compression + // is enabled. + uint64_t EstimatedFileSize() const override; + + // Get the size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t GetTailSize() const override; + + bool NeedCompact() const override; + + // Get table properties + TableProperties GetTableProperties() const override; + + // Get file checksum + std::string GetFileChecksum() const override; + + // Get file checksum function name + const char* GetFileChecksumFuncName() const override; + + void SetSeqnoTimeTableProperties( + const std::string& encoded_seqno_to_time_mapping, + uint64_t oldest_ancestor_time) override; + + private: + bool ok() const { return status().ok(); } + + // Transition state from buffered to unbuffered. See `Rep::State` API comment + // for details of the states. + // REQUIRES: `rep_->state == kBuffered` + void EnterUnbuffered(); + + // Call block's Finish() method and then + // - in buffered mode, buffer the uncompressed block contents. + // - in unbuffered mode, write the compressed block contents to file. + void WriteBlock(BlockBuilder* block, BlockHandle* handle, + BlockType blocktype); + + // Compress and write block content to the file. + void WriteBlock(const Slice& block_contents, BlockHandle* handle, + BlockType block_type); + // Directly write data to the file. + void WriteMaybeCompressedBlock( + const Slice& block_contents, CompressionType, BlockHandle* handle, + BlockType block_type, const Slice* uncompressed_block_data = nullptr); + + void SetupCacheKeyPrefix(const TableBuilderOptions& tbo); + + template + Status InsertBlockInCache(const Slice& block_contents, + const BlockHandle* handle, BlockType block_type); + + Status InsertBlockInCacheHelper(const Slice& block_contents, + const BlockHandle* handle, + BlockType block_type); + + Status InsertBlockInCompressedCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); + + void WriteFilterBlock(MetaIndexBuilder* meta_index_builder); + void WriteIndexBlock(MetaIndexBuilder* meta_index_builder, + BlockHandle* index_block_handle); + void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder); + void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder); + void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder); + void WriteFooter(BlockHandle& metaindex_block_handle, + BlockHandle& index_block_handle); + + struct Rep; + class BlockBasedTablePropertiesCollectorFactory; + class BlockBasedTablePropertiesCollector; + Rep* rep_; + + struct ParallelCompressionRep; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Some compression libraries fail when the uncompressed size is bigger than + // int. If uncompressed size is bigger than kCompressionSizeLimit, don't + // compress it + const uint64_t kCompressionSizeLimit = std::numeric_limits::max(); + + // Get blocks from mem-table walking thread, compress them and + // pass them to the write thread. Used in parallel compression mode only + void BGWorkCompression(const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx); + + // Given uncompressed block content, try to compress it and return result and + // compression type + void CompressAndVerifyBlock(const Slice& uncompressed_block_data, + bool is_data_block, + const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx, + std::string* compressed_output, + Slice* result_block_contents, + CompressionType* result_compression_type, + Status* out_status); + + // Get compressed blocks from BGWorkCompression and write them into SST + void BGWorkWriteMaybeCompressedBlock(); + + // Initialize parallel compression context and + // start BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads + void StartParallelCompression(); + + // Stop BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads + void StopParallelCompression(); +}; + +Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, + CompressionType* type, uint32_t format_version, + bool do_sample, std::string* compressed_output, + std::string* sampled_output_fast, + std::string* sampled_output_slow); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.cc new file mode 100644 index 0000000000..653c222d5e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.cc @@ -0,0 +1,962 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/block_based_table_factory.h" + +#include + +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "logging/logging.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/format.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void TailPrefetchStats::RecordEffectiveSize(size_t len) { + MutexLock l(&mutex_); + if (num_records_ < kNumTracked) { + num_records_++; + } + records_[next_++] = len; + if (next_ == kNumTracked) { + next_ = 0; + } +} + +size_t TailPrefetchStats::GetSuggestedPrefetchSize() { + std::vector sorted; + { + MutexLock l(&mutex_); + + if (num_records_ == 0) { + return 0; + } + sorted.assign(records_, records_ + num_records_); + } + + // Of the historic size, we find the maximum one that satisifis the condtiion + // that if prefetching all, less than 1/8 will be wasted. + std::sort(sorted.begin(), sorted.end()); + + // Assuming we have 5 data points, and after sorting it looks like this: + // + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // +---+ | | | | + // | | | | | | + // +---+ | | | | | | + // | | | | | | | | + // +---+ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // and we use every of the value as a candidate, and estimate how much we + // wasted, compared to read. For example, when we use the 3rd record + // as candiate. This area is what we read: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ***+ *** *** *** *** ** + // * | | | | | | + // +---+ | | | | | * + // * | | | | | | | | + // +---+ | | | | | | | * + // * | | | | X | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // *** *** ***-*** ***--*** ***--*** +**** + // which is (size of the record) X (number of records). + // + // While wasted is this area: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ****---+ | | | | + // * * | | | | | + // * *-*** *** | | | | | + // * * | | | | | | | + // *--** *** | | | | | | | + // | | | | | X | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // Which can be calculated iteratively. + // The difference between wasted using 4st and 3rd record, will + // be following area: + // +---+ + // +--+ +-+ ++ +-+ +-+ +---+ | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // +-+ +-+ +-+ ++ +---+ +--+ | | | + // | | | | | | | + // +---+ ++ | | | | | | + // | | | | | | X | | | + // +---+ ++ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // which will be the size difference between 4st and 3rd record, + // times 3, which is number of records before the 4st. + // Here we assume that all data within the prefetch range will be useful. In + // reality, it may not be the case when a partial block is inside the range, + // or there are data in the middle that is not read. We ignore those cases + // for simplicity. + assert(!sorted.empty()); + size_t prev_size = sorted[0]; + size_t max_qualified_size = sorted[0]; + size_t wasted = 0; + for (size_t i = 1; i < sorted.size(); i++) { + size_t read = sorted[i] * sorted.size(); + wasted += (sorted[i] - prev_size) * i; + if (wasted <= read / 8) { + max_qualified_size = sorted[i]; + } + prev_size = sorted[i]; + } + const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB + return std::min(kMaxPrefetchSize, max_qualified_size); +} + + +const std::string kOptNameMetadataCacheOpts = "metadata_cache_options"; + +static std::unordered_map + pinning_tier_type_string_map = { + {"kFallback", PinningTier::kFallback}, + {"kNone", PinningTier::kNone}, + {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar}, + {"kAll", PinningTier::kAll}}; + +static std::unordered_map + block_base_table_index_type_string_map = { + {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, + {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch}, + {"kTwoLevelIndexSearch", + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}, + {"kBinarySearchWithFirstKey", + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; + +static std::unordered_map + block_base_table_data_block_index_type_string_map = { + {"kDataBlockBinarySearch", + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch}, + {"kDataBlockBinaryAndHash", + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}}; + +static std::unordered_map + block_base_table_index_shortening_mode_string_map = { + {"kNoShortening", + BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, + {"kShortenSeparators", + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, + {"kShortenSeparatorsAndSuccessor", + BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor}}; + +static std::unordered_map + metadata_cache_options_type_info = { + {"top_level_index_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, top_level_index_pinning), + &pinning_tier_type_string_map)}, + {"partition_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, partition_pinning), + &pinning_tier_type_string_map)}, + {"unpartitioned_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, unpartitioned_pinning), + &pinning_tier_type_string_map)}}; + +static std::unordered_map + block_base_table_prepopulate_block_cache_string_map = { + {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable}, + {"kFlushOnly", + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}}; + + +static std::unordered_map + block_based_table_type_info = { + /* currently not supported + std::shared_ptr block_cache = nullptr; + CacheUsageOptions cache_usage_options; + */ + {"flush_block_policy_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct BlockBasedTableOptions, + flush_block_policy_factory), + OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)}, + {"cache_index_and_filter_blocks", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cache_index_and_filter_blocks_with_high_priority", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks_with_high_priority), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"pin_l0_filter_and_index_blocks_in_cache", + {offsetof(struct BlockBasedTableOptions, + pin_l0_filter_and_index_blocks_in_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"index_type", OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, index_type), + &block_base_table_index_type_string_map)}, + {"hash_index_allow_collision", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"data_block_index_type", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, data_block_index_type), + &block_base_table_data_block_index_type_string_map)}, + {"index_shortening", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, index_shortening), + &block_base_table_index_shortening_mode_string_map)}, + {"data_block_hash_table_util_ratio", + {offsetof(struct BlockBasedTableOptions, + data_block_hash_table_util_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"checksum", + {offsetof(struct BlockBasedTableOptions, checksum), + OptionType::kChecksumType, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"no_block_cache", + {offsetof(struct BlockBasedTableOptions, no_block_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"block_size", + {offsetof(struct BlockBasedTableOptions, block_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"block_size_deviation", + {offsetof(struct BlockBasedTableOptions, block_size_deviation), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"block_restart_interval", + {offsetof(struct BlockBasedTableOptions, block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"index_block_restart_interval", + {offsetof(struct BlockBasedTableOptions, index_block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"index_per_partition", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"metadata_block_size", + {offsetof(struct BlockBasedTableOptions, metadata_block_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"partition_filters", + {offsetof(struct BlockBasedTableOptions, partition_filters), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"optimize_filters_for_memory", + {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"filter_policy", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct BlockBasedTableOptions, filter_policy), + OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kNone)}, + {"whole_key_filtering", + {offsetof(struct BlockBasedTableOptions, whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"detect_filter_construct_corruption", + {offsetof(struct BlockBasedTableOptions, + detect_filter_construct_corruption), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"reserve_table_builder_memory", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"reserve_table_reader_memory", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"skip_table_builder_flush", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"format_version", + {offsetof(struct BlockBasedTableOptions, format_version), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"verify_compression", + {offsetof(struct BlockBasedTableOptions, verify_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"read_amp_bytes_per_bit", + {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13 + // and 6.14. The bug will write out 8 bytes to OPTIONS file from the + // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit + // which is actually a uint32. Consequently, the value of + // read_amp_bytes_per_bit written in the OPTIONS file is wrong. + // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit + // from OPTIONS file as a uint32. To be able to load OPTIONS file + // generated by affected releases before the fix, we need to + // manually parse read_amp_bytes_per_bit with this special hack. + uint64_t read_amp_bytes_per_bit = ParseUint64(value); + *(static_cast(addr)) = + static_cast(read_amp_bytes_per_bit); + return Status::OK(); + }}}, + {"enable_index_compression", + {offsetof(struct BlockBasedTableOptions, enable_index_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"block_align", + {offsetof(struct BlockBasedTableOptions, block_align), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"pin_top_level_index_and_filter", + {offsetof(struct BlockBasedTableOptions, + pin_top_level_index_and_filter), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {kOptNameMetadataCacheOpts, + OptionTypeInfo::Struct( + kOptNameMetadataCacheOpts, &metadata_cache_options_type_info, + offsetof(struct BlockBasedTableOptions, metadata_cache_options), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"block_cache", + {offsetof(struct BlockBasedTableOptions, block_cache), + OptionType::kUnknown, OptionVerificationType::kNormal, + (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), + // Parses the input value as a Cache + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); + return Cache::CreateFromString(opts, value, cache); + }}}, + {"block_cache_compressed", + {0, OptionType::kUnknown, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"max_auto_readahead_size", + {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"prepopulate_block_cache", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, prepopulate_block_cache), + &block_base_table_prepopulate_block_cache_string_map, + OptionTypeFlags::kMutable)}, + {"initial_auto_readahead_size", + {offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"num_file_reads_for_auto_readahead", + {offsetof(struct BlockBasedTableOptions, + num_file_reads_for_auto_readahead), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + +}; + +// TODO(myabandeh): We should return an error instead of silently changing the +// options +BlockBasedTableFactory::BlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) + : table_options_(_table_options) { + InitializeOptions(); + RegisterOptions(&table_options_, &block_based_table_type_info); + + const auto table_reader_charged = + table_options_.cache_usage_options.options_overrides + .at(CacheEntryRole::kBlockBasedTableReader) + .charged; + if (table_options_.block_cache && + table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) { + table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager( + std::make_shared>( + table_options_.block_cache))); + } +} + +void BlockBasedTableFactory::InitializeOptions() { + if (table_options_.flush_block_policy_factory == nullptr) { + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } + if (table_options_.no_block_cache) { + table_options_.block_cache.reset(); + } else if (table_options_.block_cache == nullptr) { + LRUCacheOptions co; + // 32MB, the recommended minimum size for 64 shards, to reduce contention + co.capacity = 32 << 20; + table_options_.block_cache = NewLRUCache(co); + } + if (table_options_.block_size_deviation < 0 || + table_options_.block_size_deviation > 100) { + table_options_.block_size_deviation = 0; + } + if (table_options_.block_restart_interval < 1) { + table_options_.block_restart_interval = 1; + } + if (table_options_.index_block_restart_interval < 1) { + table_options_.index_block_restart_interval = 1; + } + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + table_options_.index_block_restart_interval != 1) { + // Currently kHashSearch is incompatible with + // index_block_restart_interval > 1 + table_options_.index_block_restart_interval = 1; + } + if (table_options_.partition_filters && + table_options_.index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // We do not support partitioned filters without partitioning indexes + table_options_.partition_filters = false; + } + auto& options_overrides = + table_options_.cache_usage_options.options_overrides; + const auto options = table_options_.cache_usage_options.options; + for (std::uint32_t i = 0; i < kNumCacheEntryRoles; ++i) { + CacheEntryRole role = static_cast(i); + auto options_overrides_iter = options_overrides.find(role); + if (options_overrides_iter == options_overrides.end()) { + options_overrides.insert({role, options}); + } else if (options_overrides_iter->second.charged == + CacheEntryRoleOptions::Decision::kFallback) { + options_overrides_iter->second.charged = options.charged; + } + } +} + +Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) { + InitializeOptions(); + return TableFactory::PrepareOptions(opts); +} + +namespace { +// Different cache kinds use the same keys for physically different values, so +// they must not share an underlying key space with each other. +Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { + int cache_count = (bbto.block_cache != nullptr) + + (bbto.persistent_cache != nullptr); + if (cache_count <= 1) { + // Nothing to share / overlap + return Status::OK(); + } + + // More complex test of shared key space, in case the instances are wrappers + // for some shared underlying cache. + static Cache::CacheItemHelper kHelper{CacheEntryRole::kMisc}; + CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime(); + struct SentinelValue { + explicit SentinelValue(char _c) : c(_c) {} + char c; + }; + static SentinelValue kRegularBlockCacheMarker{'b'}; + static char kPersistentCacheMarker{'p'}; + if (bbto.block_cache) { + bbto.block_cache + ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, &kHelper, 1) + .PermitUncheckedError(); + } + if (bbto.persistent_cache) { + // Note: persistent cache copies the data, not keeping the pointer + bbto.persistent_cache + ->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1) + .PermitUncheckedError(); + } + // If we get something different from what we inserted, that indicates + // dangerously overlapping key spaces. + if (bbto.block_cache) { + auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice()); + if (handle) { + auto v = static_cast(bbto.block_cache->Value(handle)); + char c = v->c; + bbto.block_cache->Release(handle); + if (c == kPersistentCacheMarker) { + return Status::InvalidArgument( + "block_cache and persistent_cache share the same key space, " + "which is not supported"); + } else if (v != &kRegularBlockCacheMarker) { + return Status::Corruption("Unexpected mutation to block_cache"); + } + } + } + + if (bbto.persistent_cache) { + std::unique_ptr data; + size_t size = 0; + bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size) + .PermitUncheckedError(); + if (data && size > 0) { + if (data[0] == kRegularBlockCacheMarker.c) { + return Status::InvalidArgument( + "persistent_cache and block_cache share the same key space, " + "which is not supported"); + } else if (data[0] != kPersistentCacheMarker) { + return Status::Corruption("Unexpected mutation to persistent_cache"); + } + } + } + return Status::OK(); +} + +} // namespace + +Status BlockBasedTableFactory::NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache) const { + return BlockBasedTable::Open( + ro, table_reader_options.ioptions, table_reader_options.env_options, + table_options_, table_reader_options.internal_comparator, std::move(file), + file_size, table_reader_options.block_protection_bytes_per_key, + table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, + table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, + table_reader_options.skip_filters, table_reader_options.level, + table_reader_options.immortal, table_reader_options.largest_seqno, + table_reader_options.force_direct_prefetch, &tail_prefetch_stats_, + table_reader_options.block_cache_tracer, + table_reader_options.max_file_size_for_l0_meta_pin, + table_reader_options.cur_db_session_id, table_reader_options.cur_file_num, + table_reader_options.unique_id); +} + +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const { + return new BlockBasedTableBuilder(table_options_, table_builder_options, + file); +} + +Status BlockBasedTableFactory::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + cf_opts.prefix_extractor == nullptr) { + return Status::InvalidArgument( + "Hash index is specified for block-based " + "table, but prefix_extractor is not given"); + } + if (table_options_.cache_index_and_filter_blocks && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable cache_index_and_filter_blocks, " + ", but block cache is disabled"); + } + if (table_options_.pin_l0_filter_and_index_blocks_in_cache && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable pin_l0_filter_and_index_blocks_in_cache, " + ", but block cache is disabled"); + } + if (!IsSupportedFormatVersion(table_options_.format_version)) { + return Status::InvalidArgument( + "Unsupported BlockBasedTable format_version. Please check " + "include/rocksdb/table.h for more info"); + } + if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { + return Status::InvalidArgument( + "Enable block_align, but compression " + "enabled"); + } + if (table_options_.block_align && + (table_options_.block_size & (table_options_.block_size - 1))) { + return Status::InvalidArgument( + "Block alignment requested but block size is not a power of 2"); + } + if (table_options_.block_size > std::numeric_limits::max()) { + return Status::InvalidArgument( + "block size exceeds maximum number (4GiB) allowed"); + } + if (table_options_.data_block_index_type == + BlockBasedTableOptions::kDataBlockBinaryAndHash && + table_options_.data_block_hash_table_util_ratio <= 0) { + return Status::InvalidArgument( + "data_block_hash_table_util_ratio should be greater than 0 when " + "data_block_index_type is set to kDataBlockBinaryAndHash"); + } + if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { + // TODO(myabandeh): support it + return Status::InvalidArgument( + "max_successive_merges larger than 0 is currently inconsistent with " + "unordered_write"); + } + const auto& options_overrides = + table_options_.cache_usage_options.options_overrides; + for (auto options_overrides_iter = options_overrides.cbegin(); + options_overrides_iter != options_overrides.cend(); + ++options_overrides_iter) { + const CacheEntryRole role = options_overrides_iter->first; + const CacheEntryRoleOptions options = options_overrides_iter->second; + static const std::set kMemoryChargingSupported = { + CacheEntryRole::kCompressionDictionaryBuildingBuffer, + CacheEntryRole::kFilterConstruction, + CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata, + CacheEntryRole::kBlobCache}; + if (options.charged != CacheEntryRoleOptions::Decision::kFallback && + kMemoryChargingSupported.count(role) == 0) { + return Status::NotSupported( + "Enable/Disable CacheEntryRoleOptions::charged" + " for CacheEntryRole " + + kCacheEntryRoleToCamelString[static_cast(role)] + + " is not supported"); + } + if (table_options_.no_block_cache && + options.charged == CacheEntryRoleOptions::Decision::kEnabled) { + return Status::InvalidArgument( + "Enable CacheEntryRoleOptions::charged" + " for CacheEntryRole " + + kCacheEntryRoleToCamelString[static_cast(role)] + + " but block cache is disabled"); + } + if (role == CacheEntryRole::kBlobCache && + options.charged == CacheEntryRoleOptions::Decision::kEnabled) { + if (cf_opts.blob_cache == nullptr) { + return Status::InvalidArgument( + "Enable CacheEntryRoleOptions::charged" + " for CacheEntryRole " + + kCacheEntryRoleToCamelString[static_cast(role)] + + " but blob cache is not configured"); + } + if (table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable CacheEntryRoleOptions::charged" + " for CacheEntryRole " + + kCacheEntryRoleToCamelString[static_cast(role)] + + " but block cache is disabled"); + } + if (table_options_.block_cache == cf_opts.blob_cache) { + return Status::InvalidArgument( + "Enable CacheEntryRoleOptions::charged" + " for CacheEntryRole " + + kCacheEntryRoleToCamelString[static_cast(role)] + + " but blob cache is the same as block cache"); + } + if (cf_opts.blob_cache->GetCapacity() > + table_options_.block_cache->GetCapacity()) { + return Status::InvalidArgument( + "Enable CacheEntryRoleOptions::charged" + " for CacheEntryRole " + + kCacheEntryRoleToCamelString[static_cast(role)] + + " but blob cache capacity is larger than block cache capacity"); + } + } + } + { + Status s = CheckCacheOptionCompatibility(table_options_); + if (!s.ok()) { + return s; + } + } + std::string garbage; + if (!SerializeEnum(checksum_type_string_map, + table_options_.checksum, &garbage)) { + return Status::InvalidArgument( + "Unrecognized ChecksumType for checksum: " + + std::to_string(static_cast(table_options_.checksum))); + } + return TableFactory::ValidateOptions(db_opts, cf_opts); +} + +std::string BlockBasedTableFactory::GetPrintableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", + table_options_.flush_block_policy_factory->Name(), + static_cast(table_options_.flush_block_policy_factory.get())); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", + table_options_.cache_index_and_filter_blocks); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " cache_index_and_filter_blocks_with_high_priority: %d\n", + table_options_.cache_index_and_filter_blocks_with_high_priority); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " pin_l0_filter_and_index_blocks_in_cache: %d\n", + table_options_.pin_l0_filter_and_index_blocks_in_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", + table_options_.pin_top_level_index_and_filter); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_type: %d\n", + table_options_.index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_index_type: %d\n", + table_options_.data_block_index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_shortening: %d\n", + static_cast(table_options_.index_shortening)); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n", + table_options_.data_block_hash_table_util_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum); + ret.append(buffer); + snprintf(buffer, kBufferSize, " no_block_cache: %d\n", + table_options_.no_block_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_cache: %p\n", + static_cast(table_options_.block_cache.get())); + ret.append(buffer); + if (table_options_.block_cache) { + const char* block_cache_name = table_options_.block_cache->Name(); + if (block_cache_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_name); + ret.append(buffer); + } + ret.append(" block_cache_options:\n"); + ret.append(table_options_.block_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " persistent_cache: %p\n", + static_cast(table_options_.persistent_cache.get())); + ret.append(buffer); + if (table_options_.persistent_cache) { + snprintf(buffer, kBufferSize, " persistent_cache_options:\n"); + ret.append(buffer); + ret.append(table_options_.persistent_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_size: %" PRIu64 "\n", + table_options_.block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", + table_options_.block_size_deviation); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", + table_options_.block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", + table_options_.index_block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n", + table_options_.metadata_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " partition_filters: %d\n", + table_options_.partition_filters); + ret.append(buffer); + snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n", + table_options_.use_delta_encoding); + ret.append(buffer); + snprintf(buffer, kBufferSize, " filter_policy: %s\n", + table_options_.filter_policy == nullptr + ? "nullptr" + : table_options_.filter_policy->Name()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", + table_options_.whole_key_filtering); + ret.append(buffer); + snprintf(buffer, kBufferSize, " verify_compression: %d\n", + table_options_.verify_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n", + table_options_.read_amp_bytes_per_bit); + ret.append(buffer); + snprintf(buffer, kBufferSize, " format_version: %d\n", + table_options_.format_version); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_index_compression: %d\n", + table_options_.enable_index_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_align: %d\n", + table_options_.block_align); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n", + table_options_.max_auto_readahead_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n", + static_cast(table_options_.prepopulate_block_cache)); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n", + table_options_.initial_auto_readahead_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " num_file_reads_for_auto_readahead: %" PRIu64 "\n", + table_options_.num_file_reads_for_auto_readahead); + ret.append(buffer); + return ret; +} + +const void* BlockBasedTableFactory::GetOptionsPtr( + const std::string& name) const { + if (name == kBlockCacheOpts()) { + if (table_options_.no_block_cache) { + return nullptr; + } else { + return table_options_.block_cache.get(); + } + } else { + return TableFactory::GetOptionsPtr(name); + } +} + +// Take a default BlockBasedTableOptions "table_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// BlockBasedTableOptions "new_table_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in BlockBasedTableOptions: +// +// * filter_policy: +// We currently only support the following FilterPolicy in the convenience +// functions: +// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]" +// to specify BloomFilter. The above string is equivalent to calling +// NewBloomFilterPolicy(bits_per_key, use_block_based_builder). +// [Example]: +// - Pass {"filter_policy", "bloomfilter:4:true"} in +// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits +// per key and use_block_based_builder enabled. +// +// * block_cache / block_cache_compressed: +// We currently only support LRU cache in the GetOptions API. The LRU +// cache can be set by directly specifying its size. +// [Example]: +// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is +// equivalent to setting block_cache using NewLRUCache(1024 * 1024). +// +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, + void* opt_ptr) { + Status status = TableFactory::ParseOption(config_options, opt_info, opt_name, + opt_value, opt_ptr); + if (config_options.input_strings_escaped && !status.ok()) { // Got an error + // !input_strings_escaped indicates the old API, where everything is + // parsable. + if (opt_info.IsByName()) { + status = Status::OK(); + } + } + return status; +} + +Status GetBlockBasedTableOptionsFromString( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map, + new_table_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } +} + +Status GetBlockBasedTableOptionsFromMap( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options) { + assert(new_table_options); + BlockBasedTableFactory bbtf(table_options); + Status s = bbtf.ConfigureFromMap(config_options, opts_map); + if (s.ok()) { + *new_table_options = *(bbtf.GetOptions()); + } else { + *new_table_options = table_options; + } + return s; +} + +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) { + return new BlockBasedTableFactory(_table_options); +} + +const std::string BlockBasedTablePropertyNames::kIndexType = + "rocksdb.block.based.table.index.type"; +const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = + "rocksdb.block.based.table.whole.key.filtering"; +const std::string BlockBasedTablePropertyNames::kPrefixFiltering = + "rocksdb.block.based.table.prefix.filtering"; +const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; +const std::string kHashIndexPrefixesMetadataBlock = + "rocksdb.hashindex.metadata"; +const std::string kPropTrue = "1"; +const std::string kPropFalse = "0"; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.h new file mode 100644 index 0000000000..1f78769777 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_factory.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include + +#include "cache/cache_reservation_manager.h" +#include "port/port.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; +struct EnvOptions; + +class BlockBasedTableBuilder; +class RandomAccessFileReader; +class WritableFileWriter; + +// TODO: deprecate this class as it can be replaced with +// `FileMetaData::tail_size` +// +// A class used to track actual bytes written from the tail in the recent SST +// file opens, and provide a suggestion for following open. +class TailPrefetchStats { + public: + void RecordEffectiveSize(size_t len); + // 0 indicates no information to determine. + size_t GetSuggestedPrefetchSize(); + + private: + const static size_t kNumTracked = 32; + size_t records_[kNumTracked]; + port::Mutex mutex_; + size_t next_ = 0; + size_t num_records_ = 0; +}; + +class BlockBasedTableFactory : public TableFactory { + public: + explicit BlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + + ~BlockBasedTableFactory() {} + + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kBlockBasedTableName(); } + + const char* Name() const override { return kBlockBasedTableName(); } + + using TableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const override; + + // Valdates the specified DB Options. + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + Status PrepareOptions(const ConfigOptions& opts) override; + + std::string GetPrintableOptions() const override; + + bool IsDeleteRangeSupported() const override { return true; } + + TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; } + + protected: + const void* GetOptionsPtr(const std::string& name) const override; + Status ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, const std::string& opt_value, + void* opt_ptr) override; + void InitializeOptions(); + + private: + BlockBasedTableOptions table_options_; + std::shared_ptr table_reader_cache_res_mgr_; + mutable TailPrefetchStats tail_prefetch_stats_; +}; + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; +extern const std::string kPropTrue; +extern const std::string kPropFalse; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.cc new file mode 100644 index 0000000000..57744a5877 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.cc @@ -0,0 +1,500 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr, false); } + +void BlockBasedTableIterator::Seek(const Slice& target) { + SeekImpl(&target, true); +} + +void BlockBasedTableIterator::SeekImpl(const Slice* target, + bool async_prefetch) { + bool is_first_pass = true; + if (async_read_in_progress_) { + AsyncInitDataBlock(false); + is_first_pass = false; + } + + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + seek_stat_state_ = kNone; + bool filter_checked = false; + if (target && + !CheckPrefixMayMatch(*target, IterDirection::kForward, &filter_checked)) { + ResetDataIter(); + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTERED + : NON_LAST_LEVEL_SEEK_FILTERED); + return; + } + if (filter_checked) { + seek_stat_state_ = kFilterUsed; + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_FILTER_MATCH); + } + + bool need_seek_index = true; + if (block_iter_points_to_real_block_ && block_iter_.Valid()) { + // Reseek. + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } + } + } + + if (need_seek_index) { + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + allow_unprepared_value_) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + // ResetDataIter() will invalidate block_iter_. Thus, there is no need to + // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound + // as that will be done later when the data block is actually read. + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + if (read_options_.async_io && async_prefetch) { + if (is_first_pass) { + AsyncInitDataBlock(is_first_pass); + } + if (async_read_in_progress_) { + // Status::TryAgain indicates asynchronous request for retrieval of + // data blocks has been submitted. So it should return at this point + // and Seek should be called again to retrieve the requested block and + // execute the remaining code. + return; + } + } else { + InitDataBlock(); + } + } else { + // When the user does a reseek, the iterate_upper_bound might have + // changed. CheckDataBlockWithinUpperBound() needs to be called + // explicitly if the reseek ends up in the same data block. + // If the reseek ends up in a different block, InitDataBlock() will do + // the iterator upper bound check. + CheckDataBlockWithinUpperBound(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } + + CheckOutOfBound(); + + if (target) { + assert(!Valid() || icomp_.Compare(*target, key()) <= 0); + } +} + +void BlockBasedTableIterator::SeekForPrev(const Slice& target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + seek_stat_state_ = kNone; + bool filter_checked = false; + // For now totally disable prefix seek in auto prefix mode because we don't + // have logic + if (!CheckPrefixMayMatch(target, IterDirection::kBackward, &filter_checked)) { + ResetDataIter(); + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTERED + : NON_LAST_LEVEL_SEEK_FILTERED); + return; + } + if (filter_checked) { + seek_stat_state_ = kFilterUsed; + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_FILTER_MATCH); + } + + SavePrevIndexValue(); + + // Call Seek() rather than SeekForPrev() in the index block, because the + // target data block will likely to contain the position for `target`, the + // same as Seek(), rather than than before. + // For example, if we have three data blocks, each containing two keys: + // [2, 4] [6, 8] [10, 12] + // (the keys in the index block would be [4, 8, 12]) + // and the user calls SeekForPrev(7), we need to go to the second block, + // just like if they call Seek(7). + // The only case where the block is difference is when they seek to a position + // in the boundary. For example, if they SeekForPrev(5), we should go to the + // first block, rather than the second. However, we don't have the information + // to distinguish the two unless we read the second block. In this case, we'll + // end up with reading two blocks. + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + auto seek_status = index_iter_->status(); + // Check for IO error + if (!seek_status.IsNotFound() && !seek_status.ok()) { + ResetDataIter(); + return; + } + + // With prefix index, Seek() returns NotFound if the prefix doesn't exist + if (seek_status.IsNotFound()) { + // Any key less than the target is fine for prefix seek + ResetDataIter(); + return; + } else { + index_iter_->SeekToLast(); + } + // Check for IO error + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + InitDataBlock(); + + block_iter_.SeekForPrev(target); + + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); + assert(!block_iter_.Valid() || + icomp_.Compare(target, block_iter_.key()) >= 0); +} + +void BlockBasedTableIterator::SeekToLast() { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + seek_stat_state_ = kNone; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); +} + +void BlockBasedTableIterator::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); + CheckOutOfBound(); +} + +bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = !is_at_first_key_from_index_; + } + return is_valid; +} + +void BlockBasedTableIterator::Prev() { + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + + FindKeyBackward(); +} + +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + block_prefetcher_.PrefetchIfNeeded( + rep, data_block_handle, read_options_.readahead_size, is_for_compaction, + /*no_sequential_checking=*/false, read_options_.rate_limiter_priority); + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s); + block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + if (!is_for_compaction && + (seek_stat_state_ & kDataBlockReadSinceLastSeek) == 0) { + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_DATA + : NON_LAST_LEVEL_SEEK_DATA); + seek_stat_state_ = static_cast( + seek_stat_state_ | kDataBlockReadSinceLastSeek | kReportOnUseful); + } + } +} + +void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { + BlockHandle data_block_handle = index_iter_->value().handle; + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + if (is_first_pass) { + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is + // set. + // In case of async_io with Implicit readahead, block_prefetcher_ will + // always the create the prefetch buffer by setting no_sequential_checking + // = true. + block_prefetcher_.PrefetchIfNeeded( + rep, data_block_handle, read_options_.readahead_size, + is_for_compaction, /*no_sequential_checking=*/read_options_.async_io, + read_options_.rate_limiter_priority); + + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction, /*async_read=*/true, s); + + if (s.IsTryAgain()) { + async_read_in_progress_ = true; + return; + } + } + } else { + // Second pass will call the Poll to get the data block which has been + // requested asynchronously. + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s); + } + block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + + if (!is_for_compaction && + (seek_stat_state_ & kDataBlockReadSinceLastSeek) == 0) { + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_DATA + : NON_LAST_LEVEL_SEEK_DATA); + seek_stat_state_ = static_cast( + seek_stat_state_ | kDataBlockReadSinceLastSeek | kReportOnUseful); + } + async_read_in_progress_ = false; +} + +bool BlockBasedTableIterator::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.status().ok()) { + return false; + } + + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +void BlockBasedTableIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +void BlockBasedTableIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + // Whether next data block is out of upper bound, if there is one. + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && + block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock; + assert(!next_block_is_out_of_bound || + user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, + index_iter_->user_key(), /*b_has_ts=*/true) <= 0); + ResetDataIter(); + index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } + + if (!index_iter_->Valid()) { + return; + } + + IndexValue v = index_iter_->value(); + + if (!v.first_internal_key.empty() && allow_unprepared_value_) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +void BlockBasedTableIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetDataIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } + + // We could have check lower bound here too, but we opt not to do it for + // code simplicity. +} + +void BlockBasedTableIterator::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock && + Valid()) { + is_out_of_bound_ = + user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(), + /*b_has_ts=*/true) <= 0; + } +} + +void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, + /*a_has_ts=*/false, index_iter_->user_key(), + /*b_has_ts=*/true) > 0) + ? BlockUpperBound::kUpperBoundBeyondCurBlock + : BlockUpperBound::kUpperBoundInCurBlock; + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.h new file mode 100644 index 0000000000..6ea53f3315 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_iterator.h @@ -0,0 +1,310 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Iterates over the contents of BlockBasedTable. +class BlockBasedTableIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + // @param read_options Must outlive this iterator. + public: + BlockBasedTableIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr>&& index_iter, + bool check_filter, bool need_upper_bound_check, + const SliceTransform* prefix_extractor, TableReaderCaller caller, + size_t compaction_readahead_size = 0, bool allow_unprepared_value = false) + : index_iter_(std::move(index_iter)), + table_(table), + read_options_(read_options), + icomp_(icomp), + user_comparator_(icomp.user_comparator()), + pinned_iters_mgr_(nullptr), + prefix_extractor_(prefix_extractor), + lookup_context_(caller), + block_prefetcher_( + compaction_readahead_size, + table_->get_rep()->table_options.initial_auto_readahead_size), + allow_unprepared_value_(allow_unprepared_value), + block_iter_points_to_real_block_(false), + check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check), + async_read_in_progress_(false), + is_last_level_(table->IsLastLevel()) {} + + ~BlockBasedTableIterator() {} + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + bool Valid() const override { + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); + } + Slice key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } + } + Slice user_key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } + } + bool PrepareValue() override { + assert(Valid()); + + if (!is_at_first_key_from_index_) { + return true; + } + + return const_cast(this) + ->MaterializeCurrentBlock(); + } + Slice value() const override { + // PrepareValue() must have been called. + assert(!is_at_first_key_from_index_); + assert(Valid()); + + if (seek_stat_state_ & kReportOnUseful) { + bool filter_used = (seek_stat_state_ & kFilterUsed) != 0; + RecordTick( + table_->GetStatistics(), + filter_used + ? (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH) + : (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER + : NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER)); + seek_stat_state_ = kDataBlockReadSinceLastSeek; + } + + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else if (async_read_in_progress_) { + return Status::TryAgain(); + } else { + return Status::OK(); + } + } + + inline IterBoundCheck UpperBoundCheckResult() override { + if (is_out_of_bound_) { + return IterBoundCheck::kOutOfBound; + } else if (block_upper_bound_check_ == + BlockUpperBound::kUpperBoundBeyondCurBlock) { + assert(!is_out_of_bound_); + return IterBoundCheck::kInbound; + } else { + return IterBoundCheck::kUnknown; + } + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); + } + bool IsValuePinned() const override { + assert(!is_at_first_key_from_index_); + assert(Valid()); + + // BlockIter::IsValuePinned() is always true. No need to check + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + block_iter_points_to_real_block_; + } + + void ResetDataIter() { + if (block_iter_points_to_real_block_) { + if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { + block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); + } + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + block_upper_bound_check_ = BlockUpperBound::kUnknown; + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (block_prefetcher_.prefetch_buffer() != nullptr && + read_options_.adaptive_readahead) { + block_prefetcher_.prefetch_buffer()->GetReadaheadState( + &(readahead_file_info->data_block_readahead_info)); + if (index_iter_) { + index_iter_->GetReadaheadState(readahead_file_info); + } + } + } + + void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (read_options_.adaptive_readahead) { + block_prefetcher_.SetReadaheadState( + &(readahead_file_info->data_block_readahead_info)); + if (index_iter_) { + index_iter_->SetReadaheadState(readahead_file_info); + } + } + } + + std::unique_ptr> index_iter_; + + private: + enum class IterDirection { + kForward, + kBackward, + }; + // This enum indicates whether the upper bound falls into current block + // or beyond. + // +-------------+ + // | cur block | <-- (1) + // +-------------+ + // <-- (2) + // --- --- + // <-- (3) + // +-------------+ + // | next block | <-- (4) + // ...... + // + // When the block is smaller than , kUpperBoundInCurBlock + // is the value to use. The examples are (1) or (2) in the graph. It means + // all keys in the next block or beyond will be out of bound. Keys within + // the current block may or may not be out of bound. + // When the block is larger or equal to , + // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4) + // in the graph. It means that all keys in the current block is within the + // upper bound and keys in the next block may or may not be within the uppder + // bound. + // If the boundary key hasn't been checked against the upper bound, + // kUnknown can be used. + enum class BlockUpperBound : uint8_t { + kUpperBoundInCurBlock, + kUpperBoundBeyondCurBlock, + kUnknown, + }; + + // State bits for collecting stats on seeks and whether they returned useful + // results. + enum SeekStatState : uint8_t { + kNone = 0, + // Most recent seek checked prefix filter (or similar future feature) + kFilterUsed = 1 << 0, + // Already recorded that a data block was accessed since the last seek. + kDataBlockReadSinceLastSeek = 1 << 1, + // Have not yet recorded that a value() was accessed. + kReportOnUseful = 1 << 2, + }; + + const BlockBasedTable* table_; + const ReadOptions& read_options_; + const InternalKeyComparator& icomp_; + UserComparatorWrapper user_comparator_; + PinnedIteratorsManager* pinned_iters_mgr_; + DataBlockIter block_iter_; + const SliceTransform* prefix_extractor_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); + BlockCacheLookupContext lookup_context_; + + BlockPrefetcher block_prefetcher_; + + const bool allow_unprepared_value_; + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). + bool is_out_of_bound_ = false; + // How current data block's boundary key with the next block is compared with + // iterate upper bound. + BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to PrepareValue() will trigger loading the block. + bool is_at_first_key_from_index_ = false; + bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + + bool async_read_in_progress_; + + mutable SeekStatState seek_stat_state_ = SeekStatState::kNone; + bool is_last_level_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target, bool async_prefetch); + + void InitDataBlock(); + void AsyncInitDataBlock(bool is_first_pass); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); + + bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction, + bool* filter_checked) { + if (need_upper_bound_check_ && direction == IterDirection::kBackward) { + // Upper bound check isn't sufficient for backward direction to + // guarantee the same result as total order, so disable prefix + // check. + return true; + } + if (check_filter_ && + !table_->PrefixRangeMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_, &lookup_context_, + filter_checked)) { + // TODO remember the iterator is invalidated because of prefix + // match. This can avoid the upper level file iterator to falsely + // believe the position is the end of the SST file and move to + // the first key of the next file. + ResetDataIter(); + return false; + } + return true; + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.cc new file mode 100644 index 0000000000..a6897ad617 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.cc @@ -0,0 +1,3043 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_reader.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "block_cache.h" +#include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" +#include "db/compaction/compaction_picker.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "file/file_prefetch_buffer.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "parsed_full_filter_block.h" +#include "port/lang.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/trace_record.h" +#include "table/block_based/binary_search_index_reader.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_iterator.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/block_type.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/hash_index_reader.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/block_based/partitioned_index_reader.h" +#include "table/block_fetcher.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/multiget_context.h" +#include "table/persistent_cache_helper.h" +#include "table/persistent_cache_options.h" +#include "table/sst_file_writer_collectors.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { + +CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { + CacheAllocationPtr heap_buf; + heap_buf = AllocateBlock(buf.size(), allocator); + memcpy(heap_buf.get(), buf.data(), buf.size()); + return heap_buf; +} +} // namespace + +// Explicitly instantiate templates for each "blocklike" type we use (and +// before implicit specialization). +// This makes it possible to keep the template definitions in the .cc file. +#define INSTANTIATE_RETRIEVE_BLOCK(T) \ + template Status BlockBasedTable::RetrieveBlock( \ + FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ + const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ + CachableEntry* out_parsed_block, GetContext* get_context, \ + BlockCacheLookupContext* lookup_context, bool for_compaction, \ + bool use_cache, bool async_read) const; + +INSTANTIATE_RETRIEVE_BLOCK(ParsedFullFilterBlock); +INSTANTIATE_RETRIEVE_BLOCK(UncompressionDict); +INSTANTIATE_RETRIEVE_BLOCK(Block_kData); +INSTANTIATE_RETRIEVE_BLOCK(Block_kIndex); +INSTANTIATE_RETRIEVE_BLOCK(Block_kFilterPartitionIndex); +INSTANTIATE_RETRIEVE_BLOCK(Block_kRangeDeletion); +INSTANTIATE_RETRIEVE_BLOCK(Block_kMetaIndex); + +} // namespace ROCKSDB_NAMESPACE + +// Generate the regular and coroutine versions of some methods by +// including block_based_table_reader_sync_and_async.h twice +// Macros in the header will expand differently based on whether +// WITH_COROUTINES or WITHOUT_COROUTINES is defined +// clang-format off +#define WITHOUT_COROUTINES +#include "table/block_based/block_based_table_reader_sync_and_async.h" +#undef WITHOUT_COROUTINES +#define WITH_COROUTINES +#include "table/block_based/block_based_table_reader_sync_and_async.h" +#undef WITH_COROUTINES +// clang-format on + +namespace ROCKSDB_NAMESPACE { + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +BlockBasedTable::~BlockBasedTable() { delete rep_; } + +namespace { +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +// @param uncompression_dict Data for presetting the compression library's +// dictionary. +template +Status ReadAndParseBlockFromFile( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + std::unique_ptr* result, const ImmutableOptions& ioptions, + BlockCreateContext& create_context, bool maybe_compressed, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, + MemoryAllocator* memory_allocator, bool for_compaction, bool async_read) { + assert(result); + + BlockContents contents; + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, options, handle, &contents, ioptions, + /*do_uncompress*/ maybe_compressed, maybe_compressed, + TBlocklike::kBlockType, uncompression_dict, cache_options, + memory_allocator, nullptr, for_compaction); + Status s; + // If prefetch_buffer is not allocated, it will fallback to synchronous + // reading of block contents. + if (async_read && prefetch_buffer != nullptr) { + s = block_fetcher.ReadAsyncBlockContents(); + if (!s.ok()) { + return s; + } + } else { + s = block_fetcher.ReadBlockContents(); + } + if (s.ok()) { + create_context.Create(result, std::move(contents)); + } + return s; +} + +// For hash based index, return false if table_properties->prefix_extractor_name +// and prefix_extractor both exist and match, otherwise true. +inline bool PrefixExtractorChangedHelper( + const TableProperties* table_properties, + const SliceTransform* prefix_extractor) { + // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set. + // Turn off hash index in prefix_extractor is not set; if prefix_extractor + // is set but prefix_extractor_block is not set, also disable hash index + if (prefix_extractor == nullptr || table_properties == nullptr || + table_properties->prefix_extractor_name.empty()) { + return true; + } + + // prefix_extractor and prefix_extractor_block are both non-empty + if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) { + return true; + } else { + return false; + } +} + +template +uint32_t GetBlockNumRestarts(const TBlocklike& block) { + if constexpr (std::is_convertible_v) { + const Block& b = block; + return b.NumRestarts(); + } else { + return 0; + } +} + +} // namespace + +void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.stats; + + PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, + static_cast(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_hit; + get_context->get_context_stats_.num_cache_bytes_read += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_HIT); + RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage); + } + + switch (block_type) { + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_HIT); + } + break; + + case BlockType::kCompressionDictionary: + // TODO: introduce perf counter for compression dictionary hit count + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT); + } + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_HIT); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_HIT); + } + break; + } +} + +void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const { + Statistics* const statistics = rep_->ioptions.stats; + + // TODO: introduce aggregate (not per-level) block cache miss count + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, + static_cast(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + } + + // TODO: introduce perf counters for misses per block type + switch (block_type) { + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_MISS); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_MISS); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_MISS); + } + break; + } +} + +void BlockBasedTable::UpdateCacheInsertionMetrics( + BlockType block_type, GetContext* get_context, size_t usage, bool redundant, + Statistics* const statistics) { + // TODO: introduce perf counters for block cache insertions + if (get_context) { + ++get_context->get_context_stats_.num_cache_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_add_redundant; + } + get_context->get_context_stats_.num_cache_bytes_write += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT); + } + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); + } + + switch (block_type) { + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_filter_add_redundant; + } + get_context->get_context_stats_.num_cache_filter_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT); + } + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_add; + if (redundant) { + ++get_context->get_context_stats_ + .num_cache_compression_dict_add_redundant; + } + get_context->get_context_stats_ + .num_cache_compression_dict_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT); + } + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + usage); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_index_add_redundant; + } + get_context->get_context_stats_.num_cache_index_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT); + } + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_data_add_redundant; + } + get_context->get_context_stats_.num_cache_data_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT); + } + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage); + } + break; + } +} + +namespace { +// Return True if table_properties has `user_prop_name` has a `true` value +// or it doesn't contain this property (for backward compatible). +bool IsFeatureSupported(const TableProperties& table_properties, + const std::string& user_prop_name, Logger* info_log) { + auto& props = table_properties.user_collected_properties; + auto pos = props.find(user_prop_name); + // Older version doesn't have this value set. Skip this check. + if (pos != props.end()) { + if (pos->second == kPropFalse) { + return false; + } else if (pos->second != kPropTrue) { + ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s", + user_prop_name.c_str(), pos->second.c_str()); + } + } + return true; +} + +// Caller has to ensure seqno is not nullptr. +Status GetGlobalSequenceNumber(const TableProperties& table_properties, + SequenceNumber largest_seqno, + SequenceNumber* seqno) { + const auto& props = table_properties.user_collected_properties; + const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); + const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); + + *seqno = kDisableGlobalSequenceNumber; + if (version_pos == props.end()) { + if (seqno_pos != props.end()) { + std::array msg_buf; + // This is not an external sst file, global_seqno is not supported. + snprintf( + msg_buf.data(), msg_buf.max_size(), + "A non-external sst file have global seqno property with value %s", + seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + uint32_t version = DecodeFixed32(version_pos->second.c_str()); + if (version < 2) { + if (seqno_pos != props.end() || version != 1) { + std::array msg_buf; + // This is a v1 external sst file, global_seqno is not supported. + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno " + "property with value %s", + version, seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + // Since we have a plan to deprecate global_seqno, we do not return failure + // if seqno_pos == props.end(). We rely on version_pos to detect whether the + // SST is external. + SequenceNumber global_seqno(0); + if (seqno_pos != props.end()) { + global_seqno = DecodeFixed64(seqno_pos->second.c_str()); + } + // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno + // to denote it is unknown. + if (largest_seqno < kMaxSequenceNumber) { + if (global_seqno == 0) { + global_seqno = largest_seqno; + } + if (global_seqno != largest_seqno) { + std::array msg_buf; + snprintf( + msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %s, while largest seqno in the file is %llu", + version, seqno_pos->second.c_str(), + static_cast(largest_seqno)); + return Status::Corruption(msg_buf.data()); + } + } + *seqno = global_seqno; + + if (global_seqno > kMaxSequenceNumber) { + std::array msg_buf; + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %llu, which is greater than kMaxSequenceNumber", + version, static_cast(global_seqno)); + return Status::Corruption(msg_buf.data()); + } + + return Status::OK(); +} +} // namespace + +void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties, + const std::string& cur_db_session_id, + uint64_t cur_file_number, + OffsetableCacheKey* out_base_cache_key, + bool* out_is_stable) { + // Use a stable cache key if sufficient data is in table properties + std::string db_session_id; + uint64_t file_num; + std::string db_id; + if (properties && !properties->db_session_id.empty() && + properties->orig_file_number > 0) { + // (Newer SST file case) + // We must have both properties to get a stable unique id because + // CreateColumnFamilyWithImport or IngestExternalFiles can change the + // file numbers on a file. + db_session_id = properties->db_session_id; + file_num = properties->orig_file_number; + // Less critical, populated in earlier release than above + db_id = properties->db_id; + if (out_is_stable) { + *out_is_stable = true; + } + } else { + // (Old SST file case) + // We use (unique) cache keys based on current identifiers. These are at + // least stable across table file close and re-open, but not across + // different DBs nor DB close and re-open. + db_session_id = cur_db_session_id; + file_num = cur_file_number; + // Plumbing through the DB ID to here would be annoying, and of limited + // value because of the case of VersionSet::Recover opening some table + // files and later setting the DB ID. So we just rely on uniqueness + // level provided by session ID. + db_id = "unknown"; + if (out_is_stable) { + *out_is_stable = false; + } + } + + // Too many tests to update to get these working + // assert(file_num > 0); + // assert(!db_session_id.empty()); + // assert(!db_id.empty()); + + // Minimum block size is 5 bytes; therefore we can trim off two lower bits + // from offsets. See GetCacheKey. + *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num); +} + +CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key, + const BlockHandle& handle) { + // Minimum block size is 5 bytes; therefore we can trim off two lower bits + // from offet. + return base_cache_key.WithOffset(handle.offset() >> 2); +} + +Status BlockBasedTable::Open( + const ReadOptions& read_options, const ImmutableOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr&& file, uint64_t file_size, + uint8_t block_protection_bytes_per_key, + std::unique_ptr* table_reader, uint64_t tail_size, + std::shared_ptr table_reader_cache_res_mgr, + const std::shared_ptr& prefix_extractor, + const bool prefetch_index_and_filter_in_cache, const bool skip_filters, + const int level, const bool immortal_table, + const SequenceNumber largest_seqno, const bool force_direct_prefetch, + TailPrefetchStats* tail_prefetch_stats, + BlockCacheTracer* const block_cache_tracer, + size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id, + uint64_t cur_file_num, UniqueId64x2 expected_unique_id) { + table_reader->reset(); + + Status s; + Footer footer; + std::unique_ptr prefetch_buffer; + + // From read_options, retain deadline, io_timeout, rate_limiter_priority, and + // verify_checksums. In future, we may retain more options. + ReadOptions ro; + ro.deadline = read_options.deadline; + ro.io_timeout = read_options.io_timeout; + ro.rate_limiter_priority = read_options.rate_limiter_priority; + ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; + + // prefetch both index and filters, down to all partitions + const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; + const bool preload_all = !table_options.cache_index_and_filter_blocks; + + if (!ioptions.allow_mmap_reads) { + s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, + tail_prefetch_stats, prefetch_all, preload_all, + &prefetch_buffer, ioptions.stats, tail_size, + ioptions.logger); + // Return error in prefetch path to users. + if (!s.ok()) { + return s; + } + } else { + // Should not prefetch for mmap mode. + prefetch_buffer.reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */, + true /* track_min_offset */)); + } + + // Read in the following order: + // 1. Footer + // 2. [metaindex block] + // 3. [meta block: properties] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: compression dictionary] + // 6. [meta block: index] + // 7. [meta block: filter] + IOOptions opts; + s = file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = ReadFooterFromFile(opts, file.get(), *ioptions.fs, + prefetch_buffer.get(), file_size, &footer, + kBlockBasedTableMagicNumber); + } + if (!s.ok()) { + return s; + } + if (!IsSupportedFormatVersion(footer.format_version())) { + return Status::Corruption( + "Unknown Footer version. Maybe this file was created with newer " + "version of RocksDB?"); + } + + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, + internal_comparator, skip_filters, + file_size, level, immortal_table); + rep->file = std::move(file); + rep->footer = footer; + + // For fully portable/stable cache keys, we need to read the properties + // block before setting up cache keys. TODO: consider setting up a bootstrap + // cache key for PersistentCache to use for metaindex and properties blocks. + rep->persistent_cache_options = PersistentCacheOptions(); + + // Meta-blocks are not dictionary compressed. Explicitly set the dictionary + // handle to null, otherwise it may be seen as uninitialized during the below + // meta-block reads. + rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + + rep->create_context.protection_bytes_per_key = block_protection_bytes_per_key; + // Read metaindex + std::unique_ptr new_table( + new BlockBasedTable(rep, block_cache_tracer)); + std::unique_ptr metaindex; + std::unique_ptr metaindex_iter; + s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex, + &metaindex_iter); + if (!s.ok()) { + return s; + } + + // Populates table_properties and some fields that depend on it, + // such as index_type. + s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(), + metaindex_iter.get(), largest_seqno); + if (!s.ok()) { + return s; + } + + // Populate BlockCreateContext + bool blocks_definitely_zstd_compressed = + rep->table_properties && + (rep->table_properties->compression_name == + CompressionTypeToString(kZSTD) || + rep->table_properties->compression_name == + CompressionTypeToString(kZSTDNotFinalCompression)); + rep->create_context = BlockCreateContext( + &rep->table_options, rep->ioptions.stats, + blocks_definitely_zstd_compressed, block_protection_bytes_per_key, + rep->internal_comparator.user_comparator(), rep->index_value_is_full, + rep->index_has_first_key); + + // Check expected unique id if provided + if (expected_unique_id != kNullUniqueId64x2) { + auto props = rep->table_properties; + if (!props) { + return Status::Corruption("Missing table properties on file " + + std::to_string(cur_file_num) + + " with known unique ID"); + } + UniqueId64x2 actual_unique_id{}; + s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, &actual_unique_id, + /*force*/ true); + assert(s.ok()); // because force=true + if (expected_unique_id != actual_unique_id) { + return Status::Corruption( + "Mismatch in unique ID on table file " + + std::to_string(cur_file_num) + + ". Expected: " + InternalUniqueIdToHumanString(&expected_unique_id) + + " Actual: " + InternalUniqueIdToHumanString(&actual_unique_id)); + } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::PassedVerifyUniqueId", + &actual_unique_id); + } else { + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::SkippedVerifyUniqueId", + nullptr); + if (ioptions.verify_sst_unique_id_in_manifest && ioptions.logger) { + // A crude but isolated way of reporting unverified files. This should not + // be an ongoing concern so doesn't deserve a place in Statistics IMHO. + static std::atomic unverified_count{0}; + auto prev_count = + unverified_count.fetch_add(1, std::memory_order_relaxed); + if (prev_count == 0) { + ROCKS_LOG_WARN( + ioptions.logger, + "At least one SST file opened without unique ID to verify: %" PRIu64 + ".sst", + cur_file_num); + } else if (prev_count % 1000 == 0) { + ROCKS_LOG_WARN( + ioptions.logger, + "Another ~1000 SST files opened without unique ID to verify"); + } + } + } + + // Set up prefix extracto as needed + bool force_null_table_prefix_extractor = false; + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + &force_null_table_prefix_extractor); + if (force_null_table_prefix_extractor) { + assert(!rep->table_prefix_extractor); + } else if (!PrefixExtractorChangedHelper(rep->table_properties.get(), + prefix_extractor.get())) { + // Establish fast path for unchanged prefix_extractor + rep->table_prefix_extractor = prefix_extractor; + } else { + // Current prefix_extractor doesn't match table + if (rep->table_properties) { + //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions + // will need to use it + ConfigOptions config_options; + Status st = SliceTransform::CreateFromString( + config_options, rep->table_properties->prefix_extractor_name, + &(rep->table_prefix_extractor)); + if (!st.ok()) { + //**TODO: Should this be error be returned or swallowed? + ROCKS_LOG_ERROR(rep->ioptions.logger, + "Failed to create prefix extractor[%s]: %s", + rep->table_properties->prefix_extractor_name.c_str(), + st.ToString().c_str()); + } + } + } + + // With properties loaded, we can set up portable/stable cache keys + SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id, + cur_file_num, &rep->base_cache_key); + + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + rep->base_cache_key, rep->ioptions.stats); + + s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(), + metaindex_iter.get(), internal_comparator, + &lookup_context); + if (!s.ok()) { + return s; + } + s = new_table->PrefetchIndexAndFilterBlocks( + ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(), + prefetch_all, table_options, level, file_size, + max_file_size_for_l0_meta_pin, &lookup_context); + + if (s.ok()) { + // Update tail prefetch stats + assert(prefetch_buffer.get() != nullptr); + if (tail_prefetch_stats != nullptr) { + assert(prefetch_buffer->min_offset_read() < file_size); + tail_prefetch_stats->RecordEffectiveSize( + static_cast(file_size) - prefetch_buffer->min_offset_read()); + } + } + + if (s.ok() && table_reader_cache_res_mgr) { + std::size_t mem_usage = new_table->ApproximateMemoryUsage(); + s = table_reader_cache_res_mgr->MakeCacheReservation( + mem_usage, &(rep->table_reader_cache_res_handle)); + if (s.IsMemoryLimit()) { + s = Status::MemoryLimit( + "Can't allocate " + + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kBlockBasedTableReader)] + + " due to memory limit based on " + "cache capacity for memory allocation"); + } + } + + if (s.ok()) { + *table_reader = std::move(new_table); + } + return s; +} + +Status BlockBasedTable::PrefetchTail( + const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, + bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, + const bool prefetch_all, const bool preload_all, + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger) { + assert(tail_size <= file_size); + + size_t tail_prefetch_size = 0; + if (tail_size != 0) { + tail_prefetch_size = tail_size; + } else { + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the + // properties, at which point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + + ROCKS_LOG_WARN(logger, + "Tail prefetch size %zu is calculated based on heuristics", + tail_prefetch_size); + } else { + ROCKS_LOG_WARN( + logger, + "Tail prefetch size %zu is calculated based on TailPrefetchStats", + tail_prefetch_size); + } + } + size_t prefetch_off; + size_t prefetch_len; + if (file_size < tail_prefetch_size) { + prefetch_off = 0; + prefetch_len = static_cast(file_size); + } else { + prefetch_off = static_cast(file_size - tail_prefetch_size); + prefetch_len = tail_prefetch_size; + } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", + &tail_prefetch_size); + + // Try file system prefetch + if (!file->use_direct_io() && !force_direct_prefetch) { + if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority) + .IsNotSupported()) { + prefetch_buffer->reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, + false /* enable */, true /* track_min_offset */)); + return Status::OK(); + } + } + + // Use `FilePrefetchBuffer` + prefetch_buffer->reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, + true /* track_min_offset */, false /* implicit_auto_readahead */, + 0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */, + nullptr /* fs */, nullptr /* clock */, stats, + FilePrefetchBufferUsage::kTableOpenPrefetchTail)); + + IOOptions opts; + Status s = file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = (*prefetch_buffer) + ->Prefetch(opts, file, prefetch_off, prefetch_len, + ro.rate_limiter_priority); + } + return s; +} + +Status BlockBasedTable::ReadPropertiesBlock( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, const SequenceNumber largest_seqno) { + Status s; + BlockHandle handle; + s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle); + + if (!s.ok()) { + ROCKS_LOG_WARN(rep_->ioptions.logger, + "Error when seeking to properties block from file: %s", + s.ToString().c_str()); + } else if (!handle.IsNull()) { + s = meta_iter->status(); + std::unique_ptr table_properties; + if (s.ok()) { + s = ReadTablePropertiesHelper( + ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer, + rep_->ioptions, &table_properties, nullptr /* memory_allocator */); + } + IGNORE_STATUS_IF_ERROR(s); + + if (!s.ok()) { + ROCKS_LOG_WARN(rep_->ioptions.logger, + "Encountered error while reading data from properties " + "block %s", + s.ToString().c_str()); + } else { + assert(table_properties != nullptr); + rep_->table_properties = std::move(table_properties); + rep_->blocks_maybe_compressed = + rep_->table_properties->compression_name != + CompressionTypeToString(kNoCompression); + } + } else { + ROCKS_LOG_ERROR(rep_->ioptions.logger, + "Cannot find Properties block from file."); + } + + // Read the table properties, if provided. + if (rep_->table_properties) { + rep_->whole_key_filtering &= + IsFeatureSupported(*(rep_->table_properties), + BlockBasedTablePropertyNames::kWholeKeyFiltering, + rep_->ioptions.logger); + rep_->prefix_filtering &= IsFeatureSupported( + *(rep_->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger); + + rep_->index_key_includes_seq = + rep_->table_properties->index_key_is_user_key == 0; + rep_->index_value_is_full = + rep_->table_properties->index_value_is_delta_encoded == 0; + + // Update index_type with the true type. + // If table properties don't contain index type, we assume that the table + // is in very old format and has kBinarySearch index type. + auto& props = rep_->table_properties->user_collected_properties; + auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (index_type_pos != props.end()) { + rep_->index_type = static_cast( + DecodeFixed32(index_type_pos->second.c_str())); + } + auto min_ts_pos = props.find("rocksdb.timestamp_min"); + if (min_ts_pos != props.end()) { + rep_->min_timestamp = Slice(min_ts_pos->second); + } + auto max_ts_pos = props.find("rocksdb.timestamp_max"); + if (max_ts_pos != props.end()) { + rep_->max_timestamp = Slice(max_ts_pos->second); + } + + rep_->index_has_first_key = + rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; + + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, + &(rep_->global_seqno)); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str()); + } + } + return s; +} + +Status BlockBasedTable::ReadRangeDelBlock( + const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context) { + Status s; + BlockHandle range_del_handle; + s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.logger, + "Error when seeking to range delete tombstones block from file: %s", + s.ToString().c_str()); + } else if (!range_del_handle.IsNull()) { + Status tmp_status; + std::unique_ptr iter(NewDataBlockIterator( + read_options, range_del_handle, + /*input_iter=*/nullptr, BlockType::kRangeDeletion, + /*get_context=*/nullptr, lookup_context, prefetch_buffer, + /*for_compaction= */ false, /*async_read= */ false, tmp_status)); + assert(iter != nullptr); + s = iter->status(); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.logger, + "Encountered error while reading data from range del block %s", + s.ToString().c_str()); + IGNORE_STATUS_IF_ERROR(s); + } else { + rep_->fragmented_range_dels = + std::make_shared(std::move(iter), + internal_comparator); + } + } + return s; +} + +Status BlockBasedTable::PrefetchIndexAndFilterBlocks( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level, + size_t file_size, size_t max_file_size_for_l0_meta_pin, + BlockCacheLookupContext* lookup_context) { + // Find filter handle and filter type + if (rep_->filter_policy) { + auto name = rep_->filter_policy->CompatibilityName(); + bool builtin_compatible = + strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0; + + for (const auto& [filter_type, prefix] : + {std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix), + std::make_pair(Rep::FilterType::kPartitionedFilter, + kPartitionedFilterBlockPrefix), + std::make_pair(Rep::FilterType::kNoFilter, + kObsoleteFilterBlockPrefix)}) { + if (builtin_compatible) { + // This code is only here to deal with a hiccup in early 7.0.x where + // there was an unintentional name change in the SST files metadata. + // It should be OK to remove this in the future (late 2022) and just + // have the 'else' code. + // NOTE: the test:: names below are likely not needed but included + // out of caution + static const std::unordered_set kBuiltinNameAndAliases = { + BuiltinFilterPolicy::kCompatibilityName(), + test::LegacyBloomFilterPolicy::kClassName(), + test::FastLocalBloomFilterPolicy::kClassName(), + test::Standard128RibbonFilterPolicy::kClassName(), + "rocksdb.internal.DeprecatedBlockBasedBloomFilter", + BloomFilterPolicy::kClassName(), + RibbonFilterPolicy::kClassName(), + }; + + // For efficiency, do a prefix seek and see if the first match is + // good. + meta_iter->Seek(prefix); + if (meta_iter->status().ok() && meta_iter->Valid()) { + Slice key = meta_iter->key(); + if (key.starts_with(prefix)) { + key.remove_prefix(prefix.size()); + if (kBuiltinNameAndAliases.find(key.ToString()) != + kBuiltinNameAndAliases.end()) { + Slice v = meta_iter->value(); + Status s = rep_->filter_handle.DecodeFrom(&v); + if (s.ok()) { + rep_->filter_type = filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + ROCKS_LOG_WARN(rep_->ioptions.logger, + "Detected obsolete filter type in %s. Read " + "performance might suffer until DB is fully " + "re-compacted.", + rep_->file->file_name().c_str()); + } + break; + } + } + } + } + } else { + std::string filter_block_key = prefix + name; + if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle) + .ok()) { + rep_->filter_type = filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + ROCKS_LOG_WARN( + rep_->ioptions.logger, + "Detected obsolete filter type in %s. Read performance might " + "suffer until DB is fully re-compacted.", + rep_->file->file_name().c_str()); + } + break; + } + } + } + } + // Partition filters cannot be enabled without partition indexes + assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter || + rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + + // Find compression dictionary handle + Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName, + &rep_->compression_dict_handle); + if (!s.ok()) { + return s; + } + + BlockBasedTableOptions::IndexType index_type = rep_->index_type; + + const bool use_cache = table_options.cache_index_and_filter_blocks; + + const bool maybe_flushed = + level == 0 && file_size <= max_file_size_for_l0_meta_pin; + std::function is_pinned = + [maybe_flushed, &is_pinned](PinningTier pinning_tier, + PinningTier fallback_pinning_tier) { + // Fallback to fallback would lead to infinite recursion. Disallow it. + assert(fallback_pinning_tier != PinningTier::kFallback); + + switch (pinning_tier) { + case PinningTier::kFallback: + return is_pinned(fallback_pinning_tier, + PinningTier::kNone /* fallback_pinning_tier */); + case PinningTier::kNone: + return false; + case PinningTier::kFlushedAndSimilar: + return maybe_flushed; + case PinningTier::kAll: + return true; + }; + + // In GCC, this is needed to suppress `control reaches end of non-void + // function [-Werror=return-type]`. + assert(false); + return false; + }; + const bool pin_top_level_index = is_pinned( + table_options.metadata_cache_options.top_level_index_pinning, + table_options.pin_top_level_index_and_filter ? PinningTier::kAll + : PinningTier::kNone); + const bool pin_partition = + is_pinned(table_options.metadata_cache_options.partition_pinning, + table_options.pin_l0_filter_and_index_blocks_in_cache + ? PinningTier::kFlushedAndSimilar + : PinningTier::kNone); + const bool pin_unpartitioned = + is_pinned(table_options.metadata_cache_options.unpartitioned_pinning, + table_options.pin_l0_filter_and_index_blocks_in_cache + ? PinningTier::kFlushedAndSimilar + : PinningTier::kNone); + + // pin the first level of index + const bool pin_index = + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch + ? pin_top_level_index + : pin_unpartitioned; + // prefetch the first level of index + // WART: this might be redundant (unnecessary cache hit) if !pin_index, + // depending on prepopulate_block_cache option + const bool prefetch_index = prefetch_all || pin_index; + + std::unique_ptr index_reader; + s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache, + prefetch_index, pin_index, lookup_context, + &index_reader); + if (!s.ok()) { + return s; + } + + rep_->index_reader = std::move(index_reader); + + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_all || pin_partition) { + s = rep_->index_reader->CacheDependencies(ro, pin_partition, + prefetch_buffer); + } + if (!s.ok()) { + return s; + } + + // pin the first level of filter + const bool pin_filter = + rep_->filter_type == Rep::FilterType::kPartitionedFilter + ? pin_top_level_index + : pin_unpartitioned; + // prefetch the first level of filter + // WART: this might be redundant (unnecessary cache hit) if !pin_filter, + // depending on prepopulate_block_cache option + const bool prefetch_filter = prefetch_all || pin_filter; + + if (rep_->filter_policy) { + auto filter = new_table->CreateFilterBlockReader( + ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter, + lookup_context); + + if (filter) { + // Refer to the comment above about paritioned indexes always being cached + if (prefetch_all || pin_partition) { + s = filter->CacheDependencies(ro, pin_partition, prefetch_buffer); + if (!s.ok()) { + return s; + } + } + rep_->filter = std::move(filter); + } + } + + if (!rep_->compression_dict_handle.IsNull()) { + std::unique_ptr uncompression_dict_reader; + s = UncompressionDictReader::Create( + this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned, + pin_unpartitioned, lookup_context, &uncompression_dict_reader); + if (!s.ok()) { + return s; + } + + rep_->uncompression_dict_reader = std::move(uncompression_dict_reader); + } + + assert(s.ok()); + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->ioptions.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->file()->Hint(FSRandomAccessFile::kNormal); + break; + case Options::SEQUENTIAL: + rep_->file->file()->Hint(FSRandomAccessFile::kSequential); + break; + case Options::WILLNEED: + rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed); + break; + default: + assert(false); + } +} + +std::shared_ptr BlockBasedTable::GetTableProperties() + const { + return rep_->table_properties; +} + +size_t BlockBasedTable::ApproximateMemoryUsage() const { + size_t usage = 0; + if (rep_) { + usage += rep_->ApproximateMemoryUsage(); + } else { + return usage; + } + if (rep_->filter) { + usage += rep_->filter->ApproximateMemoryUsage(); + } + if (rep_->index_reader) { + usage += rep_->index_reader->ApproximateMemoryUsage(); + } + if (rep_->uncompression_dict_reader) { + usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage(); + } + if (rep_->table_properties) { + usage += rep_->table_properties->ApproximateMemoryUsage(); + } + return usage; +} + +// Load the meta-index-block from the file. On success, return the loaded +// metaindex +// block and its iterator. +Status BlockBasedTable::ReadMetaIndexBlock( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* metaindex_block, + std::unique_ptr* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + std::unique_ptr metaindex; + Status s = ReadAndParseBlockFromFile( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, + rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions, + rep_->create_context, true /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, + GetMemoryAllocator(rep_->table_options), false /* for_compaction */, + false /* async_read */); + + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.logger, + "Encountered error while reading data from properties" + " block %s", + s.ToString().c_str()); + return s; + } + + *metaindex_block = std::move(metaindex); + // meta block uses bytewise comparator. + iter->reset(metaindex_block->get()->NewMetaIterator()); + return Status::OK(); +} + +template +Cache::Priority BlockBasedTable::GetCachePriority() const { + // Here we treat the legacy name "...index_and_filter_blocks..." to mean all + // metadata blocks that might go into block cache, EXCEPT only those needed + // for the read path (Get, etc.). TableProperties should not be needed on the + // read path (prefix extractor setting is an O(1) size special case that we + // are working not to require from TableProperties), so it is not given + // high-priority treatment if it should go into BlockCache. + if constexpr (TBlocklike::kBlockType == BlockType::kData || + TBlocklike::kBlockType == BlockType::kProperties) { + return Cache::Priority::LOW; + } else if (rep_->table_options + .cache_index_and_filter_blocks_with_high_priority) { + return Cache::Priority::HIGH; + } else { + return Cache::Priority::LOW; + } +} + +template +WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CachableEntry* out_parsed_block, + GetContext* get_context) const { + assert(out_parsed_block); + assert(out_parsed_block->IsEmpty()); + + Status s; + Statistics* statistics = rep_->ioptions.statistics.get(); + + // Lookup uncompressed cache first + if (block_cache) { + assert(!cache_key.empty()); + auto cache_handle = block_cache.LookupFull( + cache_key, &rep_->create_context, GetCachePriority(), + statistics, rep_->ioptions.lowest_used_cache_tier); + + // Avoid updating metrics here if the handle is not complete yet. This + // happens with MultiGet and secondary cache. So update the metrics only + // if its a miss, or a hit and value is ready + if (!cache_handle) { + UpdateCacheMissMetrics(TBlocklike::kBlockType, get_context); + } else { + TBlocklike* value = block_cache.Value(cache_handle); + if (value) { + UpdateCacheHitMetrics(TBlocklike::kBlockType, get_context, + block_cache.get()->GetUsage(cache_handle)); + } + out_parsed_block->SetCachedValue(value, block_cache.get(), cache_handle); + return s; + } + } + + // If not found, search from the compressed block cache. + assert(out_parsed_block->IsEmpty()); + + return s; +} + +template +WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CachableEntry* out_parsed_block, BlockContents&& block_contents, + CompressionType block_comp_type, + const UncompressionDict& uncompression_dict, + MemoryAllocator* memory_allocator, GetContext* get_context) const { + const ImmutableOptions& ioptions = rep_->ioptions; + const uint32_t format_version = rep_->table_options.format_version; + assert(out_parsed_block); + assert(out_parsed_block->IsEmpty()); + + Status s; + Statistics* statistics = ioptions.stats; + + std::unique_ptr block_holder; + if (block_comp_type != kNoCompression) { + // Retrieve the uncompressed contents into a new buffer + BlockContents uncompressed_block_contents; + UncompressionContext context(block_comp_type); + UncompressionInfo info(context, uncompression_dict, block_comp_type); + s = UncompressBlockData(info, block_contents.data.data(), + block_contents.data.size(), + &uncompressed_block_contents, format_version, + ioptions, memory_allocator); + if (!s.ok()) { + return s; + } + rep_->create_context.Create(&block_holder, + std::move(uncompressed_block_contents)); + } else { + rep_->create_context.Create(&block_holder, std::move(block_contents)); + } + + // insert into uncompressed block cache + if (block_cache && block_holder->own_bytes()) { + size_t charge = block_holder->ApproximateMemoryUsage(); + BlockCacheTypedHandle* cache_handle = nullptr; + s = block_cache.InsertFull(cache_key, block_holder.get(), charge, + &cache_handle, GetCachePriority(), + rep_->ioptions.lowest_used_cache_tier); + + if (s.ok()) { + assert(cache_handle != nullptr); + out_parsed_block->SetCachedValue(block_holder.release(), + block_cache.get(), cache_handle); + + UpdateCacheInsertionMetrics(TBlocklike::kBlockType, get_context, charge, + s.IsOkOverwritten(), rep_->ioptions.stats); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + } + } else { + out_parsed_block->SetOwnedValue(std::move(block_holder)); + } + + return s; +} + +std::unique_ptr BlockBasedTable::CreateFilterBlockReader( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) { + auto& rep = rep_; + auto filter_type = rep->filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + return std::unique_ptr(); + } + + assert(rep->filter_policy); + + switch (filter_type) { + case Rep::FilterType::kPartitionedFilter: + return PartitionedFilterBlockReader::Create( + this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + + case Rep::FilterType::kFullFilter: + return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache, + prefetch, pin, lookup_context); + + default: + // filter_type is either kNoFilter (exited the function at the first if), + // or it must be covered in this switch block + assert(false); + return std::unique_ptr(); + } +} + +// disable_prefix_seek should be set to true when prefix_extractor found in SST +// differs from the one in mutable_cf_options and index type is HashBasedIndex +InternalIteratorBase* BlockBasedTable::NewIndexIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + assert(rep_ != nullptr); + assert(rep_->index_reader != nullptr); + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return rep_->index_reader->NewIterator(read_options, disable_prefix_seek, + input_iter, get_context, + lookup_context); +} + +// TODO? +template <> +DataBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, BlockType block_type, + DataBlockIter* input_iter, bool block_contents_pinned) { + return block->NewDataIterator(rep->internal_comparator.user_comparator(), + rep->get_global_seqno(block_type), input_iter, + rep->ioptions.stats, block_contents_pinned); +} + +// TODO? +template <> +IndexBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, BlockType block_type, + IndexBlockIter* input_iter, bool block_contents_pinned) { + return block->NewIndexIterator( + rep->internal_comparator.user_comparator(), + rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats, + /* total_order_seek */ true, rep->index_has_first_key, + rep->index_key_includes_seq, rep->index_value_is_full, + block_contents_pinned); +} + +// If contents is nullptr, this function looks up the block caches for the +// data block referenced by handle, and read the block from disk if necessary. +// If contents is non-null, it skips the cache lookup and disk read, since +// the caller has already read it. In both cases, if ro.fill_cache is true, +// it inserts the block into the block cache. +template +WithBlocklikeCheck +BlockBasedTable::MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + bool for_compaction, CachableEntry* out_parsed_block, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents, bool async_read) const { + assert(out_parsed_block != nullptr); + const bool no_io = (ro.read_tier == kBlockCacheTier); + BlockCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; + + // First, try to get the block from the cache + // + // If either block cache is enabled, we'll try to read from it. + Status s; + CacheKey key_data; + Slice key; + bool is_cache_hit = false; + if (block_cache) { + // create key for block cache + key_data = GetCacheKey(rep_->base_cache_key, handle); + key = key_data.AsSlice(); + + if (!contents) { + s = GetDataBlockFromCache(key, block_cache, out_parsed_block, + get_context); + // Value could still be null at this point, so check the cache handle + // and update the read pattern for prefetching + if (out_parsed_block->GetValue() || out_parsed_block->GetCacheHandle()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache and + // compressed block cache. + is_cache_hit = true; + if (prefetch_buffer) { + // Update the block details so that PrefetchBuffer can use the read + // pattern to determine if reads are sequential or not for + // prefetching. It should also take in account blocks read from cache. + prefetch_buffer->UpdateReadPattern( + handle.offset(), BlockSizeWithTrailer(handle), + ro.adaptive_readahead /*decrease_readahead_size*/); + } + } + } + + // Can't find the block from the cache. If I/O is allowed, read from the + // file. + if (out_parsed_block->GetValue() == nullptr && + out_parsed_block->GetCacheHandle() == nullptr && !no_io && + ro.fill_cache) { + Statistics* statistics = rep_->ioptions.stats; + const bool maybe_compressed = + TBlocklike::kBlockType != BlockType::kFilter && + TBlocklike::kBlockType != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed; + CompressionType contents_comp_type; + // Maybe serialized or uncompressed + BlockContents tmp_contents; + if (!contents) { + Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS + : READ_BLOCK_GET_MICROS; + StopWatch sw(rep_->ioptions.clock, statistics, histogram); + BlockFetcher block_fetcher( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, + &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed, + TBlocklike::kBlockType, uncompression_dict, + rep_->persistent_cache_options, + GetMemoryAllocator(rep_->table_options), + /*allocator=*/nullptr); + + // If prefetch_buffer is not allocated, it will fallback to synchronous + // reading of block contents. + if (async_read && prefetch_buffer != nullptr) { + s = block_fetcher.ReadAsyncBlockContents(); + if (!s.ok()) { + return s; + } + } else { + s = block_fetcher.ReadBlockContents(); + } + + contents_comp_type = block_fetcher.get_compression_type(); + contents = &tmp_contents; + if (get_context) { + switch (TBlocklike::kBlockType) { + case BlockType::kIndex: + ++get_context->get_context_stats_.num_index_read; + break; + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + ++get_context->get_context_stats_.num_filter_read; + break; + default: + break; + } + } + } else { + contents_comp_type = GetBlockCompressionType(*contents); + } + + if (s.ok()) { + // If filling cache is allowed and a cache is configured, try to put the + // block to the cache. + s = PutDataBlockToCache( + key, block_cache, out_parsed_block, std::move(*contents), + contents_comp_type, uncompression_dict, + GetMemoryAllocator(rep_->table_options), get_context); + } + } + } + + // TODO: optimize so that lookup_context != nullptr implies the others + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && + lookup_context) { + SaveLookupContextOrTraceRecord( + key, is_cache_hit, ro, out_parsed_block->GetValue(), lookup_context); + } + + assert(s.ok() || out_parsed_block->GetValue() == nullptr); + return s; +} + +template +WithBlocklikeCheck +BlockBasedTable::SaveLookupContextOrTraceRecord( + const Slice& block_key, bool is_cache_hit, const ReadOptions& ro, + const TBlocklike* parsed_block_value, + BlockCacheLookupContext* lookup_context) const { + assert(lookup_context); + size_t usage = 0; + uint64_t nkeys = 0; + if (parsed_block_value) { + // Approximate the number of keys in the block using restarts. + int interval = rep_->table_options.block_restart_interval; + nkeys = interval * GetBlockNumRestarts(*parsed_block_value); + // On average, the last restart should be just over half utilized. + // Specifically, 1..N should be N/2 + 0.5. For example, 7 -> 4, 8 -> 4.5. + // Use the get_id to alternate between rounding up vs. down. + if (nkeys > 0) { + bool rounding = static_cast(lookup_context->get_id) & 1; + nkeys -= (interval - rounding) / 2; + } + usage = parsed_block_value->ApproximateMemoryUsage(); + } + TraceType trace_block_type = TraceType::kTraceMax; + switch (TBlocklike::kBlockType) { + case BlockType::kData: + trace_block_type = TraceType::kBlockTraceDataBlock; + break; + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + trace_block_type = TraceType::kBlockTraceFilterBlock; + break; + case BlockType::kCompressionDictionary: + trace_block_type = TraceType::kBlockTraceUncompressionDictBlock; + break; + case BlockType::kRangeDeletion: + trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; + break; + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; + default: + // This cannot happen. + assert(false); + break; + } + const bool no_io = ro.read_tier == kBlockCacheTier; + bool no_insert = no_io || !ro.fill_cache; + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( + trace_block_type, lookup_context->caller)) { + // Make a copy of the block key here since it will be logged later. + lookup_context->FillLookupContext(is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, + block_key.ToString(), nkeys); + + // Defer logging the access to Get() and MultiGet() to trace additional + // information, e.g., referenced_key + } else { + // Avoid making copy of block_key if it doesn't need to be saved in + // BlockCacheLookupContext + lookup_context->FillLookupContext(is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, + /*block_key=*/{}, nkeys); + + // Fill in default values for irrelevant/unknown fields + FinishTraceRecord(*lookup_context, block_key, + lookup_context->referenced_key, + /*does_referenced_key_exist*/ false, + /*referenced_data_size*/ 0); + } +} + +void BlockBasedTable::FinishTraceRecord( + const BlockCacheLookupContext& lookup_context, const Slice& block_key, + const Slice& referenced_key, bool does_referenced_key_exist, + uint64_t referenced_data_size) const { + // Avoid making copy of referenced_key if it doesn't need to be saved in + // BlockCacheLookupContext + BlockCacheTraceRecord access_record( + rep_->ioptions.clock->NowMicros(), + /*block_key=*/"", lookup_context.block_type, lookup_context.block_size, + rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), + lookup_context.caller, lookup_context.is_cache_hit, + lookup_context.no_insert, lookup_context.get_id, + lookup_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_context.num_keys_in_block, does_referenced_key_exist); + // TODO: Should handle status here? + block_cache_tracer_ + ->WriteBlockAccess(access_record, block_key, rep_->cf_name_for_tracing(), + referenced_key) + .PermitUncheckedError(); +} + +template +WithBlocklikeCheck BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* out_parsed_block, GetContext* get_context, + BlockCacheLookupContext* lookup_context, bool for_compaction, + bool use_cache, bool async_read) const { + assert(out_parsed_block); + assert(out_parsed_block->IsEmpty()); + + Status s; + if (use_cache) { + s = MaybeReadBlockAndLoadToCache( + prefetch_buffer, ro, handle, uncompression_dict, for_compaction, + out_parsed_block, get_context, lookup_context, + /*contents=*/nullptr, async_read); + + if (!s.ok()) { + return s; + } + + if (out_parsed_block->GetValue() != nullptr || + out_parsed_block->GetCacheHandle() != nullptr) { + assert(s.ok()); + return s; + } + } + + assert(out_parsed_block->IsEmpty()); + + const bool no_io = ro.read_tier == kBlockCacheTier; + if (no_io) { + return Status::Incomplete("no blocking io"); + } + + const bool maybe_compressed = + TBlocklike::kBlockType != BlockType::kFilter && + TBlocklike::kBlockType != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; + std::unique_ptr block; + + { + Histograms histogram = + for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; + StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram); + s = ReadAndParseBlockFromFile( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, + rep_->ioptions, rep_->create_context, maybe_compressed, + uncompression_dict, rep_->persistent_cache_options, + GetMemoryAllocator(rep_->table_options), for_compaction, async_read); + + if (get_context) { + switch (TBlocklike::kBlockType) { + case BlockType::kIndex: + ++(get_context->get_context_stats_.num_index_read); + break; + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + ++(get_context->get_context_stats_.num_filter_read); + break; + default: + break; + } + } + } + + if (!s.ok()) { + return s; + } + + out_parsed_block->SetOwnedValue(std::move(block)); + + assert(s.ok()); + return s; +} + +BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( + const BlockBasedTable* table, + UnorderedMap>* block_map) + : table_(table), block_map_(block_map) {} + +InternalIteratorBase* +BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( + const BlockHandle& handle) { + // Return a block iterator on the index partition + auto block = block_map_->find(handle.offset()); + // block_map_ must be exhaustive + if (block == block_map_->end()) { + assert(false); + // Signal problem to caller + return nullptr; + } + const Rep* rep = table_->get_rep(); + assert(rep); + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return block->second.GetValue()->NewIndexIterator( + rep->internal_comparator.user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full); +} + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in BlockBasedTableOptions.filter_policy. +// In particular, we require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and +// will return true if the filter block is not in memory and not found in block +// cache. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixRangeMayMatch( + const Slice& internal_key, const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check, BlockCacheLookupContext* lookup_context, + bool* filter_checked) const { + if (!rep_->filter_policy) { + return true; + } + + const SliceTransform* prefix_extractor; + + if (rep_->table_prefix_extractor == nullptr) { + if (need_upper_bound_check) { + return true; + } + prefix_extractor = options_prefix_extractor; + } else { + prefix_extractor = rep_->table_prefix_extractor.get(); + } + auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); + auto user_key_without_ts = + ExtractUserKeyAndStripTimestamp(internal_key, ts_sz); + if (!prefix_extractor->InDomain(user_key_without_ts)) { + return true; + } + + bool may_match = true; + + FilterBlockReader* const filter = rep_->filter.get(); + *filter_checked = false; + if (filter != nullptr) { + const bool no_io = read_options.read_tier == kBlockCacheTier; + + const Slice* const const_ikey_ptr = &internal_key; + may_match = filter->RangeMayExist( + read_options.iterate_upper_bound, user_key_without_ts, prefix_extractor, + rep_->internal_comparator.user_comparator(), const_ikey_ptr, + filter_checked, need_upper_bound_check, no_io, lookup_context, + read_options); + } + + return may_match; +} + +bool BlockBasedTable::PrefixExtractorChanged( + const SliceTransform* prefix_extractor) const { + if (prefix_extractor == nullptr) { + return true; + } else if (prefix_extractor == rep_->table_prefix_extractor.get()) { + return false; + } else { + return PrefixExtractorChangedHelper(rep_->table_properties.get(), + prefix_extractor); + } +} + +Statistics* BlockBasedTable::GetStatistics() const { + return rep_->ioptions.stats; +} +bool BlockBasedTable::IsLastLevel() const { + return rep_->level == rep_->ioptions.num_levels - 1; +} + +InternalIterator* BlockBasedTable::NewIterator( + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, TableReaderCaller caller, + size_t compaction_readahead_size, bool allow_unprepared_value) { + BlockCacheLookupContext lookup_context{caller}; + bool need_upper_bound_check = + read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor); + std::unique_ptr> index_iter(NewIndexIterator( + read_options, + /*disable_prefix_seek=*/need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context)); + if (arena == nullptr) { + return new BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, caller, + compaction_readahead_size, allow_unprepared_value); + } else { + auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); + return new (mem) BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, caller, + compaction_readahead_size, allow_unprepared_value); + } +} + +FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( + const ReadOptions& read_options) { + if (rep_->fragmented_range_dels == nullptr) { + return nullptr; + } + SequenceNumber snapshot = kMaxSequenceNumber; + if (read_options.snapshot != nullptr) { + snapshot = read_options.snapshot->GetSequenceNumber(); + } + return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels, + rep_->internal_comparator, + snapshot, read_options.timestamp); +} + +bool BlockBasedTable::FullFilterKeyMayMatch( + FilterBlockReader* filter, const Slice& internal_key, const bool no_io, + const SliceTransform* prefix_extractor, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const { + if (filter == nullptr) { + return true; + } + Slice user_key = ExtractUserKey(internal_key); + const Slice* const const_ikey_ptr = &internal_key; + bool may_match = true; + size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); + if (rep_->whole_key_filtering) { + may_match = filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr, + get_context, lookup_context, read_options); + if (may_match) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } + } else if (!PrefixExtractorChanged(prefix_extractor) && + prefix_extractor->InDomain(user_key_without_ts)) { + // FIXME ^^^: there should be no reason for Get() to depend on current + // prefix_extractor at all. It should always use table_prefix_extractor. + may_match = filter->PrefixMayMatch( + prefix_extractor->Transform(user_key_without_ts), no_io, const_ikey_ptr, + get_context, lookup_context, read_options); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED); + if (may_match) { + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL); + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } + } + return may_match; +} + +void BlockBasedTable::FullFilterKeysMayMatch( + FilterBlockReader* filter, MultiGetRange* range, const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const { + if (filter == nullptr) { + return; + } + uint64_t before_keys = range->KeysLeft(); + assert(before_keys > 0); // Caller should ensure + if (rep_->whole_key_filtering) { + filter->KeysMayMatch(range, no_io, lookup_context, read_options); + uint64_t after_keys = range->KeysLeft(); + if (after_keys) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys, + rep_->level); + } + uint64_t filtered_keys = before_keys - after_keys; + if (filtered_keys) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys, + rep_->level); + } + } else if (!PrefixExtractorChanged(prefix_extractor)) { + // FIXME ^^^: there should be no reason for MultiGet() to depend on current + // prefix_extractor at all. It should always use table_prefix_extractor. + filter->PrefixesMayMatch(range, prefix_extractor, false, lookup_context, + read_options); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys); + uint64_t after_keys = range->KeysLeft(); + if (after_keys) { + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys, + rep_->level); + } + uint64_t filtered_keys = before_keys - after_keys; + if (filtered_keys) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL, + filtered_keys); + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys, + rep_->level); + } + } +} + +Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, + std::vector& anchors) { + // We iterator the whole index block here. More efficient implementation + // is possible if we push this operation into IndexReader. For example, we + // can directly sample from restart block entries in the index block and + // only read keys needed. Here we take a simple solution. Performance is + // likely not to be a problem. We are compacting the whole file, so all + // keys will be read out anyway. An extra read to index block might be + // a small share of the overhead. We can try to optimize if needed. + // + // `CacheDependencies()` brings all the blocks into cache using one I/O. That + // way the full index scan usually finds the index data it is looking for in + // cache rather than doing an I/O for each "dependency" (partition). + Status s = rep_->index_reader->CacheDependencies( + read_options, false /* pin */, nullptr /* prefetch_buffer */); + if (!s.ok()) { + return s; + } + + IndexBlockIter iiter_on_stack; + auto iiter = NewIndexIterator( + read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, + /*get_context=*/nullptr, /*lookup_context=*/nullptr); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + // If needed the threshold could be more adaptive. For example, it can be + // based on size, so that a larger will be sampled to more partitions than a + // smaller file. The size might also need to be passed in by the caller based + // on total compaction size. + const uint64_t kMaxNumAnchors = uint64_t{128}; + uint64_t num_blocks = this->GetTableProperties()->num_data_blocks; + uint64_t num_blocks_per_anchor = num_blocks / kMaxNumAnchors; + if (num_blocks_per_anchor == 0) { + num_blocks_per_anchor = 1; + } + + uint64_t count = 0; + std::string last_key; + uint64_t range_size = 0; + uint64_t prev_offset = 0; + for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { + const BlockHandle& bh = iiter->value().handle; + range_size += bh.offset() + bh.size() - prev_offset; + prev_offset = bh.offset() + bh.size(); + if (++count % num_blocks_per_anchor == 0) { + count = 0; + anchors.emplace_back(iiter->user_key(), range_size); + range_size = 0; + } else { + last_key = iiter->user_key().ToString(); + } + } + if (count != 0) { + anchors.emplace_back(last_key, range_size); + } + return Status::OK(); +} + +bool BlockBasedTable::TimestampMayMatch(const ReadOptions& read_options) const { + if (read_options.timestamp != nullptr && !rep_->min_timestamp.empty()) { + RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_CHECKED); + auto read_ts = read_options.timestamp; + auto comparator = rep_->internal_comparator.user_comparator(); + if (comparator->CompareTimestamp(*read_ts, rep_->min_timestamp) < 0) { + RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_FILTERED); + return false; + } + } + return true; +} + +Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters) { + // Similar to Bloom filter !may_match + // If timestamp is beyond the range of the table, skip + if (!TimestampMayMatch(read_options)) { + return Status::OK(); + } + assert(key.size() >= 8); // key must be internal key + assert(get_context != nullptr); + Status s; + const bool no_io = read_options.read_tier == kBlockCacheTier; + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + + // First check the full filter + // If full filter not useful, Then go into each block + uint64_t tracing_get_id = get_context->get_tracing_get_id(); + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Trace the key since it contains both user key and sequence number. + lookup_context.referenced_key = key.ToString(); + lookup_context.get_from_user_specified_snapshot = + read_options.snapshot != nullptr; + } + TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch"); + const bool may_match = + FullFilterKeyMayMatch(filter, key, no_io, prefix_extractor, get_context, + &lookup_context, read_options); + TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch"); + if (may_match) { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged(prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + get_context, &lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + bool matched = false; // if such user key matched a key in SST + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + IndexValue v = iiter->value(); + + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + BlockCacheLookupContext lookup_data_block_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr}; + bool does_referenced_key_exist = false; + DataBlockIter biter; + uint64_t referenced_data_size = 0; + Status tmp_status; + NewDataBlockIterator( + read_options, v.handle, &biter, BlockType::kData, get_context, + &lookup_data_block_context, /*prefetch_buffer=*/nullptr, + /*for_compaction=*/false, /*async_read=*/false, tmp_status); + + if (no_io && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + s = biter.status(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + bool may_exist = biter.SeekForGet(key); + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + Status pik_status = ParseInternalKey( + biter.key(), &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + s = pik_status; + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + } + done = true; + break; + } + } + s = biter.status(); + if (!s.ok()) { + break; + } + } + // Write the block cache access record. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter.key(); + } else { + referenced_key = key; + } + FinishTraceRecord(lookup_data_block_context, + lookup_data_block_context.block_key, referenced_key, + does_referenced_key_exist, referenced_data_size); + } + + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + } + if (matched && filter != nullptr) { + if (rep_->whole_key_filtering) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_TRUE_POSITIVE); + } + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + + if (s.ok() && !iiter->status().IsNotFound()) { + s = iiter->status(); + } + } + + return s; +} + +Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options, + const SliceTransform* prefix_extractor, + MultiGetRange* mget_range) { + if (mget_range->empty()) { + // Caller should ensure non-empty (performance bug) + assert(false); + return Status::OK(); // Nothing to do + } + + FilterBlockReader* const filter = rep_->filter.get(); + if (!filter) { + return Status::OK(); + } + + // First check the full filter + // If full filter not useful, Then go into each block + const bool no_io = read_options.read_tier == kBlockCacheTier; + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (mget_range->begin()->get_context) { + tracing_mget_id = mget_range->begin()->get_context->get_tracing_get_id(); + } + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor, + &lookup_context, read_options); + + return Status::OK(); +} + +Status BlockBasedTable::Prefetch(const ReadOptions& read_options, + const Slice* const begin, + const Slice* const end) { + auto& comparator = rep_->internal_comparator; + UserComparatorWrapper user_comparator(comparator.user_comparator()); + // pre-condition + if (begin && end && comparator.Compare(*begin, *end) > 0) { + return Status::InvalidArgument(*begin, *end); + } + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + IndexBlockIter iiter_on_stack; + auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + &iiter_on_stack, /*get_context=*/nullptr, + &lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr>(iiter); + } + + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + + // indicates if we are on the last page that need to be pre-fetched + bool prefetching_boundary_page = false; + + for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); + iiter->Next()) { + BlockHandle block_handle = iiter->value().handle; + const bool is_user_key = !rep_->index_key_includes_seq; + if (end && + ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || + (is_user_key && + user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { + if (prefetching_boundary_page) { + break; + } + + // The index entry represents the last key in the data block. + // We should load this page into memory as well, but no more + prefetching_boundary_page = true; + } + + // Load the block specified by the block_handle into the block cache + DataBlockIter biter; + Status tmp_status; + NewDataBlockIterator( + read_options, block_handle, &biter, /*type=*/BlockType::kData, + /*get_context=*/nullptr, &lookup_context, + /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, + /*async_read=*/false, tmp_status); + + if (!biter.status().ok()) { + // there was an unexpected error while pre-fetching + return biter.status(); + } + } + + return Status::OK(); +} + +Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, + TableReaderCaller caller) { + Status s; + // Check Meta blocks + std::unique_ptr metaindex; + std::unique_ptr metaindex_iter; + s = ReadMetaIndexBlock(read_options, nullptr /* prefetch buffer */, + &metaindex, &metaindex_iter); + if (s.ok()) { + s = VerifyChecksumInMetaBlocks(read_options, metaindex_iter.get()); + if (!s.ok()) { + return s; + } + } else { + return s; + } + // Check Data blocks + IndexBlockIter iiter_on_stack; + BlockCacheLookupContext context{caller}; + InternalIteratorBase* iiter = NewIndexIterator( + read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, + /*get_context=*/nullptr, &context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr>(iiter); + } + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + s = VerifyChecksumInBlocks(read_options, iiter); + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + const ReadOptions& read_options, + InternalIteratorBase* index_iter) { + Status s; + // We are scanning the whole file, so no need to do exponential + // increasing of the buffer size. + size_t readahead_size = (read_options.readahead_size != 0) + ? read_options.readahead_size + : rep_->table_options.max_auto_readahead_size; + // FilePrefetchBuffer doesn't work in mmap mode and readahead is not + // needed there. + FilePrefetchBuffer prefetch_buffer( + readahead_size /* readahead_size */, + readahead_size /* max_readahead_size */, + !rep_->ioptions.allow_mmap_reads /* enable */); + + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle = index_iter->value().handle; + BlockContents contents; + BlockFetcher block_fetcher( + rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle, + &contents, rep_->ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kData, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + // In the case of two level indexes, we would have exited the above loop + // by checking index_iter->Valid(), but Valid() might have returned false + // due to an IO error. So check the index_iter status + s = index_iter->status(); + } + return s; +} + +BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( + const Slice& meta_block_name) { + if (meta_block_name.starts_with(kFullFilterBlockPrefix)) { + return BlockType::kFilter; + } + + if (meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) { + return BlockType::kFilterPartitionIndex; + } + + if (meta_block_name == kPropertiesBlockName) { + return BlockType::kProperties; + } + + if (meta_block_name == kCompressionDictBlockName) { + return BlockType::kCompressionDictionary; + } + + if (meta_block_name == kRangeDelBlockName) { + return BlockType::kRangeDeletion; + } + + if (meta_block_name == kHashIndexPrefixesBlock) { + return BlockType::kHashIndexPrefixes; + } + + if (meta_block_name == kHashIndexPrefixesMetadataBlock) { + return BlockType::kHashIndexMetadata; + } + + if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) { + // Obsolete but possible in old files + return BlockType::kInvalid; + } + + assert(false); + return BlockType::kInvalid; +} + +Status BlockBasedTable::VerifyChecksumInMetaBlocks( + const ReadOptions& read_options, InternalIteratorBase* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); + BlockContents contents; + const Slice meta_block_name = index_iter->key(); + if (meta_block_name == kPropertiesBlockName) { + // Unfortunate special handling for properties block checksum w/ + // global seqno + std::unique_ptr table_properties; + s = ReadTablePropertiesHelper(read_options, handle, rep_->file.get(), + nullptr /* prefetch_buffer */, rep_->footer, + rep_->ioptions, &table_properties, + nullptr /* memory_allocator */); + } else { + s = BlockFetcher( + rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, + read_options, handle, &contents, rep_->ioptions, + false /* decompress */, false /*maybe_compressed*/, + GetBlockTypeForMetaBlockByName(meta_block_name), + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options) + .ReadBlockContents(); + } + if (!s.ok()) { + break; + } + } + return s; +} + +bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { + assert(rep_ != nullptr); + + Cache* const cache = rep_->table_options.block_cache.get(); + if (cache == nullptr) { + return false; + } + + CacheKey key = GetCacheKey(rep_->base_cache_key, handle); + + Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice()); + if (cache_handle == nullptr) { + return false; + } + + cache->Release(cache_handle); + + return true; +} + +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + std::unique_ptr> iiter(NewIndexIterator( + options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + iiter->Seek(key); + assert(iiter->Valid()); + + return TEST_BlockInCache(iiter->value().handle); +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + switch (rep_->index_type) { + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } + case BlockBasedTableOptions::kBinarySearch: + FALLTHROUGH_INTENDED; + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + return BinarySearchIndexReader::Create(this, ro, prefetch_buffer, + use_cache, prefetch, pin, + lookup_context, index_reader); + } + case BlockBasedTableOptions::kHashSearch: { + if (!rep_->table_prefix_extractor) { + ROCKS_LOG_WARN(rep_->ioptions.logger, + "Missing prefix extractor for hash index. Fall back to" + " binary search index."); + return BinarySearchIndexReader::Create(this, ro, prefetch_buffer, + use_cache, prefetch, pin, + lookup_context, index_reader); + } else { + return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter, + use_cache, prefetch, pin, lookup_context, + index_reader); + } + } + default: { + std::string error_message = + "Unrecognized index type: " + std::to_string(rep_->index_type); + return Status::InvalidArgument(error_message.c_str()); + } + } +} + +uint64_t BlockBasedTable::ApproximateDataOffsetOf( + const InternalIteratorBase& index_iter, + uint64_t data_size) const { + assert(index_iter.status().ok()); + if (index_iter.Valid()) { + BlockHandle handle = index_iter.value().handle; + return handle.offset(); + } else { + // The iterator is past the last key in the file. + return data_size; + } +} + +uint64_t BlockBasedTable::GetApproximateDataSize() { + // Should be in table properties unless super old version + if (rep_->table_properties) { + return rep_->table_properties->data_size; + } + // Fall back to rough estimate from footer + return rep_->footer.metaindex_handle().offset(); +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, + TableReaderCaller caller) { + uint64_t data_size = GetApproximateDataSize(); + if (UNLIKELY(data_size == 0)) { + // Hmm. Let's just split in half to avoid skewing one way or another, + // since we don't know whether we're operating on lower bound or + // upper bound. + return rep_->file_size / 2; + } + + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + ReadOptions ro; + ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; + auto index_iter = + NewIndexIterator(ro, /*disable_prefix_seek=*/true, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(key); + uint64_t offset; + if (index_iter->status().ok()) { + offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Split in half to avoid skewing one way or another, + // since we don't know whether we're operating on lower bound or + // upper bound. + return rep_->file_size / 2; + } + + // Pro-rate file metadata (incl filters) size-proportionally across data + // blocks. + double size_ratio = + static_cast(offset) / static_cast(data_size); + return static_cast(size_ratio * + static_cast(rep_->file_size)); +} + +uint64_t BlockBasedTable::ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, + TableReaderCaller caller) { + assert(rep_->internal_comparator.Compare(start, end) <= 0); + + uint64_t data_size = GetApproximateDataSize(); + if (UNLIKELY(data_size == 0)) { + // Hmm. Assume whole file is involved, since we have lower and upper + // bound. This likely skews the estimate if we consider that this function + // is typically called with `[start, end]` fully contained in the file's + // key-range. + return rep_->file_size; + } + + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + ReadOptions ro; + ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; + auto index_iter = + NewIndexIterator(ro, /*disable_prefix_seek=*/true, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(start); + uint64_t start_offset; + if (index_iter->status().ok()) { + start_offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Assume file is involved from the start. This likely skews the estimate + // but is consistent with the above error handling. + start_offset = 0; + } + + index_iter->Seek(end); + uint64_t end_offset; + if (index_iter->status().ok()) { + end_offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Assume file is involved until the end. This likely skews the estimate + // but is consistent with the above error handling. + end_offset = data_size; + } + + assert(end_offset >= start_offset); + // Pro-rate file metadata (incl filters) size-proportionally across data + // blocks. + double size_ratio = static_cast(end_offset - start_offset) / + static_cast(data_size); + return static_cast(size_ratio * + static_cast(rep_->file_size)); +} + +bool BlockBasedTable::TEST_FilterBlockInCache() const { + assert(rep_ != nullptr); + return rep_->filter_type != Rep::FilterType::kNoFilter && + TEST_BlockInCache(rep_->filter_handle); +} + +bool BlockBasedTable::TEST_IndexBlockInCache() const { + assert(rep_ != nullptr); + + return TEST_BlockInCache(rep_->footer.index_handle()); +} + +Status BlockBasedTable::GetKVPairsFromDataBlocks( + const ReadOptions& read_options, std::vector* kv_pair_blocks) { + std::unique_ptr> blockhandles_iter( + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + + Status s = blockhandles_iter->status(); + if (!s.ok()) { + // Cannot read Index Block + return s; + } + + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + + if (!s.ok()) { + break; + } + + std::unique_ptr datablock_iter; + Status tmp_status; + datablock_iter.reset(NewDataBlockIterator( + read_options, blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, + /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, + /*async_read=*/false, tmp_status)); + s = datablock_iter->status(); + + if (!s.ok()) { + // Error reading the block - Skipped + continue; + } + + KVPairBlock kv_pair_block; + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + // Error reading the block - Skipped + break; + } + const Slice& key = datablock_iter->key(); + const Slice& value = datablock_iter->value(); + std::string key_copy = std::string(key.data(), key.size()); + std::string value_copy = std::string(value.data(), value.size()); + + kv_pair_block.push_back( + std::make_pair(std::move(key_copy), std::move(value_copy))); + } + kv_pair_blocks->push_back(std::move(kv_pair_block)); + } + return Status::OK(); +} + +Status BlockBasedTable::DumpTable(WritableFile* out_file) { + WritableFileStringStreamAdapter out_file_wrapper(out_file); + std::ostream out_stream(&out_file_wrapper); + // Output Footer + out_stream << "Footer Details:\n" + "--------------------------------------\n"; + out_stream << " " << rep_->footer.ToString() << "\n"; + + // Output MetaIndex + out_stream << "Metaindex Details:\n" + "--------------------------------------\n"; + std::unique_ptr metaindex; + std::unique_ptr metaindex_iter; + // TODO: plumb Env::IOActivity + const ReadOptions ro; + Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, + &metaindex_iter); + if (s.ok()) { + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); + metaindex_iter->Next()) { + s = metaindex_iter->status(); + if (!s.ok()) { + return s; + } + if (metaindex_iter->key() == kPropertiesBlockName) { + out_stream << " Properties block handle: " + << metaindex_iter->value().ToString(true) << "\n"; + } else if (metaindex_iter->key() == kCompressionDictBlockName) { + out_stream << " Compression dictionary block handle: " + << metaindex_iter->value().ToString(true) << "\n"; + } else if (strstr(metaindex_iter->key().ToString().c_str(), + "filter.rocksdb.") != nullptr) { + out_stream << " Filter block handle: " + << metaindex_iter->value().ToString(true) << "\n"; + } else if (metaindex_iter->key() == kRangeDelBlockName) { + out_stream << " Range deletion block handle: " + << metaindex_iter->value().ToString(true) << "\n"; + } + } + out_stream << "\n"; + } else { + return s; + } + + // Output TableProperties + const ROCKSDB_NAMESPACE::TableProperties* table_properties; + table_properties = rep_->table_properties.get(); + + if (table_properties != nullptr) { + out_stream << "Table Properties:\n" + "--------------------------------------\n"; + out_stream << " " << table_properties->ToString("\n ", ": ") << "\n"; + } + + if (rep_->filter) { + out_stream << "Filter Details:\n" + "--------------------------------------\n"; + out_stream << " " << rep_->filter->ToString() << "\n"; + } + + // Output Index block + s = DumpIndexBlock(out_stream); + if (!s.ok()) { + return s; + } + + // Output compression dictionary + if (rep_->uncompression_dict_reader) { + CachableEntry uncompression_dict; + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, ro, false /* no_io */, + false, /* verify_checksums */ + nullptr /* get_context */, nullptr /* lookup_context */, + &uncompression_dict); + if (!s.ok()) { + return s; + } + + assert(uncompression_dict.GetValue()); + + const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict(); + out_stream << "Compression Dictionary:\n" + "--------------------------------------\n"; + out_stream << " size (bytes): " << raw_dict.size() << "\n\n"; + out_stream << " HEX " << raw_dict.ToString(true) << "\n\n"; + } + + // Output range deletions block + auto* range_del_iter = NewRangeTombstoneIterator(ro); + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + out_stream << "Range deletions:\n" + "--------------------------------------\n"; + for (; range_del_iter->Valid(); range_del_iter->Next()) { + DumpKeyValue(range_del_iter->key(), range_del_iter->value(), + out_stream); + } + out_stream << "\n"; + } + delete range_del_iter; + } + // Output Data blocks + s = DumpDataBlocks(out_stream); + + if (!s.ok()) { + return s; + } + + if (!out_stream.good()) { + return Status::IOError("Failed to write to output file"); + } + return Status::OK(); +} + +Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { + out_stream << "Index Details:\n" + "--------------------------------------\n"; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::unique_ptr> blockhandles_iter( + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_stream << "Can not read Index Block \n\n"; + return s; + } + + out_stream << " Block key hex dump: Data block handle\n"; + out_stream << " Block key ascii\n\n"; + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + Slice key = blockhandles_iter->key(); + Slice user_key; + InternalKey ikey; + if (!rep_->index_key_includes_seq) { + user_key = key; + } else { + ikey.DecodeFrom(key); + user_key = ikey.user_key(); + } + + out_stream << " HEX " << user_key.ToString(true) << ": " + << blockhandles_iter->value().ToString(true, + rep_->index_has_first_key) + << " offset " << blockhandles_iter->value().handle.offset() + << " size " << blockhandles_iter->value().handle.size() << "\n"; + + std::string str_key = user_key.ToString(); + std::string res_key(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + res_key.append(&str_key[i], 1); + res_key.append(1, cspace); + } + out_stream << " ASCII " << res_key << "\n"; + out_stream << " ------\n"; + } + out_stream << "\n"; + return Status::OK(); +} + +Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::unique_ptr> blockhandles_iter( + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_stream << "Can not read Index Block \n\n"; + return s; + } + + uint64_t datablock_size_min = std::numeric_limits::max(); + uint64_t datablock_size_max = 0; + uint64_t datablock_size_sum = 0; + + size_t block_id = 1; + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + block_id++, blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + + BlockHandle bh = blockhandles_iter->value().handle; + uint64_t datablock_size = bh.size(); + datablock_size_min = std::min(datablock_size_min, datablock_size); + datablock_size_max = std::max(datablock_size_max, datablock_size); + datablock_size_sum += datablock_size; + + out_stream << "Data Block # " << block_id << " @ " + << blockhandles_iter->value().handle.ToString(true) << "\n"; + out_stream << "--------------------------------------\n"; + + std::unique_ptr datablock_iter; + Status tmp_status; + datablock_iter.reset(NewDataBlockIterator( + read_options, blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, + /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, + /*async_read=*/false, tmp_status)); + s = datablock_iter->status(); + + if (!s.ok()) { + out_stream << "Error reading the block - Skipped \n\n"; + continue; + } + + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + out_stream << "Error reading the block - Skipped \n"; + break; + } + DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream); + } + out_stream << "\n"; + } + + uint64_t num_datablocks = block_id - 1; + if (num_datablocks) { + double datablock_size_avg = + static_cast(datablock_size_sum) / num_datablocks; + out_stream << "Data Block Summary:\n"; + out_stream << "--------------------------------------\n"; + out_stream << " # data blocks: " << num_datablocks << "\n"; + out_stream << " min data block size: " << datablock_size_min << "\n"; + out_stream << " max data block size: " << datablock_size_max << "\n"; + out_stream << " avg data block size: " + << std::to_string(datablock_size_avg) << "\n"; + } + + return Status::OK(); +} + +void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, + std::ostream& out_stream) { + InternalKey ikey; + ikey.DecodeFrom(key); + + out_stream << " HEX " << ikey.user_key().ToString(true) << ": " + << value.ToString(true) << "\n"; + + std::string str_key = ikey.user_key().ToString(); + std::string str_value = value.ToString(); + std::string res_key(""), res_value(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + if (str_key[i] == '\0') { + res_key.append("\\0", 2); + } else { + res_key.append(&str_key[i], 1); + } + res_key.append(1, cspace); + } + for (size_t i = 0; i < str_value.size(); i++) { + if (str_value[i] == '\0') { + res_value.append("\\0", 2); + } else { + res_value.append(&str_value[i], 1); + } + res_value.append(1, cspace); + } + + out_stream << " ASCII " << res_key << ": " << res_value << "\n"; + out_stream << " ------\n"; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.h new file mode 100644 index 0000000000..8cdda06425 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader.h @@ -0,0 +1,745 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" +#include "cache/cache_reservation_manager.h" +#include "db/range_tombstone_fragmenter.h" +#include "file/filename.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_cache.h" +#include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/uncompression_dict_reader.h" +#include "table/format.h" +#include "table/persistent_cache_options.h" +#include "table/table_properties_internal.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/coro_utils.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class FilterBlockReader; +class FullFilterBlockReader; +class Footer; +class InternalKeyComparator; +class Iterator; +class FSRandomAccessFile; +class TableCache; +class TableReader; +class WritableFile; +struct BlockBasedTableOptions; +struct EnvOptions; +struct ReadOptions; +class GetContext; + +using KVPairBlock = std::vector>; + +// Reader class for BlockBasedTable format. +// For the format of BlockBasedTable refer to +// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format. +// This is the default table type. Data is chucked into fixed size blocks and +// each block in-turn stores entries. When storing data, we can compress and/or +// encode data efficiently within a block, which often results in a much smaller +// data size compared with the raw data size. As for the record retrieval, we'll +// first locate the block where target record may reside, then read the block to +// memory, and finally search that record within the block. Of course, to avoid +// frequent reads of the same block, we introduced the block cache to keep the +// loaded blocks in the memory. +class BlockBasedTable : public TableReader { + public: + static const std::string kObsoleteFilterBlockPrefix; + static const std::string kFullFilterBlockPrefix; + static const std::string kPartitionedFilterBlockPrefix; + + // 1-byte compression type + 32-bit checksum + static constexpr size_t kBlockTrailerSize = 5; + + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table_reader" to the newly opened + // table. The client should delete "*table_reader" when no longer needed. + // If there was an error while initializing the table, sets "*table_reader" + // to nullptr and returns a non-ok status. + // + // @param file must remain live while this Table is in use. + // @param prefetch_index_and_filter_in_cache can be used to disable + // prefetching of + // index and filter blocks into block cache at startup + // @param skip_filters Disables loading/accessing the filter block. Overrides + // prefetch_index_and_filter_in_cache, so filter will be skipped if both + // are set. + // @param force_direct_prefetch if true, always prefetching to RocksDB + // buffer, rather than calling RandomAccessFile::Prefetch(). + static Status Open( + const ReadOptions& ro, const ImmutableOptions& ioptions, + const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, + std::unique_ptr&& file, uint64_t file_size, + uint8_t block_protection_bytes_per_key, + std::unique_ptr* table_reader, uint64_t tail_size, + std::shared_ptr table_reader_cache_res_mgr = + nullptr, + const std::shared_ptr& prefix_extractor = nullptr, + bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false, + int level = -1, const bool immortal_table = false, + const SequenceNumber largest_seqno = 0, + bool force_direct_prefetch = false, + TailPrefetchStats* tail_prefetch_stats = nullptr, + BlockCacheTracer* const block_cache_tracer = nullptr, + size_t max_file_size_for_l0_meta_pin = 0, + const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0, + UniqueId64x2 expected_unique_id = {}); + + bool PrefixRangeMayMatch(const Slice& internal_key, + const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context, + bool* filter_checked) const; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // @param read_options Must outlive the returned iterator. + // @param skip_filters Disables loading/accessing the filter block + // compaction_readahead_size: its value will only be used if caller = + // kCompaction. + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; + + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& read_options) override; + + // @param skip_filters Disables loading/accessing the filter block + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + Status MultiGetFilter(const ReadOptions& read_options, + const SliceTransform* prefix_extractor, + MultiGetRange* mget_range) override; + + DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet, + const ReadOptions& readOptions, + const MultiGetContext::Range* mget_range, + const SliceTransform* prefix_extractor, + bool skip_filters = false); + + // Pre-fetch the disk blocks that correspond to the key range specified by + // (kbegin, kend). The call will return error status in the event of + // IO or iteration error. + Status Prefetch(const ReadOptions& read_options, const Slice* begin, + const Slice* end) override; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, + TableReaderCaller caller) override; + + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data. + // The start key must not be greater than the end key. + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; + + Status ApproximateKeyAnchors(const ReadOptions& read_options, + std::vector& anchors) override; + + bool TEST_BlockInCache(const BlockHandle& handle) const; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table && block cache enabled + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override; + + size_t ApproximateMemoryUsage() const override; + + // convert SST file to a human readable form + Status DumpTable(WritableFile* out_file) override; + + Status VerifyChecksum(const ReadOptions& readOptions, + TableReaderCaller caller) override; + + ~BlockBasedTable(); + + bool TEST_FilterBlockInCache() const; + bool TEST_IndexBlockInCache() const; + + // IndexReader is the interface that provides the functionality for index + // access. + class IndexReader { + public: + virtual ~IndexReader() = default; + + // Create an iterator for index access. If iter is null, then a new object + // is created on the heap, and the callee will have the ownership. + // If a non-null iter is passed in, it will be used, and the returned value + // is either the same as iter or a new on-heap object that + // wraps the passed iter. In the latter case the return value points + // to a different object then iter, and the callee has the ownership of the + // returned object. + virtual InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; + + // Report an approximation of how much memory has been used other than + // memory that was allocated in block cache. + virtual size_t ApproximateMemoryUsage() const = 0; + // Cache the dependencies of the index reader (e.g. the partitions + // of a partitioned index). + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /* pin */, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { + return Status::OK(); + } + }; + + class IndexReaderCommon; + + static void SetupBaseCacheKey(const TableProperties* properties, + const std::string& cur_db_session_id, + uint64_t cur_file_number, + OffsetableCacheKey* out_base_cache_key, + bool* out_is_stable = nullptr); + + static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key, + const BlockHandle& handle); + + static void UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, size_t usage, + bool redundant, + Statistics* const statistics); + + Statistics* GetStatistics() const; + bool IsLastLevel() const; + + // Get the size to read from storage for a BlockHandle. size_t because we + // are about to load into memory. + static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) { + return static_cast(handle.size() + kBlockTrailerSize); + } + + // It is the caller's responsibility to make sure that this is called with + // block-based table serialized block contents, which contains the compression + // byte in the trailer after `block_size`. + static inline CompressionType GetBlockCompressionType(const char* block_data, + size_t block_size) { + return static_cast(block_data[block_size]); + } + static inline CompressionType GetBlockCompressionType( + const BlockContents& contents) { + assert(contents.has_trailer); + return GetBlockCompressionType(contents.data.data(), contents.data.size()); + } + + // Retrieve all key value pairs from data blocks in the table. + // The key retrieved are internal keys. + Status GetKVPairsFromDataBlocks(const ReadOptions& read_options, + std::vector* kv_pair_blocks); + + struct Rep; + + Rep* get_rep() { return rep_; } + const Rep* get_rep() const { return rep_; } + + // input_iter: if it is not null, update this one and return it as Iterator + template + TBlockIter* NewDataBlockIterator(const ReadOptions& ro, + const BlockHandle& block_handle, + TBlockIter* input_iter, BlockType block_type, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, + bool for_compaction, bool async_read, + Status& s) const; + + // input_iter: if it is not null, update this one and return it as Iterator + template + TBlockIter* NewDataBlockIterator(const ReadOptions& ro, + CachableEntry& block, + TBlockIter* input_iter, Status s) const; + + class PartitionedIndexIteratorState; + + template + friend class FilterBlockReaderCommon; + + friend class PartitionIndexReader; + + friend class UncompressionDictReader; + + protected: + Rep* rep_; + explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) + : rep_(rep), block_cache_tracer_(block_cache_tracer) {} + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; + + private: + friend class MockedBlockBasedTable; + friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test; + BlockCacheTracer* const block_cache_tracer_; + + void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, + size_t usage) const; + void UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const; + + // Either Block::NewDataIterator() or Block::NewIndexIterator(). + template + static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, + BlockType block_type, + TBlockIter* input_iter, + bool block_contents_pinned); + + // If block cache enabled (compressed or uncompressed), looks for the block + // identified by handle in (1) uncompressed cache, (2) compressed cache, and + // then (3) file. If found, inserts into the cache(s) that were searched + // unsuccessfully (e.g., if found in file, will add to both uncompressed and + // compressed caches if they're enabled). + // + // @param block_entry value is set to the uncompressed block if found. If + // in uncompressed block cache, also sets cache_handle to reference that + // block. + template + WithBlocklikeCheck MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + bool for_compaction, CachableEntry* block_entry, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents, bool async_read) const; + + // Similar to the above, with one crucial difference: it will retrieve the + // block from the file even if there are no caches configured (assuming the + // read options allow I/O). + template + WithBlocklikeCheck RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, GetContext* get_context, + BlockCacheLookupContext* lookup_context, bool for_compaction, + bool use_cache, bool async_read) const; + + template + WithBlocklikeCheck SaveLookupContextOrTraceRecord( + const Slice& block_key, bool is_cache_hit, const ReadOptions& ro, + const TBlocklike* parsed_block_value, + BlockCacheLookupContext* lookup_context) const; + + void FinishTraceRecord(const BlockCacheLookupContext& lookup_context, + const Slice& block_key, const Slice& referenced_key, + bool does_referenced_key_exist, + uint64_t referenced_data_size) const; + + DECLARE_SYNC_AND_ASYNC_CONST( + void, RetrieveMultipleBlocks, const ReadOptions& options, + const MultiGetRange* batch, + const autovector* handles, + Status* statuses, CachableEntry* results, char* scratch, + const UncompressionDict& uncompression_dict); + + // Get the iterator from the index reader. + // + // If input_iter is not set, return a new Iterator. + // If input_iter is set, try to update it and return it as Iterator. + // However note that in some cases the returned iterator may be different + // from input_iter. In such case the returned iterator should be freed. + // + // Note: ErrorIterator with Status::Incomplete shall be returned if all the + // following conditions are met: + // 1. We enabled table_options.cache_index_and_filter_blocks. + // 2. index is not present in block cache. + // 3. We disallowed any io to be performed, that is, read_options == + // kBlockCacheTier + InternalIteratorBase* NewIndexIterator( + const ReadOptions& read_options, bool need_upper_bound_check, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + + template + Cache::Priority GetCachePriority() const; + + // Read block cache from block caches (if set): block_cache. + // On success, Status::OK with be returned and @block will be populated with + // pointer to the block as well as its block handle. + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + template + WithBlocklikeCheck GetDataBlockFromCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CachableEntry* block, GetContext* get_context) const; + + // Put a maybe compressed block to the corresponding block caches. + // This method will perform decompression against block_contents if needed + // and then populate the block caches. + // On success, Status::OK will be returned; also @block will be populated with + // uncompressed block and its cache handle. + // + // Allocated memory managed by block_contents will be transferred to + // PutDataBlockToCache(). After the call, the object will be invalid. + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + template + WithBlocklikeCheck PutDataBlockToCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CachableEntry* cached_block, BlockContents&& block_contents, + CompressionType block_comp_type, + const UncompressionDict& uncompression_dict, + MemoryAllocator* memory_allocator, GetContext* get_context) const; + + // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found + // after a call to Seek(key), until handle_result returns false. + // May not make such a call if filter policy says that key is not present. + friend class TableCache; + friend class BlockBasedTableBuilder; + + // Create a index reader based on the index type stored in the table. + // Optionally, user can pass a preloaded meta_index_iter for the index that + // need to access extra meta blocks for index construction. This parameter + // helps avoid re-reading meta index block if caller already created one. + Status CreateIndexReader(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + bool FullFilterKeyMayMatch(FilterBlockReader* filter, const Slice& user_key, + const bool no_io, + const SliceTransform* prefix_extractor, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const; + + void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range, + const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const; + + // If force_direct_prefetch is true, always prefetching to RocksDB + // buffer, rather than calling RandomAccessFile::Prefetch(). + static Status PrefetchTail( + const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, + bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, + const bool prefetch_all, const bool preload_all, + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger); + Status ReadMetaIndexBlock(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* metaindex_block, + std::unique_ptr* iter); + Status ReadPropertiesBlock(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const SequenceNumber largest_seqno); + Status ReadRangeDelBlock(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context); + Status PrefetchIndexAndFilterBlocks( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, BlockBasedTable* new_table, + bool prefetch_all, const BlockBasedTableOptions& table_options, + const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin, + BlockCacheLookupContext* lookup_context); + + static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); + + Status VerifyChecksumInMetaBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); + + // Create the filter from the filter block. + std::unique_ptr CreateFilterBlockReader( + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + + // Size of all data blocks, maybe approximate + uint64_t GetApproximateDataSize(); + + // Given an iterator return its offset in data block section of file. + uint64_t ApproximateDataOffsetOf( + const InternalIteratorBase& index_iter, + uint64_t data_size) const; + + // Helper functions for DumpTable() + Status DumpIndexBlock(std::ostream& out_stream); + Status DumpDataBlocks(std::ostream& out_stream); + void DumpKeyValue(const Slice& key, const Slice& value, + std::ostream& out_stream); + + // Returns false if prefix_extractor exists and is compatible with that used + // in building the table file, otherwise true. + bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const; + + bool TimestampMayMatch(const ReadOptions& read_options) const; + + // A cumulative data block file read in MultiGet lower than this size will + // use a stack buffer + static constexpr size_t kMultiGetReadStackBufSize = 8192; + + friend class PartitionedFilterBlockReader; + friend class PartitionedFilterBlockTest; + friend class DBBasicTest_MultiGetIOBufferOverrun_Test; +}; + +// Maintaining state of a two-level iteration on a partitioned index structure. +class BlockBasedTable::PartitionedIndexIteratorState + : public TwoLevelIteratorState { + public: + PartitionedIndexIteratorState( + const BlockBasedTable* table, + UnorderedMap>* block_map); + InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& index_value) override; + + private: + // Don't own table_ + const BlockBasedTable* table_; + UnorderedMap>* block_map_; +}; + +// Stores all the properties associated with a BlockBasedTable. +// These are immutable. +struct BlockBasedTable::Rep { + Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options, + const BlockBasedTableOptions& _table_opt, + const InternalKeyComparator& _internal_comparator, bool skip_filters, + uint64_t _file_size, int _level, const bool _immortal_table) + : ioptions(_ioptions), + env_options(_env_options), + table_options(_table_opt), + filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), + internal_comparator(_internal_comparator), + filter_type(FilterType::kNoFilter), + index_type(BlockBasedTableOptions::IndexType::kBinarySearch), + whole_key_filtering(_table_opt.whole_key_filtering), + prefix_filtering(true), + global_seqno(kDisableGlobalSequenceNumber), + file_size(_file_size), + level(_level), + immortal_table(_immortal_table) {} + ~Rep() { status.PermitUncheckedError(); } + const ImmutableOptions& ioptions; + const EnvOptions& env_options; + const BlockBasedTableOptions table_options; + const FilterPolicy* const filter_policy; + const InternalKeyComparator& internal_comparator; + Status status; + std::unique_ptr file; + OffsetableCacheKey base_cache_key; + PersistentCacheOptions persistent_cache_options; + + // Footer contains the fixed table information + Footer footer; + + std::unique_ptr index_reader; + std::unique_ptr filter; + std::unique_ptr uncompression_dict_reader; + + enum class FilterType { + kNoFilter, + kFullFilter, + kPartitionedFilter, + }; + FilterType filter_type; + BlockHandle filter_handle; + BlockHandle compression_dict_handle; + + std::shared_ptr table_properties; + BlockBasedTableOptions::IndexType index_type; + bool whole_key_filtering; + bool prefix_filtering; + std::shared_ptr table_prefix_extractor; + + std::shared_ptr fragmented_range_dels; + + // FIXME + // If true, data blocks in this file are definitely ZSTD compressed. If false + // they might not be. When false we skip creating a ZSTD digested + // uncompression dictionary. Even if we get a false negative, things should + // still work, just not as quickly. + BlockCreateContext create_context; + + // If global_seqno is used, all Keys in this file will have the same + // seqno with value `global_seqno`. + // + // A value of kDisableGlobalSequenceNumber means that this feature is disabled + // and every key have it's own seqno. + SequenceNumber global_seqno; + + // Size of the table file on disk + uint64_t file_size; + + // the level when the table is opened, could potentially change when trivial + // move is involved + int level; + + // the timestamp range of table + // Points into memory owned by TableProperties. This would need to change if + // TableProperties become subject to cache eviction. + Slice min_timestamp; + Slice max_timestamp; + + // If false, blocks in this file are definitely all uncompressed. Knowing this + // before reading individual blocks enables certain optimizations. + bool blocks_maybe_compressed = true; + + // These describe how index is encoded. + bool index_has_first_key = false; + bool index_key_includes_seq = true; + bool index_value_is_full = true; + + const bool immortal_table; + + std::unique_ptr + table_reader_cache_res_handle = nullptr; + + SequenceNumber get_global_seqno(BlockType block_type) const { + return (block_type == BlockType::kFilterPartitionIndex || + block_type == BlockType::kCompressionDictionary) + ? kDisableGlobalSequenceNumber + : global_seqno; + } + + uint64_t cf_id_for_tracing() const { + return table_properties + ? table_properties->column_family_id + : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context:: + kUnknownColumnFamily; + } + + Slice cf_name_for_tracing() const { + return table_properties ? table_properties->column_family_name + : BlockCacheTraceHelper::kUnknownColumnFamilyName; + } + + uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } + + uint64_t sst_number_for_tracing() const { + return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; + } + void CreateFilePrefetchBuffer( + size_t readahead_size, size_t max_readahead_size, + std::unique_ptr* fpb, bool implicit_auto_readahead, + uint64_t num_file_reads, + uint64_t num_file_reads_for_auto_readahead) const { + fpb->reset(new FilePrefetchBuffer( + readahead_size, max_readahead_size, + !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */, + implicit_auto_readahead, num_file_reads, + num_file_reads_for_auto_readahead, ioptions.fs.get(), ioptions.clock, + ioptions.stats)); + } + + void CreateFilePrefetchBufferIfNotExists( + size_t readahead_size, size_t max_readahead_size, + std::unique_ptr* fpb, bool implicit_auto_readahead, + uint64_t num_file_reads, + uint64_t num_file_reads_for_auto_readahead) const { + if (!(*fpb)) { + CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, + implicit_auto_readahead, num_file_reads, + num_file_reads_for_auto_readahead); + } + } + + std::size_t ApproximateMemoryUsage() const { + std::size_t usage = 0; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } +}; + +// This is an adapter class for `WritableFile` to be used for `std::ostream`. +// The adapter wraps a `WritableFile`, which can be passed to a `std::ostream` +// constructor for storing streaming data. +// Note: +// * This adapter doesn't provide any buffering, each write is forwarded to +// `WritableFile->Append()` directly. +// * For a failed write, the user needs to check the status by `ostream.good()` +class WritableFileStringStreamAdapter : public std::stringbuf { + public: + explicit WritableFileStringStreamAdapter(WritableFile* writable_file) + : file_(writable_file) {} + + // Override overflow() to handle `sputc()`. There are cases that will not go + // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by + // `os.put()` directly and will call `sputc()` By internal implementation: + // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character + // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) : + // overflow(_Traits::to_int_type(_Ch)); + // } + // As we explicitly disabled buffering (_Pnavail() is always 0), every write, + // not captured by xsputn(), becomes an overflow here. + int overflow(int ch = EOF) override { + if (ch != EOF) { + Status s = file_->Append(Slice((char*)&ch, 1)); + if (s.ok()) { + return ch; + } + } + return EOF; + } + + std::streamsize xsputn(char const* p, std::streamsize n) override { + Status s = file_->Append(Slice(p, n)); + if (!s.ok()) { + return 0; + } + return n; + } + + private: + WritableFile* file_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_impl.h new file mode 100644 index 0000000000..801b4614f0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_impl.h @@ -0,0 +1,203 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include + +#include "block.h" +#include "block_cache.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/reader_common.h" + +// The file contains some member functions of BlockBasedTable that +// cannot be implemented in block_based_table_reader.cc because +// it's called by other files (e.g. block_based_iterator.h) and +// are templates. + +namespace ROCKSDB_NAMESPACE { +namespace { +using IterPlaceholderCacheInterface = + PlaceholderCacheInterface; + +template +struct IterTraits {}; + +template <> +struct IterTraits { + using IterBlocklike = Block_kData; +}; + +template <> +struct IterTraits { + using IterBlocklike = Block_kIndex; +}; + +} // namespace + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read, + Status& s) const { + using IterBlocklike = typename IterTraits::IterBlocklike; + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + CachableEntry block; + if (rep_->uncompression_dict_reader && block_type == BlockType::kData) { + CachableEntry uncompression_dict; + const bool no_io = (ro.read_tier == kBlockCacheTier); + // For async scans, don't use the prefetch buffer since an async prefetch + // might already be under way and this would invalidate it. Also, the + // uncompression dict is typically at the end of the file and would + // most likely break the sequentiality of the access pattern. + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + ro.async_io ? nullptr : prefetch_buffer, ro, no_io, ro.verify_checksums, + get_context, lookup_context, &uncompression_dict); + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + s = RetrieveBlock(prefetch_buffer, ro, handle, dict, + &block.As(), get_context, lookup_context, + for_compaction, + /* use_cache */ true, async_read); + } else { + s = RetrieveBlock( + prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), + &block.As(), get_context, lookup_context, for_compaction, + /* use_cache */ true, async_read); + } + + if (s.IsTryAgain() && async_read) { + return iter; + } + + if (!s.ok()) { + assert(block.IsEmpty()); + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), block_type, iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache) { + IterPlaceholderCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; + if (block_cache) { + // insert a dummy record to block cache to track the memory usage + Cache::Handle* cache_handle = nullptr; + CacheKey key = + CacheKey::CreateUniqueForCacheLifetime(block_cache.get()); + s = block_cache.Insert(key.AsSlice(), + block.GetValue()->ApproximateMemoryUsage(), + &cache_handle); + + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache.get(), + cache_handle); + } + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + + return iter; +} + +// Convert an uncompressed data block (i.e CachableEntry) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, + CachableEntry& block, + TBlockIter* input_iter, + Status s) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), BlockType::kData, + iter, block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache) { + IterPlaceholderCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; + if (block_cache) { + // insert a dummy record to block cache to track the memory usage + Cache::Handle* cache_handle = nullptr; + CacheKey key = + CacheKey::CreateUniqueForCacheLifetime(block_cache.get()); + s = block_cache.Insert(key.AsSlice(), + block.GetValue()->ApproximateMemoryUsage(), + &cache_handle); + + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache.get(), + cache_handle); + } + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + return iter; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_sync_and_async.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_sync_and_async.h new file mode 100644 index 0000000000..0cb251afa5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_based_table_reader_sync_and_async.h @@ -0,0 +1,758 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/async_file_reader.h" +#include "util/coro_utils.h" + +#if defined(WITHOUT_COROUTINES) || \ + (defined(USE_COROUTINES) && defined(WITH_COROUTINES)) + +namespace ROCKSDB_NAMESPACE { + +// This function reads multiple data blocks from disk using Env::MultiRead() +// and optionally inserts them into the block cache. It uses the scratch +// buffer provided by the caller, which is contiguous. If scratch is a nullptr +// it allocates a separate buffer for each block. Typically, if the blocks +// need to be uncompressed and there is no compressed block cache, callers +// can allocate a temporary scratch buffer in order to minimize memory +// allocations. +// If options.fill_cache is true, it inserts the blocks into cache. If its +// false and scratch is non-null and the blocks are uncompressed, it copies +// the buffers to heap. In any case, the CachableEntry returned will +// own the data bytes. +// If compression is enabled and also there is no compressed block cache, +// the adjacent blocks are read out in one IO (combined read) +// batch - A MultiGetRange with only those keys with unique data blocks not +// found in cache +// handles - A vector of block handles. Some of them me be NULL handles +// scratch - An optional contiguous buffer to read compressed blocks into +DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) +(const ReadOptions& options, const MultiGetRange* batch, + const autovector* handles, + Status* statuses, CachableEntry* results, char* scratch, + const UncompressionDict& uncompression_dict) const { + RandomAccessFileReader* file = rep_->file.get(); + const Footer& footer = rep_->footer; + const ImmutableOptions& ioptions = rep_->ioptions; + size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; + MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); + + if (ioptions.allow_mmap_reads) { + size_t idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + // XXX: use_cache=true means double cache query? + statuses[idx_in_batch] = + RetrieveBlock(nullptr, options, handle, uncompression_dict, + &results[idx_in_batch].As(), + mget_iter->get_context, /* lookup_context */ nullptr, + /* for_compaction */ false, /* use_cache */ true, + /* async_read */ false); + } + assert(idx_in_batch == handles->size()); + CO_RETURN; + } + + // In direct IO mode, blocks share the direct io buffer. + // Otherwise, blocks share the scratch buffer. + const bool use_shared_buffer = file->use_direct_io() || scratch != nullptr; + + autovector read_reqs; + size_t buf_offset = 0; + size_t idx_in_batch = 0; + + uint64_t prev_offset = 0; + size_t prev_len = 0; + autovector req_idx_for_block; + autovector req_offset_for_block; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + size_t prev_end = static_cast(prev_offset) + prev_len; + + // If current block is adjacent to the previous one, at the same time, + // compression is enabled and there is no compressed cache, we combine + // the two block read as one. + // We don't combine block reads here in direct IO mode, because when doing + // direct IO read, the block requests will be realigned and merged when + // necessary. + if (use_shared_buffer && !file->use_direct_io() && + prev_end == handle.offset()) { + req_offset_for_block.emplace_back(prev_len); + prev_len += BlockSizeWithTrailer(handle); + } else { + // No compression or current block and previous one is not adjacent: + // Step 1, create a new request for previous blocks + if (prev_len != 0) { + FSReadRequest req; + req.offset = prev_offset; + req.len = prev_len; + if (file->use_direct_io()) { + req.scratch = nullptr; + } else if (use_shared_buffer) { + req.scratch = scratch + buf_offset; + buf_offset += req.len; + } else { + req.scratch = new char[req.len]; + } + read_reqs.emplace_back(req); + } + + // Step 2, remeber the previous block info + prev_offset = handle.offset(); + prev_len = BlockSizeWithTrailer(handle); + req_offset_for_block.emplace_back(0); + } + req_idx_for_block.emplace_back(read_reqs.size()); + + PERF_COUNTER_ADD(block_read_count, 1); + PERF_COUNTER_ADD(block_read_byte, BlockSizeWithTrailer(handle)); + } + // Handle the last block and process the pending last request + if (prev_len != 0) { + FSReadRequest req; + req.offset = prev_offset; + req.len = prev_len; + if (file->use_direct_io()) { + req.scratch = nullptr; + } else if (use_shared_buffer) { + req.scratch = scratch + buf_offset; + } else { + req.scratch = new char[req.len]; + } + read_reqs.emplace_back(req); + } + + AlignedBuf direct_io_buf; + { + IOOptions opts; + IOStatus s = file->PrepareIOOptions(options, opts); + if (s.ok()) { +#if defined(WITH_COROUTINES) + if (file->use_direct_io()) { +#endif // WITH_COROUTINES + s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), + &direct_io_buf, options.rate_limiter_priority); +#if defined(WITH_COROUTINES) + } else { + co_await batch->context()->reader().MultiReadAsync( + file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf); + } +#endif // WITH_COROUTINES + } + if (!s.ok()) { + // Discard all the results in this batch if there is any time out + // or overall MultiRead error + for (FSReadRequest& req : read_reqs) { + req.status = s; + } + } + } + + idx_in_batch = 0; + size_t valid_batch_idx = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + + if (handle.IsNull()) { + continue; + } + + assert(valid_batch_idx < req_idx_for_block.size()); + assert(valid_batch_idx < req_offset_for_block.size()); + assert(req_idx_for_block[valid_batch_idx] < read_reqs.size()); + size_t& req_idx = req_idx_for_block[valid_batch_idx]; + size_t& req_offset = req_offset_for_block[valid_batch_idx]; + valid_batch_idx++; + FSReadRequest& req = read_reqs[req_idx]; + Status s = req.status; + if (s.ok()) { + if ((req.result.size() != req.len) || + (req_offset + BlockSizeWithTrailer(handle) > req.result.size())) { + s = Status::Corruption("truncated block read from " + + rep_->file->file_name() + " offset " + + std::to_string(handle.offset()) + ", expected " + + std::to_string(req.len) + " bytes, got " + + std::to_string(req.result.size())); + } + } + + BlockContents serialized_block; + if (s.ok()) { + if (!use_shared_buffer) { + // We allocated a buffer for this block. Give ownership of it to + // BlockContents so it can free the memory + assert(req.result.data() == req.scratch); + assert(req.result.size() == BlockSizeWithTrailer(handle)); + assert(req_offset == 0); + serialized_block = + BlockContents(std::unique_ptr(req.scratch), handle.size()); + } else { + // We used the scratch buffer or direct io buffer + // which are shared by the blocks. + // serialized_block does not have the ownership. + serialized_block = + BlockContents(Slice(req.result.data() + req_offset, handle.size())); + } +#ifndef NDEBUG + serialized_block.has_trailer = true; +#endif + + if (options.verify_checksums) { + PERF_TIMER_GUARD(block_checksum_time); + const char* data = req.result.data(); + // Since the scratch might be shared, the offset of the data block in + // the buffer might not be 0. req.result.data() only point to the + // begin address of each read request, we need to add the offset + // in each read request. Checksum is stored in the block trailer, + // beyond the payload size. + s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset, + handle.size(), rep_->file->file_name(), + handle.offset()); + TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); + } + } else if (!use_shared_buffer) { + // Free the allocated scratch buffer. + delete[] req.scratch; + } + + if (s.ok()) { + // When the blocks share the same underlying buffer (scratch or direct io + // buffer), we may need to manually copy the block into heap if the + // serialized block has to be inserted into a cache. That falls into the + // following cases - + // 1. serialized block is not compressed, it needs to be inserted into + // the uncompressed block cache if there is one + // 2. If the serialized block is compressed, it needs to be inserted + // into the compressed block cache if there is one + // + // In all other cases, the serialized block is either uncompressed into a + // heap buffer or there is no cache at all. + CompressionType compression_type = + GetBlockCompressionType(serialized_block); + if (use_shared_buffer && compression_type == kNoCompression) { + Slice serialized = + Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle)); + serialized_block = BlockContents( + CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), + serialized), + handle.size()); +#ifndef NDEBUG + serialized_block.has_trailer = true; +#endif + } + } + + if (s.ok()) { + if (options.fill_cache) { + CachableEntry* block_entry = &results[idx_in_batch]; + // MaybeReadBlockAndLoadToCache will insert into the block caches if + // necessary. Since we're passing the serialized block contents, it + // will avoid looking up the block cache + s = MaybeReadBlockAndLoadToCache( + nullptr, options, handle, uncompression_dict, + /*for_compaction=*/false, block_entry, mget_iter->get_context, + /*lookup_context=*/nullptr, &serialized_block, + /*async_read=*/false); + + // block_entry value could be null if no block cache is present, i.e + // BlockBasedTableOptions::no_block_cache is true and no compressed + // block cache is configured. In that case, fall + // through and set up the block explicitly + if (block_entry->GetValue() != nullptr) { + s.PermitUncheckedError(); + continue; + } + } + + CompressionType compression_type = + GetBlockCompressionType(serialized_block); + BlockContents contents; + if (compression_type != kNoCompression) { + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressSerializedBlock( + info, req.result.data() + req_offset, handle.size(), &contents, + footer.format_version(), rep_->ioptions, memory_allocator); + } else { + // There are two cases here: + // 1) caller uses the shared buffer (scratch or direct io buffer); + // 2) we use the requst buffer. + // If scratch buffer or direct io buffer is used, we ensure that + // all serialized blocks are copyed to the heap as single blocks. If + // scratch buffer is not used, we also have no combined read, so the + // serialized block can be used directly. + contents = std::move(serialized_block); + } + if (s.ok()) { + results[idx_in_batch].SetOwnedValue(std::make_unique( + std::move(contents), read_amp_bytes_per_bit, ioptions.stats)); + } + } + statuses[idx_in_batch] = s; + } +} + +using MultiGetRange = MultiGetContext::Range; +DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) +(const ReadOptions& read_options, const MultiGetRange* mget_range, + const SliceTransform* prefix_extractor, bool skip_filters) { + if (mget_range->empty()) { + // Caller should ensure non-empty (performance bug) + assert(false); + CO_RETURN; // Nothing to do + } + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + MultiGetRange sst_file_range(*mget_range, mget_range->begin(), + mget_range->end()); + + // First check the full filter + // If full filter not useful, Then go into each block + const bool no_io = read_options.read_tier == kBlockCacheTier; + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (sst_file_range.begin()->get_context) { + tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); + } + // TODO: need more than one lookup_context here to track individual filter + // and index partition hits and misses. + BlockCacheLookupContext metadata_lookup_context{ + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + FullFilterKeysMayMatch(filter, &sst_file_range, no_io, prefix_extractor, + &metadata_lookup_context, read_options); + + if (!sst_file_range.empty()) { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged(prefix_extractor); + } + auto iiter = NewIndexIterator( + read_options, need_upper_bound_check, &iiter_on_stack, + sst_file_range.begin()->get_context, &metadata_lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + uint64_t prev_offset = std::numeric_limits::max(); + autovector block_handles; + std::array, MultiGetContext::MAX_BATCH_SIZE> + results; + std::array statuses; + // Empty data_lookup_contexts means "unused," when block cache tracing is + // disabled. (Limited options as element type is not default contructible.) + std::vector data_lookup_contexts; + MultiGetContext::Mask reused_mask = 0; + char stack_buf[kMultiGetReadStackBufSize]; + std::unique_ptr block_buf; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Awkward because BlockCacheLookupContext is not CopyAssignable + data_lookup_contexts.reserve(MultiGetContext::MAX_BATCH_SIZE); + for (size_t i = 0; i < MultiGetContext::MAX_BATCH_SIZE; ++i) { + data_lookup_contexts.push_back(metadata_lookup_context); + } + } + { + MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), + sst_file_range.end()); + CachableEntry uncompression_dict; + Status uncompression_dict_status; + uncompression_dict_status.PermitUncheckedError(); + bool uncompression_dict_inited = false; + size_t total_len = 0; + + // GetContext for any key will do, as the stats will be aggregated + // anyway + GetContext* get_context = sst_file_range.begin()->get_context; + + { + using BCI = BlockCacheInterface; + BCI block_cache{rep_->table_options.block_cache.get()}; + std::array + async_handles; + std::array cache_keys; + size_t cache_lookup_count = 0; + + for (auto miter = data_block_range.begin(); + miter != data_block_range.end(); ++miter) { + const Slice& key = miter->ikey; + iiter->Seek(miter->ikey); + + IndexValue v; + if (iiter->Valid()) { + v = iiter->value(); + } + if (!iiter->Valid() || + (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper( + rep_->internal_comparator.user_comparator()) + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + if (!iiter->status().IsNotFound()) { + *(miter->s) = iiter->status(); + } + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + + if (!uncompression_dict_inited && rep_->uncompression_dict_reader) { + uncompression_dict_status = + rep_->uncompression_dict_reader + ->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, read_options, no_io, + read_options.verify_checksums, get_context, + &metadata_lookup_context, &uncompression_dict); + uncompression_dict_inited = true; + } + + if (!uncompression_dict_status.ok()) { + assert(!uncompression_dict_status.IsNotFound()); + *(miter->s) = uncompression_dict_status; + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + + if (v.handle.offset() == prev_offset) { + // This key can reuse the previous block (later on). + // Mark previous as "reused" + reused_mask |= MultiGetContext::Mask{1} + << (block_handles.size() - 1); + // Use null handle to indicate this one reuses same block as + // previous. + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + continue; + } + prev_offset = v.handle.offset(); + block_handles.emplace_back(v.handle); + + if (block_cache) { + // Lookup the cache for the given data block referenced by an index + // iterator value (i.e BlockHandle). If it exists in the cache, + // initialize block to the contents of the data block. + + // An async version of MaybeReadBlockAndLoadToCache / + // GetDataBlockFromCache + BCI::TypedAsyncLookupHandle& async_handle = + async_handles[cache_lookup_count]; + cache_keys[cache_lookup_count] = + GetCacheKey(rep_->base_cache_key, v.handle); + async_handle.key = cache_keys[cache_lookup_count].AsSlice(); + // NB: StartAsyncLookupFull populates async_handle.helper + async_handle.create_context = &rep_->create_context; + async_handle.priority = GetCachePriority(); + async_handle.stats = rep_->ioptions.statistics.get(); + + block_cache.StartAsyncLookupFull( + async_handle, rep_->ioptions.lowest_used_cache_tier); + ++cache_lookup_count; + // TODO: stats? + } + } + + if (block_cache) { + block_cache.get()->WaitAll(&async_handles[0], cache_lookup_count); + } + size_t lookup_idx = 0; + for (size_t i = 0; i < block_handles.size(); ++i) { + // If this block was a success or failure or not needed because + // the corresponding key is in the same block as a prior key, skip + if (block_handles[i] == BlockHandle::NullBlockHandle()) { + continue; + } + if (!block_cache) { + total_len += BlockSizeWithTrailer(block_handles[i]); + } else { + BCI::TypedHandle* h = async_handles[lookup_idx].Result(); + if (h) { + // Cache hit + results[i].SetCachedValue(block_cache.Value(h), block_cache.get(), + h); + // Don't need to fetch + block_handles[i] = BlockHandle::NullBlockHandle(); + UpdateCacheHitMetrics(BlockType::kData, get_context, + block_cache.get()->GetUsage(h)); + } else { + // Cache miss + total_len += BlockSizeWithTrailer(block_handles[i]); + UpdateCacheMissMetrics(BlockType::kData, get_context); + } + if (!data_lookup_contexts.empty()) { + // Populate cache key before it's discarded + data_lookup_contexts[i].block_key = + async_handles[lookup_idx].key.ToString(); + } + ++lookup_idx; + } + } + assert(lookup_idx == cache_lookup_count); + } + + if (total_len) { + char* scratch = nullptr; + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + assert(uncompression_dict_inited || !rep_->uncompression_dict_reader); + assert(uncompression_dict_status.ok()); + // If using direct IO, then scratch is not used, so keep it nullptr. + // If the blocks need to be uncompressed and we don't need the + // compressed blocks, then we can use a contiguous block of + // memory to read in all the blocks as it will be temporary + // storage + // 1. If blocks are compressed and compressed block cache is there, + // alloc heap bufs + // 2. If blocks are uncompressed, alloc heap bufs + // 3. If blocks are compressed and no compressed block cache, use + // stack buf + if (!rep_->file->use_direct_io() && + rep_->blocks_maybe_compressed) { + if (total_len <= kMultiGetReadStackBufSize) { + scratch = stack_buf; + } else { + scratch = new char[total_len]; + block_buf.reset(scratch); + } + } + CO_AWAIT(RetrieveMultipleBlocks) + (read_options, &data_block_range, &block_handles, &statuses[0], + &results[0], scratch, dict); + if (get_context) { + ++(get_context->get_context_stats_.num_sst_read); + } + } + } + + DataBlockIter first_biter; + DataBlockIter next_biter; + size_t idx_in_batch = 0; + SharedCleanablePtr shared_cleanable; + for (auto miter = sst_file_range.begin(); miter != sst_file_range.end(); + ++miter) { + Status s; + GetContext* get_context = miter->get_context; + const Slice& key = miter->ikey; + bool matched = false; // if such user key matched a key in SST + bool done = false; + bool first_block = true; + do { + DataBlockIter* biter = nullptr; + uint64_t referenced_data_size = 0; + Block_kData* parsed_block_value = nullptr; + bool reusing_prev_block; + bool later_reused; + bool does_referenced_key_exist = false; + bool handle_present = false; + BlockCacheLookupContext* lookup_data_block_context = + data_lookup_contexts.empty() ? nullptr + : &data_lookup_contexts[idx_in_batch]; + if (first_block) { + handle_present = !block_handles[idx_in_batch].IsNull(); + parsed_block_value = results[idx_in_batch].GetValue(); + if (handle_present || parsed_block_value) { + first_biter.Invalidate(Status::OK()); + NewDataBlockIterator( + read_options, results[idx_in_batch].As(), &first_biter, + statuses[idx_in_batch]); + reusing_prev_block = false; + } else { + // If handle is null and result is empty, then the status is never + // set, which should be the initial value: ok(). + assert(statuses[idx_in_batch].ok()); + reusing_prev_block = true; + } + biter = &first_biter; + later_reused = + (reused_mask & (MultiGetContext::Mask{1} << idx_in_batch)) != 0; + idx_in_batch++; + } else { + IndexValue v = iiter->value(); + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + next_biter.Invalidate(Status::OK()); + Status tmp_s; + NewDataBlockIterator( + read_options, iiter->value().handle, &next_biter, + BlockType::kData, get_context, lookup_data_block_context, + /* prefetch_buffer= */ nullptr, /* for_compaction = */ false, + /*async_read = */ false, tmp_s); + biter = &next_biter; + reusing_prev_block = false; + later_reused = false; + } + + if (read_options.read_tier == kBlockCacheTier && + biter->status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter->status().ok()) { + s = biter->status(); + break; + } + + // Reusing blocks complicates pinning/Cleanable, because the cache + // entry referenced by biter can only be released once all returned + // pinned values are released. This code previously did an extra + // block_cache Ref for each reuse, but that unnecessarily increases + // block cache contention. Instead we can use a variant of shared_ptr + // to release in block cache only once. + // + // Although the biter loop below might SaveValue multiple times for + // merges, just one value_pinner suffices, as MultiGet will merge + // the operands before returning to the API user. + Cleanable* value_pinner; + if (biter->IsValuePinned()) { + if (reusing_prev_block) { + // Note that we don't yet know if the MultiGet results will need + // to pin this block, so we might wrap a block for sharing and + // still end up with 1 (or 0) pinning ref. Not ideal but OK. + // + // Here we avoid adding redundant cleanups if we didn't end up + // delegating the cleanup from last time around. + if (!biter->HasCleanups()) { + assert(shared_cleanable.get()); + if (later_reused) { + shared_cleanable.RegisterCopyWith(biter); + } else { + shared_cleanable.MoveAsCleanupTo(biter); + } + } + } else if (later_reused) { + assert(biter->HasCleanups()); + // Make the existing cleanups on `biter` sharable: + shared_cleanable.Allocate(); + // Move existing `biter` cleanup(s) to `shared_cleanable` + biter->DelegateCleanupsTo(&*shared_cleanable); + // Reference `shared_cleanable` as new cleanup for `biter` + shared_cleanable.RegisterCopyWith(biter); + } + assert(biter->HasCleanups()); + value_pinner = biter; + } else { + value_pinner = nullptr; + } + + bool may_exist = biter->SeekForGet(key); + if (!may_exist) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + break; + } + + // Call the *saver function on each entry/block until it returns false + for (; biter->Valid(); biter->Next()) { + ParsedInternalKey parsed_key; + Status pik_status = ParseInternalKey( + biter->key(), &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + s = pik_status; + } + if (!get_context->SaveValue(parsed_key, biter->value(), &matched, + value_pinner)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = + biter->key().size() + biter->value().size(); + } + done = true; + break; + } + s = biter->status(); + } + // Write the block cache access. + // XXX: There appear to be 'break' statements above that bypass this + // writing of the block cache trace record + if (lookup_data_block_context && !reusing_prev_block && first_block) { + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter->key(); + } else { + referenced_key = key; + } + + // block_key is self-assigned here (previously assigned from + // cache_keys / async_handles, now out of scope) + SaveLookupContextOrTraceRecord(lookup_data_block_context->block_key, + /*is_cache_hit=*/!handle_present, + read_options, parsed_block_value, + lookup_data_block_context); + FinishTraceRecord( + *lookup_data_block_context, lookup_data_block_context->block_key, + referenced_key, does_referenced_key_exist, referenced_data_size); + } + s = biter->status(); + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + if (first_block) { + iiter->Seek(key); + if (!iiter->Valid()) { + break; + } + } + first_block = false; + iiter->Next(); + } while (iiter->Valid()); + + if (matched && filter != nullptr) { + if (rep_->whole_key_filtering) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_TRUE_POSITIVE); + } + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok() && !iiter->status().IsNotFound()) { + s = iiter->status(); + } + *(miter->s) = s; + } +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + // Not sure why we need to do it. Should investigate more. + for (auto& st : statuses) { + st.PermitUncheckedError(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.cc new file mode 100644 index 0000000000..92702b17d0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_based/block_builder.h" + +#include + +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "table/block_based/data_block_footer.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +BlockBuilder::BlockBuilder( + int block_restart_interval, bool use_delta_encoding, + bool use_value_delta_encoding, + BlockBasedTableOptions::DataBlockIndexType index_type, + double data_block_hash_table_util_ratio) + : block_restart_interval_(block_restart_interval), + use_delta_encoding_(use_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + restarts_(1, 0), // First restart point is at offset 0 + counter_(0), + finished_(false) { + switch (index_type) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + data_block_hash_index_builder_.Initialize( + data_block_hash_table_util_ratio); + break; + default: + assert(0); + } + assert(block_restart_interval_ >= 1); + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.resize(1); // First restart point is at offset 0 + assert(restarts_[0] == 0); + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); + counter_ = 0; + finished_ = false; + last_key_.clear(); + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Reset(); + } +#ifndef NDEBUG + add_with_last_key_called_ = false; +#endif +} + +void BlockBuilder::SwapAndReset(std::string& buffer) { + std::swap(buffer_, buffer); + Reset(); +} + +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, + const Slice& value) const { + size_t estimate = CurrentSizeEstimate(); + // Note: this is an imprecise estimate as it accounts for the whole key size + // instead of non-shared key size. + estimate += key.size(); + // In value delta encoding we estimate the value delta size as half the full + // value size since only the size field of block handle is encoded. + estimate += + !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) + ? value.size() + : value.size() / 2; + + if (counter_ >= block_restart_interval_) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + // Note: this is an imprecise estimate as we will have to encoded size, one + // for shared key and one for non-shared key. + estimate += VarintLength(key.size()); // varint for key length. + if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { + estimate += VarintLength(value.size()); // varint for value length. + } + + return estimate; +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + + uint32_t num_restarts = static_cast(restarts_.size()); + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch; + if (data_block_hash_index_builder_.Valid() && + CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) { + data_block_hash_index_builder_.Finish(buffer_); + index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } + + // footer is a packed format of data_block_index_type and num_restarts + uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts); + + PutFixed32(&buffer_, block_footer); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value, + const Slice* const delta_value) { + // Ensure no unsafe mixing of Add and AddWithLastKey + assert(!add_with_last_key_called_); + + AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size()); + if (use_delta_encoding_) { + // Update state + // We used to just copy the changed data, but it appears to be + // faster to just copy the whole thing. + last_key_.assign(key.data(), key.size()); + } +} + +void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, + const Slice& last_key_param, + const Slice* const delta_value) { + // Ensure no unsafe mixing of Add and AddWithLastKey + assert(last_key_.empty()); +#ifndef NDEBUG + add_with_last_key_called_ = false; +#endif + + // Here we make sure to use an empty `last_key` on first call after creation + // or Reset. This is more convenient for the caller and we can be more + // clever inside BlockBuilder. On this hot code path, we want to avoid + // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a + // fast min operation instead, with an assertion to be sure our logic is + // sound. + size_t buffer_size = buffer_.size(); + size_t last_key_size = last_key_param.size(); + assert(buffer_size == 0 || buffer_size >= last_key_size); + + Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size)); + + AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size); +} + +inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, + const Slice& value, + const Slice& last_key, + const Slice* const delta_value, + size_t buffer_size) { + assert(!finished_); + assert(counter_ <= block_restart_interval_); + assert(!use_value_delta_encoding_ || delta_value); + size_t shared = 0; // number of bytes shared with prev key + if (counter_ >= block_restart_interval_) { + // Restart compression + restarts_.push_back(static_cast(buffer_size)); + estimate_ += sizeof(uint32_t); + counter_ = 0; + } else if (use_delta_encoding_) { + // See how much sharing to do with previous string + shared = key.difference_offset(last_key); + } + + const size_t non_shared = key.size() - shared; + + if (use_value_delta_encoding_) { + // Add "" to buffer_ + PutVarint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared)); + } else { + // Add "" to buffer_ + PutVarint32Varint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared), + static_cast(value.size())); + } + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + // Use value delta encoding only when the key has shared bytes. This would + // simplify the decoding, where it can figure which decoding to use simply by + // looking at the shared bytes size. + if (shared != 0 && use_value_delta_encoding_) { + buffer_.append(delta_value->data(), delta_value->size()); + } else { + buffer_.append(value.data(), value.size()); + } + + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Add(ExtractUserKey(key), + restarts_.size() - 1); + } + + counter_++; + estimate_ += buffer_.size() - buffer_size; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.h new file mode 100644 index 0000000000..7cb94b3118 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_builder.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "table/block_based/data_block_hash_index.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder { + public: + BlockBuilder(const BlockBuilder&) = delete; + void operator=(const BlockBuilder&) = delete; + + explicit BlockBuilder(int block_restart_interval, + bool use_delta_encoding = true, + bool use_value_delta_encoding = false, + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch, + double data_block_hash_table_util_ratio = 0.75); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // Swap the contents in BlockBuilder with buffer, then reset the BlockBuilder. + void SwapAndReset(std::string& buffer); + + // REQUIRES: Finish() has not been called since the last call to Reset(). + // REQUIRES: Unless a range tombstone block, key is larger than any previously + // added key + // DO NOT mix with AddWithLastKey() between Resets. For efficiency, use + // AddWithLastKey() in contexts where previous added key is already known + // and delta encoding might be used. + void Add(const Slice& key, const Slice& value, + const Slice* const delta_value = nullptr); + + // A faster version of Add() if the previous key is already known for all + // Add()s. + // REQUIRES: Finish() has not been called since the last call to Reset(). + // REQUIRES: Unless a range tombstone block, key is larger than any previously + // added key + // REQUIRES: if AddWithLastKey has been called since last Reset(), last_key + // is the key from most recent AddWithLastKey. (For convenience, last_key + // is ignored on first call after creation or Reset().) + // DO NOT mix with Add() between Resets. + void AddWithLastKey(const Slice& key, const Slice& value, + const Slice& last_key, + const Slice* const delta_value = nullptr); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + inline size_t CurrentSizeEstimate() const { + return estimate_ + (data_block_hash_index_builder_.Valid() + ? data_block_hash_index_builder_.EstimateSize() + : 0); + } + + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { return buffer_.empty(); } + + private: + inline void AddWithLastKeyImpl(const Slice& key, const Slice& value, + const Slice& last_key, + const Slice* const delta_value, + size_t buffer_size); + + const int block_restart_interval_; + // TODO(myabandeh): put it into a separate IndexBlockBuilder + const bool use_delta_encoding_; + // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values + const bool use_value_delta_encoding_; + + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + size_t estimate_; + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + DataBlockHashIndexBuilder data_block_hash_index_builder_; +#ifndef NDEBUG + bool add_with_last_key_called_ = false; +#endif +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.cc new file mode 100644 index 0000000000..a252899d24 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.cc @@ -0,0 +1,106 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/block_cache.h" + +namespace ROCKSDB_NAMESPACE { + +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kData( + std::move(block), table_options->read_amp_bytes_per_bit, statistics)); + parsed_out->get()->InitializeDataBlockProtectionInfo(protection_bytes_per_key, + raw_ucmp); +} +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kIndex(std::move(block), + /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); +} +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kFilterPartitionIndex( + std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); +} +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, BlockContents&& block) { + parsed_out->reset(new Block_kRangeDeletion( + std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); +} +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kMetaIndex( + std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeMetaIndexBlockProtectionInfo( + protection_bytes_per_key); +} + +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, BlockContents&& block) { + parsed_out->reset(new ParsedFullFilterBlock( + table_options->filter_policy.get(), std::move(block))); +} + +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new UncompressionDict( + block.data, std::move(block.allocation), using_zstd)); +} + +namespace { +// For getting SecondaryCache-compatible helpers from a BlockType. This is +// useful for accessing block cache in untyped contexts, such as for generic +// cache warming in table builder. +const std::array(BlockType::kInvalid) + 1> + kCacheItemFullHelperForBlockType{{ + BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), + nullptr, // kProperties + BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), + nullptr, // kHashIndexPrefixes + nullptr, // kHashIndexMetadata + nullptr, // kMetaIndex (not yet stored in block cache) + BlockCacheInterface::GetFullHelper(), + nullptr, // kInvalid + }}; + +// For getting basic helpers from a BlockType (no SecondaryCache support) +const std::array(BlockType::kInvalid) + 1> + kCacheItemBasicHelperForBlockType{{ + BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), + nullptr, // kProperties + BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), + nullptr, // kHashIndexPrefixes + nullptr, // kHashIndexMetadata + nullptr, // kMetaIndex (not yet stored in block cache) + BlockCacheInterface::GetBasicHelper(), + nullptr, // kInvalid + }}; +} // namespace + +const Cache::CacheItemHelper* GetCacheItemHelper( + BlockType block_type, CacheTier lowest_used_cache_tier) { + if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) { + return kCacheItemFullHelperForBlockType[static_cast(block_type)]; + } else { + return kCacheItemBasicHelperForBlockType[static_cast(block_type)]; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.h new file mode 100644 index 0000000000..00eaface37 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_cache.h @@ -0,0 +1,140 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Code supporting block cache (Cache) access for block-based table, based on +// the convenient APIs in typed_cache.h + +#pragma once + +#include + +#include "cache/typed_cache.h" +#include "port/lang.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +// Metaprogramming wrappers for Block, to give each type a single role when +// used with FullTypedCacheInterface. +// (NOTE: previous attempts to create actual derived classes of Block with +// virtual calls resulted in performance regression) + +class Block_kData : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kDataBlock; + static constexpr BlockType kBlockType = BlockType::kData; +}; + +class Block_kIndex : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kIndexBlock; + static constexpr BlockType kBlockType = BlockType::kIndex; +}; + +class Block_kFilterPartitionIndex : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = + CacheEntryRole::kFilterMetaBlock; + static constexpr BlockType kBlockType = BlockType::kFilterPartitionIndex; +}; + +class Block_kRangeDeletion : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock; + static constexpr BlockType kBlockType = BlockType::kRangeDeletion; +}; + +// Useful for creating the Block even though meta index blocks are not +// yet stored in block cache +class Block_kMetaIndex : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock; + static constexpr BlockType kBlockType = BlockType::kMetaIndex; +}; + +struct BlockCreateContext : public Cache::CreateContext { + BlockCreateContext() {} + BlockCreateContext(const BlockBasedTableOptions* _table_options, + Statistics* _statistics, bool _using_zstd, + uint8_t _protection_bytes_per_key, + const Comparator* _raw_ucmp, + bool _index_value_is_full = false, + bool _index_has_first_key = false) + : table_options(_table_options), + statistics(_statistics), + using_zstd(_using_zstd), + protection_bytes_per_key(_protection_bytes_per_key), + raw_ucmp(_raw_ucmp), + index_value_is_full(_index_value_is_full), + index_has_first_key(_index_has_first_key) {} + + const BlockBasedTableOptions* table_options = nullptr; + Statistics* statistics = nullptr; + bool using_zstd = false; + uint8_t protection_bytes_per_key = 0; + const Comparator* raw_ucmp = nullptr; + bool index_value_is_full; + bool index_has_first_key; + + // For TypedCacheInterface + template + inline void Create(std::unique_ptr* parsed_out, + size_t* charge_out, const Slice& data, + MemoryAllocator* alloc) { + Create(parsed_out, + BlockContents(AllocateAndCopyBlock(data, alloc), data.size())); + *charge_out = parsed_out->get()->ApproximateMemoryUsage(); + } + + void Create(std::unique_ptr* parsed_out, BlockContents&& block); + void Create(std::unique_ptr* parsed_out, BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); +}; + +// Convenient cache interface to use for block_cache, with support for +// SecondaryCache. +template +using BlockCacheInterface = + FullTypedCacheInterface; + +// Shortcut name for cache handles under BlockCacheInterface +template +using BlockCacheTypedHandle = + typename BlockCacheInterface::TypedHandle; + +// Selects the right helper based on BlockType and CacheTier +const Cache::CacheItemHelper* GetCacheItemHelper( + BlockType block_type, + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier); + +// For SFINAE check that a type is "blocklike" with a kCacheEntryRole member. +// Can get difficult compiler/linker errors without a good check like this. +template +using WithBlocklikeCheck = std::enable_if_t< + TBlocklike::kCacheEntryRole == CacheEntryRole::kMisc || true, TUse>; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.cc new file mode 100644 index 0000000000..83ec2cb060 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_prefetcher.h" + +#include "rocksdb/file_system.h" +#include "table/block_based/block_based_table_reader.h" + +namespace ROCKSDB_NAMESPACE { +void BlockPrefetcher::PrefetchIfNeeded( + const BlockBasedTable::Rep* rep, const BlockHandle& handle, + const size_t readahead_size, bool is_for_compaction, + const bool no_sequential_checking, + const Env::IOPriority rate_limiter_priority) { + // num_file_reads is used by FilePrefetchBuffer only when + // implicit_auto_readahead is set. + if (is_for_compaction) { + rep->CreateFilePrefetchBufferIfNotExists( + compaction_readahead_size_, compaction_readahead_size_, + &prefetch_buffer_, /*implicit_auto_readahead=*/false, + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0); + return; + } + + // Explicit user requested readahead. + if (readahead_size > 0) { + rep->CreateFilePrefetchBufferIfNotExists( + readahead_size, readahead_size, &prefetch_buffer_, + /*implicit_auto_readahead=*/false, /*num_file_reads=*/0, + /*num_file_reads_for_auto_readahead=*/0); + return; + } + + // Implicit readahead. + + // If max_auto_readahead_size is set to be 0 by user, no data will be + // prefetched. + size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size; + if (max_auto_readahead_size == 0 || initial_auto_readahead_size_ == 0) { + return; + } + + if (initial_auto_readahead_size_ > max_auto_readahead_size) { + initial_auto_readahead_size_ = max_auto_readahead_size; + } + + // In case of no_sequential_checking, it will skip the num_file_reads_ and + // will always creates the FilePrefetchBuffer. + if (no_sequential_checking) { + rep->CreateFilePrefetchBufferIfNotExists( + initial_auto_readahead_size_, max_auto_readahead_size, + &prefetch_buffer_, /*implicit_auto_readahead=*/true, + /*num_file_reads=*/0, + rep->table_options.num_file_reads_for_auto_readahead); + return; + } + + size_t len = BlockBasedTable::BlockSizeWithTrailer(handle); + size_t offset = handle.offset(); + + // If FS supports prefetching (readahead_limit_ will be non zero in that case) + // and current block exists in prefetch buffer then return. + if (offset + len <= readahead_limit_) { + UpdateReadPattern(offset, len); + return; + } + + if (!IsBlockSequential(offset)) { + UpdateReadPattern(offset, len); + ResetValues(rep->table_options.initial_auto_readahead_size); + return; + } + UpdateReadPattern(offset, len); + + // Implicit auto readahead, which will be enabled if the number of reads + // reached `table_options.num_file_reads_for_auto_readahead` (default: 2) and + // scans are sequential. + num_file_reads_++; + if (num_file_reads_ <= rep->table_options.num_file_reads_for_auto_readahead) { + return; + } + + if (rep->file->use_direct_io()) { + rep->CreateFilePrefetchBufferIfNotExists( + initial_auto_readahead_size_, max_auto_readahead_size, + &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, + rep->table_options.num_file_reads_for_auto_readahead); + return; + } + + if (readahead_size_ > max_auto_readahead_size) { + readahead_size_ = max_auto_readahead_size; + } + + // If prefetch is not supported, fall back to use internal prefetch buffer. + // Discarding other return status of Prefetch calls intentionally, as + // we can fallback to reading from disk if Prefetch fails. + Status s = rep->file->Prefetch( + handle.offset(), + BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_, + rate_limiter_priority); + if (s.IsNotSupported()) { + rep->CreateFilePrefetchBufferIfNotExists( + initial_auto_readahead_size_, max_auto_readahead_size, + &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, + rep->table_options.num_file_reads_for_auto_readahead); + return; + } + + readahead_limit_ = offset + len + readahead_size_; + // Keep exponentially increasing readahead size until + // max_auto_readahead_size. + readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.h new file mode 100644 index 0000000000..518868a301 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefetcher.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +namespace ROCKSDB_NAMESPACE { +class BlockPrefetcher { + public: + explicit BlockPrefetcher(size_t compaction_readahead_size, + size_t initial_auto_readahead_size) + : compaction_readahead_size_(compaction_readahead_size), + readahead_size_(initial_auto_readahead_size), + initial_auto_readahead_size_(initial_auto_readahead_size) {} + + void PrefetchIfNeeded(const BlockBasedTable::Rep* rep, + const BlockHandle& handle, size_t readahead_size, + bool is_for_compaction, + const bool no_sequential_checking, + Env::IOPriority rate_limiter_priority); + FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); } + + void UpdateReadPattern(const uint64_t& offset, const size_t& len) { + prev_offset_ = offset; + prev_len_ = len; + } + + bool IsBlockSequential(const uint64_t& offset) { + return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); + } + + void ResetValues(size_t initial_auto_readahead_size) { + num_file_reads_ = 1; + // Since initial_auto_readahead_size_ can be different from + // the value passed to BlockBasedTableOptions.initial_auto_readahead_size in + // case of adaptive_readahead, so fallback the readahead_size_ to that value + // in case of reset. + initial_auto_readahead_size_ = initial_auto_readahead_size; + readahead_size_ = initial_auto_readahead_size_; + readahead_limit_ = 0; + return; + } + + void SetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) { + num_file_reads_ = readahead_info->num_file_reads; + initial_auto_readahead_size_ = readahead_info->readahead_size; + TEST_SYNC_POINT_CALLBACK("BlockPrefetcher::SetReadaheadState", + &initial_auto_readahead_size_); + } + + private: + // Readahead size used in compaction, its value is used only if + // lookup_context_.caller = kCompaction. + size_t compaction_readahead_size_; + + // readahead_size_ is used if underlying FS supports prefetching. + size_t readahead_size_; + size_t readahead_limit_ = 0; + // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch + // buffer. + uint64_t initial_auto_readahead_size_; + uint64_t num_file_reads_ = 0; + uint64_t prev_offset_ = 0; + size_t prev_len_ = 0; + std::unique_ptr prefetch_buffer_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.cc new file mode 100644 index 0000000000..c83701d69c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.cc @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/block_prefix_index.h" + +#include + +#include "memory/arena.h" +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +inline uint32_t Hash(const Slice& s) { + return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0); +} + +inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) { + return Hash(prefix) % num_buckets; +} + +// The prefix block index is simply a bucket array, with each entry pointing to +// the blocks that span the prefixes hashed to this bucket. +// +// To reduce memory footprint, if there is only one block per bucket, the entry +// stores the block id directly. If there are more than one blocks per bucket, +// because of hash collision or a single prefix spanning multiple blocks, +// the entry points to an array of block ids. The block array is an array of +// uint32_t's. The first uint32_t indicates the total number of blocks, followed +// by the block ids. +// +// To differentiate the two cases, the high order bit of the entry indicates +// whether it is a 'pointer' into a separate block array. +// 0x7FFFFFFF is reserved for empty bucket. + +const uint32_t kNoneBlock = 0x7FFFFFFF; +const uint32_t kBlockArrayMask = 0x80000000; + +inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; } + +inline bool IsBlockId(uint32_t block_id) { + return (block_id & kBlockArrayMask) == 0; +} + +inline uint32_t DecodeIndex(uint32_t block_id) { + uint32_t index = block_id ^ kBlockArrayMask; + assert(index < kBlockArrayMask); + return index; +} + +inline uint32_t EncodeIndex(uint32_t index) { + assert(index < kBlockArrayMask); + return index | kBlockArrayMask; +} + +// temporary storage for prefix information during index building +struct PrefixRecord { + Slice prefix; + uint32_t start_block; + uint32_t end_block; + uint32_t num_blocks; + PrefixRecord* next; +}; + +class BlockPrefixIndex::Builder { + public: + void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) { + PrefixRecord* record = reinterpret_cast( + arena_.AllocateAligned(sizeof(PrefixRecord))); + record->prefix = key_prefix; + record->start_block = start_block; + record->end_block = start_block + num_blocks - 1; + record->num_blocks = num_blocks; + prefixes_.push_back(record); + } + + BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) { + // For now, use roughly 1:1 prefix to bucket ratio. + uint32_t num_buckets = static_cast(prefixes_.size()) + 1; + + // Collect prefix records that hash to the same bucket, into a single + // linklist. + std::vector prefixes_per_bucket(num_buckets, nullptr); + std::vector num_blocks_per_bucket(num_buckets, 0); + for (PrefixRecord* current : prefixes_) { + uint32_t bucket = PrefixToBucket(current->prefix, num_buckets); + // merge the prefix block span if the first block of this prefix is + // connected to the last block of the previous prefix. + PrefixRecord* prev = prefixes_per_bucket[bucket]; + if (prev) { + assert(current->start_block >= prev->end_block); + auto distance = current->start_block - prev->end_block; + if (distance <= 1) { + prev->end_block = current->end_block; + prev->num_blocks = prev->end_block - prev->start_block + 1; + num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1); + continue; + } + } + current->next = prev; + prefixes_per_bucket[bucket] = current; + num_blocks_per_bucket[bucket] += current->num_blocks; + } + + // Calculate the block array buffer size + uint32_t total_block_array_entries = 0; + for (uint32_t i = 0; i < num_buckets; i++) { + uint32_t num_blocks = num_blocks_per_bucket[i]; + if (num_blocks > 1) { + total_block_array_entries += (num_blocks + 1); + } + } + + // Populate the final prefix block index + uint32_t* block_array_buffer = new uint32_t[total_block_array_entries]; + uint32_t* buckets = new uint32_t[num_buckets]; + uint32_t offset = 0; + for (uint32_t i = 0; i < num_buckets; i++) { + uint32_t num_blocks = num_blocks_per_bucket[i]; + if (num_blocks == 0) { + assert(prefixes_per_bucket[i] == nullptr); + buckets[i] = kNoneBlock; + } else if (num_blocks == 1) { + assert(prefixes_per_bucket[i] != nullptr); + assert(prefixes_per_bucket[i]->next == nullptr); + buckets[i] = prefixes_per_bucket[i]->start_block; + } else { + assert(total_block_array_entries > 0); + assert(prefixes_per_bucket[i] != nullptr); + buckets[i] = EncodeIndex(offset); + block_array_buffer[offset] = num_blocks; + uint32_t* last_block = &block_array_buffer[offset + num_blocks]; + auto current = prefixes_per_bucket[i]; + // populate block ids from largest to smallest + while (current != nullptr) { + for (uint32_t iter = 0; iter < current->num_blocks; iter++) { + *last_block = current->end_block - iter; + last_block--; + } + current = current->next; + } + assert(last_block == &block_array_buffer[offset]); + offset += (num_blocks + 1); + } + } + + assert(offset == total_block_array_entries); + + return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets, + total_block_array_entries, block_array_buffer); + } + + private: + std::vector prefixes_; + Arena arena_; +}; + +Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockPrefixIndex** prefix_index) { + uint64_t pos = 0; + auto meta_pos = prefix_meta; + Status s; + Builder builder; + + while (!meta_pos.empty()) { + uint32_t prefix_size = 0; + uint32_t entry_index = 0; + uint32_t num_blocks = 0; + if (!GetVarint32(&meta_pos, &prefix_size) || + !GetVarint32(&meta_pos, &entry_index) || + !GetVarint32(&meta_pos, &num_blocks)) { + s = Status::Corruption( + "Corrupted prefix meta block: unable to read from it."); + break; + } + if (pos + prefix_size > prefixes.size()) { + s = Status::Corruption( + "Corrupted prefix meta block: size inconsistency."); + break; + } + Slice prefix(prefixes.data() + pos, prefix_size); + builder.Add(prefix, entry_index, num_blocks); + + pos += prefix_size; + } + + if (s.ok() && pos != prefixes.size()) { + s = Status::Corruption("Corrupted prefix meta block"); + } + + if (s.ok()) { + *prefix_index = builder.Finish(prefix_extractor); + } + + return s; +} + +uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) { + Slice prefix = internal_prefix_extractor_.Transform(key); + + uint32_t bucket = PrefixToBucket(prefix, num_buckets_); + uint32_t block_id = buckets_[bucket]; + + if (IsNone(block_id)) { + return 0; + } else if (IsBlockId(block_id)) { + *blocks = &buckets_[bucket]; + return 1; + } else { + uint32_t index = DecodeIndex(block_id); + assert(index < num_block_array_buffer_entries_); + *blocks = &block_array_buffer_[index + 1]; + uint32_t num_blocks = block_array_buffer_[index]; + assert(num_blocks > 1); + assert(index + num_blocks < num_block_array_buffer_entries_); + return num_blocks; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.h new file mode 100644 index 0000000000..4db8e2c654 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_prefix_index.h @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "db/dbformat.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +class Iterator; +class Slice; +class SliceTransform; + +// Build a hash-based index to speed up the lookup for "index block". +// BlockHashIndex accepts a key and, if found, returns its restart index within +// that index block. +class BlockPrefixIndex { + public: + // Maps a key to a list of data blocks that could potentially contain + // the key, based on the prefix. + // Returns the total number of relevant blocks, 0 means the key does + // not exist. + uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + + size_t ApproximateMemoryUsage() const { + return sizeof(BlockPrefixIndex) + + (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t); + } + + // Create hash index by reading from the metadata blocks. + // Note: table reader (caller) is responsible for keeping shared_ptr to + // underlying prefix extractor + // @params prefixes: a sequence of prefixes. + // @params prefix_meta: contains the "metadata" to of the prefixes. + static Status Create(const SliceTransform* hash_key_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockPrefixIndex** prefix_index); + + ~BlockPrefixIndex() { + delete[] buckets_; + delete[] block_array_buffer_; + } + + private: + class Builder; + friend Builder; + + BlockPrefixIndex(const SliceTransform* prefix_extractor, uint32_t num_buckets, + uint32_t* buckets, uint32_t num_block_array_buffer_entries, + uint32_t* block_array_buffer) + : internal_prefix_extractor_(prefix_extractor), + num_buckets_(num_buckets), + num_block_array_buffer_entries_(num_block_array_buffer_entries), + buckets_(buckets), + block_array_buffer_(block_array_buffer) {} + + InternalKeySliceTransform internal_prefix_extractor_; + + uint32_t num_buckets_; + uint32_t num_block_array_buffer_entries_; + uint32_t* buckets_; + uint32_t* block_array_buffer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_type.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_type.h new file mode 100644 index 0000000000..a9d6a1a773 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/block_type.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Represents the types of blocks used in the block based table format. +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details. +// For code sanity, BlockType should imply a specific TBlocklike for +// BlocklikeTraits. +enum class BlockType : uint8_t { + kData, + kFilter, // for second level partitioned filters and full filters + kFilterPartitionIndex, // for top-level index of filter partitions + kProperties, + kCompressionDictionary, + kRangeDeletion, + kHashIndexPrefixes, + kHashIndexMetadata, + kMetaIndex, + kIndex, + // Note: keep kInvalid the last value when adding new enum values. + kInvalid +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/cachable_entry.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/cachable_entry.h new file mode 100644 index 0000000000..3cd1bb807a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/cachable_entry.h @@ -0,0 +1,244 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "port/likely.h" +#include "rocksdb/advanced_cache.h" +#include "rocksdb/cleanable.h" + +namespace ROCKSDB_NAMESPACE { + +// CachableEntry is a handle to an object that may or may not be in the block +// cache. It is used in a variety of ways: +// +// 1) It may refer to an object in the block cache. In this case, cache_ and +// cache_handle_ are not nullptr, and the cache handle has to be released when +// the CachableEntry is destroyed (the lifecycle of the cached object, on the +// other hand, is managed by the cache itself). +// 2) It may uniquely own the (non-cached) object it refers to (examples include +// a block read directly from file, or uncompressed blocks when there is a +// compressed block cache but no uncompressed block cache). In such cases, the +// object has to be destroyed when the CachableEntry is destroyed. +// 3) It may point to an object (cached or not) without owning it. In this case, +// no action is needed when the CachableEntry is destroyed. +// 4) Sometimes, management of a cached or owned object (see #1 and #2 above) +// is transferred to some other object. This is used for instance with iterators +// (where cleanup is performed using a chain of cleanup functions, +// see Cleanable). +// +// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not +// allowed); hence, this is a move-only type, where a move transfers the +// management responsibilities, and leaves the source object in an empty state. + +template +class CachableEntry { + public: + CachableEntry() = default; + + CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle, + bool own_value) + : value_(value), + cache_(cache), + cache_handle_(cache_handle), + own_value_(own_value) { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + } + + CachableEntry(const CachableEntry&) = delete; + CachableEntry& operator=(const CachableEntry&) = delete; + + CachableEntry(CachableEntry&& rhs) noexcept + : value_(rhs.value_), + cache_(rhs.cache_), + cache_handle_(rhs.cache_handle_), + own_value_(rhs.own_value_) { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + } + + CachableEntry& operator=(CachableEntry&& rhs) noexcept { + if (UNLIKELY(this == &rhs)) { + return *this; + } + + ReleaseResource(); + + value_ = rhs.value_; + cache_ = rhs.cache_; + cache_handle_ = rhs.cache_handle_; + own_value_ = rhs.own_value_; + + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + + return *this; + } + + ~CachableEntry() { ReleaseResource(); } + + bool IsEmpty() const { + return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr && + !own_value_; + } + + bool IsCached() const { + assert(!!cache_ == !!cache_handle_); + + return cache_handle_ != nullptr; + } + + T* GetValue() const { return value_; } + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return cache_handle_; } + bool GetOwnValue() const { return own_value_; } + + void Reset() { + ReleaseResource(); + ResetFields(); + } + + void TransferTo(Cleanable* cleanable) { + if (cleanable) { + if (cache_handle_ != nullptr) { + assert(cache_ != nullptr); + cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_); + } else if (own_value_) { + cleanable->RegisterCleanup(&DeleteValue, value_, nullptr); + } + } + + ResetFields(); + } + + void SetOwnedValue(std::unique_ptr&& value) { + assert(value.get() != nullptr); + + if (UNLIKELY(value_ == value.get() && own_value_)) { + assert(cache_ == nullptr && cache_handle_ == nullptr); + return; + } + + Reset(); + + value_ = value.release(); + own_value_ = true; + } + + void SetUnownedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && cache_ == nullptr && + cache_handle_ == nullptr && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + assert(!own_value_); + } + + void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { + assert(cache != nullptr); + assert(cache_handle != nullptr); + + if (UNLIKELY(value_ == value && cache_ == cache && + cache_handle_ == cache_handle && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + cache_ = cache; + cache_handle_ = cache_handle; + assert(!own_value_); + } + + // Since this class is essentially an elaborate pointer, it's sometimes + // useful to be able to upcast or downcast the base type of the pointer, + // especially when interacting with typed_cache.h. + template + std::enable_if_t || + std::is_base_of_v), + /* Actual return type */ + CachableEntry&> + As() { + CachableEntry* result_ptr = + reinterpret_cast*>(this); + // Ensure no weirdness in template instantiations + assert(static_cast(&this->value_) == + static_cast(&result_ptr->value_)); + assert(&this->cache_handle_ == &result_ptr->cache_handle_); + // This function depends on no arithmetic involved in the pointer + // conversion, which is not statically checkable. + assert(static_cast(this->value_) == + static_cast(result_ptr->value_)); + return *result_ptr; + } + + private: + void ReleaseResource() noexcept { + if (LIKELY(cache_handle_ != nullptr)) { + assert(cache_ != nullptr); + cache_->Release(cache_handle_); + } else if (own_value_) { + delete value_; + } + } + + void ResetFields() noexcept { + value_ = nullptr; + cache_ = nullptr; + cache_handle_ = nullptr; + own_value_ = false; + } + + static void ReleaseCacheHandle(void* arg1, void* arg2) { + Cache* const cache = static_cast(arg1); + assert(cache); + + Cache::Handle* const cache_handle = static_cast(arg2); + assert(cache_handle); + + cache->Release(cache_handle); + } + + static void DeleteValue(void* arg1, void* /* arg2 */) { + delete static_cast(arg1); + } + + private: + // Have to be your own best friend + template + friend class CachableEntry; + + T* value_ = nullptr; + Cache* cache_ = nullptr; + Cache::Handle* cache_handle_ = nullptr; + bool own_value_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.cc new file mode 100644 index 0000000000..5d5d8ed55e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/data_block_footer.h" + +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +const int kDataBlockIndexTypeBitShift = 31; + +// 0x7FFFFFFF +const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u; + +// 0x7FFFFFFF +const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u; + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts) { + if (num_restarts > kMaxNumRestarts) { + assert(0); // mute travis "unused" warning + } + + uint32_t block_footer = num_restarts; + if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) { + block_footer |= 1u << kDataBlockIndexTypeBitShift; + } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) { + assert(0); + } + + return block_footer; +} + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts) { + if (index_type) { + if (block_footer & 1u << kDataBlockIndexTypeBitShift) { + *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } else { + *index_type = BlockBasedTableOptions::kDataBlockBinarySearch; + } + } + + if (num_restarts) { + *num_restarts = block_footer & kNumRestartsMask; + assert(*num_restarts <= kMaxNumRestarts); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.h new file mode 100644 index 0000000000..c1cfd47309 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_footer.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts); + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.cc new file mode 100644 index 0000000000..c579dcc43d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "table/block_based/data_block_hash_index.h" + +#include +#include + +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +void DataBlockHashIndexBuilder::Add(const Slice& key, + const size_t restart_index) { + assert(Valid()); + if (restart_index > kMaxRestartSupportedByHashIndex) { + valid_ = false; + return; + } + + uint32_t hash_value = GetSliceHash(key); + hash_and_restart_pairs_.emplace_back(hash_value, + static_cast(restart_index)); + estimated_num_buckets_ += bucket_per_key_; +} + +void DataBlockHashIndexBuilder::Finish(std::string& buffer) { + assert(Valid()); + uint16_t num_buckets = static_cast(estimated_num_buckets_); + + if (num_buckets == 0) { + num_buckets = 1; // sanity check + } + + // The build-in hash cannot well distribute strings when into different + // buckets when num_buckets is power of two, resulting in high hash + // collision. + // We made the num_buckets to be odd to avoid this issue. + num_buckets |= 1; + + std::vector buckets(num_buckets, kNoEntry); + // write the restart_index array + for (auto& entry : hash_and_restart_pairs_) { + uint32_t hash_value = entry.first; + uint8_t restart_index = entry.second; + uint16_t buck_idx = static_cast(hash_value % num_buckets); + if (buckets[buck_idx] == kNoEntry) { + buckets[buck_idx] = restart_index; + } else if (buckets[buck_idx] != restart_index) { + // same bucket cannot store two different restart_index, mark collision + buckets[buck_idx] = kCollision; + } + } + + for (uint8_t restart_index : buckets) { + buffer.append( + const_cast(reinterpret_cast(&restart_index)), + sizeof(restart_index)); + } + + // write NUM_BUCK + PutFixed16(&buffer, num_buckets); + + assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex); +} + +void DataBlockHashIndexBuilder::Reset() { + estimated_num_buckets_ = 0; + valid_ = true; + hash_and_restart_pairs_.clear(); +} + +void DataBlockHashIndex::Initialize(const char* data, uint16_t size, + uint16_t* map_offset) { + assert(size >= sizeof(uint16_t)); // NUM_BUCKETS + num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t)); + assert(num_buckets_ > 0); + assert(size > num_buckets_ * sizeof(uint8_t)); + *map_offset = static_cast(size - sizeof(uint16_t) - + num_buckets_ * sizeof(uint8_t)); +} + +uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset, + const Slice& key) const { + uint32_t hash_value = GetSliceHash(key); + uint16_t idx = static_cast(hash_value % num_buckets_); + const char* bucket_table = data + map_offset; + return static_cast(*(bucket_table + idx * sizeof(uint8_t))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.h new file mode 100644 index 0000000000..3215221755 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/data_block_hash_index.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +// This is an experimental feature aiming to reduce the CPU utilization of +// point-lookup within a data-block. It is only used in data blocks, and not +// in meta-data blocks or per-table index blocks. +// +// It only used to support BlockBasedTable::Get(). +// +// A serialized hash index is appended to the data-block. The new block data +// format is as follows: +// +// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER] +// +// RI: Restart Interval (the same as the default data-block format) +// RI_IDX: Restart Interval index (the same as the default data-block format) +// HASH_IDX: The new data-block hash index feature. +// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as +// the flag indicating if this hash index is in use. Note that +// given a data block < 32KB, the MSB is never used. So we can +// borrow the MSB as the hash index flag. Therefore, this format is +// compatible with the legacy data-blocks with num_restarts < 32768, +// as the MSB is 0. +// +// The format of the data-block hash index is as follows: +// +// HASH_IDX: [B B B ... B NUM_BUCK] +// +// B: bucket, an array of restart index. Each buckets is uint8_t. +// NUM_BUCK: Number of buckets, which is the length of the bucket array. +// +// We reserve two special flag: +// kNoEntry=255, +// kCollision=254. +// +// Therefore, the max number of restarts this hash index can supoport is 253. +// +// Buckets are initialized to be kNoEntry. +// +// When storing a key in the hash index, the key is first hashed to a bucket. +// If there the bucket is empty (kNoEntry), the restart index is stored in +// the bucket. If there is already a restart index there, we will update the +// existing restart index to a collision marker (kCollision). If the +// the bucket is already marked as collision, we do not store the restart +// index either. +// +// During query process, a key is first hashed to a bucket. Then we examine if +// the buckets store nothing (kNoEntry) or the bucket had a collision +// (kCollision). If either of those happens, we get the restart index of +// the key and will directly go to the restart interval to search the key. +// +// Note that we only support blocks with #restart_interval < 254. If a block +// has more restart interval than that, hash index will not be create for it. + +const uint8_t kNoEntry = 255; +const uint8_t kCollision = 254; +const uint8_t kMaxRestartSupportedByHashIndex = 253; + +// Because we use uint16_t address, we only support block no more than 64KB +const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16; +const double kDefaultUtilRatio = 0.75; + +class DataBlockHashIndexBuilder { + public: + DataBlockHashIndexBuilder() + : bucket_per_key_(-1 /*uninitialized marker*/), + estimated_num_buckets_(0), + valid_(false) {} + + void Initialize(double util_ratio) { + if (util_ratio <= 0) { + util_ratio = kDefaultUtilRatio; // sanity check + } + bucket_per_key_ = 1 / util_ratio; + valid_ = true; + } + + inline bool Valid() const { return valid_ && bucket_per_key_ > 0; } + void Add(const Slice& key, const size_t restart_index); + void Finish(std::string& buffer); + void Reset(); + inline size_t EstimateSize() const { + uint16_t estimated_num_buckets = + static_cast(estimated_num_buckets_); + + // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. + estimated_num_buckets |= 1; + + return sizeof(uint16_t) + + static_cast(estimated_num_buckets * sizeof(uint8_t)); + } + + private: + double bucket_per_key_; // is the multiplicative inverse of util_ratio_ + double estimated_num_buckets_; + + // Now the only usage for `valid_` is to mark false when the inserted + // restart_index is larger than supported. In this case HashIndex is not + // appended to the block content. + bool valid_; + + std::vector> hash_and_restart_pairs_; + friend class DataBlockHashIndex_DataBlockHashTestSmall_Test; +}; + +class DataBlockHashIndex { + public: + DataBlockHashIndex() : num_buckets_(0) {} + + void Initialize(const char* data, uint16_t size, uint16_t* map_offset); + + uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const; + + inline bool Valid() { return num_buckets_ != 0; } + + private: + // To make the serialized hash index compact and to save the space overhead, + // here all the data fields persisted in the block are in uint16 format. + // We find that a uint16 is large enough to index every offset of a 64KiB + // block. + // So in other words, DataBlockHashIndex does not support block size equal + // or greater then 64KiB. + uint16_t num_buckets_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block.h new file mode 100644 index 0000000000..b14858c020 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block.h @@ -0,0 +1,183 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include +#include + +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/format.h" +#include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kNotValid = ULLONG_MAX; +class FilterPolicy; + +class GetContext; +using MultiGetRange = MultiGetContext::Range; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table, or partitioned into smaller filters. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// Add* Finish +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder() {} + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&) = delete; + void operator=(const FilterBlockBuilder&) = delete; + + virtual ~FilterBlockBuilder() {} + + virtual void Add( + const Slice& key_without_ts) = 0; // Add a key to current filter + virtual bool IsEmpty() const = 0; // Empty == none added + // For reporting stats on how many entries the builder considered unique + virtual size_t EstimateEntriesAdded() = 0; + Slice Finish() { // Generate Filter + const BlockHandle empty_handle; + Status dont_care_status; + auto ret = Finish(empty_handle, &dont_care_status); + assert(dont_care_status.ok()); + return ret; + } + // If filter_data is not nullptr, Finish() may transfer ownership of + // underlying filter data to the caller, so that it can be freed as soon as + // possible. BlockBasedFilterBlock will ignore this parameter. + // + virtual Slice Finish( + const BlockHandle& tmp /* only used in PartitionedFilterBlock as + last_partition_block_handle */ + , + Status* status, std::unique_ptr* filter_data = nullptr) = 0; + + // This is called when finishes using the FilterBitsBuilder + // in order to release memory usage and cache charge + // associated with it timely + virtual void ResetFilterBitsBuilder() {} + + // To optionally post-verify the filter returned from + // FilterBlockBuilder::Finish. + // Return Status::OK() if skipped. + virtual Status MaybePostVerifyFilter(const Slice& /* filter_content */) { + return Status::OK(); + } +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +// +// BlockBased/Full FilterBlock would be called in the same way. +class FilterBlockReader { + public: + FilterBlockReader() = default; + virtual ~FilterBlockReader() = default; + + FilterBlockReader(const FilterBlockReader&) = delete; + FilterBlockReader& operator=(const FilterBlockReader&) = delete; + + /** + * If no_io is set, then it returns true if it cannot answer the query without + * reading data from disk. This is used in PartitionedFilterBlockReader to + * avoid reading partitions that are not in block cache already + * + * Normally filters are built on only the user keys and the InternalKey is not + * needed for a query. The index in PartitionedFilterBlockReader however is + * built upon InternalKey and must be provided via const_ikey_ptr when running + * queries. + */ + virtual bool KeyMayMatch(const Slice& key, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) = 0; + + virtual void KeysMayMatch(MultiGetRange* range, const bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + for (auto iter = range->begin(); iter != range->end(); ++iter) { + const Slice ukey_without_ts = iter->ukey_without_ts; + const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; + if (!KeyMayMatch(ukey_without_ts, no_io, &ikey, get_context, + lookup_context, read_options)) { + range->SkipKey(iter); + } + } + } + + /** + * no_io and const_ikey_ptr here means the same as in KeyMayMatch + */ + virtual bool PrefixMayMatch(const Slice& prefix, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) = 0; + + virtual void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + const bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + for (auto iter = range->begin(); iter != range->end(); ++iter) { + const Slice ukey_without_ts = iter->ukey_without_ts; + const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; + if (prefix_extractor->InDomain(ukey_without_ts) && + !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), no_io, + &ikey, get_context, lookup_context, read_options)) { + range->SkipKey(iter); + } + } + } + + virtual size_t ApproximateMemoryUsage() const = 0; + + // convert this object to a human readable form + virtual std::string ToString() const { + std::string error_msg("Unsupported filter \n"); + return error_msg; + } + + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /*pin*/, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { + return Status::OK(); + } + + virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, + const Slice& user_key_without_ts, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, + const Slice* const const_ikey_ptr, + bool* filter_checked, bool need_upper_bound_check, + bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.cc new file mode 100644 index 0000000000..32f800db79 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/filter_block_reader_common.h" + +#include "block_cache.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/parsed_full_filter_block.h" + +namespace ROCKSDB_NAMESPACE { + +template +Status FilterBlockReaderCommon::ReadFilterBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) { + PERF_TIMER_GUARD(read_filter_block_nanos); + + assert(table); + assert(filter_block); + assert(filter_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + + const Status s = + table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, + get_context, lookup_context, + /* for_compaction */ false, use_cache, + /* async_read */ false); + + return s; +} + +template +const SliceTransform* +FilterBlockReaderCommon::table_prefix_extractor() const { + assert(table_); + + const BlockBasedTable::Rep* const rep = table_->get_rep(); + assert(rep); + + return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr; +} + +template +bool FilterBlockReaderCommon::whole_key_filtering() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->whole_key_filtering; +} + +template +bool FilterBlockReaderCommon::cache_filter_blocks() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->table_options.cache_index_and_filter_blocks; +} + +template +Status FilterBlockReaderCommon::GetOrReadFilterBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block, + const ReadOptions& read_options) const { + assert(filter_block); + + if (!filter_block_.IsEmpty()) { + filter_block->SetUnownedValue(filter_block_.GetValue()); + return Status::OK(); + } + + ReadOptions ro = read_options; + if (no_io) { + ro.read_tier = kBlockCacheTier; + } + + return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, ro, + cache_filter_blocks(), get_context, lookup_context, + filter_block); +} + +template +size_t FilterBlockReaderCommon::ApproximateFilterBlockMemoryUsage() + const { + assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr); + return filter_block_.GetOwnValue() + ? filter_block_.GetValue()->ApproximateMemoryUsage() + : 0; +} + +template +bool FilterBlockReaderCommon::RangeMayExist( + const Slice* iterate_upper_bound, const Slice& user_key_without_ts, + const SliceTransform* prefix_extractor, const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, bool no_io, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) { + *filter_checked = false; + return true; + } + Slice prefix = prefix_extractor->Transform(user_key_without_ts); + if (need_upper_bound_check && + !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { + *filter_checked = false; + return true; + } else { + *filter_checked = true; + return PrefixMayMatch(prefix, no_io, const_ikey_ptr, + /* get_context */ nullptr, lookup_context, + read_options); + } +} + +template +bool FilterBlockReaderCommon::IsFilterCompatible( + const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const { + // Try to reuse the bloom filter in the SST table if prefix_extractor in + // mutable_cf_options has changed. If range [user_key, upper_bound) all + // share the same prefix then we may still be able to use the bloom filter. + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (iterate_upper_bound != nullptr && prefix_extractor) { + if (!prefix_extractor->InDomain(*iterate_upper_bound)) { + return false; + } + Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); + // first check if user_key and upper_bound all share the same prefix + if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform, + false) != 0) { + // second check if user_key's prefix is the immediate predecessor of + // upper_bound and have the same length. If so, we know for sure all + // keys in the range [user_key, upper_bound) share the same prefix. + // Also need to make sure upper_bound are full length to ensure + // correctness + if (!full_length_enabled_ || + iterate_upper_bound->size() != prefix_extractor_full_length_ || + !comparator->IsSameLengthImmediateSuccessor(prefix, + *iterate_upper_bound)) { + return false; + } + } + return true; + } else { + return false; + } +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template class FilterBlockReaderCommon; +template class FilterBlockReaderCommon; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.h new file mode 100644 index 0000000000..62335b30be --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_block_reader_common.h @@ -0,0 +1,76 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include + +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBasedTable; +class FilePrefetchBuffer; + +// Encapsulates common functionality for the various filter block reader +// implementations. Provides access to the filter block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +template +class FilterBlockReaderCommon : public FilterBlockReader { + public: + FilterBlockReaderCommon(const BlockBasedTable* t, + CachableEntry&& filter_block) + : table_(t), filter_block_(std::move(filter_block)) { + assert(table_); + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (prefix_extractor) { + full_length_enabled_ = + prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); + } + } + + bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + + protected: + static Status ReadFilterBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block); + + const BlockBasedTable* table() const { return table_; } + const SliceTransform* table_prefix_extractor() const; + bool whole_key_filtering() const; + bool cache_filter_blocks() const; + + Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block, + const ReadOptions& read_options) const; + + size_t ApproximateFilterBlockMemoryUsage() const; + + private: + bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const; + + private: + const BlockBasedTable* table_; + CachableEntry filter_block_; + size_t prefix_extractor_full_length_ = 0; + bool full_length_enabled_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy.cc new file mode 100644 index 0000000000..36f3b16d4b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy.cc @@ -0,0 +1,1966 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +#include +#include +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "logging/logging.h" +#include "port/lang.h" +#include "rocksdb/convenience.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" +#include "util/bloom_impl.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/math.h" +#include "util/ribbon_config.h" +#include "util/ribbon_impl.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Metadata trailer size for built-in filters. (This is separate from +// block-based table block trailer.) +// +// Originally this was 1 byte for num_probes and 4 bytes for number of +// cache lines in the Bloom filter, but now the first trailer byte is +// usually an implementation marker and remaining 4 bytes have various +// meanings. +static constexpr uint32_t kMetadataLen = 5; + +Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { + // Missing metadata, treated as zero entries + return Slice(nullptr, 0); +} + +Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { + return Slice("\0\0\0\0\0\0", 6); +} + +// Base class for filter builders using the XXH3 preview hash, +// also known as Hash64 or GetSliceHash64. +class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit XXPH3FilterBitsBuilder( + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption) + : aggregate_rounding_balance_(aggregate_rounding_balance), + cache_res_mgr_(cache_res_mgr), + detect_filter_construct_corruption_( + detect_filter_construct_corruption) {} + + ~XXPH3FilterBitsBuilder() override {} + + virtual void AddKey(const Slice& key) override { + uint64_t hash = GetSliceHash64(key); + // Especially with prefixes, it is common to have repetition, + // though only adjacent repetition, which we want to immediately + // recognize and collapse for estimating true filter space + // requirements. + if (hash_entries_info_.entries.empty() || + hash != hash_entries_info_.entries.back()) { + if (detect_filter_construct_corruption_) { + hash_entries_info_.xor_checksum ^= hash; + } + hash_entries_info_.entries.push_back(hash); + if (cache_res_mgr_ && + // Traditional rounding to whole bucket size + ((hash_entries_info_.entries.size() % + kUint64tHashEntryCacheResBucketSize) == + kUint64tHashEntryCacheResBucketSize / 2)) { + hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); + Status s = cache_res_mgr_->MakeCacheReservation( + kUint64tHashEntryCacheResBucketSize * sizeof(hash), + &hash_entries_info_.cache_res_bucket_handles.back()); + s.PermitUncheckedError(); + } + } + } + + virtual size_t EstimateEntriesAdded() override { + return hash_entries_info_.entries.size(); + } + + virtual Status MaybePostVerify(const Slice& filter_content) override; + + protected: + static constexpr uint32_t kMetadataLen = 5; + + // Number of hash entries to accumulate before charging their memory usage to + // the cache when cache charging is available + static const std::size_t kUint64tHashEntryCacheResBucketSize = + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(uint64_t); + + // For delegating between XXPH3FilterBitsBuilders + void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { + assert(other != nullptr); + hash_entries_info_.Swap(&(other->hash_entries_info_)); + } + + void ResetEntries() { hash_entries_info_.Reset(); } + + virtual size_t RoundDownUsableSpace(size_t available_size) = 0; + + // To choose size using malloc_usable_size, we have to actually allocate. + size_t AllocateMaybeRounding(size_t target_len_with_metadata, + size_t num_entries, + std::unique_ptr* buf) { + // Return value set to a default; overwritten in some cases + size_t rv = target_len_with_metadata; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + if (aggregate_rounding_balance_ != nullptr) { + // Do optimize_filters_for_memory, using malloc_usable_size. + // Approach: try to keep FP rate balance better than or on + // target (negative aggregate_rounding_balance_). We can then select a + // lower bound filter size (within reasonable limits) that gets us as + // close to on target as possible. We request allocation for that filter + // size and use malloc_usable_size to "round up" to the actual + // allocation size. + + // Although it can be considered bad practice to use malloc_usable_size + // to access an object beyond its original size, this approach should be + // quite general: working for all allocators that properly support + // malloc_usable_size. + + // Race condition on balance is OK because it can only cause temporary + // skew in rounding up vs. rounding down, as long as updates are atomic + // and relative. + int64_t balance = aggregate_rounding_balance_->load(); + + double target_fp_rate = + EstimatedFpRate(num_entries, target_len_with_metadata); + double rv_fp_rate = target_fp_rate; + + if (balance < 0) { + // See formula for BloomFilterPolicy::aggregate_rounding_balance_ + double for_balance_fp_rate = + -balance / double{0x100000000} + target_fp_rate; + + // To simplify, we just try a few modified smaller sizes. This also + // caps how much we vary filter size vs. target, to avoid outlier + // behavior from excessive variance. + size_t target_len = target_len_with_metadata - kMetadataLen; + assert(target_len < target_len_with_metadata); // check underflow + for (uint64_t maybe_len_rough : + {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16, + uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) { + size_t maybe_len_with_metadata = + RoundDownUsableSpace(maybe_len_rough + kMetadataLen); + double maybe_fp_rate = + EstimatedFpRate(num_entries, maybe_len_with_metadata); + if (maybe_fp_rate <= for_balance_fp_rate) { + rv = maybe_len_with_metadata; + rv_fp_rate = maybe_fp_rate; + break; + } + } + } + + // Filter blocks are loaded into block cache with their block trailer. + // We need to make sure that's accounted for in choosing a + // fragmentation-friendly size. + const size_t kExtraPadding = BlockBasedTable::kBlockTrailerSize; + size_t requested = rv + kExtraPadding; + + // Allocate and get usable size + buf->reset(new char[requested]); + size_t usable = malloc_usable_size(buf->get()); + + if (usable - usable / 4 > requested) { + // Ratio greater than 4/3 is too much for utilizing, if it's + // not a buggy or mislinked malloc_usable_size implementation. + // Non-linearity of FP rates with bits/key means rapidly + // diminishing returns in overall accuracy for additional + // storage on disk. + // Nothing to do, except assert that the result is accurate about + // the usable size. (Assignment never used.) + assert(((*buf)[usable - 1] = 'x')); + } else if (usable > requested) { + rv = RoundDownUsableSpace(usable - kExtraPadding); + assert(rv <= usable - kExtraPadding); + rv_fp_rate = EstimatedFpRate(num_entries, rv); + } else { + // Too small means bad malloc_usable_size + assert(usable == requested); + } + memset(buf->get(), 0, rv); + + // Update balance + int64_t diff = static_cast((rv_fp_rate - target_fp_rate) * + double{0x100000000}); + *aggregate_rounding_balance_ += diff; + } else { + buf->reset(new char[rv]()); + } +#else + (void)num_entries; + buf->reset(new char[rv]()); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return rv; + } + + // TODO: Ideally we want to verify the hash entry + // as it is added to the filter and eliminate this function + // for speeding up and leaving fewer spaces for undetected memory/CPU + // corruption. For Ribbon Filter, it's bit harder. + // Possible solution: + // pass a custom iterator that tracks the xor checksum as + // it iterates to ResetAndFindSeedToSolve + Status MaybeVerifyHashEntriesChecksum() { + if (!detect_filter_construct_corruption_) { + return Status::OK(); + } + + uint64_t actual_hash_entries_xor_checksum = 0; + for (uint64_t h : hash_entries_info_.entries) { + actual_hash_entries_xor_checksum ^= h; + } + + if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) { + return Status::OK(); + } else { + // Since these hash entries are corrupted and they will not be used + // anymore, we can reset them and release memory. + ResetEntries(); + return Status::Corruption("Filter's hash entries checksum mismatched"); + } + } + + // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, + // always "round up" like historic behavior. + std::atomic* aggregate_rounding_balance_; + + // For reserving memory used in (new) Bloom and Ribbon Filter construction + std::shared_ptr cache_res_mgr_; + + // For managing cache charge for final filter in (new) Bloom and Ribbon + // Filter construction + std::deque> + final_filter_cache_res_handles_; + + bool detect_filter_construct_corruption_; + + struct HashEntriesInfo { + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque entries; + + // If cache_res_mgr_ != nullptr, + // it manages cache charge for buckets of hash entries in (new) Bloom + // or Ribbon Filter construction. + // Otherwise, it is empty. + std::deque> + cache_res_bucket_handles; + + // If detect_filter_construct_corruption_ == true, + // it records the xor checksum of hash entries. + // Otherwise, it is 0. + uint64_t xor_checksum = 0; + + void Swap(HashEntriesInfo* other) { + assert(other != nullptr); + std::swap(entries, other->entries); + std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); + std::swap(xor_checksum, other->xor_checksum); + } + + void Reset() { + entries.clear(); + cache_res_bucket_handles.clear(); + xor_checksum = 0; + } + }; + + HashEntriesInfo hash_entries_info_; +}; + +// #################### FastLocalBloom implementation ################## // +// ############## also known as format_version=5 Bloom filter ########## // + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { + public: + // Non-null aggregate_rounding_balance implies optimize_filters_for_memory + explicit FastLocalBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr, + detect_filter_construct_corruption), + millibits_per_key_(millibits_per_key) { + assert(millibits_per_key >= 1000); + } + + // No Copy allowed + FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; + void operator=(const FastLocalBloomBitsBuilder&) = delete; + + ~FastLocalBloomBitsBuilder() override {} + + using FilterBitsBuilder::Finish; + + virtual Slice Finish(std::unique_ptr* buf) override { + return Finish(buf, nullptr); + } + + virtual Slice Finish(std::unique_ptr* buf, + Status* status) override { + size_t num_entries = hash_entries_info_.entries.size(); + size_t len_with_metadata = CalculateSpace(num_entries); + + std::unique_ptr mutable_buf; + std::unique_ptr + final_filter_cache_res_handle; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + // Cache charging for mutable_buf + if (cache_res_mgr_) { + Status s = cache_res_mgr_->MakeCacheReservation( + len_with_metadata * sizeof(char), &final_filter_cache_res_handle); + s.PermitUncheckedError(); + } + + assert(mutable_buf); + assert(len_with_metadata >= kMetadataLen); + + // Max size supported by implementation + assert(len_with_metadata <= 0xffffffffU); + + // Compute num_probes after any rounding / adjustments + int num_probes = GetNumProbes(num_entries, len_with_metadata); + + uint32_t len = static_cast(len_with_metadata - kMetadataLen); + if (len > 0) { + TEST_SYNC_POINT_CALLBACK( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries", + &hash_entries_info_.entries); + AddAllEntries(mutable_buf.get(), len, num_probes); + Status verify_hash_entries_checksum_status = + MaybeVerifyHashEntriesChecksum(); + if (!verify_hash_entries_checksum_status.ok()) { + if (status) { + *status = verify_hash_entries_checksum_status; + } + return FinishAlwaysTrue(buf); + } + } + + bool keep_entries_for_postverify = detect_filter_construct_corruption_; + if (!keep_entries_for_postverify) { + ResetEntries(); + } + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -1 = Marker for newer Bloom implementations + mutable_buf[len] = static_cast(-1); + // 0 = Marker for this sub-implementation + mutable_buf[len + 1] = static_cast(0); + // num_probes (and 0 in upper bits for 64-byte block size) + mutable_buf[len + 2] = static_cast(num_probes); + // rest of metadata stays zero + + auto TEST_arg_pair __attribute__((__unused__)) = + std::make_pair(&mutable_buf, len_with_metadata); + TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter", + &TEST_arg_pair); + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + if (status) { + *status = Status::OK(); + } + return rv; + } + + size_t ApproximateNumEntries(size_t bytes) override { + size_t bytes_no_meta = + bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0; + return static_cast(uint64_t{8000} * bytes_no_meta / + millibits_per_key_); + } + + size_t CalculateSpace(size_t num_entries) override { + // If not for cache line blocks in the filter, what would the target + // length in bytes be? + size_t raw_target_len = static_cast( + (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000); + + if (raw_target_len >= size_t{0xffffffc0}) { + // Max supported for this data structure implementation + raw_target_len = size_t{0xffffffc0}; + } + + // Round up to nearest multiple of 64 (block size). This adjustment is + // used for target FP rate only so that we don't receive complaints about + // lower FP rate vs. historic Bloom filter behavior. + return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen; + } + + double EstimatedFpRate(size_t keys, size_t len_with_metadata) override { + int num_probes = GetNumProbes(keys, len_with_metadata); + return FastLocalBloomImpl::EstimatedFpRate( + keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64); + } + + protected: + size_t RoundDownUsableSpace(size_t available_size) override { + size_t rv = available_size - kMetadataLen; + + if (rv >= size_t{0xffffffc0}) { + // Max supported for this data structure implementation + rv = size_t{0xffffffc0}; + } + + // round down to multiple of 64 (block size) + rv &= ~size_t{63}; + + return rv + kMetadataLen; + } + + private: + // Compute num_probes after any rounding / adjustments + int GetNumProbes(size_t keys, size_t len_with_metadata) { + uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000; + int actual_millibits_per_key = + static_cast(millibits / std::max(keys, size_t{1})); + // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to + // minimize unit test churn. Remove this some time. + if (!aggregate_rounding_balance_) { + actual_millibits_per_key = millibits_per_key_; + } + // END XXX/TODO + return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key); + } + + void AddAllEntries(char* data, uint32_t len, int num_probes) { + // Simple version without prefetching: + // + // for (auto h : hash_entries_info_.entries) { + // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len, + // num_probes, data); + // } + + const size_t num_entries = hash_entries_info_.entries.size(); + constexpr size_t kBufferMask = 7; + static_assert(((kBufferMask + 1) & kBufferMask) == 0, + "Must be power of 2 minus 1"); + + std::array hashes; + std::array byte_offsets; + + // Prime the buffer + size_t i = 0; + std::deque::iterator hash_entries_it = + hash_entries_info_.entries.begin(); + for (; i <= kBufferMask && i < num_entries; ++i) { + uint64_t h = *hash_entries_it; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, + /*out*/ &byte_offsets[i]); + hashes[i] = Upper32of64(h); + ++hash_entries_it; + } + + // Process and buffer + for (; i < num_entries; ++i) { + uint32_t& hash_ref = hashes[i & kBufferMask]; + uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask]; + // Process (add) + FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes, + data + byte_offset_ref); + // And buffer + uint64_t h = *hash_entries_it; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, + /*out*/ &byte_offset_ref); + hash_ref = Upper32of64(h); + ++hash_entries_it; + } + + // Finish processing + for (i = 0; i <= kBufferMask && i < num_entries; ++i) { + FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes, + data + byte_offsets[i]); + } + } + + // Target allocation per added key, in thousandths of a bit. + int millibits_per_key_; +}; + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsReader : public BuiltinFilterBitsReader { + public: + FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes) + : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {} + + // No Copy allowed + FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete; + void operator=(const FastLocalBloomBitsReader&) = delete; + + ~FastLocalBloomBitsReader() override {} + + bool MayMatch(const Slice& key) override { + uint64_t h = GetSliceHash64(key); + uint32_t byte_offset; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, + /*out*/ &byte_offset); + return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_, + data_ + byte_offset); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + uint64_t h = GetSliceHash64(*keys[i]); + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, + /*out*/ &byte_offsets[i]); + hashes[i] = Upper32of64(h); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i]); + } + } + + bool HashMayMatch(const uint64_t h) override { + return FastLocalBloomImpl::HashMayMatch(Lower32of64(h), Upper32of64(h), + len_bytes_, num_probes_, data_); + } + + private: + const char* data_; + const int num_probes_; + const uint32_t len_bytes_; +}; + +// ##################### Ribbon filter implementation ################### // + +// Implements concept RehasherTypesAndSettings in ribbon_impl.h +struct Standard128RibbonRehasherTypesAndSettings { + // These are schema-critical. Any change almost certainly changes + // underlying data. + static constexpr bool kIsFilter = true; + static constexpr bool kHomogeneous = false; + static constexpr bool kFirstCoeffAlwaysOne = true; + static constexpr bool kUseSmash = false; + using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; + using Hash = uint64_t; + using Seed = uint32_t; + // Changing these doesn't necessarily change underlying data, + // but might affect supported scalability of those dimensions. + using Index = uint32_t; + using ResultRow = uint32_t; + // Save a conditional in Ribbon queries + static constexpr bool kAllowZeroStarts = false; +}; + +using Standard128RibbonTypesAndSettings = + ribbon::StandardRehasherAdapter; + +class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { + public: + explicit Standard128RibbonBitsBuilder( + double desired_one_in_fp_rate, int bloom_millibits_per_key, + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption, Logger* info_log) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr, + detect_filter_construct_corruption), + desired_one_in_fp_rate_(desired_one_in_fp_rate), + info_log_(info_log), + bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance, + cache_res_mgr, detect_filter_construct_corruption) { + assert(desired_one_in_fp_rate >= 1.0); + } + + // No Copy allowed + Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete; + void operator=(const Standard128RibbonBitsBuilder&) = delete; + + ~Standard128RibbonBitsBuilder() override {} + + using FilterBitsBuilder::Finish; + + virtual Slice Finish(std::unique_ptr* buf) override { + return Finish(buf, nullptr); + } + + virtual Slice Finish(std::unique_ptr* buf, + Status* status) override { + if (hash_entries_info_.entries.size() > kMaxRibbonEntries) { + ROCKS_LOG_WARN( + info_log_, "Too many keys for Ribbon filter: %llu", + static_cast(hash_entries_info_.entries.size())); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_info_.entries.empty()); + return bloom_fallback_.Finish(buf, status); + } + if (hash_entries_info_.entries.size() == 0) { + // Save a conditional in Ribbon queries by using alternate reader + // for zero entries added. + if (status) { + *status = Status::OK(); + } + return FinishAlwaysFalse(buf); + } + uint32_t num_entries = + static_cast(hash_entries_info_.entries.size()); + uint32_t num_slots; + size_t len_with_metadata; + + CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots); + + // Bloom fall-back indicator + if (num_slots == 0) { + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_info_.entries.empty()); + return bloom_fallback_.Finish(buf, status); + } + + uint32_t entropy = 0; + if (!hash_entries_info_.entries.empty()) { + entropy = Lower32of64(hash_entries_info_.entries.front()); + } + + BandingType banding; + std::size_t bytes_banding = ribbon::StandardBanding< + Standard128RibbonTypesAndSettings>::EstimateMemoryUsage(num_slots); + Status status_banding_cache_res = Status::OK(); + + // Cache charging for banding + std::unique_ptr + banding_res_handle; + if (cache_res_mgr_) { + status_banding_cache_res = cache_res_mgr_->MakeCacheReservation( + bytes_banding, &banding_res_handle); + } + + if (status_banding_cache_res.IsMemoryLimit()) { + ROCKS_LOG_WARN(info_log_, + "Cache charging for Ribbon filter banding failed due " + "to cache full"); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_info_.entries.empty()); + // Release cache for banding since the banding won't be allocated + banding_res_handle.reset(); + return bloom_fallback_.Finish(buf, status); + } + + TEST_SYNC_POINT_CALLBACK( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries", + &hash_entries_info_.entries); + + bool success = banding.ResetAndFindSeedToSolve( + num_slots, hash_entries_info_.entries.begin(), + hash_entries_info_.entries.end(), + /*starting seed*/ entropy & 255, /*seed mask*/ 255); + if (!success) { + ROCKS_LOG_WARN( + info_log_, "Too many re-seeds (256) for Ribbon filter, %llu / %llu", + static_cast(hash_entries_info_.entries.size()), + static_cast(num_slots)); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_info_.entries.empty()); + return bloom_fallback_.Finish(buf, status); + } + + Status verify_hash_entries_checksum_status = + MaybeVerifyHashEntriesChecksum(); + if (!verify_hash_entries_checksum_status.ok()) { + ROCKS_LOG_WARN(info_log_, "Verify hash entries checksum error: %s", + verify_hash_entries_checksum_status.getState()); + if (status) { + *status = verify_hash_entries_checksum_status; + } + return FinishAlwaysTrue(buf); + } + + bool keep_entries_for_postverify = detect_filter_construct_corruption_; + if (!keep_entries_for_postverify) { + ResetEntries(); + } + + uint32_t seed = banding.GetOrdinalSeed(); + assert(seed < 256); + + std::unique_ptr mutable_buf; + std::unique_ptr + final_filter_cache_res_handle; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + // Cache charging for mutable_buf + if (cache_res_mgr_) { + Status s = cache_res_mgr_->MakeCacheReservation( + len_with_metadata * sizeof(char), &final_filter_cache_res_handle); + s.PermitUncheckedError(); + } + + SolnType soln(mutable_buf.get(), len_with_metadata); + soln.BackSubstFrom(banding); + uint32_t num_blocks = soln.GetNumBlocks(); + // This should be guaranteed: + // num_entries < 2^30 + // => (overhead_factor < 2.0) + // num_entries * overhead_factor == num_slots < 2^31 + // => (num_blocks = num_slots / 128) + // num_blocks < 2^24 + assert(num_blocks < 0x1000000U); + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -2 = Marker for Standard128 Ribbon + mutable_buf[len_with_metadata - 5] = static_cast(-2); + // Hash seed + mutable_buf[len_with_metadata - 4] = static_cast(seed); + // Number of blocks, in 24 bits + // (Along with bytes, we can derive other settings) + mutable_buf[len_with_metadata - 3] = static_cast(num_blocks & 255); + mutable_buf[len_with_metadata - 2] = + static_cast((num_blocks >> 8) & 255); + mutable_buf[len_with_metadata - 1] = + static_cast((num_blocks >> 16) & 255); + + auto TEST_arg_pair __attribute__((__unused__)) = + std::make_pair(&mutable_buf, len_with_metadata); + TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter", + &TEST_arg_pair); + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + if (status) { + *status = Status::OK(); + } + return rv; + } + + // Setting num_slots to 0 means "fall back on Bloom filter." + // And note this implementation does not support num_entries or num_slots + // beyond uint32_t; see kMaxRibbonEntries. + void CalculateSpaceAndSlots(size_t num_entries, + size_t* target_len_with_metadata, + uint32_t* num_slots) { + if (num_entries > kMaxRibbonEntries) { + // More entries than supported by this Ribbon + *num_slots = 0; // use Bloom + *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries); + return; + } + uint32_t entropy = 0; + if (!hash_entries_info_.entries.empty()) { + entropy = Upper32of64(hash_entries_info_.entries.front()); + } + + *num_slots = NumEntriesToNumSlots(static_cast(num_entries)); + *target_len_with_metadata = + SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_, + /*rounding*/ entropy) + + kMetadataLen; + + // Consider possible Bloom fallback for small filters + if (*num_slots < 1024) { + size_t bloom = bloom_fallback_.CalculateSpace(num_entries); + if (bloom < *target_len_with_metadata) { + *num_slots = 0; // use Bloom + *target_len_with_metadata = bloom; + return; + } + } + } + + size_t CalculateSpace(size_t num_entries) override { + if (num_entries == 0) { + // See FinishAlwaysFalse + return 0; + } + size_t target_len_with_metadata; + uint32_t num_slots; + CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots); + (void)num_slots; + return target_len_with_metadata; + } + + // This is a somewhat ugly but reasonably fast and reasonably accurate + // reversal of CalculateSpace. + size_t ApproximateNumEntries(size_t bytes) override { + size_t len_no_metadata = + RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) - + kMetadataLen; + + if (!(desired_one_in_fp_rate_ > 1.0)) { + // Effectively asking for 100% FP rate, or NaN etc. + // Note that NaN is neither < 1.0 nor > 1.0 + return kMaxRibbonEntries; + } + + // Find a slight under-estimate for actual average bits per slot + double min_real_bits_per_slot; + if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits::max()) { + // Max of 32 solution columns (result bits) + min_real_bits_per_slot = 32.0; + } else { + // Account for mix of b and b+1 solution columns being slightly + // suboptimal vs. ideal log2(1/fp_rate) bits. + uint32_t rounded = static_cast(desired_one_in_fp_rate_); + int upper_bits_per_key = 1 + FloorLog2(rounded); + double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key); + double portion_lower = + (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) / + fp_rate_for_upper; + min_real_bits_per_slot = upper_bits_per_key - portion_lower; + assert(min_real_bits_per_slot > 0.0); + assert(min_real_bits_per_slot <= 32.0); + } + + // An overestimate, but this should only be O(1) slots away from truth. + double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot; + + // Let's not bother accounting for overflow to Bloom filter + // (Includes NaN case) + if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) { + return kMaxRibbonEntries; + } + + // Set up for short iteration + uint32_t slots = static_cast(max_slots); + slots = SolnType::RoundUpNumSlots(slots); + + // Assert that we have a valid upper bound on slots + assert(SolnType::GetBytesForOneInFpRate( + SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_, + /*rounding*/ 0) > len_no_metadata); + + // Iterate up to a few times to rather precisely account for small effects + for (int i = 0; slots > 0; ++i) { + size_t reqd_bytes = + SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_, + /*rounding*/ 0); + if (reqd_bytes <= len_no_metadata) { + break; // done + } + if (i >= 2) { + // should have been enough iterations + assert(false); + break; + } + slots = SolnType::RoundDownNumSlots(slots - 1); + } + + uint32_t num_entries = ConfigHelper::GetNumToAdd(slots); + + // Consider possible Bloom fallback for small filters + if (slots < 1024) { + size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes); + if (bloom > num_entries) { + return bloom; + } else { + return num_entries; + } + } else { + return std::min(num_entries, kMaxRibbonEntries); + } + } + + double EstimatedFpRate(size_t num_entries, + size_t len_with_metadata) override { + if (num_entries > kMaxRibbonEntries) { + // More entries than supported by this Ribbon + return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata); + } + uint32_t num_slots = + NumEntriesToNumSlots(static_cast(num_entries)); + SolnType fake_soln(nullptr, len_with_metadata); + fake_soln.ConfigureForNumSlots(num_slots); + return fake_soln.ExpectedFpRate(); + } + + Status MaybePostVerify(const Slice& filter_content) override { + bool fall_back = (bloom_fallback_.EstimateEntriesAdded() > 0); + return fall_back ? bloom_fallback_.MaybePostVerify(filter_content) + : XXPH3FilterBitsBuilder::MaybePostVerify(filter_content); + } + + protected: + size_t RoundDownUsableSpace(size_t available_size) override { + size_t rv = available_size - kMetadataLen; + + // round down to multiple of 16 (segment size) + rv &= ~size_t{15}; + + return rv + kMetadataLen; + } + + private: + using TS = Standard128RibbonTypesAndSettings; + using SolnType = ribbon::SerializableInterleavedSolution; + using BandingType = ribbon::StandardBanding; + using ConfigHelper = ribbon::BandingConfigHelper1TS; + + static uint32_t NumEntriesToNumSlots(uint32_t num_entries) { + uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries); + return SolnType::RoundUpNumSlots(num_slots1); + } + + // Approximate num_entries to ensure number of bytes fits in 32 bits, which + // is not an inherent limitation but does ensure somewhat graceful Bloom + // fallback for crazy high number of entries, since the Bloom implementation + // does not support number of bytes bigger than fits in 32 bits. This is + // within an order of magnitude of implementation limit on num_slots + // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits + // (for filter metadata). + static constexpr uint32_t kMaxRibbonEntries = 950000000; // ~ 1 billion + + // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate. + double desired_one_in_fp_rate_; + + // For warnings, or can be nullptr + Logger* info_log_; + + // For falling back on Bloom filter in some exceptional cases and + // very small filter cases + FastLocalBloomBitsBuilder bloom_fallback_; +}; + +// for the linker, at least with DEBUG_LEVEL=2 +constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries; + +class Standard128RibbonBitsReader : public BuiltinFilterBitsReader { + public: + Standard128RibbonBitsReader(const char* data, size_t len_bytes, + uint32_t num_blocks, uint32_t seed) + : soln_(const_cast(data), len_bytes) { + soln_.ConfigureForNumBlocks(num_blocks); + hasher_.SetOrdinalSeed(seed); + } + + // No Copy allowed + Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete; + void operator=(const Standard128RibbonBitsReader&) = delete; + + ~Standard128RibbonBitsReader() override {} + + bool MayMatch(const Slice& key) override { + uint64_t h = GetSliceHash64(key); + return soln_.FilterQuery(h, hasher_); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + struct SavedData { + uint64_t seeded_hash; + uint32_t segment_num; + uint32_t num_columns; + uint32_t start_bits; + }; + std::array saved; + for (int i = 0; i < num_keys; ++i) { + ribbon::InterleavedPrepareQuery( + GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash, + &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = ribbon::InterleavedFilterQuery( + saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns, + saved[i].start_bits, hasher_, soln_); + } + } + + bool HashMayMatch(const uint64_t h) override { + return soln_.FilterQuery(h, hasher_); + } + + private: + using TS = Standard128RibbonTypesAndSettings; + ribbon::SerializableInterleavedSolution soln_; + ribbon::StandardHasher hasher_; +}; + +// ##################### Legacy Bloom implementation ################### // + +using LegacyBloomImpl = LegacyLocalityBloomImpl; + +class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log); + + // No Copy allowed + LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete; + void operator=(const LegacyBloomBitsBuilder&) = delete; + + ~LegacyBloomBitsBuilder() override; + + void AddKey(const Slice& key) override; + + virtual size_t EstimateEntriesAdded() override { + return hash_entries_.size(); + } + + using FilterBitsBuilder::Finish; + + Slice Finish(std::unique_ptr* buf) override; + + size_t CalculateSpace(size_t num_entries) override { + uint32_t dont_care1; + uint32_t dont_care2; + return CalculateSpace(num_entries, &dont_care1, &dont_care2); + } + + double EstimatedFpRate(size_t keys, size_t bytes) override { + return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen, + num_probes_); + } + + size_t ApproximateNumEntries(size_t bytes) override; + + private: + int bits_per_key_; + int num_probes_; + std::vector hash_entries_; + Logger* info_log_; + + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(size_t num_entries, uint32_t* total_bits, + uint32_t* num_lines); + + // Implementation-specific variant of public CalculateSpace + uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); +}; + +LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, + Logger* info_log) + : bits_per_key_(bits_per_key), + num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)), + info_log_(info_log) { + assert(bits_per_key_); +} + +LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {} + +void LegacyBloomBitsBuilder::AddKey(const Slice& key) { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } +} + +Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { + uint32_t total_bits, num_lines; + size_t num_entries = hash_entries_.size(); + char* data = + ReserveSpace(static_cast(num_entries), &total_bits, &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); + } + + // Check for excessive entries for 32-bit hash function + if (num_entries >= /* minimum of 3 million */ 3000000U) { + // More specifically, we can detect that the 32-bit hash function + // is causing significant increase in FP rate by comparing current + // estimated FP rate to what we would get with a normal number of + // keys at same memory ratio. + double est_fp_rate = LegacyBloomImpl::EstimatedFpRate( + num_entries, total_bits / 8, num_probes_); + double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate( + 1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_); + + if (est_fp_rate >= 1.50 * vs_fp_rate) { + // For more details, see + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + ROCKS_LOG_WARN( + info_log_, + "Using legacy SST/BBT Bloom filter with excessive key count " + "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. " + "Consider using new Bloom with format_version>=5, smaller SST " + "file size, or partitioned filters.", + num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate); + } + } + } + // See BloomFilterPolicy::GetFilterBitsReader for metadata + data[total_bits / 8] = static_cast(num_probes_); + EncodeFixed32(data + total_bits / 8 + 1, static_cast(num_lines)); + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, total_bits / 8 + kMetadataLen); +} + +size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) { + assert(bits_per_key_); + assert(bytes > 0); + + uint64_t total_bits_tmp = bytes * 8; + // total bits, including temporary computations, cannot exceed 2^32 + // for compatibility + total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000}); + + uint32_t high = static_cast(total_bits_tmp) / + static_cast(bits_per_key_) + + 1; + uint32_t low = 1; + uint32_t n = high; + for (; n >= low; n--) { + if (CalculateSpace(n) <= bytes) { + break; + } + } + return n; +} + +uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_lines = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_lines an odd number to make sure more bits are involved + // when determining which block. + if (num_lines % 2 == 0) { + num_lines++; + } + return num_lines * (CACHE_LINE_SIZE * 8); +} + +uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries, + uint32_t* total_bits, + uint32_t* num_lines) { + assert(bits_per_key_); + if (num_entries != 0) { + size_t total_bits_tmp = num_entries * bits_per_key_; + // total bits, including temporary computations, cannot exceed 2^32 + // for compatibility + total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000}); + + *total_bits = + GetTotalBitsForLocality(static_cast(total_bits_tmp)); + *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); + assert(*total_bits > 0 && *total_bits % 8 == 0); + } else { + // filter is empty, just leave space for metadata + *total_bits = 0; + *num_lines = 0; + } + + // Reserve space for Filter + uint32_t sz = *total_bits / 8; + sz += kMetadataLen; // 4 bytes for num_lines, 1 byte for num_probes + return sz; +} + +char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries, + uint32_t* total_bits, + uint32_t* num_lines) { + uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines); + char* data = new char[sz]; + memset(data, 0, sz); + return data; +} + +inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, + uint32_t num_lines, + uint32_t total_bits) { +#ifdef NDEBUG + static_cast(total_bits); +#endif + assert(num_lines > 0 && total_bits > 0); + + LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data, + ConstexprFloorLog2(CACHE_LINE_SIZE)); +} + +class LegacyBloomBitsReader : public BuiltinFilterBitsReader { + public: + LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines, + uint32_t log2_cache_line_size) + : data_(data), + num_probes_(num_probes), + num_lines_(num_lines), + log2_cache_line_size_(log2_cache_line_size) {} + + // No Copy allowed + LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete; + void operator=(const LegacyBloomBitsReader&) = delete; + + ~LegacyBloomBitsReader() override {} + + // "contents" contains the data built by a preceding call to + // FilterBitsBuilder::Finish. MayMatch must return true if the key was + // passed to FilterBitsBuilder::AddKey. This method may return true or false + // if the key was not on the list, but it should aim to return false with a + // high probability. + bool MayMatch(const Slice& key) override { + uint32_t hash = BloomHash(key); + uint32_t byte_offset; + LegacyBloomImpl::PrepareHashMayMatch( + hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_); + return LegacyBloomImpl::HashMayMatchPrepared( + hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + hashes[i] = BloomHash(*keys[i]); + LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_, + /*out*/ &byte_offsets[i], + log2_cache_line_size_); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = LegacyBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i], + log2_cache_line_size_); + } + } + + bool HashMayMatch(const uint64_t /* h */) override { return false; } + + private: + const char* data_; + const int num_probes_; + const uint32_t num_lines_; + const uint32_t log2_cache_line_size_; +}; + +class AlwaysTrueFilter : public BuiltinFilterBitsReader { + public: + bool MayMatch(const Slice&) override { return true; } + using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return true; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload +}; + +class AlwaysFalseFilter : public BuiltinFilterBitsReader { + public: + bool MayMatch(const Slice&) override { return false; } + using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return false; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload +}; + +Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { + Status s = Status::OK(); + + if (!detect_filter_construct_corruption_) { + return s; + } + + std::unique_ptr bits_reader( + BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content)); + + for (uint64_t h : hash_entries_info_.entries) { + // The current approach will not detect corruption from XXPH3Filter to + // AlwaysTrueFilter, which can lead to performance cost later due to + // AlwaysTrueFilter not filtering anything. But this cost is acceptable + // given the extra implementation complixity to detect such case. + bool may_match = bits_reader->HashMayMatch(h); + if (!may_match) { + s = Status::Corruption("Corrupted filter content"); + break; + } + } + + ResetEntries(); + return s; +} +} // namespace + +const char* BuiltinFilterPolicy::kClassName() { + return "rocksdb.internal.BuiltinFilter"; +} + +bool BuiltinFilterPolicy::IsInstanceOf(const std::string& name) const { + if (name == kClassName()) { + return true; + } else { + return FilterPolicy::IsInstanceOf(name); + } +} + +static const char* kBuiltinFilterMetadataName = "rocksdb.BuiltinBloomFilter"; + +const char* BuiltinFilterPolicy::kCompatibilityName() { + return kBuiltinFilterMetadataName; +} + +const char* BuiltinFilterPolicy::CompatibilityName() const { + return kBuiltinFilterMetadataName; +} + +BloomLikeFilterPolicy::BloomLikeFilterPolicy(double bits_per_key) + : warned_(false), aggregate_rounding_balance_(0) { + // Sanitize bits_per_key + if (bits_per_key < 0.5) { + // Round down to no filter + bits_per_key = 0; + } else if (bits_per_key < 1.0) { + // Minimum 1 bit per key (equiv) when creating filter + bits_per_key = 1.0; + } else if (!(bits_per_key < 100.0)) { // including NaN + bits_per_key = 100.0; + } + + // Includes a nudge toward rounding up, to ensure on all platforms + // that doubles specified with three decimal digits after the decimal + // point are interpreted accurately. + millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); + + // For now configure Ribbon filter to match Bloom FP rate and save + // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key + // for same FP rate.) + desired_one_in_fp_rate_ = + 1.0 / BloomMath::CacheLocalFpRate( + bits_per_key, + FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_), + /*cache_line_bits*/ 512); + + // For better or worse, this is a rounding up of a nudged rounding up, + // e.g. 7.4999999999999 will round up to 8, but that provides more + // predictability against small arithmetic errors in floating point. + whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000; +} + +BloomLikeFilterPolicy::~BloomLikeFilterPolicy() {} +const char* BloomLikeFilterPolicy::kClassName() { + return "rocksdb.internal.BloomLikeFilter"; +} + +bool BloomLikeFilterPolicy::IsInstanceOf(const std::string& name) const { + if (name == kClassName()) { + return true; + } else { + return BuiltinFilterPolicy::IsInstanceOf(name); + } +} + +const char* ReadOnlyBuiltinFilterPolicy::kClassName() { + return kBuiltinFilterMetadataName; +} + +std::string BloomLikeFilterPolicy::GetId() const { + return Name() + GetBitsPerKeySuffix(); +} + +BloomFilterPolicy::BloomFilterPolicy(double bits_per_key) + : BloomLikeFilterPolicy(bits_per_key) {} + +FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (GetMillibitsPerKey() == 0) { + // "No filter" special case + return nullptr; + } else if (context.table_options.format_version < 5) { + return GetLegacyBloomBuilderWithContext(context); + } else { + return GetFastLocalBloomBuilderWithContext(context); + } +} + +const char* BloomFilterPolicy::kClassName() { return "bloomfilter"; } +const char* BloomFilterPolicy::kNickName() { return "rocksdb.BloomFilter"; } + +std::string BloomFilterPolicy::GetId() const { + // Including ":false" for better forward-compatibility with 6.29 and earlier + // which required a boolean `use_block_based_builder` parameter + return BloomLikeFilterPolicy::GetId() + ":false"; +} + +FilterBitsBuilder* BloomLikeFilterPolicy::GetFastLocalBloomBuilderWithContext( + const FilterBuildingContext& context) const { + bool offm = context.table_options.optimize_filters_for_memory; + const auto options_overrides_iter = + context.table_options.cache_usage_options.options_overrides.find( + CacheEntryRole::kFilterConstruction); + const auto filter_construction_charged = + options_overrides_iter != + context.table_options.cache_usage_options.options_overrides.end() + ? options_overrides_iter->second.charged + : context.table_options.cache_usage_options.options.charged; + + std::shared_ptr cache_res_mgr; + if (context.table_options.block_cache && + filter_construction_charged == + CacheEntryRoleOptions::Decision::kEnabled) { + cache_res_mgr = std::make_shared< + CacheReservationManagerImpl>( + context.table_options.block_cache); + } + return new FastLocalBloomBitsBuilder( + millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, + cache_res_mgr, context.table_options.detect_filter_construct_corruption); +} + +FilterBitsBuilder* BloomLikeFilterPolicy::GetLegacyBloomBuilderWithContext( + const FilterBuildingContext& context) const { + if (whole_bits_per_key_ >= 14 && context.info_log && + !warned_.load(std::memory_order_relaxed)) { + warned_ = true; + const char* adjective; + if (whole_bits_per_key_ >= 20) { + adjective = "Dramatic"; + } else { + adjective = "Significant"; + } + // For more details, see + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + ROCKS_LOG_WARN(context.info_log, + "Using legacy Bloom filter with high (%d) bits/key. " + "%s filter space and/or accuracy improvement is available " + "with format_version>=5.", + whole_bits_per_key_, adjective); + } + return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log); +} + +FilterBitsBuilder* +BloomLikeFilterPolicy::GetStandard128RibbonBuilderWithContext( + const FilterBuildingContext& context) const { + // FIXME: code duplication with GetFastLocalBloomBuilderWithContext + bool offm = context.table_options.optimize_filters_for_memory; + const auto options_overrides_iter = + context.table_options.cache_usage_options.options_overrides.find( + CacheEntryRole::kFilterConstruction); + const auto filter_construction_charged = + options_overrides_iter != + context.table_options.cache_usage_options.options_overrides.end() + ? options_overrides_iter->second.charged + : context.table_options.cache_usage_options.options.charged; + + std::shared_ptr cache_res_mgr; + if (context.table_options.block_cache && + filter_construction_charged == + CacheEntryRoleOptions::Decision::kEnabled) { + cache_res_mgr = std::make_shared< + CacheReservationManagerImpl>( + context.table_options.block_cache); + } + return new Standard128RibbonBitsBuilder( + desired_one_in_fp_rate_, millibits_per_key_, + offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr, + context.table_options.detect_filter_construct_corruption, + context.info_log); +} + +std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix() const { + std::string rv = ":" + std::to_string(millibits_per_key_ / 1000); + int frac = millibits_per_key_ % 1000; + if (frac > 0) { + rv.push_back('.'); + rv.push_back(static_cast('0' + (frac / 100))); + frac %= 100; + if (frac > 0) { + rv.push_back(static_cast('0' + (frac / 10))); + frac %= 10; + if (frac > 0) { + rv.push_back(static_cast('0' + frac)); + } + } + } + return rv; +} + +FilterBitsBuilder* BuiltinFilterPolicy::GetBuilderFromContext( + const FilterBuildingContext& context) { + if (context.table_options.filter_policy) { + return context.table_options.filter_policy->GetBuilderWithContext(context); + } else { + return nullptr; + } +} + +// For testing only, but always constructable with internal names +namespace test { + +const char* LegacyBloomFilterPolicy::kClassName() { + return "rocksdb.internal.LegacyBloomFilter"; +} + +FilterBitsBuilder* LegacyBloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (GetMillibitsPerKey() == 0) { + // "No filter" special case + return nullptr; + } + return GetLegacyBloomBuilderWithContext(context); +} + +const char* FastLocalBloomFilterPolicy::kClassName() { + return "rocksdb.internal.FastLocalBloomFilter"; +} + +FilterBitsBuilder* FastLocalBloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (GetMillibitsPerKey() == 0) { + // "No filter" special case + return nullptr; + } + return GetFastLocalBloomBuilderWithContext(context); +} + +const char* Standard128RibbonFilterPolicy::kClassName() { + return "rocksdb.internal.Standard128RibbonFilter"; +} + +FilterBitsBuilder* Standard128RibbonFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (GetMillibitsPerKey() == 0) { + // "No filter" special case + return nullptr; + } + return GetStandard128RibbonBuilderWithContext(context); +} + +} // namespace test + +BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBuiltinFilterBitsReader( + const Slice& contents) { + uint32_t len_with_meta = static_cast(contents.size()); + if (len_with_meta <= kMetadataLen) { + // filter is empty or broken. Treat like zero keys added. + return new AlwaysFalseFilter(); + } + + // Legacy Bloom filter data: + // 0 +-----------------------------------+ + // | Raw Bloom filter data | + // | ... | + // len +-----------------------------------+ + // | byte for num_probes or | + // | marker for new implementations | + // len+1 +-----------------------------------+ + // | four bytes for number of cache | + // | lines | + // len_with_meta +-----------------------------------+ + + int8_t raw_num_probes = + static_cast(contents.data()[len_with_meta - kMetadataLen]); + // NB: *num_probes > 30 and < 128 probably have not been used, because of + // BloomFilterPolicy::initialize, unless directly calling + // LegacyBloomBitsBuilder as an API, but we are leaving those cases in + // limbo with LegacyBloomBitsReader for now. + + if (raw_num_probes < 1) { + // Note: < 0 (or unsigned > 127) indicate special new implementations + // (or reserved for future use) + switch (raw_num_probes) { + case 0: + // Treat as zero probes (always FP) + return new AlwaysTrueFilter(); + case -1: + // Marker for newer Bloom implementations + return GetBloomBitsReader(contents); + case -2: + // Marker for Ribbon implementations + return GetRibbonBitsReader(contents); + default: + // Reserved (treat as zero probes, always FP, for now) + return new AlwaysTrueFilter(); + } + } + // else attempt decode for LegacyBloomBitsReader + + int num_probes = raw_num_probes; + assert(num_probes >= 1); + assert(num_probes <= 127); + + uint32_t len = len_with_meta - kMetadataLen; + assert(len > 0); + + uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); + uint32_t log2_cache_line_size; + + if (num_lines * CACHE_LINE_SIZE == len) { + // Common case + log2_cache_line_size = ConstexprFloorLog2(CACHE_LINE_SIZE); + } else if (num_lines == 0 || len % num_lines != 0) { + // Invalid (no solution to num_lines * x == len) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } else { + // Determine the non-native cache line size (from another system) + log2_cache_line_size = 0; + while ((num_lines << log2_cache_line_size) < len) { + ++log2_cache_line_size; + } + if ((num_lines << log2_cache_line_size) != len) { + // Invalid (block size not a power of two) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } + } + // if not early return + return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines, + log2_cache_line_size); +} + +// Read metadata to determine what kind of FilterBitsReader is needed +// and return a new one. +FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader( + const Slice& contents) const { + return BuiltinFilterPolicy::GetBuiltinFilterBitsReader(contents); +} + +BuiltinFilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader( + const Slice& contents) { + uint32_t len_with_meta = static_cast(contents.size()); + uint32_t len = len_with_meta - kMetadataLen; + + assert(len > 0); // precondition + + uint32_t seed = static_cast(contents.data()[len + 1]); + uint32_t num_blocks = static_cast(contents.data()[len + 2]); + num_blocks |= static_cast(contents.data()[len + 3]) << 8; + num_blocks |= static_cast(contents.data()[len + 4]) << 16; + if (num_blocks < 2) { + // Not supported + // num_blocks == 1 is not used because num_starts == 1 is problematic + // for the hashing scheme. num_blocks == 0 is unused because there's + // already a concise encoding of an "always false" filter. + // Return something safe: + return new AlwaysTrueFilter(); + } + return new Standard128RibbonBitsReader(contents.data(), len, num_blocks, + seed); +} + +// For newer Bloom filter implementations +BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader( + const Slice& contents) { + uint32_t len_with_meta = static_cast(contents.size()); + uint32_t len = len_with_meta - kMetadataLen; + + assert(len > 0); // precondition + + // New Bloom filter data: + // 0 +-----------------------------------+ + // | Raw Bloom filter data | + // | ... | + // len +-----------------------------------+ + // | char{-1} byte -> new Bloom filter | + // len+1 +-----------------------------------+ + // | byte for subimplementation | + // | 0: FastLocalBloom | + // | other: reserved | + // len+2 +-----------------------------------+ + // | byte for block_and_probes | + // | 0 in top 3 bits -> 6 -> 64-byte | + // | reserved: | + // | 1 in top 3 bits -> 7 -> 128-byte| + // | 2 in top 3 bits -> 8 -> 256-byte| + // | ... | + // | num_probes in bottom 5 bits, | + // | except 0 and 31 reserved | + // len+3 +-----------------------------------+ + // | two bytes reserved | + // | possibly for hash seed | + // len_with_meta +-----------------------------------+ + + // Read more metadata (see above) + char sub_impl_val = contents.data()[len_with_meta - 4]; + char block_and_probes = contents.data()[len_with_meta - 3]; + int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6; + + int num_probes = (block_and_probes & 31); + if (num_probes < 1 || num_probes > 30) { + // Reserved / future safe + return new AlwaysTrueFilter(); + } + + uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2); + if (rest != 0) { + // Reserved, possibly for hash seed + // Future safe + return new AlwaysTrueFilter(); + } + + if (sub_impl_val == 0) { // FastLocalBloom + if (log2_block_bytes == 6) { // Only block size supported for now + return new FastLocalBloomBitsReader(contents.data(), num_probes, len); + } + } + // otherwise + // Reserved / future safe + return new AlwaysTrueFilter(); +} + +const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, + bool /*use_block_based_builder*/) { + // NOTE: use_block_based_builder now ignored so block-based filter is no + // longer accessible in public API. + return new BloomFilterPolicy(bits_per_key); +} + +RibbonFilterPolicy::RibbonFilterPolicy(double bloom_equivalent_bits_per_key, + int bloom_before_level) + : BloomLikeFilterPolicy(bloom_equivalent_bits_per_key), + bloom_before_level_(bloom_before_level) {} + +FilterBitsBuilder* RibbonFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (GetMillibitsPerKey() == 0) { + // "No filter" special case + return nullptr; + } + // Treat unknown same as bottommost + int levelish = INT_MAX; + + switch (context.compaction_style) { + case kCompactionStyleLevel: + case kCompactionStyleUniversal: { + if (context.reason == TableFileCreationReason::kFlush) { + // Treat flush as level -1 + assert(context.level_at_creation == 0); + levelish = -1; + } else if (context.level_at_creation == -1) { + // Unknown level + assert(levelish == INT_MAX); + } else { + levelish = context.level_at_creation; + } + break; + } + case kCompactionStyleFIFO: + case kCompactionStyleNone: + // Treat as bottommost + assert(levelish == INT_MAX); + break; + } + if (levelish < bloom_before_level_) { + return GetFastLocalBloomBuilderWithContext(context); + } else { + return GetStandard128RibbonBuilderWithContext(context); + } +} + +const char* RibbonFilterPolicy::kClassName() { return "ribbonfilter"; } +const char* RibbonFilterPolicy::kNickName() { return "rocksdb.RibbonFilter"; } + +std::string RibbonFilterPolicy::GetId() const { + return BloomLikeFilterPolicy::GetId() + ":" + + std::to_string(bloom_before_level_); +} + +const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key, + int bloom_before_level) { + return new RibbonFilterPolicy(bloom_equivalent_bits_per_key, + bloom_before_level); +} + +FilterBuildingContext::FilterBuildingContext( + const BlockBasedTableOptions& _table_options) + : table_options(_table_options) {} + +FilterPolicy::~FilterPolicy() {} + +std::shared_ptr BloomLikeFilterPolicy::Create( + const std::string& name, double bits_per_key) { + if (name == test::LegacyBloomFilterPolicy::kClassName()) { + return std::make_shared(bits_per_key); + } else if (name == test::FastLocalBloomFilterPolicy::kClassName()) { + return std::make_shared(bits_per_key); + } else if (name == test::Standard128RibbonFilterPolicy::kClassName()) { + return std::make_shared(bits_per_key); + } else if (name == BloomFilterPolicy::kClassName()) { + // For testing + return std::make_shared(bits_per_key); + } else if (name == RibbonFilterPolicy::kClassName()) { + // For testing + return std::make_shared(bits_per_key, + /*bloom_before_level*/ 0); + } else { + return nullptr; + } +} + +namespace { +static ObjectLibrary::PatternEntry FilterPatternEntryWithBits( + const char* name) { + return ObjectLibrary::PatternEntry(name, false).AddNumber(":", false); +} + +template +T* NewBuiltinFilterPolicyWithBits(const std::string& uri) { + const std::vector vals = StringSplit(uri, ':'); + double bits_per_key = ParseDouble(vals[1]); + return new T(bits_per_key); +} +static int RegisterBuiltinFilterPolicies(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + ReadOnlyBuiltinFilterPolicy::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new ReadOnlyBuiltinFilterPolicy()); + return guard->get(); + }); + + library.AddFactory( + FilterPatternEntryWithBits(BloomFilterPolicy::kClassName()) + .AnotherName(BloomFilterPolicy::kNickName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(NewBuiltinFilterPolicyWithBits(uri)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits(BloomFilterPolicy::kClassName()) + .AnotherName(BloomFilterPolicy::kNickName()) + .AddSuffix(":false"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(NewBuiltinFilterPolicyWithBits(uri)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits(BloomFilterPolicy::kClassName()) + .AnotherName(BloomFilterPolicy::kNickName()) + .AddSuffix(":true"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + const std::vector vals = StringSplit(uri, ':'); + double bits_per_key = ParseDouble(vals[1]); + // NOTE: This case previously configured the deprecated block-based + // filter, but old ways of configuring that now map to full filter. We + // defer to the corresponding API to ensure consistency in case that + // change is reverted. + guard->reset(NewBloomFilterPolicy(bits_per_key, true)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits(RibbonFilterPolicy::kClassName()) + .AnotherName(RibbonFilterPolicy::kNickName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + const std::vector vals = StringSplit(uri, ':'); + double bits_per_key = ParseDouble(vals[1]); + guard->reset(NewRibbonFilterPolicy(bits_per_key)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits(RibbonFilterPolicy::kClassName()) + .AnotherName(RibbonFilterPolicy::kNickName()) + .AddNumber(":", true), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + const std::vector vals = StringSplit(uri, ':'); + double bits_per_key = ParseDouble(vals[1]); + int bloom_before_level = ParseInt(vals[2]); + guard->reset(NewRibbonFilterPolicy(bits_per_key, bloom_before_level)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits(test::LegacyBloomFilterPolicy::kClassName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset( + NewBuiltinFilterPolicyWithBits(uri)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits( + test::FastLocalBloomFilterPolicy::kClassName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset( + NewBuiltinFilterPolicyWithBits( + uri)); + return guard->get(); + }); + library.AddFactory( + FilterPatternEntryWithBits( + test::Standard128RibbonFilterPolicy::kClassName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset( + NewBuiltinFilterPolicyWithBits( + uri)); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +} // namespace + +Status FilterPolicy::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* policy) { + if (value == kNullptrString || value.empty()) { + policy->reset(); + return Status::OK(); + } else if (value == ReadOnlyBuiltinFilterPolicy::kClassName()) { + *policy = std::make_shared(); + return Status::OK(); + } + + std::string id; + std::unordered_map opt_map; + Status status = + Customizable::GetOptionsMap(options, policy->get(), value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { + static std::once_flag loaded; + std::call_once(loaded, [&]() { + RegisterBuiltinFilterPolicies(*(ObjectLibrary::Default().get()), ""); + }); + status = options.registry->NewSharedObject(id, policy); + } + if (options.ignore_unsupported_options && status.IsNotSupported()) { + return Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject( + options, const_cast(policy->get()), opt_map); + } + return status; +} + +const std::vector& BloomLikeFilterPolicy::GetAllFixedImpls() { + STATIC_AVOID_DESTRUCTION(std::vector, impls){ + // Match filter_bench -impl=x ordering + test::LegacyBloomFilterPolicy::kClassName(), + test::FastLocalBloomFilterPolicy::kClassName(), + test::Standard128RibbonFilterPolicy::kClassName(), + }; + return impls; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy_internal.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy_internal.h new file mode 100644 index 0000000000..9bc3a24829 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/filter_policy_internal.h @@ -0,0 +1,340 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/filter_policy.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +// A class that takes a bunch of keys, then generates filter +class FilterBitsBuilder { + public: + virtual ~FilterBitsBuilder() {} + + // Add a key (or prefix) to the filter. Typically, a builder will keep + // a set of 64-bit key hashes and only build the filter in Finish + // when the final number of keys is known. Keys are added in sorted order + // and duplicated keys are possible, so typically, the builder will + // only add this key if its hash is different from the most recently + // added. + virtual void AddKey(const Slice& key) = 0; + + // Called by RocksDB before Finish to populate + // TableProperties::num_filter_entries, so should represent the + // number of unique keys (and/or prefixes) added, but does not have + // to be exact. `return 0;` may be used to conspicuously indicate "unknown". + virtual size_t EstimateEntriesAdded() = 0; + + // Generate the filter using the keys that are added + // The return value of this function would be the filter bits, + // The ownership of actual data is set to buf + virtual Slice Finish(std::unique_ptr* buf) = 0; + + // Similar to Finish(std::unique_ptr* buf), except that + // for a non-null status pointer argument, it will point to + // Status::Corruption() when there is any corruption during filter + // construction or Status::OK() otherwise. + // + // WARNING: do not use a filter resulted from a corrupted construction + // TODO: refactor this to have a better signature, consolidate + virtual Slice Finish(std::unique_ptr* buf, + Status* /* status */) { + return Finish(buf); + } + + // Verify the filter returned from calling FilterBitsBuilder::Finish. + // The function returns Status::Corruption() if there is any corruption in the + // constructed filter or Status::OK() otherwise. + // + // Implementations should normally consult + // FilterBuildingContext::table_options.detect_filter_construct_corruption + // to determine whether to perform verification or to skip by returning + // Status::OK(). The decision is left to the FilterBitsBuilder so that + // verification prerequisites before PostVerify can be skipped when not + // configured. + // + // RocksDB internal will always call MaybePostVerify() on the filter after + // it is returned from calling FilterBitsBuilder::Finish + // except for FilterBitsBuilder::Finish resulting a corruption + // status, which indicates the filter is already in a corrupted state and + // there is no need to post-verify + virtual Status MaybePostVerify(const Slice& /* filter_content */) { + return Status::OK(); + } + + // Approximate the number of keys that can be added and generate a filter + // <= the specified number of bytes. Callers (including RocksDB) should + // only use this result for optimizing performance and not as a guarantee. + virtual size_t ApproximateNumEntries(size_t bytes) = 0; +}; + +// A class that checks if a key can be in filter +// It should be initialized by Slice generated by BitsBuilder +class FilterBitsReader { + public: + virtual ~FilterBitsReader() {} + + // Check if the entry match the bits in filter + virtual bool MayMatch(const Slice& entry) = 0; + + // Check if an array of entries match the bits in filter + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) { + for (int i = 0; i < num_keys; ++i) { + may_match[i] = MayMatch(*keys[i]); + } + } +}; + +// Exposes any extra information needed for testing built-in +// FilterBitsBuilders +class BuiltinFilterBitsBuilder : public FilterBitsBuilder { + public: + // Calculate number of bytes needed for a new filter, including + // metadata. Passing the result to ApproximateNumEntries should + // (ideally, usually) return >= the num_entry passed in. + // When optimize_filters_for_memory is enabled, this function + // is not authoritative but represents a target size that should + // be close to the average size. + virtual size_t CalculateSpace(size_t num_entries) = 0; + + // Returns an estimate of the FP rate of the returned filter if + // `num_entries` keys are added and the filter returned by Finish + // is `bytes` bytes. + virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; +}; + +// Base class for RocksDB built-in filter reader with +// extra useful functionalities for inernal. +class BuiltinFilterBitsReader : public FilterBitsReader { + public: + // Check if the hash of the entry match the bits in filter + virtual bool HashMayMatch(const uint64_t /* h */) { return true; } +}; + +// Base class for RocksDB built-in filter policies. This provides the +// ability to read all kinds of built-in filters (so that old filters can +// be used even when you change between built-in policies). +class BuiltinFilterPolicy : public FilterPolicy { + public: // overrides + // Read metadata to determine what kind of FilterBitsReader is needed + // and return a new one. This must successfully process any filter data + // generated by a built-in FilterBitsBuilder, regardless of the impl + // chosen for this BloomFilterPolicy. + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; + static const char* kClassName(); + bool IsInstanceOf(const std::string& id) const override; + // All variants of BuiltinFilterPolicy can read each others filters. + const char* CompatibilityName() const override; + static const char* kCompatibilityName(); + + public: // new + // An internal function for the implementation of + // BuiltinFilterBitsReader::GetFilterBitsReader without requiring an instance + // or working around potential virtual overrides. + static BuiltinFilterBitsReader* GetBuiltinFilterBitsReader( + const Slice& contents); + + // Returns a new FilterBitsBuilder from the filter_policy in + // table_options of a context, or nullptr if not applicable. + // (An internal convenience function to save boilerplate.) + static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&); + + private: + // For Bloom filter implementation(s) + static BuiltinFilterBitsReader* GetBloomBitsReader(const Slice& contents); + + // For Ribbon filter implementation(s) + static BuiltinFilterBitsReader* GetRibbonBitsReader(const Slice& contents); +}; + +// A "read only" filter policy used for backward compatibility with old +// OPTIONS files, which did not specifying a Bloom configuration, just +// "rocksdb.BuiltinBloomFilter". Although this can read existing filters, +// this policy does not build new filters, so new SST files generated +// under the policy will get no filters (like nullptr FilterPolicy). +// This class is considered internal API and subject to change. +class ReadOnlyBuiltinFilterPolicy : public BuiltinFilterPolicy { + public: + const char* Name() const override { return kClassName(); } + static const char* kClassName(); + + // Does not write filters. + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override { + return nullptr; + } +}; + +// RocksDB built-in filter policy for Bloom or Bloom-like filters including +// Ribbon filters. +// This class is considered internal API and subject to change. +// See NewBloomFilterPolicy and NewRibbonFilterPolicy. +class BloomLikeFilterPolicy : public BuiltinFilterPolicy { + public: + explicit BloomLikeFilterPolicy(double bits_per_key); + + ~BloomLikeFilterPolicy() override; + static const char* kClassName(); + bool IsInstanceOf(const std::string& id) const override; + + std::string GetId() const override; + + // Essentially for testing only: configured millibits/key + int GetMillibitsPerKey() const { return millibits_per_key_; } + // Essentially for testing only: legacy whole bits/key + int GetWholeBitsPerKey() const { return whole_bits_per_key_; } + + // All the different underlying implementations that a BloomLikeFilterPolicy + // might use, as a configuration string name for a testing mode for + // "always use this implementation." Only appropriate for unit tests. + static const std::vector& GetAllFixedImpls(); + + // Convenience function for creating by name for fixed impls + static std::shared_ptr Create(const std::string& name, + double bits_per_key); + + protected: + // Some implementations used by aggregating policies + FilterBitsBuilder* GetLegacyBloomBuilderWithContext( + const FilterBuildingContext& context) const; + FilterBitsBuilder* GetFastLocalBloomBuilderWithContext( + const FilterBuildingContext& context) const; + FilterBitsBuilder* GetStandard128RibbonBuilderWithContext( + const FilterBuildingContext& context) const; + + std::string GetBitsPerKeySuffix() const; + + private: + // Bits per key settings are for configuring Bloom filters. + + // Newer filters support fractional bits per key. For predictable behavior + // of 0.001-precision values across floating point implementations, we + // round to thousandths of a bit (on average) per key. + int millibits_per_key_; + + // Older filters round to whole number bits per key. (There *should* be no + // compatibility issue with fractional bits per key, but preserving old + // behavior with format_version < 5 just in case.) + int whole_bits_per_key_; + + // For configuring Ribbon filter: a desired value for 1/fp_rate. For + // example, 100 -> 1% fp rate. + double desired_one_in_fp_rate_; + + // Whether relevant warnings have been logged already. (Remember so we + // only report once per BloomFilterPolicy instance, to keep the noise down.) + mutable std::atomic warned_; + + // State for implementing optimize_filters_for_memory. Essentially, this + // tracks a surplus or deficit in total FP rate of filters generated by + // builders under this policy vs. what would have been generated without + // optimize_filters_for_memory. + // + // To avoid floating point weirdness, the actual value is + // Sum over all generated filters f: + // (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32 + mutable std::atomic aggregate_rounding_balance_; +}; + +// For NewBloomFilterPolicy +// +// This is a user-facing policy that automatically choose between +// LegacyBloom and FastLocalBloom based on context at build time, +// including compatibility with format_version. +class BloomFilterPolicy : public BloomLikeFilterPolicy { + public: + explicit BloomFilterPolicy(double bits_per_key); + + // To use this function, call BuiltinFilterPolicy::GetBuilderFromContext(). + // + // Neither the context nor any objects therein should be saved beyond + // the call to this function, unless it's shared_ptr. + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override; + + static const char* kClassName(); + const char* Name() const override { return kClassName(); } + static const char* kNickName(); + const char* NickName() const override { return kNickName(); } + std::string GetId() const override; +}; + +// For NewRibbonFilterPolicy +// +// This is a user-facing policy that chooses between Standard128Ribbon +// and FastLocalBloom based on context at build time (LSM level and other +// factors in extreme cases). +class RibbonFilterPolicy : public BloomLikeFilterPolicy { + public: + explicit RibbonFilterPolicy(double bloom_equivalent_bits_per_key, + int bloom_before_level); + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override; + + int GetBloomBeforeLevel() const { return bloom_before_level_; } + + static const char* kClassName(); + const char* Name() const override { return kClassName(); } + static const char* kNickName(); + const char* NickName() const override { return kNickName(); } + std::string GetId() const override; + + private: + const int bloom_before_level_; +}; + +// For testing only, but always constructable with internal names +namespace test { + +class LegacyBloomFilterPolicy : public BloomLikeFilterPolicy { + public: + explicit LegacyBloomFilterPolicy(double bits_per_key) + : BloomLikeFilterPolicy(bits_per_key) {} + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override; + + static const char* kClassName(); + const char* Name() const override { return kClassName(); } +}; + +class FastLocalBloomFilterPolicy : public BloomLikeFilterPolicy { + public: + explicit FastLocalBloomFilterPolicy(double bits_per_key) + : BloomLikeFilterPolicy(bits_per_key) {} + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override; + + static const char* kClassName(); + const char* Name() const override { return kClassName(); } +}; + +class Standard128RibbonFilterPolicy : public BloomLikeFilterPolicy { + public: + explicit Standard128RibbonFilterPolicy(double bloom_equiv_bits_per_key) + : BloomLikeFilterPolicy(bloom_equiv_bits_per_key) {} + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override; + + static const char* kClassName(); + const char* Name() const override { return kClassName(); } +}; + +} // namespace test + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy.cc new file mode 100644 index 0000000000..d5cc310013 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" + +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/customizable_util.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/flush_block_policy_impl.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +// Flush block by size +class FlushBlockBySizePolicy : public FlushBlockPolicy { + public: + // @params block_size: Approximate size of user data packed per + // block. + // @params block_size_deviation: This is used to close a block before it + // reaches the configured + FlushBlockBySizePolicy(const uint64_t block_size, + const uint64_t block_size_deviation, const bool align, + const BlockBuilder& data_block_builder) + : block_size_(block_size), + block_size_deviation_limit_( + ((block_size * (100 - block_size_deviation)) + 99) / 100), + align_(align), + data_block_builder_(data_block_builder) {} + + bool Update(const Slice& key, const Slice& value) override { + // it makes no sense to flush when the data block is empty + if (data_block_builder_.empty()) { + return false; + } + + auto curr_size = data_block_builder_.CurrentSizeEstimate(); + + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + return curr_size >= block_size_ || BlockAlmostFull(key, value); + } + + private: + bool BlockAlmostFull(const Slice& key, const Slice& value) const { + if (block_size_deviation_limit_ == 0) { + return false; + } + + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + if (align_) { + estimated_size_after += BlockBasedTable::kBlockTrailerSize; + return estimated_size_after > block_size_; + } + + return estimated_size_after > block_size_ && + curr_size > block_size_deviation_limit_; + } + + const uint64_t block_size_; + const uint64_t block_size_deviation_limit_; + const bool align_; + const BlockBuilder& data_block_builder_; +}; + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const { + return new FlushBlockBySizePolicy( + table_options.block_size, table_options.block_size_deviation, + table_options.block_align, data_block_builder); +} + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder) { + return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); +} + +static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + FlushBlockBySizePolicyFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FlushBlockBySizePolicyFactory()); + return guard->get(); + }); + library.AddFactory( + FlushBlockEveryKeyPolicyFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FlushBlockEveryKeyPolicyFactory()); + return guard->get(); + }); + return 2; +} + +FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory() + : FlushBlockPolicyFactory() {} + +Status FlushBlockPolicyFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* factory) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterFlushBlockPolicyFactories(*(ObjectLibrary::Default().get()), ""); + }); + + if (value.empty()) { + factory->reset(new FlushBlockBySizePolicyFactory()); + return Status::OK(); + } else { + return LoadSharedObject(config_options, value, + factory); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy_impl.h new file mode 100644 index 0000000000..4f79682bc2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/flush_block_policy_impl.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" + +namespace ROCKSDB_NAMESPACE { + +// FlushBlockEveryKeyPolicy currently used only in tests. + +class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { + public: + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (!start_) { + start_ = true; + return false; + } + return true; + } + + private: + bool start_ = false; +}; + +class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { + public: + explicit FlushBlockEveryKeyPolicyFactory() {} + + static const char* kClassName() { return "FlushBlockEveryKeyPolicyFactory"; } + const char* Name() const override { return kClassName(); } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return new FlushBlockEveryKeyPolicy; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.cc new file mode 100644 index 0000000000..60ff7c44f3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/full_filter_block.h" + +#include + +#include "block_type.h" +#include "monitoring/perf_context_imp.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +FullFilterBlockBuilder::FullFilterBlockBuilder( + const SliceTransform* _prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder) + : prefix_extractor_(_prefix_extractor), + whole_key_filtering_(whole_key_filtering), + last_whole_key_recorded_(false), + last_prefix_recorded_(false), + last_key_in_domain_(false), + any_added_(false) { + assert(filter_bits_builder != nullptr); + filter_bits_builder_.reset(filter_bits_builder); +} + +size_t FullFilterBlockBuilder::EstimateEntriesAdded() { + return filter_bits_builder_->EstimateEntriesAdded(); +} + +void FullFilterBlockBuilder::Add(const Slice& key_without_ts) { + const bool add_prefix = + prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts); + + if (!last_prefix_recorded_ && last_key_in_domain_) { + // We can reach here when a new filter partition starts in partitioned + // filter. The last prefix in the previous partition should be added if + // necessary regardless of key_without_ts, to support prefix SeekForPrev. + AddKey(last_prefix_str_); + last_prefix_recorded_ = true; + } + + if (whole_key_filtering_) { + if (!add_prefix) { + AddKey(key_without_ts); + } else { + // if both whole_key and prefix are added to bloom then we will have whole + // key_without_ts and prefix addition being interleaved and thus cannot + // rely on the bits builder to properly detect the duplicates by comparing + // with the last item. + Slice last_whole_key = Slice(last_whole_key_str_); + if (!last_whole_key_recorded_ || + last_whole_key.compare(key_without_ts) != 0) { + AddKey(key_without_ts); + last_whole_key_recorded_ = true; + last_whole_key_str_.assign(key_without_ts.data(), + key_without_ts.size()); + } + } + } + if (add_prefix) { + last_key_in_domain_ = true; + AddPrefix(key_without_ts); + } else { + last_key_in_domain_ = false; + } +} + +// Add key to filter if needed +inline void FullFilterBlockBuilder::AddKey(const Slice& key) { + filter_bits_builder_->AddKey(key); + any_added_ = true; +} + +// Add prefix to filter if needed +void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + assert(prefix_extractor_ && prefix_extractor_->InDomain(key)); + Slice prefix = prefix_extractor_->Transform(key); + if (whole_key_filtering_) { + // if both whole_key and prefix are added to bloom then we will have whole + // key and prefix addition being interleaved and thus cannot rely on the + // bits builder to properly detect the duplicates by comparing with the last + // item. + Slice last_prefix = Slice(last_prefix_str_); + if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) { + AddKey(prefix); + last_prefix_recorded_ = true; + last_prefix_str_.assign(prefix.data(), prefix.size()); + } + } else { + AddKey(prefix); + } +} + +void FullFilterBlockBuilder::Reset() { + last_whole_key_recorded_ = false; + last_prefix_recorded_ = false; +} + +Slice FullFilterBlockBuilder::Finish( + const BlockHandle& /*tmp*/, Status* status, + std::unique_ptr* filter_data) { + Reset(); + // In this impl we ignore BlockHandle + *status = Status::OK(); + if (any_added_) { + any_added_ = false; + Slice filter_content = filter_bits_builder_->Finish( + filter_data ? filter_data : &filter_data_, status); + return filter_content; + } + return Slice(); +} + +FullFilterBlockReader::FullFilterBlockReader( + const BlockBasedTable* t, + CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) {} + +bool FullFilterBlockReader::KeyMayMatch(const Slice& key, const bool no_io, + const Slice* const /*const_ikey_ptr*/, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + if (!whole_key_filtering()) { + return true; + } + return MayMatch(key, no_io, get_context, lookup_context, read_options); +} + +std::unique_ptr FullFilterBlockReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new FullFilterBlockReader(table, std::move(filter_block))); +} + +bool FullFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { + return MayMatch(prefix, no_io, get_context, lookup_context, read_options); +} + +bool FullFilterBlockReader::MayMatch(const Slice& entry, bool no_io, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const { + CachableEntry filter_block; + + const Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, + &filter_block, read_options); + if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); + return true; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (filter_bits_reader) { + if (filter_bits_reader->MayMatch(entry)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } + } + return true; +} + +void FullFilterBlockReader::KeysMayMatch( + MultiGetRange* range, const bool no_io, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { + if (!whole_key_filtering()) { + // Simply return. Don't skip any key - consider all keys as likely to be + // present + return; + } + MayMatch(range, no_io, nullptr, lookup_context, read_options); +} + +void FullFilterBlockReader::PrefixesMayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, + const bool no_io, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + MayMatch(range, no_io, prefix_extractor, lookup_context, read_options); +} + +void FullFilterBlockReader::MayMatch(MultiGetRange* range, bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, range->begin()->get_context, lookup_context, + &filter_block, read_options); + if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); + return; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (!filter_bits_reader) { + return; + } + + // We need to use an array instead of autovector for may_match since + // &may_match[0] doesn't work for autovector (compiler error). So + // declare both keys and may_match as arrays, which is also slightly less + // expensive compared to autovector + std::array keys; + std::array may_match = {{true}}; + autovector prefixes; + int num_keys = 0; + MultiGetRange filter_range(*range, range->begin(), range->end()); + for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { + if (!prefix_extractor) { + keys[num_keys++] = &iter->ukey_without_ts; + } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) { + prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts)); + keys[num_keys++] = &prefixes.back(); + } else { + filter_range.SkipKey(iter); + } + } + + filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); + + int i = 0; + for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { + if (!may_match[i]) { + // Update original MultiGet range to skip this key. The filter_range + // was temporarily used just to skip keys not in prefix_extractor domain + range->SkipKey(iter); + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + } else { + // PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + PerfContext* perf_ctx = get_perf_context(); + perf_ctx->bloom_sst_hit_count++; + } + ++i; + } +} + +size_t FullFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.h new file mode 100644 index 0000000000..7b0890d10c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/full_filter_block.h @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +class FilterPolicy; +class FilterBitsBuilder; +class FilterBitsReader; + +// A FullFilterBlockBuilder is used to construct a full filter for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// The format of full filter block is: +// +----------------------------------------------------------------+ +// | full filter for all keys in sst file | +// +----------------------------------------------------------------+ +// The full filter can be very large. At the end of it, we put +// num_probes: how many hash functions are used in bloom filter +// +class FullFilterBlockBuilder : public FilterBlockBuilder { + public: + explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, + bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder); + // No copying allowed + FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete; + void operator=(const FullFilterBlockBuilder&) = delete; + + // bits_builder is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockBuilder() {} + + virtual void Add(const Slice& key_without_ts) override; + virtual bool IsEmpty() const override { return !any_added_; } + virtual size_t EstimateEntriesAdded() override; + virtual Slice Finish( + const BlockHandle& tmp, Status* status, + std::unique_ptr* filter_data = nullptr) override; + using FilterBlockBuilder::Finish; + + virtual void ResetFilterBitsBuilder() override { + filter_bits_builder_.reset(); + } + + virtual Status MaybePostVerifyFilter(const Slice& filter_content) override { + return filter_bits_builder_->MaybePostVerify(filter_content); + } + + protected: + virtual void AddKey(const Slice& key); + std::unique_ptr filter_bits_builder_; + virtual void Reset(); + void AddPrefix(const Slice& key); + const SliceTransform* prefix_extractor() { return prefix_extractor_; } + const std::string& last_prefix_str() const { return last_prefix_str_; } + + private: + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + bool last_whole_key_recorded_; + std::string last_whole_key_str_; + bool last_prefix_recorded_; + std::string last_prefix_str_; + // Whether prefix_extractor_->InDomain(last_whole_key_) is true. + // Used in partitioned filters so that the last prefix from the previous + // filter partition will be added to the current partition if + // last_key_in_domain_ is true, regardless of the current key. + bool last_key_in_domain_; + bool any_added_; + std::unique_ptr filter_data_; +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class FullFilterBlockReader + : public FilterBlockReaderCommon { + public: + FullFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); + + bool KeyMayMatch(const Slice& key, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + + bool PrefixMayMatch(const Slice& prefix, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + + void KeysMayMatch(MultiGetRange* range, const bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + // Used in partitioned filter code + void KeysMayMatch2(MultiGetRange* range, + const SliceTransform* /*prefix_extractor*/, + const bool no_io, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + KeysMayMatch(range, no_io, lookup_context, read_options); + } + + void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + const bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + size_t ApproximateMemoryUsage() const override; + + private: + bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const; + void MayMatch(MultiGetRange* range, bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.cc new file mode 100644 index 0000000000..e158d8039d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/hash_index_reader.h" + +#include "table/block_fetcher.h" +#include "table/meta_blocks.h" + +namespace ROCKSDB_NAMESPACE { +Status HashIndexReader::Create(const BlockBasedTable* table, + const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(index_reader != nullptr); + assert(!pin || prefetch); + + const BlockBasedTable::Rep* rep = table->get_rep(); + assert(rep != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ro, use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + // Note, failure to create prefix hash index does not need to be a + // hard error. We can still fall back to the original binary search index. + // So, Create will succeed regardless, from this point on. + + index_reader->reset(new HashIndexReader(table, std::move(index_block))); + + // Get prefixes block + BlockHandle prefixes_handle; + Status s = + FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, &prefixes_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + // Get index metadata block + BlockHandle prefixes_meta_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, + &prefixes_meta_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + RandomAccessFileReader* const file = rep->file.get(); + const Footer& footer = rep->footer; + const ImmutableOptions& ioptions = rep->ioptions; + const PersistentCacheOptions& cache_options = rep->persistent_cache_options; + MemoryAllocator* const memory_allocator = + GetMemoryAllocator(rep->table_options); + + // Read contents for the blocks + BlockContents prefixes_contents; + BlockFetcher prefixes_block_fetcher( + file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents, + ioptions, true /*decompress*/, true /*maybe_compressed*/, + BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); + s = prefixes_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + BlockContents prefixes_meta_contents; + BlockFetcher prefixes_meta_block_fetcher( + file, prefetch_buffer, footer, ro, prefixes_meta_handle, + &prefixes_meta_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexMetadata, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_meta_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + BlockPrefixIndex* prefix_index = nullptr; + assert(rep->table_prefix_extractor); + s = BlockPrefixIndex::Create(rep->table_prefix_extractor.get(), + prefixes_contents.data, + prefixes_meta_contents.data, &prefix_index); + // TODO: log error + if (s.ok()) { + HashIndexReader* const hash_index_reader = + static_cast(index_reader->get()); + hash_index_reader->prefix_index_.reset(prefix_index); + } + + return Status::OK(); +} + +InternalIteratorBase* HashIndexReader::NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + const BlockBasedTable::Rep* rep = table()->get_rep(); + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context, + &index_block, read_options); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + Statistics* kNullStats = nullptr; + const bool total_order_seek = + read_options.total_order_seek || disable_prefix_seek; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, + total_order_seek, index_has_first_key(), index_key_includes_seq(), + index_value_is_full(), false /* block_contents_pinned */, + prefix_index_.get()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.h new file mode 100644 index 0000000000..9037efc877 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/hash_index_reader.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "table/block_based/index_reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +class HashIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + static Status Create(const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + if (prefix_index_) { + usage += prefix_index_->ApproximateMemoryUsage(); + } + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + HashIndexReader(const BlockBasedTable* t, CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + std::unique_ptr prefix_index_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.cc new file mode 100644 index 0000000000..0247301782 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.cc @@ -0,0 +1,282 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/index_builder.h" + +#include + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/flush_block_policy.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +// Create a index builder based on its type. +IndexBuilder* IndexBuilder::CreateIndexBuilder( + BlockBasedTableOptions::IndexType index_type, + const InternalKeyComparator* comparator, + const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt) { + IndexBuilder* result = nullptr; + switch (index_type) { + case BlockBasedTableOptions::kBinarySearch: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ false); + break; + } + case BlockBasedTableOptions::kHashSearch: { + // Currently kHashSearch is incompatible with index_block_restart_interval + // > 1 + assert(table_opt.index_block_restart_interval == 1); + result = new HashIndexBuilder( + comparator, int_key_slice_transform, + table_opt.index_block_restart_interval, table_opt.format_version, + use_value_delta_encoding, table_opt.index_shortening); + break; + } + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + result = PartitionedIndexBuilder::CreateIndexBuilder( + comparator, use_value_delta_encoding, table_opt); + break; + } + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ true); + break; + } + default: { + assert(!"Do not recognize the index type "); + break; + } + } + return result; +} + +void ShortenedIndexBuilder::FindShortestInternalKeySeparator( + const Comparator& comparator, std::string* start, const Slice& limit) { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + comparator.FindShortestSeparator(&tmp, user_limit); + if (tmp.size() <= user_start.size() && + comparator.Compare(user_start, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, + PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek)); + assert(InternalKeyComparator(&comparator).Compare(*start, tmp) < 0); + assert(InternalKeyComparator(&comparator).Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void ShortenedIndexBuilder::FindShortInternalKeySuccessor( + const Comparator& comparator, std::string* key) { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + comparator.FindShortSuccessor(&tmp); + if (tmp.size() <= user_key.size() && comparator.Compare(user_key, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, + PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek)); + assert(InternalKeyComparator(&comparator).Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( + const InternalKeyComparator* comparator, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt) { + return new PartitionedIndexBuilder(comparator, table_opt, + use_value_delta_encoding); +} + +PartitionedIndexBuilder::PartitionedIndexBuilder( + const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding) + : IndexBuilder(comparator), + index_block_builder_(table_opt.index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(table_opt.index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + sub_index_builder_(nullptr), + table_opt_(table_opt), + // We start by false. After each partition we revise the value based on + // what the sub_index_builder has decided. If the feature is disabled + // entirely, this will be set to true after switching the first + // sub_index_builder. Otherwise, it could be set to true even one of the + // sub_index_builders could not safely exclude seq from the keys, then it + // wil be enforced on all sub_index_builders on ::Finish. + seperator_is_key_plus_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) {} + +PartitionedIndexBuilder::~PartitionedIndexBuilder() { + delete sub_index_builder_; +} + +void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { + assert(sub_index_builder_ == nullptr); + sub_index_builder_ = new ShortenedIndexBuilder( + comparator_, table_opt_.index_block_restart_interval, + table_opt_.format_version, use_value_delta_encoding_, + table_opt_.index_shortening, /* include_first_key */ false); + + // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if + // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by + // default on Creation) so that flush policy can point to + // sub_index_builder_->index_block_builder_ + if (seperator_is_key_plus_seq_) { + sub_index_builder_->seperator_is_key_plus_seq_ = true; + } + + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + // Note: this is sub-optimal since sub_index_builder_ could later reset + // seperator_is_key_plus_seq_ but the probability of that is low. + sub_index_builder_->seperator_is_key_plus_seq_ + ? sub_index_builder_->index_block_builder_ + : sub_index_builder_->index_block_builder_without_seq_)); + partition_cut_requested_ = false; +} + +void PartitionedIndexBuilder::RequestPartitionCut() { + partition_cut_requested_ = true; +} + +void PartitionedIndexBuilder::AddIndexEntry( + std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, const BlockHandle& block_handle) { + // Note: to avoid two consecuitive flush in the same method call, we do not + // check flush policy when adding the last key + if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys + if (sub_index_builder_ == nullptr) { + MakeNewSubIndexBuilder(); + } + sub_index_builder_->AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + if (!seperator_is_key_plus_seq_ && + sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders and reset + // flush_policy to point to Block Builder of sub_index_builder_ that store + // internal keys. + seperator_is_key_plus_seq_ = true; + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + sub_index_builder_->index_block_builder_)); + } + sub_index_last_key_ = std::string(*last_key_in_current_block); + entries_.push_back( + {sub_index_last_key_, + std::unique_ptr(sub_index_builder_)}); + sub_index_builder_ = nullptr; + cut_filter_block = true; + } else { + // apply flush policy only to non-empty sub_index_builder_ + if (sub_index_builder_ != nullptr) { + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + bool do_flush = + partition_cut_requested_ || + flush_policy_->Update(*last_key_in_current_block, handle_encoding); + if (do_flush) { + entries_.push_back( + {sub_index_last_key_, + std::unique_ptr(sub_index_builder_)}); + cut_filter_block = true; + sub_index_builder_ = nullptr; + } + } + if (sub_index_builder_ == nullptr) { + MakeNewSubIndexBuilder(); + } + sub_index_builder_->AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + sub_index_last_key_ = std::string(*last_key_in_current_block); + if (!seperator_is_key_plus_seq_ && + sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders and reset + // flush_policy to point to Block Builder of sub_index_builder_ that store + // internal keys. + seperator_is_key_plus_seq_ = true; + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + sub_index_builder_->index_block_builder_)); + } + } +} + +Status PartitionedIndexBuilder::Finish( + IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { + if (partition_cnt_ == 0) { + partition_cnt_ = entries_.size(); + } + // It must be set to null after last key is added + assert(sub_index_builder_ == nullptr); + if (finishing_indexes == true) { + Entry& last_entry = entries_.front(); + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), + handle_encoding, + &handle_delta_encoding_slice); + } + entries_.pop_front(); + } + // If there is no sub_index left, then return the 2nd level index. + if (UNLIKELY(entries_.empty())) { + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } + top_level_index_size_ = index_blocks->index_block_contents.size(); + index_size_ += top_level_index_size_; + return Status::OK(); + } else { + // Finish the next partition index in line and Incomplete() to indicate we + // expect more calls to Finish + Entry& entry = entries_.front(); + // Apply the policy to all sub-indexes + entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; + auto s = entry.value->Finish(index_blocks); + index_size_ += index_blocks->index_block_contents.size(); + finishing_indexes = true; + return s.ok() ? Status::Incomplete() : s; + } +} + +size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; } +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.h new file mode 100644 index 0000000000..dd3be03316 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_builder.h @@ -0,0 +1,455 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { +// The interface for building index. +// Instruction for adding a new concrete IndexBuilder: +// 1. Create a subclass instantiated from IndexBuilder. +// 2. Add a new entry associated with that subclass in TableOptions::IndexType. +// 3. Add a create function for the new subclass in CreateIndexBuilder. +// Note: we can devise more advanced design to simplify the process for adding +// new subclass, which will, on the other hand, increase the code complexity and +// catch unwanted attention from readers. Given that we won't add/change +// indexes frequently, it makes sense to just embrace a more straightforward +// design that just works. +class IndexBuilder { + public: + static IndexBuilder* CreateIndexBuilder( + BlockBasedTableOptions::IndexType index_type, + const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, + const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt); + + // Index builder will construct a set of blocks which contain: + // 1. One primary index block. + // 2. (Optional) a set of metablocks that contains the metadata of the + // primary index. + struct IndexBlocks { + Slice index_block_contents; + std::unordered_map meta_blocks; + }; + explicit IndexBuilder(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexBuilder() {} + + // Add a new index entry to index block. + // To allow further optimization, we provide `last_key_in_current_block` and + // `first_key_in_next_block`, based on which the specific implementation can + // determine the best index key to be used for the index block. + // Called before the OnKeyAdded() call for first_key_in_next_block. + // @last_key_in_current_block: this parameter maybe overridden with the value + // "substitute key". + // @first_key_in_next_block: it will be nullptr if the entry being added is + // the last one in the table + // + // REQUIRES: Finish() has not yet been called. + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) = 0; + + // This method will be called whenever a key is added. The subclasses may + // override OnKeyAdded() if they need to collect additional information. + virtual void OnKeyAdded(const Slice& /*key*/) {} + + // Inform the index builder that all entries has been written. Block builder + // may therefore perform any operation required for block finalization. + // + // REQUIRES: Finish() has not yet been called. + inline Status Finish(IndexBlocks* index_blocks) { + // Throw away the changes to last_partition_block_handle. It has no effect + // on the first call to Finish anyway. + BlockHandle last_partition_block_handle; + return Finish(index_blocks, last_partition_block_handle); + } + + // This override of Finish can be utilized to build the 2nd level index in + // PartitionIndexBuilder. + // + // index_blocks will be filled with the resulting index data. If the return + // value is Status::InComplete() then it means that the index is partitioned + // and the callee should keep calling Finish until Status::OK() is returned. + // In that case, last_partition_block_handle is pointer to the block written + // with the result of the last call to Finish. This can be utilized to build + // the second level index pointing to each block of partitioned indexes. The + // last call to Finish() that returns Status::OK() populates index_blocks with + // the 2nd level index content. + virtual Status Finish(IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) = 0; + + // Get the size for index block. Must be called after ::Finish. + virtual size_t IndexSize() const = 0; + + virtual bool seperator_is_key_plus_seq() { return true; } + + protected: + const InternalKeyComparator* comparator_; + // Set after ::Finish is called + size_t index_size_ = 0; +}; + +// This index builder builds space-efficient index block. +// +// Optimizations: +// 1. Made block's `block_restart_interval` to be 1, which will avoid linear +// search when doing index lookup (can be disabled by setting +// index_block_restart_interval). +// 2. Shorten the key length for index block. Other than honestly using the +// last key in the data block as the index key, we instead find a shortest +// substitute key that serves the same function. +class ShortenedIndexBuilder : public IndexBuilder { + public: + explicit ShortenedIndexBuilder( + const InternalKeyComparator* comparator, + const int index_block_restart_interval, const uint32_t format_version, + const bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode, + bool include_first_key) + : IndexBuilder(comparator), + index_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + include_first_key_(include_first_key), + shortening_mode_(shortening_mode) { + // Making the default true will disable the feature for old versions + seperator_is_key_plus_seq_ = (format_version <= 2); + } + + virtual void OnKeyAdded(const Slice& key) override { + if (include_first_key_ && current_block_first_internal_key_.empty()) { + current_block_first_internal_key_.assign(key.data(), key.size()); + } + } + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + if (first_key_in_next_block != nullptr) { + if (shortening_mode_ != + BlockBasedTableOptions::IndexShorteningMode::kNoShortening) { + FindShortestInternalKeySeparator(*comparator_->user_comparator(), + last_key_in_current_block, + *first_key_in_next_block); + } + if (!seperator_is_key_plus_seq_ && + comparator_->user_comparator()->Compare( + ExtractUserKey(*last_key_in_current_block), + ExtractUserKey(*first_key_in_next_block)) == 0) { + seperator_is_key_plus_seq_ = true; + } + } else { + if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor) { + FindShortInternalKeySuccessor(*comparator_->user_comparator(), + last_key_in_current_block); + } + } + auto sep = Slice(*last_key_in_current_block); + + assert(!include_first_key_ || !current_block_first_internal_key_.empty()); + IndexValue entry(block_handle, current_block_first_internal_key_); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); + if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { + entry.EncodeTo(&delta_encoded_entry, include_first_key_, + &last_encoded_handle_); + } else { + // If it's the first block, or delta encoding is disabled, + // BlockBuilder::Add() below won't use delta-encoded slice. + } + last_encoded_handle_ = block_handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, + &delta_encoded_entry_slice); + } + + current_block_first_internal_key_.clear(); + } + + using IndexBuilder::Finish; + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& /*last_partition_block_handle*/) override { + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } + index_size_ = index_blocks->index_block_contents.size(); + return Status::OK(); + } + + virtual size_t IndexSize() const override { return index_size_; } + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + + // Changes *key to a short string >= *key. + // + static void FindShortestInternalKeySeparator(const Comparator& comparator, + std::string* start, + const Slice& limit); + + static void FindShortInternalKeySuccessor(const Comparator& comparator, + std::string* key); + + friend class PartitionedIndexBuilder; + + private: + BlockBuilder index_block_builder_; + BlockBuilder index_block_builder_without_seq_; + const bool use_value_delta_encoding_; + bool seperator_is_key_plus_seq_; + const bool include_first_key_; + BlockBasedTableOptions::IndexShorteningMode shortening_mode_; + BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); + std::string current_block_first_internal_key_; +}; + +// HashIndexBuilder contains a binary-searchable primary index and the +// metadata for secondary hash index construction. +// The metadata for hash index consists two parts: +// - a metablock that compactly contains a sequence of prefixes. All prefixes +// are stored consectively without any metadata (like, prefix sizes) being +// stored, which is kept in the other metablock. +// - a metablock contains the metadata of the prefixes, including prefix size, +// restart index and number of block it spans. The format looks like: +// +// +-----------------+---------------------------+---------------------+ +// <=prefix 1 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// <=prefix 2 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// | | +// | .... | +// | | +// +-----------------+---------------------------+---------------------+ +// <=prefix n +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// +// The reason of separating these two metablocks is to enable the efficiently +// reuse the first metablock during hash index construction without unnecessary +// data copy or small heap allocations for prefixes. +class HashIndexBuilder : public IndexBuilder { + public: + explicit HashIndexBuilder( + const InternalKeyComparator* comparator, + const SliceTransform* hash_key_extractor, + int index_block_restart_interval, int format_version, + bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode) + : IndexBuilder(comparator), + primary_index_builder_(comparator, index_block_restart_interval, + format_version, use_value_delta_encoding, + shortening_mode, /* include_first_key */ false), + hash_key_extractor_(hash_key_extractor) {} + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + ++current_restart_index_; + primary_index_builder_.AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + } + + virtual void OnKeyAdded(const Slice& key) override { + auto key_prefix = hash_key_extractor_->Transform(key); + bool is_first_entry = pending_block_num_ == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix_ != key_prefix) { + if (!is_first_entry) { + FlushPendingPrefix(); + } + + // need a hard copy otherwise the underlying data changes all the time. + // TODO(kailiu) std::to_string() is expensive. We may speed up can avoid + // data copy. + pending_entry_prefix_ = key_prefix.ToString(); + pending_block_num_ = 1; + pending_entry_index_ = static_cast(current_restart_index_); + } else { + // entry number increments when keys share the prefix reside in + // different data blocks. + auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; + assert(last_restart_index <= current_restart_index_); + if (last_restart_index != current_restart_index_) { + ++pending_block_num_; + } + } + } + + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override { + if (pending_block_num_ != 0) { + FlushPendingPrefix(); + } + Status s = primary_index_builder_.Finish(index_blocks, + last_partition_block_handle); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesBlock.c_str(), prefix_block_}); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); + return s; + } + + virtual size_t IndexSize() const override { + return primary_index_builder_.IndexSize() + prefix_block_.size() + + prefix_meta_block_.size(); + } + + virtual bool seperator_is_key_plus_seq() override { + return primary_index_builder_.seperator_is_key_plus_seq(); + } + + private: + void FlushPendingPrefix() { + prefix_block_.append(pending_entry_prefix_.data(), + pending_entry_prefix_.size()); + PutVarint32Varint32Varint32( + &prefix_meta_block_, + static_cast(pending_entry_prefix_.size()), + pending_entry_index_, pending_block_num_); + } + + ShortenedIndexBuilder primary_index_builder_; + const SliceTransform* hash_key_extractor_; + + // stores a sequence of prefixes + std::string prefix_block_; + // stores the metadata of prefixes + std::string prefix_meta_block_; + + // The following 3 variables keeps unflushed prefix and its metadata. + // The details of block_num and entry_index can be found in + // "block_hash_index.{h,cc}" + uint32_t pending_block_num_ = 0; + uint32_t pending_entry_index_ = 0; + std::string pending_entry_prefix_; + + uint64_t current_restart_index_ = 0; +}; + +/** + * IndexBuilder for two-level indexing. Internally it creates a new index for + * each partition and Finish then in order when Finish is called on it + * continiously until Status::OK() is returned. + * + * The format on the disk would be I I I I I I IP where I is block containing a + * partition of indexes built using ShortenedIndexBuilder and IP is a block + * containing a secondary index on the partitions, built using + * ShortenedIndexBuilder. + */ +class PartitionedIndexBuilder : public IndexBuilder { + public: + static PartitionedIndexBuilder* CreateIndexBuilder( + const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt); + + explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding); + + virtual ~PartitionedIndexBuilder(); + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override; + + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override; + + virtual size_t IndexSize() const override { return index_size_; } + size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } + size_t NumPartitions() const; + + inline bool ShouldCutFilterBlock() { + // Current policy is to align the partitions of index and filters + if (cut_filter_block) { + cut_filter_block = false; + return true; + } + return false; + } + + std::string& GetPartitionKey() { return sub_index_last_key_; } + + // Called when an external entity (such as filter partition builder) request + // cutting the next partition + void RequestPartitionCut(); + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + + bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + + private: + // Set after ::Finish is called + size_t top_level_index_size_ = 0; + // Set after ::Finish is called + size_t partition_cnt_ = 0; + + void MakeNewSubIndexBuilder(); + + struct Entry { + std::string key; + std::unique_ptr value; + }; + std::list entries_; // list of partitioned indexes and their keys + BlockBuilder index_block_builder_; // top-level index builder + BlockBuilder index_block_builder_without_seq_; // same for user keys + // the active partition index builder + ShortenedIndexBuilder* sub_index_builder_; + // the last key in the active partition index builder + std::string sub_index_last_key_; + std::unique_ptr flush_policy_; + // true if Finish is called once but not complete yet. + bool finishing_indexes = false; + const BlockBasedTableOptions& table_opt_; + bool seperator_is_key_plus_seq_; + bool use_value_delta_encoding_; + // true if an external entity (such as filter partition builder) request + // cutting the next partition + bool partition_cut_requested_ = true; + // true if it should cut the next filter partition block + bool cut_filter_block = false; + BlockHandle last_encoded_handle_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.cc new file mode 100644 index 0000000000..828200299d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/index_reader_common.h" + +#include "block_cache.h" + +namespace ROCKSDB_NAMESPACE { +Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) { + PERF_TIMER_GUARD(read_index_block_nanos); + + assert(table != nullptr); + assert(index_block != nullptr); + assert(index_block->IsEmpty()); + + const Rep* const rep = table->get_rep(); + assert(rep != nullptr); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->footer.index_handle(), + UncompressionDict::GetEmptyDict(), &index_block->As(), + get_context, lookup_context, /* for_compaction */ false, use_cache, + /* async_read */ false); + + return s; +} + +Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* index_block, + const ReadOptions& ro) const { + assert(index_block != nullptr); + + if (!index_block_.IsEmpty()) { + index_block->SetUnownedValue(index_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options = ro; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, + cache_index_blocks(), get_context, lookup_context, + index_block); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.h new file mode 100644 index 0000000000..d3827fc6bc --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/index_reader_common.h @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Encapsulates common functionality for the various index reader +// implementations. Provides access to the index block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { + public: + IndexReaderCommon(const BlockBasedTable* t, + CachableEntry&& index_block) + : table_(t), index_block_(std::move(index_block)) { + assert(table_ != nullptr); + } + + protected: + static Status ReadIndexBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block); + + const BlockBasedTable* table() const { return table_; } + + const InternalKeyComparator* internal_comparator() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + return &table_->get_rep()->internal_comparator; + } + + bool index_has_first_key() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_has_first_key; + } + + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_key_includes_seq; + } + + bool index_value_is_full() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_value_is_full; + } + + bool cache_index_blocks() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->table_options.cache_index_and_filter_blocks; + } + + Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block, + const ReadOptions& read_options) const; + + size_t ApproximateIndexBlockMemoryUsage() const { + assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); + return index_block_.GetOwnValue() + ? index_block_.GetValue()->ApproximateMemoryUsage() + : 0; + } + + private: + const BlockBasedTable* table_; + CachableEntry index_block_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/mock_block_based_table.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/mock_block_based_table.h new file mode 100644 index 0000000000..13f3dfaee1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/mock_block_based_table.h @@ -0,0 +1,62 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" + +namespace ROCKSDB_NAMESPACE { +namespace mock { + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class MockBlockBasedTableTester { + static constexpr int kMockLevel = 0; + + public: + Options options_; + ImmutableOptions ioptions_; + EnvOptions env_options_; + BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; + + explicit MockBlockBasedTableTester(const FilterPolicy* filter_policy) + : MockBlockBasedTableTester( + std::shared_ptr(filter_policy)){}; + + explicit MockBlockBasedTableTester( + std::shared_ptr filter_policy) + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.filter_policy = std::move(filter_policy); + + constexpr bool skip_filters = false; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep( + ioptions_, env_options_, table_options_, icomp_, skip_filters, + 12345 /*file_size*/, kMockLevel, immortal_table))); + } + + FilterBitsBuilder* GetBuilder() const { + FilterBuildingContext context(table_options_); + context.column_family_name = "mock_cf"; + context.compaction_style = ioptions_.compaction_style; + context.level_at_creation = kMockLevel; + context.info_log = ioptions_.logger; + return BloomFilterPolicy::GetBuilderFromContext(context); + } +}; + +} // namespace mock +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.cc new file mode 100644 index 0000000000..9184a48d23 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/parsed_full_filter_block.h" + +#include "table/block_based/filter_policy_internal.h" + +namespace ROCKSDB_NAMESPACE { + +ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy, + BlockContents&& contents) + : block_contents_(std::move(contents)), + filter_bits_reader_( + !block_contents_.data.empty() + ? filter_policy->GetFilterBitsReader(block_contents_.data) + : nullptr) {} + +ParsedFullFilterBlock::~ParsedFullFilterBlock() = default; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.h new file mode 100644 index 0000000000..8d81868d1b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/parsed_full_filter_block.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "table/block_based/block_type.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +class FilterBitsReader; +class FilterPolicy; + +// The sharable/cachable part of the full filter. +class ParsedFullFilterBlock { + public: + ParsedFullFilterBlock(const FilterPolicy* filter_policy, + BlockContents&& contents); + ~ParsedFullFilterBlock(); + + FilterBitsReader* filter_bits_reader() const { + return filter_bits_reader_.get(); + } + + // TODO: consider memory usage of the FilterBitsReader + size_t ApproximateMemoryUsage() const { + return block_contents_.ApproximateMemoryUsage(); + } + + bool own_bytes() const { return block_contents_.own_bytes(); } + + // For TypedCacheInterface + const Slice& ContentSlice() const { return block_contents_.data; } + static constexpr CacheEntryRole kCacheEntryRole = + CacheEntryRole::kFilterBlock; + static constexpr BlockType kBlockType = BlockType::kFilter; + + private: + BlockContents block_contents_; + std::unique_ptr filter_bits_reader_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.cc new file mode 100644 index 0000000000..daefa49f4e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.cc @@ -0,0 +1,554 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/partitioned_filter_block.h" + +#include + +#include "block_cache.h" +#include "block_type.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( + const SliceTransform* _prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size) + : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering, + filter_bits_builder), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + p_index_builder_(p_index_builder), + keys_added_to_partition_(0), + total_added_in_built_(0) { + keys_per_partition_ = static_cast( + filter_bits_builder_->ApproximateNumEntries(partition_size)); + if (keys_per_partition_ < 1) { + // partition_size (minus buffer, ~10%) might be smaller than minimum + // filter size, sometimes based on cache line size. Try to find that + // minimum size without CalculateSpace (not necessarily available). + uint32_t larger = std::max(partition_size + 4, uint32_t{16}); + for (;;) { + keys_per_partition_ = static_cast( + filter_bits_builder_->ApproximateNumEntries(larger)); + if (keys_per_partition_ >= 1) { + break; + } + larger += larger / 4; + if (larger > 100000) { + // might be a broken implementation. substitute something reasonable: + // 1 key / byte. + keys_per_partition_ = partition_size; + break; + } + } + } +} + +PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() { + partitioned_filters_construction_status_.PermitUncheckedError(); +} + +void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( + const Slice* next_key) { + // Use == to send the request only once + if (keys_added_to_partition_ == keys_per_partition_) { + // Currently only index builder is in charge of cutting a partition. We keep + // requesting until it is granted. + p_index_builder_->RequestPartitionCut(); + } + if (!p_index_builder_->ShouldCutFilterBlock()) { + return; + } + + // Add the prefix of the next key before finishing the partition without + // updating last_prefix_str_. This hack, fixes a bug with format_verison=3 + // where seeking for the prefix would lead us to the previous partition. + const bool maybe_add_prefix = + next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); + if (maybe_add_prefix) { + const Slice next_key_prefix = prefix_extractor()->Transform(*next_key); + if (next_key_prefix.compare(last_prefix_str()) != 0) { + AddKey(next_key_prefix); + } + } + + total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded(); + std::unique_ptr filter_data; + Status filter_construction_status = Status::OK(); + Slice filter = + filter_bits_builder_->Finish(&filter_data, &filter_construction_status); + if (filter_construction_status.ok()) { + filter_construction_status = filter_bits_builder_->MaybePostVerify(filter); + } + std::string& index_key = p_index_builder_->GetPartitionKey(); + filters.push_back({index_key, std::move(filter_data), filter}); + if (!filter_construction_status.ok() && + partitioned_filters_construction_status_.ok()) { + partitioned_filters_construction_status_ = filter_construction_status; + } + keys_added_to_partition_ = 0; + Reset(); +} + +void PartitionedFilterBlockBuilder::Add(const Slice& key) { + MaybeCutAFilterBlock(&key); + FullFilterBlockBuilder::Add(key); +} + +void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { + FullFilterBlockBuilder::AddKey(key); + keys_added_to_partition_++; +} + +size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() { + return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded(); +} + +Slice PartitionedFilterBlockBuilder::Finish( + const BlockHandle& last_partition_block_handle, Status* status, + std::unique_ptr* filter_data) { + if (finishing_filters == true) { + // Record the handle of the last written filter block in the index + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding, + &handle_delta_encoding_slice); + if (!p_index_builder_->seperator_is_key_plus_seq()) { + index_on_filter_block_builder_without_seq_.Add( + ExtractUserKey(last_filter_entry_key), handle_encoding, + &handle_delta_encoding_slice); + } + } else { + MaybeCutAFilterBlock(nullptr); + } + + if (!partitioned_filters_construction_status_.ok()) { + *status = partitioned_filters_construction_status_; + return Slice(); + } + + // If there is no filter partition left, then return the index on filter + // partitions + if (UNLIKELY(filters.empty())) { + *status = Status::OK(); + last_filter_data.reset(); + if (finishing_filters) { + // Simplest to just add them all at the end + total_added_in_built_ = 0; + if (p_index_builder_->seperator_is_key_plus_seq()) { + return index_on_filter_block_builder_.Finish(); + } else { + return index_on_filter_block_builder_without_seq_.Finish(); + } + } else { + // This is the rare case where no key was added to the filter + return Slice(); + } + } else { + // Return the next filter partition in line and set Incomplete() status to + // indicate we expect more calls to Finish + *status = Status::Incomplete(); + finishing_filters = true; + + last_filter_entry_key = filters.front().key; + Slice filter = filters.front().filter; + last_filter_data = std::move(filters.front().filter_data); + if (filter_data != nullptr) { + *filter_data = std::move(last_filter_data); + } + filters.pop_front(); + return filter; + } +} + +PartitionedFilterBlockReader::PartitionedFilterBlockReader( + const BlockBasedTable* t, + CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) {} + +std::unique_ptr PartitionedFilterBlockReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new PartitionedFilterBlockReader(table, std::move(filter_block))); +} + +bool PartitionedFilterBlockReader::KeyMayMatch( + const Slice& key, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + assert(const_ikey_ptr != nullptr); + if (!whole_key_filtering()) { + return true; + } + + return MayMatch(key, no_io, const_ikey_ptr, get_context, lookup_context, + read_options, &FullFilterBlockReader::KeyMayMatch); +} + +void PartitionedFilterBlockReader::KeysMayMatch( + MultiGetRange* range, const bool no_io, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { + if (!whole_key_filtering()) { + return; // Any/all may match + } + + MayMatch(range, nullptr, no_io, lookup_context, read_options, + &FullFilterBlockReader::KeysMayMatch2); +} + +bool PartitionedFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + assert(const_ikey_ptr != nullptr); + return MayMatch(prefix, no_io, const_ikey_ptr, get_context, lookup_context, + read_options, &FullFilterBlockReader::PrefixMayMatch); +} + +void PartitionedFilterBlockReader::PrefixesMayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, + const bool no_io, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) { + assert(prefix_extractor); + MayMatch(range, prefix_extractor, no_io, lookup_context, read_options, + &FullFilterBlockReader::PrefixesMayMatch); +} + +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( + const CachableEntry& filter_block, + const Slice& entry) const { + IndexBlockIter iter; + const InternalKeyComparator* const comparator = internal_comparator(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + comparator->user_comparator(), + table()->get_rep()->get_global_seqno(BlockType::kFilterPartitionIndex), + &iter, kNullStats, true /* total_order_seek */, + false /* have_first_key */, index_key_includes_seq(), + index_value_is_full()); + iter.Seek(entry); + if (UNLIKELY(!iter.Valid())) { + // entry is larger than all the keys. However its prefix might still be + // present in the last partition. If this is called by PrefixMayMatch this + // is necessary for correct behavior. Otherwise it is unnecessary but safe. + // Assuming this is an unlikely case for full key search, the performance + // overhead should be negligible. + iter.SeekToLast(); + } + assert(iter.Valid()); + BlockHandle fltr_blk_handle = iter.value().handle; + return fltr_blk_handle; +} + +Status PartitionedFilterBlockReader::GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, const ReadOptions& _read_options, + CachableEntry* filter_block) const { + assert(table()); + assert(filter_block); + assert(filter_block->IsEmpty()); + + if (!filter_map_.empty()) { + auto iter = filter_map_.find(fltr_blk_handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (iter != filter_map_.end()) { + filter_block->SetUnownedValue(iter->second.GetValue()); + return Status::OK(); + } + } + + ReadOptions read_options = _read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + const Status s = + table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, + UncompressionDict::GetEmptyDict(), filter_block, + get_context, lookup_context, + /* for_compaction */ false, /* use_cache */ true, + /* async_read */ false); + + return s; +} + +bool PartitionedFilterBlockReader::MayMatch( + const Slice& slice, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options, FilterFunction filter_function) const { + CachableEntry filter_block; + Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, + &filter_block, read_options); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return true; + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return true; + } + + auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + + CachableEntry filter_partition_block; + s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, + no_io, get_context, lookup_context, read_options, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return true; + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + return (filter_partition.*filter_function)( + slice, no_io, const_ikey_ptr, get_context, lookup_context, read_options); +} + +void PartitionedFilterBlockReader::MayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, bool no_io, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, + FilterManyFunction filter_function) const { + CachableEntry filter_block; + Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block, read_options); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return; // Any/all may match + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return; // Any/all may match + } + + auto start_iter_same_handle = range->begin(); + BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle(); + + // For all keys mapping to same partition (must be adjacent in sorted order) + // share block cache lookup and use full filter multiget on the partition + // filter. + for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) { + // TODO: re-use one top-level index iterator + BlockHandle this_filter_handle = + GetFilterPartitionHandle(filter_block, iter->ikey); + if (!prev_filter_handle.IsNull() && + this_filter_handle != prev_filter_handle) { + MultiGetRange subrange(*range, start_iter_same_handle, iter); + MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io, + lookup_context, read_options, filter_function); + range->AddSkipsFrom(subrange); + start_iter_same_handle = iter; + } + if (UNLIKELY(this_filter_handle.size() == 0)) { // key is out of range + // Not reachable with current behavior of GetFilterPartitionHandle + assert(false); + range->SkipKey(iter); + prev_filter_handle = BlockHandle::NullBlockHandle(); + } else { + prev_filter_handle = this_filter_handle; + } + } + if (!prev_filter_handle.IsNull()) { + MultiGetRange subrange(*range, start_iter_same_handle, range->end()); + MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io, + lookup_context, read_options, filter_function); + range->AddSkipsFrom(subrange); + } +} + +void PartitionedFilterBlockReader::MayMatchPartition( + MultiGetRange* range, const SliceTransform* prefix_extractor, + BlockHandle filter_handle, bool no_io, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, + FilterManyFunction filter_function) const { + CachableEntry filter_partition_block; + Status s = GetFilterPartitionBlock( + nullptr /* prefetch_buffer */, filter_handle, no_io, + range->begin()->get_context, lookup_context, read_options, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return; // Any/all may match + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + (filter_partition.*filter_function)(range, prefix_extractor, no_io, + lookup_context, read_options); +} + +size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + // TODO(myabandeh): better estimation for filter_map_ size +} + +// TODO(myabandeh): merge this with the same function in IndexReader +Status PartitionedFilterBlockReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { + assert(table()); + + const BlockBasedTable::Rep* const rep = table()->get_rep(); + assert(rep); + + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + + CachableEntry filter_block; + + Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &filter_block, ro); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep->ioptions.logger, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return s; + } + + // Before read partitions, prefetch them to avoid lots of IOs + assert(filter_block.GetValue()); + + IndexBlockIter biter; + const InternalKeyComparator* const comparator = internal_comparator(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + comparator->user_comparator(), + rep->get_global_seqno(BlockType::kFilterPartitionIndex), &biter, + kNullStats, true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + BlockHandle handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + handle = biter.value().handle; + uint64_t last_off = + handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize; + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr prefetch_buffer; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() || + tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + + IOOptions opts; + s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } + } + // After prefetch, read the partitions one by one + for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { + handle = biter.value().handle; + + CachableEntry block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), + /* for_compaction */ false, &block, nullptr /* get_context */, + &lookup_context, nullptr /* contents */, false); + if (!s.ok()) { + return s; + } + assert(s.ok() || block.GetValue() == nullptr); + + if (block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + filter_map_[handle.offset()] = std::move(block); + } + } + } + } + return biter.status(); +} + +const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() + const { + assert(table()); + assert(table()->get_rep()); + + return &table()->get_rep()->internal_comparator; +} + +bool PartitionedFilterBlockReader::index_key_includes_seq() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_key_includes_seq; +} + +bool PartitionedFilterBlockReader::index_value_is_full() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_value_is_full; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.h new file mode 100644 index 0000000000..0e3b4a5b34 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_filter_block.h @@ -0,0 +1,182 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "block_cache.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/index_builder.h" +#include "util/autovector.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { +class InternalKeyComparator; + +class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { + public: + explicit PartitionedFilterBlockBuilder( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size); + + virtual ~PartitionedFilterBlockBuilder(); + + void AddKey(const Slice& key) override; + void Add(const Slice& key) override; + size_t EstimateEntriesAdded() override; + + virtual Slice Finish( + const BlockHandle& last_partition_block_handle, Status* status, + std::unique_ptr* filter_data = nullptr) override; + + virtual void ResetFilterBitsBuilder() override { + // Previously constructed partitioned filters by + // this to-be-reset FiterBitsBuilder can also be + // cleared + filters.clear(); + FullFilterBlockBuilder::ResetFilterBitsBuilder(); + } + + // For PartitionFilter, optional post-verifing the filter is done + // as part of PartitionFilterBlockBuilder::Finish + // to avoid implementation complexity of doing it elsewhere. + // Therefore we are skipping it in here. + virtual Status MaybePostVerifyFilter( + const Slice& /* filter_content */) override { + return Status::OK(); + } + + private: + // Filter data + BlockBuilder index_on_filter_block_builder_; // top-level index builder + BlockBuilder + index_on_filter_block_builder_without_seq_; // same for user keys + struct FilterEntry { + std::string key; + std::unique_ptr filter_data; + Slice filter; + }; + std::deque filters; // list of partitioned filters and keys used + // in building the index + + // Set to the first non-okay status if any of the filter + // partitions experiences construction error. + // If partitioned_filters_construction_status_ is non-okay, + // then the whole partitioned filters should not be used. + Status partitioned_filters_construction_status_; + std::string last_filter_entry_key; + std::unique_ptr last_filter_data; + std::unique_ptr value; + bool finishing_filters = + false; // true if Finish is called once but not complete yet. + // The policy of when cut a filter block and Finish it + void MaybeCutAFilterBlock(const Slice* next_key); + // Currently we keep the same number of partitions for filters and indexes. + // This would allow for some potentioal optimizations in future. If such + // optimizations did not realize we can use different number of partitions and + // eliminate p_index_builder_ + PartitionedIndexBuilder* const p_index_builder_; + // The desired number of keys per partition + uint32_t keys_per_partition_; + // The number of keys added to the last partition so far + uint32_t keys_added_to_partition_; + // According to the bits builders, how many keys/prefixes added + // in all the filters we have fully built + uint64_t total_added_in_built_; + BlockHandle last_encoded_handle_; +}; + +class PartitionedFilterBlockReader + : public FilterBlockReaderCommon { + public: + PartitionedFilterBlockReader( + const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); + + bool KeyMayMatch(const Slice& key, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + void KeysMayMatch(MultiGetRange* range, const bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + + bool PrefixMayMatch(const Slice& prefix, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + const bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) override; + + size_t ApproximateMemoryUsage() const override; + + private: + BlockHandle GetFilterPartitionHandle( + const CachableEntry& filter_block, + const Slice& entry) const; + Status GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, + CachableEntry* filter_block) const; + + using FilterFunction = bool (FullFilterBlockReader::*)( + const Slice& slice, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options); + bool MayMatch(const Slice& slice, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options, + FilterFunction filter_function) const; + using FilterManyFunction = void (FullFilterBlockReader::*)( + MultiGetRange* range, const SliceTransform* prefix_extractor, + const bool no_io, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options); + void MayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, + bool no_io, BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options, + FilterManyFunction filter_function) const; + void MayMatchPartition(MultiGetRange* range, + const SliceTransform* prefix_extractor, + BlockHandle filter_handle, bool no_io, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options, + FilterManyFunction filter_function) const; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; + + const InternalKeyComparator* internal_comparator() const; + bool index_key_includes_seq() const; + bool index_value_is_full() const; + + protected: + // For partition blocks pinned in cache. Can be a subset of blocks + // in case some fail insertion on attempt to pin. + UnorderedMap> filter_map_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.cc new file mode 100644 index 0000000000..b9bc2155a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/partitioned_index_iterator.h" + +namespace ROCKSDB_NAMESPACE { +void PartitionedIndexIterator::Seek(const Slice& target) { SeekImpl(&target); } + +void PartitionedIndexIterator::SeekToFirst() { SeekImpl(nullptr); } + +void PartitionedIndexIterator::SeekImpl(const Slice* target) { + SavePrevIndexValue(); + + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetPartitionedIndexIter(); + return; + } + + InitPartitionedIndexBlock(); + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + + // We could check upper bound here, but that would be too complicated + // and checking index upper bound is less useful than for data blocks. + + if (target) { + assert(!Valid() || (table_->get_rep()->index_key_includes_seq + ? (icomp_.Compare(*target, key()) <= 0) + : (user_comparator_.Compare(ExtractUserKey(*target), + key()) <= 0))); + } +} + +void PartitionedIndexIterator::SeekToLast() { + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetPartitionedIndexIter(); + return; + } + InitPartitionedIndexBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); +} + +void PartitionedIndexIterator::Next() { + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); +} + +void PartitionedIndexIterator::Prev() { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + + FindKeyBackward(); +} + +void PartitionedIndexIterator::InitPartitionedIndexBlock() { + BlockHandle partitioned_index_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + partitioned_index_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetPartitionedIndexIter(); + } + auto* rep = table_->get_rep(); + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + block_prefetcher_.PrefetchIfNeeded( + rep, partitioned_index_handle, read_options_.readahead_size, + is_for_compaction, /*no_sequential_checking=*/false, + read_options_.rate_limiter_priority); + Status s; + table_->NewDataBlockIterator( + read_options_, partitioned_index_handle, &block_iter_, + BlockType::kIndex, + /*get_context=*/nullptr, &lookup_context_, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s); + block_iter_points_to_real_block_ = true; + // We could check upper bound here but it is complicated to reason about + // upper bound in index iterator. On the other than, in large scans, index + // iterators are moved much less frequently compared to data blocks. So + // the upper bound check is skipped for simplicity. + } +} + +void PartitionedIndexIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +void PartitionedIndexIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + ResetPartitionedIndexIter(); + index_iter_->Next(); + + if (!index_iter_->Valid()) { + return; + } + + InitPartitionedIndexBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +void PartitionedIndexIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetPartitionedIndexIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitPartitionedIndexBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.h new file mode 100644 index 0000000000..6412fe2399 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_iterator.h @@ -0,0 +1,160 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Iterator that iterates over partitioned index. +// Some upper and lower bound tricks played in block based table iterators +// could be played here, but it's too complicated to reason about index +// keys with upper or lower bound, so we skip it for simplicity. +class PartitionedIndexIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + public: + PartitionedIndexIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr>&& index_iter, + TableReaderCaller caller, size_t compaction_readahead_size = 0) + : index_iter_(std::move(index_iter)), + table_(table), + read_options_(read_options), +#ifndef NDEBUG + icomp_(icomp), +#endif + user_comparator_(icomp.user_comparator()), + block_iter_points_to_real_block_(false), + lookup_context_(caller), + block_prefetcher_( + compaction_readahead_size, + table_->get_rep()->table_options.initial_auto_readahead_size) { + } + + ~PartitionedIndexIterator() override {} + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice&) override { + // Shouldn't be called. + assert(false); + } + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult*) override { + assert(false); + return false; + } + void Prev() override; + bool Valid() const override { + return block_iter_points_to_real_block_ && block_iter_.Valid(); + } + Slice key() const override { + assert(Valid()); + return block_iter_.key(); + } + Slice user_key() const override { + assert(Valid()); + return block_iter_.user_key(); + } + IndexValue value() const override { + assert(Valid()); + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else { + return Status::OK(); + } + } + inline IterBoundCheck UpperBoundCheckResult() override { + // Shouldn't be called. + assert(false); + return IterBoundCheck::kUnknown; + } + void SetPinnedItersMgr(PinnedIteratorsManager*) override { + // Shouldn't be called. + assert(false); + } + bool IsKeyPinned() const override { + // Shouldn't be called. + assert(false); + return false; + } + bool IsValuePinned() const override { + // Shouldn't be called. + assert(false); + return false; + } + + void ResetPartitionedIndexIter() { + if (block_iter_points_to_real_block_) { + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (block_prefetcher_.prefetch_buffer() != nullptr && + read_options_.adaptive_readahead) { + block_prefetcher_.prefetch_buffer()->GetReadaheadState( + &(readahead_file_info->index_block_readahead_info)); + } + } + + void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (read_options_.adaptive_readahead) { + block_prefetcher_.SetReadaheadState( + &(readahead_file_info->index_block_readahead_info)); + } + } + + std::unique_ptr> index_iter_; + + private: + friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test; + const BlockBasedTable* table_; + const ReadOptions read_options_; +#ifndef NDEBUG + const InternalKeyComparator& icomp_; +#endif + UserComparatorWrapper user_comparator_; + IndexBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); + BlockCacheLookupContext lookup_context_; + BlockPrefetcher block_prefetcher_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitPartitionedIndexBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.cc new file mode 100644 index 0000000000..cad3a616e7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/partitioned_index_reader.h" + +#include "block_cache.h" +#include "file/random_access_file_reader.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/partitioned_index_iterator.h" + +namespace ROCKSDB_NAMESPACE { +Status PartitionIndexReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ro, use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset(new PartitionIndexReader(table, std::move(index_block))); + + return Status::OK(); +} + +InternalIteratorBase* PartitionIndexReader::NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context, + &index_block, read_options); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + const BlockBasedTable::Rep* rep = table()->rep_; + InternalIteratorBase* it = nullptr; + + Statistics* kNullStats = nullptr; + // Filters are already checked before seeking the index + if (!partition_map_.empty()) { + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + it = NewTwoLevelIterator( + new BlockBasedTable::PartitionedIndexIteratorState(table(), + &partition_map_), + index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), + index_value_is_full())); + } else { + ReadOptions ro; + ro.fill_cache = read_options.fill_cache; + ro.deadline = read_options.deadline; + ro.io_timeout = read_options.io_timeout; + ro.adaptive_readahead = read_options.adaptive_readahead; + ro.async_io = read_options.async_io; + ro.rate_limiter_priority = read_options.rate_limiter_priority; + ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + std::unique_ptr> index_iter( + index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), + index_value_is_full())); + + it = new PartitionedIndexIterator( + table(), ro, *internal_comparator(), std::move(index_iter), + lookup_context ? lookup_context->caller + : TableReaderCaller::kUncategorized); + } + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + + // TODO(myabandeh): Update TwoLevelIterator to be able to make use of + // on-stack BlockIter while the state is on heap. Currentlly it assumes + // the first level iter is always on heap and will attempt to delete it + // in its destructor. +} +Status PartitionIndexReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { + if (!partition_map_.empty()) { + // The dependencies are already cached since `partition_map_` is filled in + // an all-or-nothing manner. + return Status::OK(); + } + // Before read partitions, prefetch them to avoid lots of IOs + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + const BlockBasedTable::Rep* rep = table()->rep_; + IndexBlockIter biter; + BlockHandle handle; + Statistics* kNullStats = nullptr; + + CachableEntry index_block; + { + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &index_block, ro); + if (!s.ok()) { + return s; + } + } + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + if (!biter.Valid()) { + // Empty index. + return biter.status(); + } + handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + if (!biter.Valid()) { + // Empty index. + return biter.status(); + } + handle = biter.value().handle; + uint64_t last_off = + handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle); + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr prefetch_buffer; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() || + tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + IOOptions opts; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } + } + } + // For saving "all or nothing" to partition_map_ + UnorderedMap> map_in_progress; + + // After prefetch, read the partitions one by one + biter.SeekToFirst(); + size_t partition_count = 0; + for (; biter.Valid(); biter.Next()) { + handle = biter.value().handle; + CachableEntry block; + ++partition_count; + // TODO: Support counter batch update for partitioned index and + // filter blocks + Status s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), + /*for_compaction=*/false, &block.As(), + /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, + /*async_read=*/false); + + if (!s.ok()) { + return s; + } + if (block.GetValue() != nullptr) { + // Might need to "pin" some mmap-read blocks (GetOwnValue) if some + // partitions are successfully compressed (cached) and some are not + // compressed (mmap eligible) + if (block.IsCached() || block.GetOwnValue()) { + if (pin) { + map_in_progress[handle.offset()] = std::move(block); + } + } + } + } + Status s = biter.status(); + // Save (pin) them only if everything checks out + if (map_in_progress.size() == partition_count && s.ok()) { + std::swap(partition_map_, map_in_progress); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.h new file mode 100644 index 0000000000..9482fd6b44 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/partitioned_index_reader.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/index_reader_common.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { +// Index that allows binary search lookup in a two-level index structure. +class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read the partition index from the file and create an instance for + // `PartitionIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + // return a two-level iterator: first level is on the partition index + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + // TODO(myabandeh): more accurate estimate of partition_map_ mem usage + return usage; + } + + private: + PartitionIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + // For partition blocks pinned in cache. This is expected to be "all or + // none" so that !partition_map_.empty() can use an iterator expecting + // all partitions to be saved here. + UnorderedMap> partition_map_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.cc new file mode 100644 index 0000000000..0ff43e9b4e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/reader_common.h" + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/table.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +void ForceReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle, true /* erase_if_last_ref */); +} + +// WART: this is specific to block-based table +Status VerifyBlockChecksum(ChecksumType type, const char* data, + size_t block_size, const std::string& file_name, + uint64_t offset) { + PERF_TIMER_GUARD(block_checksum_time); + // After block_size bytes is compression type (1 byte), which is part of + // the checksummed section. + size_t len = block_size + 1; + // And then the stored checksum value (4 bytes). + uint32_t stored = DecodeFixed32(data + len); + + uint32_t computed = ComputeBuiltinChecksum(type, data, len); + if (stored == computed) { + return Status::OK(); + } else { + // Unmask for people who might look for reference crc value + if (type == kCRC32c) { + stored = crc32c::Unmask(stored); + computed = crc32c::Unmask(computed); + } + return Status::Corruption( + "block checksum mismatch: stored = " + std::to_string(stored) + + ", computed = " + std::to_string(computed) + + ", type = " + std::to_string(type) + " in " + file_name + " offset " + + std::to_string(offset) + " size " + std::to_string(block_size)); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.h new file mode 100644 index 0000000000..102802afec --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/reader_common.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "rocksdb/advanced_cache.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { +// Release the cached entry and decrement its ref count. +extern void ForceReleaseCachedEntry(void* arg, void* h); + +inline MemoryAllocator* GetMemoryAllocator( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache.get() + ? table_options.block_cache->memory_allocator() + : nullptr; +} + +// Assumes block has a trailer as in format.h. file_name and offset provided +// for generating a diagnostic message in returned status. +// +// Returns Status::OK() on checksum match, or Status::Corruption() on checksum +// mismatch. +extern Status VerifyBlockChecksum(ChecksumType type, const char* data, + size_t block_size, + const std::string& file_name, + uint64_t offset); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.cc new file mode 100644 index 0000000000..4ac442b6ba --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/uncompression_dict_reader.h" + +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +Status UncompressionDictReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* uncompression_dict_reader) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(uncompression_dict_reader); + + CachableEntry uncompression_dict; + if (prefetch || !use_cache) { + const Status s = ReadUncompressionDictionary( + table, prefetch_buffer, ro, use_cache, nullptr /* get_context */, + lookup_context, &uncompression_dict); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + uncompression_dict.Reset(); + } + } + + uncompression_dict_reader->reset( + new UncompressionDictReader(table, std::move(uncompression_dict))); + + return Status::OK(); +} + +Status UncompressionDictReader::ReadUncompressionDictionary( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict) { + // TODO: add perf counter for compression dictionary read time + + assert(table); + assert(uncompression_dict); + assert(uncompression_dict->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + assert(!rep->compression_dict_handle.IsNull()); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->compression_dict_handle, + UncompressionDict::GetEmptyDict(), uncompression_dict, get_context, + lookup_context, + /* for_compaction */ false, use_cache, + /* async_read */ false); + + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.logger, + "Encountered error while reading data from compression dictionary " + "block %s", + s.ToString().c_str()); + } + + return s; +} + +Status UncompressionDictReader::GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict) const { + assert(uncompression_dict); + + if (!uncompression_dict_.IsEmpty()) { + uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + read_options.verify_checksums = verify_checksums; + read_options.io_activity = ro.io_activity; + + return ReadUncompressionDictionary(table_, prefetch_buffer, read_options, + cache_dictionary_blocks(), get_context, + lookup_context, uncompression_dict); +} + +size_t UncompressionDictReader::ApproximateMemoryUsage() const { + assert(!uncompression_dict_.GetOwnValue() || + uncompression_dict_.GetValue() != nullptr); + size_t usage = uncompression_dict_.GetOwnValue() + ? uncompression_dict_.GetValue()->ApproximateMemoryUsage() + : 0; + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + + return usage; +} + +bool UncompressionDictReader::cache_dictionary_blocks() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->table_options.cache_index_and_filter_blocks; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.h new file mode 100644 index 0000000000..c78800d8ac --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_based/uncompression_dict_reader.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include + +#include "table/block_based/cachable_entry.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBasedTable; +struct BlockCacheLookupContext; +class FilePrefetchBuffer; +class GetContext; +struct ReadOptions; +struct UncompressionDict; + +// Provides access to the uncompression dictionary regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class UncompressionDictReader { + public: + static Status Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* uncompression_dict_reader); + + Status GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict) const; + + size_t ApproximateMemoryUsage() const; + + private: + UncompressionDictReader(const BlockBasedTable* t, + CachableEntry&& uncompression_dict) + : table_(t), uncompression_dict_(std::move(uncompression_dict)) { + assert(table_); + } + + bool cache_dictionary_blocks() const; + + static Status ReadUncompressionDictionary( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict); + + const BlockBasedTable* table_; + CachableEntry uncompression_dict_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.cc new file mode 100644 index 0000000000..b2fa6d4b52 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.cc @@ -0,0 +1,405 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_fetcher.h" + +#include +#include +#include + +#include "logging/logging.h" +#include "memory/memory_allocator_impl.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_type.h" +#include "table/block_based/reader_common.h" +#include "table/format.h" +#include "table/persistent_cache_helper.h" +#include "util/compression.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +inline void BlockFetcher::ProcessTrailerIfPresent() { + if (footer_.GetBlockTrailerSize() > 0) { + assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize); + if (read_options_.verify_checksums) { + io_status_ = status_to_io_status(VerifyBlockChecksum( + footer_.checksum_type(), slice_.data(), block_size_, + file_->file_name(), handle_.offset())); + RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); + if (!io_status_.ok()) { + assert(io_status_.IsCorruption()); + RecordTick(ioptions_.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); + } + } + compression_type_ = + BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_); + } else { + // E.g. plain table or cuckoo table + compression_type_ = kNoCompression; + } +} + +inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { + if (cache_options_.persistent_cache && + !cache_options_.persistent_cache->IsCompressed()) { + Status status = PersistentCacheHelper::LookupUncompressed( + cache_options_, handle_, contents_); + if (status.ok()) { + // uncompressed page is found for the block handle + return true; + } else { + // uncompressed page is not found + if (ioptions_.logger && !status.IsNotFound()) { + assert(!status.ok()); + ROCKS_LOG_INFO(ioptions_.logger, + "Error reading from persistent cache. %s", + status.ToString().c_str()); + } + } + } + return false; +} + +inline bool BlockFetcher::TryGetFromPrefetchBuffer() { + if (prefetch_buffer_ != nullptr) { + IOOptions opts; + IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); + if (io_s.ok()) { + bool read_from_prefetch_buffer = false; + if (read_options_.async_io && !for_compaction_) { + read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync( + opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, + &io_s, read_options_.rate_limiter_priority); + } else { + read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache( + opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, + &io_s, read_options_.rate_limiter_priority, for_compaction_); + } + if (read_from_prefetch_buffer) { + ProcessTrailerIfPresent(); + if (!io_status_.ok()) { + return true; + } + got_from_prefetch_buffer_ = true; + used_buf_ = const_cast(slice_.data()); + } + } + if (!io_s.ok()) { + io_status_ = io_s; + return true; + } + } + return got_from_prefetch_buffer_; +} + +inline bool BlockFetcher::TryGetSerializedBlockFromPersistentCache() { + if (cache_options_.persistent_cache && + cache_options_.persistent_cache->IsCompressed()) { + std::unique_ptr buf; + io_status_ = status_to_io_status(PersistentCacheHelper::LookupSerialized( + cache_options_, handle_, &buf, block_size_with_trailer_)); + if (io_status_.ok()) { + heap_buf_ = CacheAllocationPtr(buf.release()); + used_buf_ = heap_buf_.get(); + slice_ = Slice(heap_buf_.get(), block_size_); + ProcessTrailerIfPresent(); + return true; + } else if (!io_status_.IsNotFound() && ioptions_.logger) { + assert(!io_status_.ok()); + ROCKS_LOG_INFO(ioptions_.logger, + "Error reading from persistent cache. %s", + io_status_.ToString().c_str()); + } + } + return false; +} + +inline void BlockFetcher::PrepareBufferForBlockFromFile() { + // cache miss read from device + if ((do_uncompress_ || ioptions_.allow_mmap_reads) && + block_size_with_trailer_ < kDefaultStackBufferSize) { + // If we've got a small enough chunk of data, read it in to the + // trivially allocated stack buffer instead of needing a full malloc() + // + // `GetBlockContents()` cannot return this data as its lifetime is tied to + // this `BlockFetcher`'s lifetime. That is fine because this is only used + // in cases where we do not expect the `GetBlockContents()` result to be the + // same buffer we are assigning here. If we guess incorrectly, there will be + // a heap allocation and memcpy in `GetBlockContents()` to obtain the final + // result. Considering we are eliding a heap allocation here by using the + // stack buffer, the cost of guessing incorrectly here is one extra memcpy. + // + // When `do_uncompress_` is true, we expect the uncompression step will + // allocate heap memory for the final result. However this expectation will + // be wrong if the block turns out to already be uncompressed, which we + // won't know for sure until after reading it. + // + // When `ioptions_.allow_mmap_reads` is true, we do not expect the file + // reader to use the scratch buffer at all, but instead return a pointer + // into the mapped memory. This expectation will be wrong when using a + // file reader that does not implement mmap reads properly. + used_buf_ = &stack_buf_[0]; + } else if (maybe_compressed_ && !do_uncompress_) { + compressed_buf_ = + AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_); + used_buf_ = compressed_buf_.get(); + } else { + heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_); + used_buf_ = heap_buf_.get(); + } +} + +inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { + if (io_status_.ok() && read_options_.fill_cache && + cache_options_.persistent_cache && + cache_options_.persistent_cache->IsCompressed()) { + PersistentCacheHelper::InsertSerialized(cache_options_, handle_, used_buf_, + block_size_with_trailer_); + } +} + +inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() { + if (io_status_.ok() && !got_from_prefetch_buffer_ && + read_options_.fill_cache && cache_options_.persistent_cache && + !cache_options_.persistent_cache->IsCompressed()) { + // insert to uncompressed cache + PersistentCacheHelper::InsertUncompressed(cache_options_, handle_, + *contents_); + } +} + +inline void BlockFetcher::CopyBufferToHeapBuf() { + assert(used_buf_ != heap_buf_.get()); + heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_); + memcpy(heap_buf_.get(), used_buf_, block_size_with_trailer_); +#ifndef NDEBUG + num_heap_buf_memcpy_++; +#endif +} + +inline void BlockFetcher::CopyBufferToCompressedBuf() { + assert(used_buf_ != compressed_buf_.get()); + compressed_buf_ = + AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_); + memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_); +#ifndef NDEBUG + num_compressed_buf_memcpy_++; +#endif +} + +// Entering this method means the block is not compressed or do not need to be +// uncompressed. The block can be in one of the following buffers: +// 1. prefetch buffer if prefetch is enabled and the block is prefetched before +// 2. stack_buf_ if block size is smaller than the stack_buf_ size and block +// is not compressed +// 3. heap_buf_ if the block is not compressed +// 4. compressed_buf_ if the block is compressed +// 5. direct_io_buf_ if direct IO is enabled +// After this method, if the block is compressed, it should be in +// compressed_buf_, otherwise should be in heap_buf_. +inline void BlockFetcher::GetBlockContents() { + if (slice_.data() != used_buf_) { + // the slice content is not the buffer provided + *contents_ = BlockContents(Slice(slice_.data(), block_size_)); + } else { + // page can be either uncompressed or compressed, the buffer either stack + // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096 + if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) { + CopyBufferToHeapBuf(); + } else if (used_buf_ == compressed_buf_.get()) { + if (compression_type_ == kNoCompression && + memory_allocator_ != memory_allocator_compressed_) { + CopyBufferToHeapBuf(); + } else { + heap_buf_ = std::move(compressed_buf_); + } + } else if (direct_io_buf_.get() != nullptr) { + if (compression_type_ == kNoCompression) { + CopyBufferToHeapBuf(); + } else { + CopyBufferToCompressedBuf(); + heap_buf_ = std::move(compressed_buf_); + } + } + *contents_ = BlockContents(std::move(heap_buf_), block_size_); + } +#ifndef NDEBUG + contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; +#endif +} + +IOStatus BlockFetcher::ReadBlockContents() { + if (TryGetUncompressBlockFromPersistentCache()) { + compression_type_ = kNoCompression; +#ifndef NDEBUG + contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; +#endif // NDEBUG + return IOStatus::OK(); + } + if (TryGetFromPrefetchBuffer()) { + if (!io_status_.ok()) { + return io_status_; + } + } else if (!TryGetSerializedBlockFromPersistentCache()) { + IOOptions opts; + io_status_ = file_->PrepareIOOptions(read_options_, opts); + // Actual file read + if (io_status_.ok()) { + if (file_->use_direct_io()) { + PERF_TIMER_GUARD(block_read_time); + PERF_CPU_TIMER_GUARD(block_read_cpu_time, nullptr); + io_status_ = file_->Read( + opts, handle_.offset(), block_size_with_trailer_, &slice_, nullptr, + &direct_io_buf_, read_options_.rate_limiter_priority); + PERF_COUNTER_ADD(block_read_count, 1); + used_buf_ = const_cast(slice_.data()); + } else { + PrepareBufferForBlockFromFile(); + PERF_TIMER_GUARD(block_read_time); + PERF_CPU_TIMER_GUARD(block_read_cpu_time, nullptr); + io_status_ = file_->Read(opts, handle_.offset(), + block_size_with_trailer_, &slice_, used_buf_, + nullptr, read_options_.rate_limiter_priority); + PERF_COUNTER_ADD(block_read_count, 1); +#ifndef NDEBUG + if (slice_.data() == &stack_buf_[0]) { + num_stack_buf_memcpy_++; + } else if (slice_.data() == heap_buf_.get()) { + num_heap_buf_memcpy_++; + } else if (slice_.data() == compressed_buf_.get()) { + num_compressed_buf_memcpy_++; + } +#endif + } + } + + // TODO: introduce dedicated perf counter for range tombstones + switch (block_type_) { + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + PERF_COUNTER_ADD(filter_block_read_count, 1); + break; + + case BlockType::kCompressionDictionary: + PERF_COUNTER_ADD(compression_dict_block_read_count, 1); + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(index_block_read_count, 1); + break; + + // Nothing to do here as we don't have counters for the other types. + default: + break; + } + + PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_); + if (!io_status_.ok()) { + return io_status_; + } + + if (slice_.size() != block_size_with_trailer_) { + return IOStatus::Corruption( + "truncated block read from " + file_->file_name() + " offset " + + std::to_string(handle_.offset()) + ", expected " + + std::to_string(block_size_with_trailer_) + " bytes, got " + + std::to_string(slice_.size())); + } + + ProcessTrailerIfPresent(); + if (io_status_.ok()) { + InsertCompressedBlockToPersistentCacheIfNeeded(); + } else { + return io_status_; + } + } + + if (do_uncompress_ && compression_type_ != kNoCompression) { + PERF_TIMER_GUARD(block_decompress_time); + // compressed page, uncompress, update cache + UncompressionContext context(compression_type_); + UncompressionInfo info(context, uncompression_dict_, compression_type_); + io_status_ = status_to_io_status(UncompressSerializedBlock( + info, slice_.data(), block_size_, contents_, footer_.format_version(), + ioptions_, memory_allocator_)); +#ifndef NDEBUG + num_heap_buf_memcpy_++; +#endif + compression_type_ = kNoCompression; + } else { + GetBlockContents(); + } + + InsertUncompressedBlockToPersistentCacheIfNeeded(); + + return io_status_; +} + +IOStatus BlockFetcher::ReadAsyncBlockContents() { + if (TryGetUncompressBlockFromPersistentCache()) { + compression_type_ = kNoCompression; +#ifndef NDEBUG + contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; +#endif // NDEBUG + return IOStatus::OK(); + } else if (!TryGetSerializedBlockFromPersistentCache()) { + assert(prefetch_buffer_ != nullptr); + if (!for_compaction_) { + IOOptions opts; + IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); + if (!io_s.ok()) { + return io_s; + } + io_s = status_to_io_status(prefetch_buffer_->PrefetchAsync( + opts, file_, handle_.offset(), block_size_with_trailer_, &slice_)); + if (io_s.IsTryAgain()) { + return io_s; + } + if (io_s.ok()) { + // Data Block is already in prefetch. + got_from_prefetch_buffer_ = true; + ProcessTrailerIfPresent(); + if (!io_status_.ok()) { + return io_status_; + } + used_buf_ = const_cast(slice_.data()); + + if (do_uncompress_ && compression_type_ != kNoCompression) { + PERF_TIMER_GUARD(block_decompress_time); + // compressed page, uncompress, update cache + UncompressionContext context(compression_type_); + UncompressionInfo info(context, uncompression_dict_, + compression_type_); + io_status_ = status_to_io_status(UncompressSerializedBlock( + info, slice_.data(), block_size_, contents_, + footer_.format_version(), ioptions_, memory_allocator_)); +#ifndef NDEBUG + num_heap_buf_memcpy_++; +#endif + compression_type_ = kNoCompression; + } else { + GetBlockContents(); + } + InsertUncompressedBlockToPersistentCacheIfNeeded(); + return io_status_; + } + } + // Fallback to sequential reading of data blocks in case of io_s returns + // error or for_compaction_is true. + return ReadBlockContents(); + } + return io_status_; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.h new file mode 100644 index 0000000000..da6c352d0a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/block_fetcher.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "memory/memory_allocator_impl.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" +#include "table/format.h" +#include "table/persistent_cache_options.h" + +namespace ROCKSDB_NAMESPACE { + +// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or +// persistent cache provided (if any) to try to avoid reading from the file +// directly. Note that both the prefetch buffer and the persistent cache are +// optional; also, note that the persistent cache may be configured to store +// either compressed or uncompressed blocks. +// +// If the retrieved block is compressed and the do_uncompress flag is set, +// BlockFetcher uncompresses the block (using the uncompression dictionary, +// if provided, to prime the compression algorithm), and returns the resulting +// uncompressed block data. Otherwise, it returns the original block. +// +// Two read options affect the behavior of BlockFetcher: if verify_checksums is +// true, the checksum of the (original) block is checked; if fill_cache is true, +// the block is added to the persistent cache if needed. +// +// Memory for uncompressed and compressed blocks is allocated as needed +// using memory_allocator and memory_allocator_compressed, respectively +// (if provided; otherwise, the default allocator is used). + +class BlockFetcher { + public: + BlockFetcher(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, + const Footer& footer /* ref retained */, + const ReadOptions& read_options, + const BlockHandle& handle /* ref retained */, + BlockContents* contents, + const ImmutableOptions& ioptions /* ref retained */, + bool do_uncompress, bool maybe_compressed, BlockType block_type, + const UncompressionDict& uncompression_dict /* ref retained */, + const PersistentCacheOptions& cache_options /* ref retained */, + MemoryAllocator* memory_allocator = nullptr, + MemoryAllocator* memory_allocator_compressed = nullptr, + bool for_compaction = false) + : file_(file), + prefetch_buffer_(prefetch_buffer), + footer_(footer), + read_options_(read_options), + handle_(handle), + contents_(contents), + ioptions_(ioptions), + do_uncompress_(do_uncompress), + maybe_compressed_(maybe_compressed), + block_type_(block_type), + block_size_(static_cast(handle_.size())), + block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()), + uncompression_dict_(uncompression_dict), + cache_options_(cache_options), + memory_allocator_(memory_allocator), + memory_allocator_compressed_(memory_allocator_compressed), + for_compaction_(for_compaction) { + io_status_.PermitUncheckedError(); // TODO(AR) can we improve on this? + } + + IOStatus ReadBlockContents(); + IOStatus ReadAsyncBlockContents(); + + inline CompressionType get_compression_type() const { + return compression_type_; + } + inline size_t GetBlockSizeWithTrailer() const { + return block_size_with_trailer_; + } + +#ifndef NDEBUG + int TEST_GetNumStackBufMemcpy() const { return num_stack_buf_memcpy_; } + int TEST_GetNumHeapBufMemcpy() const { return num_heap_buf_memcpy_; } + int TEST_GetNumCompressedBufMemcpy() const { + return num_compressed_buf_memcpy_; + } + +#endif + private: +#ifndef NDEBUG + int num_stack_buf_memcpy_ = 0; + int num_heap_buf_memcpy_ = 0; + int num_compressed_buf_memcpy_ = 0; + +#endif + static const uint32_t kDefaultStackBufferSize = 5000; + + RandomAccessFileReader* file_; + FilePrefetchBuffer* prefetch_buffer_; + const Footer& footer_; + const ReadOptions read_options_; + const BlockHandle& handle_; + BlockContents* contents_; + const ImmutableOptions& ioptions_; + const bool do_uncompress_; + const bool maybe_compressed_; + const BlockType block_type_; + const size_t block_size_; + const size_t block_size_with_trailer_; + const UncompressionDict& uncompression_dict_; + const PersistentCacheOptions& cache_options_; + MemoryAllocator* memory_allocator_; + MemoryAllocator* memory_allocator_compressed_; + IOStatus io_status_; + Slice slice_; + char* used_buf_ = nullptr; + AlignedBuf direct_io_buf_; + CacheAllocationPtr heap_buf_; + CacheAllocationPtr compressed_buf_; + char stack_buf_[kDefaultStackBufferSize]; + bool got_from_prefetch_buffer_ = false; + CompressionType compression_type_; + bool for_compaction_ = false; + + // return true if found + bool TryGetUncompressBlockFromPersistentCache(); + // return true if found + bool TryGetFromPrefetchBuffer(); + bool TryGetSerializedBlockFromPersistentCache(); + void PrepareBufferForBlockFromFile(); + // Copy content from used_buf_ to new heap_buf_. + void CopyBufferToHeapBuf(); + // Copy content from used_buf_ to new compressed_buf_. + void CopyBufferToCompressedBuf(); + void GetBlockContents(); + void InsertCompressedBlockToPersistentCacheIfNeeded(); + void InsertUncompressedBlockToPersistentCacheIfNeeded(); + void ProcessTrailerIfPresent(); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.cc new file mode 100644 index 0000000000..8a5c452405 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.cc @@ -0,0 +1,370 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "table/compaction_merging_iterator.h" + +namespace ROCKSDB_NAMESPACE { +class CompactionMergingIterator : public InternalIterator { + public: + CompactionMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, + int n, bool is_arena_mode, + std::vector< + std::pair> + range_tombstones) + : is_arena_mode_(is_arena_mode), + comparator_(comparator), + current_(nullptr), + minHeap_(CompactionHeapItemComparator(comparator_)), + pinned_iters_mgr_(nullptr) { + children_.resize(n); + for (int i = 0; i < n; i++) { + children_[i].level = i; + children_[i].iter.Set(children[i]); + assert(children_[i].type == HeapItem::ITERATOR); + } + assert(range_tombstones.size() == static_cast(n)); + for (auto& p : range_tombstones) { + range_tombstone_iters_.push_back(p.first); + } + pinned_heap_item_.resize(n); + for (int i = 0; i < n; ++i) { + if (range_tombstones[i].second) { + // for LevelIterator + *range_tombstones[i].second = &range_tombstone_iters_[i]; + } + pinned_heap_item_[i].level = i; + pinned_heap_item_[i].type = HeapItem::DELETE_RANGE_START; + } + } + + void considerStatus(const Status& s) { + if (!s.ok() && status_.ok()) { + status_ = s; + } + } + + ~CompactionMergingIterator() override { + // TODO: use unique_ptr for range_tombstone_iters_ + for (auto child : range_tombstone_iters_) { + delete child; + } + + for (auto& child : children_) { + child.iter.DeleteIter(is_arena_mode_); + } + status_.PermitUncheckedError(); + } + + bool Valid() const override { return current_ != nullptr && status_.ok(); } + + Status status() const override { return status_; } + + void SeekToFirst() override; + + void Seek(const Slice& target) override; + + void Next() override; + + Slice key() const override { + assert(Valid()); + return current_->key(); + } + + Slice value() const override { + assert(Valid()); + if (LIKELY(current_->type == HeapItem::ITERATOR)) { + return current_->iter.value(); + } else { + return dummy_tombstone_val; + } + } + + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->type == HeapItem::DELETE_RANGE_START || + current_->iter.MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return current_->type == HeapItem::DELETE_RANGE_START + ? IterBoundCheck::kUnknown + : current_->iter.UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + for (auto& child : children_) { + child.iter.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsDeleteRangeSentinelKey() const override { + assert(Valid()); + return current_->type == HeapItem::DELETE_RANGE_START; + } + + // Compaction uses the above subset of InternalIterator interface. + void SeekToLast() override { assert(false); } + + void SeekForPrev(const Slice&) override { assert(false); } + + void Prev() override { assert(false); } + + bool NextAndGetResult(IterateResult*) override { + assert(false); + return false; + } + + bool IsKeyPinned() const override { + assert(false); + return false; + } + + bool IsValuePinned() const override { + assert(false); + return false; + } + + bool PrepareValue() override { + assert(false); + return false; + } + + private: + struct HeapItem { + HeapItem() = default; + + IteratorWrapper iter; + size_t level = 0; + std::string tombstone_str; + enum Type { ITERATOR, DELETE_RANGE_START }; + Type type = ITERATOR; + + explicit HeapItem(size_t _level, InternalIteratorBase* _iter) + : level(_level), type(Type::ITERATOR) { + iter.Set(_iter); + } + + void SetTombstoneForCompaction(const ParsedInternalKey&& pik) { + tombstone_str.clear(); + AppendInternalKey(&tombstone_str, pik); + } + + [[nodiscard]] Slice key() const { + return type == ITERATOR ? iter.key() : tombstone_str; + } + }; + + class CompactionHeapItemComparator { + public: + explicit CompactionHeapItemComparator( + const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(HeapItem* a, HeapItem* b) const { + int r = comparator_->Compare(a->key(), b->key()); + // For each file, we assume all range tombstone start keys come before + // its file boundary sentinel key (file's meta.largest key). + // In the case when meta.smallest = meta.largest and range tombstone start + // key is truncated at meta.smallest, the start key will have op_type = + // kMaxValid to make it smaller (see TruncatedRangeDelIterator + // constructor). The following assertion validates this assumption. + assert(a->type == b->type || r != 0); + return r > 0; + } + + private: + const InternalKeyComparator* comparator_; + }; + + using CompactionMinHeap = BinaryHeap; + bool is_arena_mode_; + const InternalKeyComparator* comparator_; + // HeapItem for all child point iterators. + std::vector children_; + // HeapItem for range tombstones. pinned_heap_item_[i] corresponds to the + // current range tombstone from range_tombstone_iters_[i]. + std::vector pinned_heap_item_; + // range_tombstone_iters_[i] contains range tombstones in the sorted run that + // corresponds to children_[i]. range_tombstone_iters_[i] == + // nullptr means the sorted run of children_[i] does not have range + // tombstones (or the current SSTable does not have range tombstones in the + // case of LevelIterator). + std::vector range_tombstone_iters_; + // Used as value for range tombstone keys + std::string dummy_tombstone_val{}; + + // Skip file boundary sentinel keys. + void FindNextVisibleKey(); + + // top of minHeap_ + HeapItem* current_; + // If any of the children have non-ok status, this is one of them. + Status status_; + CompactionMinHeap minHeap_; + PinnedIteratorsManager* pinned_iters_mgr_; + // Process a child that is not in the min heap. + // If valid, add to the min heap. Otherwise, check status. + void AddToMinHeapOrCheckStatus(HeapItem*); + + HeapItem* CurrentForward() const { + return !minHeap_.empty() ? minHeap_.top() : nullptr; + } + + void InsertRangeTombstoneAtLevel(size_t level) { + if (range_tombstone_iters_[level]->Valid()) { + pinned_heap_item_[level].SetTombstoneForCompaction( + range_tombstone_iters_[level]->start_key()); + minHeap_.push(&pinned_heap_item_[level]); + } + } +}; + +void CompactionMergingIterator::SeekToFirst() { + minHeap_.clear(); + status_ = Status::OK(); + for (auto& child : children_) { + child.iter.SeekToFirst(); + AddToMinHeapOrCheckStatus(&child); + } + + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + if (range_tombstone_iters_[i]) { + range_tombstone_iters_[i]->SeekToFirst(); + InsertRangeTombstoneAtLevel(i); + } + } + + FindNextVisibleKey(); + current_ = CurrentForward(); +} + +void CompactionMergingIterator::Seek(const Slice& target) { + minHeap_.clear(); + status_ = Status::OK(); + for (auto& child : children_) { + child.iter.Seek(target); + AddToMinHeapOrCheckStatus(&child); + } + + ParsedInternalKey pik; + ParseInternalKey(target, &pik, false /* log_err_key */) + .PermitUncheckedError(); + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + if (range_tombstone_iters_[i]) { + range_tombstone_iters_[i]->Seek(pik.user_key); + // For compaction, output keys should all be after seek target. + while (range_tombstone_iters_[i]->Valid() && + comparator_->Compare(range_tombstone_iters_[i]->start_key(), pik) < + 0) { + range_tombstone_iters_[i]->Next(); + } + InsertRangeTombstoneAtLevel(i); + } + } + + FindNextVisibleKey(); + current_ = CurrentForward(); +} + +void CompactionMergingIterator::Next() { + assert(Valid()); + // For the heap modifications below to be correct, current_ must be the + // current top of the heap. + assert(current_ == CurrentForward()); + // as the current points to the current record. move the iterator forward. + if (current_->type == HeapItem::ITERATOR) { + current_->iter.Next(); + if (current_->iter.Valid()) { + // current is still valid after the Next() call above. Call + // replace_top() to restore the heap property. When the same child + // iterator yields a sequence of keys, this is cheap. + assert(current_->iter.status().ok()); + minHeap_.replace_top(current_); + } else { + // current stopped being valid, remove it from the heap. + considerStatus(current_->iter.status()); + minHeap_.pop(); + } + } else { + assert(current_->type == HeapItem::DELETE_RANGE_START); + size_t level = current_->level; + assert(range_tombstone_iters_[level]); + range_tombstone_iters_[level]->Next(); + if (range_tombstone_iters_[level]->Valid()) { + pinned_heap_item_[level].SetTombstoneForCompaction( + range_tombstone_iters_[level]->start_key()); + minHeap_.replace_top(&pinned_heap_item_[level]); + } else { + minHeap_.pop(); + } + } + FindNextVisibleKey(); + current_ = CurrentForward(); +} + +void CompactionMergingIterator::FindNextVisibleKey() { + while (!minHeap_.empty()) { + HeapItem* current = minHeap_.top(); + // IsDeleteRangeSentinelKey() here means file boundary sentinel keys. + if (current->type != HeapItem::ITERATOR || + !current->iter.IsDeleteRangeSentinelKey()) { + return; + } + // range tombstone start keys from the same SSTable should have been + // exhausted + assert(!range_tombstone_iters_[current->level] || + !range_tombstone_iters_[current->level]->Valid()); + // current->iter is a LevelIterator, and it enters a new SST file in the + // Next() call here. + current->iter.Next(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + minHeap_.replace_top(current); + } else { + minHeap_.pop(); + } + if (range_tombstone_iters_[current->level]) { + InsertRangeTombstoneAtLevel(current->level); + } + } +} + +void CompactionMergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) { + if (child->iter.Valid()) { + assert(child->iter.status().ok()); + minHeap_.push(child); + } else { + considerStatus(child->iter.status()); + } +} + +InternalIterator* NewCompactionMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, int n, + std::vector>& range_tombstone_iters, + Arena* arena) { + assert(n >= 0); + if (n == 0) { + return NewEmptyInternalIterator(arena); + } else { + if (arena == nullptr) { + return new CompactionMergingIterator(comparator, children, n, + false /* is_arena_mode */, + range_tombstone_iters); + } else { + auto mem = arena->AllocateAligned(sizeof(CompactionMergingIterator)); + return new (mem) CompactionMergingIterator(comparator, children, n, + true /* is_arena_mode */, + range_tombstone_iters); + } + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.h new file mode 100644 index 0000000000..e3fd7797fd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/compaction_merging_iterator.h @@ -0,0 +1,44 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/range_del_aggregator.h" +#include "rocksdb/slice.h" +#include "rocksdb/types.h" +#include "table/merging_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * This is a simplified version of MergingIterator and is specifically used for + * compaction. It merges the input `children` iterators into a sorted stream of + * keys. Range tombstone start keys are also emitted to prevent oversize + * compactions. For example, consider an L1 file with content [a, b), y, z, + * where [a, b) is a range tombstone and y and z are point keys. This could + * cause an oversize compaction as it can overlap with a wide range of key space + * in L2. + * + * CompactionMergingIterator emits range tombstone start keys from each LSM + * level's range tombstone iterator, and for each range tombstone + * [start,end)@seqno, the key will be start@seqno with op_type + * kTypeRangeDeletion unless truncated at file boundary (see detail in + * TruncatedRangeDelIterator::start_key()). + * + * Caller should use CompactionMergingIterator::IsDeleteRangeSentinelKey() to + * check if the current key is a range tombstone key. + * TODO(cbi): IsDeleteRangeSentinelKey() is used for two kinds of keys at + * different layers: file boundary and range tombstone keys. Separate them into + * two APIs for clarity. + */ +class CompactionMergingIterator; + +InternalIterator* NewCompactionMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, int n, + std::vector>& range_tombstone_iters, + Arena* arena = nullptr); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.cc new file mode 100644 index 0000000000..7ca72365fd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.cc @@ -0,0 +1,551 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/cuckoo/cuckoo_table_builder.h" + +#include + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "table/block_based/block_builder.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/autovector.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +const std::string CuckooTablePropertyNames::kEmptyKey = + "rocksdb.cuckoo.bucket.empty.key"; +const std::string CuckooTablePropertyNames::kNumHashFunc = + "rocksdb.cuckoo.hash.num"; +const std::string CuckooTablePropertyNames::kHashTableSize = + "rocksdb.cuckoo.hash.size"; +const std::string CuckooTablePropertyNames::kValueLength = + "rocksdb.cuckoo.value.length"; +const std::string CuckooTablePropertyNames::kIsLastLevel = + "rocksdb.cuckoo.file.islastlevel"; +const std::string CuckooTablePropertyNames::kCuckooBlockSize = + "rocksdb.cuckoo.hash.cuckooblocksize"; +const std::string CuckooTablePropertyNames::kIdentityAsFirstHash = + "rocksdb.cuckoo.hash.identityfirst"; +const std::string CuckooTablePropertyNames::kUseModuleHash = + "rocksdb.cuckoo.hash.usemodule"; +const std::string CuckooTablePropertyNames::kUserKeyLength = + "rocksdb.cuckoo.hash.userkeylength"; + +// Obtained by running echo rocksdb.table.cuckoo | sha1sum +extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; + +CuckooTableBuilder::CuckooTableBuilder( + WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_table, uint32_t max_search_depth, + const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool use_module_hash, bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t), + uint32_t column_family_id, const std::string& column_family_name, + const std::string& db_id, const std::string& db_session_id, + uint64_t file_number) + : num_hash_func_(2), + file_(file), + max_hash_table_ratio_(max_hash_table_ratio), + max_num_hash_func_(max_num_hash_table), + max_search_depth_(max_search_depth), + cuckoo_block_size_(std::max(1U, cuckoo_block_size)), + hash_table_size_(use_module_hash ? 0 : 2), + is_last_level_file_(false), + has_seen_first_key_(false), + has_seen_first_value_(false), + key_size_(0), + value_size_(0), + num_entries_(0), + num_values_(0), + ucomp_(user_comparator), + use_module_hash_(use_module_hash), + identity_as_first_hash_(identity_as_first_hash), + get_slice_hash_(get_slice_hash), + closed_(false) { + // Data is in a huge block. + properties_.num_data_blocks = 1; + properties_.index_size = 0; + properties_.filter_size = 0; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; + properties_.db_id = db_id; + properties_.db_session_id = db_session_id; + properties_.orig_file_number = file_number; + status_.PermitUncheckedError(); + io_status_.PermitUncheckedError(); +} + +void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { + if (num_entries_ >= kMaxVectorIdx - 1) { + status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); + return; + } + ParsedInternalKey ikey; + Status pik_status = + ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + status_ = Status::Corruption("Unable to parse key into internal key. ", + pik_status.getState()); + return; + } + if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { + status_ = Status::NotSupported("Unsupported key type " + + std::to_string(ikey.type)); + return; + } + + // Determine if we can ignore the sequence number and value type from + // internal keys by looking at sequence number from first key. We assume + // that if first key has a zero sequence number, then all the remaining + // keys will have zero seq. no. + if (!has_seen_first_key_) { + is_last_level_file_ = ikey.sequence == 0; + has_seen_first_key_ = true; + smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size(); + } + if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) { + status_ = Status::NotSupported("all keys have to be the same size"); + return; + } + + if (ikey.type == kTypeValue) { + if (!has_seen_first_value_) { + has_seen_first_value_ = true; + value_size_ = value.size(); + } + if (value_size_ != value.size()) { + status_ = Status::NotSupported("all values have to be the same size"); + return; + } + + if (is_last_level_file_) { + kvs_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + kvs_.append(key.data(), key.size()); + } + kvs_.append(value.data(), value.size()); + ++num_values_; + } else { + if (is_last_level_file_) { + deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + deleted_keys_.append(key.data(), key.size()); + } + } + ++num_entries_; + + // In order to fill the empty buckets in the hash table, we identify a + // key which is not used so far (unused_user_key). We determine this by + // maintaining smallest and largest keys inserted so far in bytewise order + // and use them to find a key outside this range in Finish() operation. + // Note that this strategy is independent of user comparator used here. + if (ikey.user_key.compare(smallest_user_key_) < 0) { + smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + } else if (ikey.user_key.compare(largest_user_key_) > 0) { + largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + } + if (!use_module_hash_) { + if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) { + hash_table_size_ *= 2; + } + } +} + +bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const { + assert(closed_); + return idx >= num_values_; +} + +Slice CuckooTableBuilder::GetKey(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + return Slice( + &deleted_keys_[static_cast((idx - num_values_) * key_size_)], + static_cast(key_size_)); + } + return Slice(&kvs_[static_cast(idx * (key_size_ + value_size_))], + static_cast(key_size_)); +} + +Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { + assert(closed_); + return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx)); +} + +Slice CuckooTableBuilder::GetValue(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + static std::string empty_value(static_cast(value_size_), 'a'); + return Slice(empty_value); + } + return Slice( + &kvs_[static_cast(idx * (key_size_ + value_size_) + key_size_)], + static_cast(value_size_)); +} + +Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { + buckets->resize( + static_cast(hash_table_size_ + cuckoo_block_size_ - 1)); + uint32_t make_space_for_key_call_id = 0; + for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) { + uint64_t bucket_id = 0; + bool bucket_found = false; + autovector hash_vals; + Slice user_key = GetUserKey(vector_idx); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found; + ++hash_cnt) { + uint64_t hash_val = + CuckooHash(user_key, hash_cnt, use_module_hash_, hash_table_size_, + identity_as_first_hash_, get_slice_hash_); + // If there is a collision, check next cuckoo_block_size_ locations for + // empty locations. While checking, if we reach end of the hash table, + // stop searching and proceed for next hash function. + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++hash_val) { + if ((*buckets)[static_cast(hash_val)].vector_idx == + kMaxVectorIdx) { + bucket_id = hash_val; + bucket_found = true; + break; + } else { + if (ucomp_->Compare( + user_key, GetUserKey((*buckets)[static_cast(hash_val)] + .vector_idx)) == 0) { + return Status::NotSupported("Same key is being inserted again."); + } + hash_vals.push_back(hash_val); + } + } + } + while (!bucket_found && + !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets, + &bucket_id)) { + // Rehash by increashing number of hash tables. + if (num_hash_func_ >= max_num_hash_func_) { + return Status::NotSupported("Too many collisions. Unable to hash."); + } + // We don't really need to rehash the entire table because old hashes are + // still valid and we only increased the number of hash functions. + uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_, + hash_table_size_, identity_as_first_hash_, + get_slice_hash_); + ++num_hash_func_; + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++hash_val) { + if ((*buckets)[static_cast(hash_val)].vector_idx == + kMaxVectorIdx) { + bucket_found = true; + bucket_id = hash_val; + break; + } else { + hash_vals.push_back(hash_val); + } + } + } + (*buckets)[static_cast(bucket_id)].vector_idx = vector_idx; + } + return Status::OK(); +} + +Status CuckooTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + std::vector buckets; + std::string unused_bucket; + if (num_entries_ > 0) { + // Calculate the real hash size if module hash is enabled. + if (use_module_hash_) { + hash_table_size_ = + static_cast(num_entries_ / max_hash_table_ratio_); + } + status_ = MakeHashTable(&buckets); + if (!status_.ok()) { + return status_; + } + // Determine unused_user_key to fill empty buckets. + std::string unused_user_key = smallest_user_key_; + int curr_pos = static_cast(unused_user_key.size()) - 1; + while (curr_pos >= 0) { + --unused_user_key[curr_pos]; + if (Slice(unused_user_key).compare(smallest_user_key_) < 0) { + break; + } + --curr_pos; + } + if (curr_pos < 0) { + // Try using the largest key to identify an unused key. + unused_user_key = largest_user_key_; + curr_pos = static_cast(unused_user_key.size()) - 1; + while (curr_pos >= 0) { + ++unused_user_key[curr_pos]; + if (Slice(unused_user_key).compare(largest_user_key_) > 0) { + break; + } + --curr_pos; + } + } + if (curr_pos < 0) { + return Status::Corruption("Unable to find unused key"); + } + if (is_last_level_file_) { + unused_bucket = unused_user_key; + } else { + ParsedInternalKey ikey(unused_user_key, 0, kTypeValue); + AppendInternalKey(&unused_bucket, ikey); + } + } + properties_.num_entries = num_entries_; + properties_.num_deletions = num_entries_ - num_values_; + properties_.fixed_key_len = key_size_; + properties_.user_collected_properties[CuckooTablePropertyNames::kValueLength] + .assign(reinterpret_cast(&value_size_), sizeof(value_size_)); + + uint64_t bucket_size = key_size_ + value_size_; + unused_bucket.resize(static_cast(bucket_size), 'a'); + // Write the table. + uint32_t num_added = 0; + for (auto& bucket : buckets) { + if (bucket.vector_idx == kMaxVectorIdx) { + io_status_ = file_->Append(Slice(unused_bucket)); + } else { + ++num_added; + io_status_ = file_->Append(GetKey(bucket.vector_idx)); + if (io_status_.ok()) { + if (value_size_ > 0) { + io_status_ = file_->Append(GetValue(bucket.vector_idx)); + } + } + } + if (!io_status_.ok()) { + status_ = io_status_; + return status_; + } + } + assert(num_added == NumEntries()); + properties_.raw_key_size = num_added * properties_.fixed_key_len; + properties_.raw_value_size = num_added * value_size_; + + uint64_t offset = buckets.size() * bucket_size; + properties_.data_size = offset; + unused_bucket.resize(static_cast(properties_.fixed_key_len)); + properties_.user_collected_properties[CuckooTablePropertyNames::kEmptyKey] = + unused_bucket; + properties_.user_collected_properties[CuckooTablePropertyNames::kNumHashFunc] + .assign(reinterpret_cast(&num_hash_func_), sizeof(num_hash_func_)); + + properties_ + .user_collected_properties[CuckooTablePropertyNames::kHashTableSize] + .assign(reinterpret_cast(&hash_table_size_), + sizeof(hash_table_size_)); + properties_.user_collected_properties[CuckooTablePropertyNames::kIsLastLevel] + .assign(reinterpret_cast(&is_last_level_file_), + sizeof(is_last_level_file_)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kCuckooBlockSize] + .assign(reinterpret_cast(&cuckoo_block_size_), + sizeof(cuckoo_block_size_)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kIdentityAsFirstHash] + .assign(reinterpret_cast(&identity_as_first_hash_), + sizeof(identity_as_first_hash_)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kUseModuleHash] + .assign(reinterpret_cast(&use_module_hash_), + sizeof(use_module_hash_)); + uint32_t user_key_len = static_cast(smallest_user_key_.size()); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kUserKeyLength] + .assign(reinterpret_cast(&user_key_len), + sizeof(user_key_len)); + + // Write meta blocks. + MetaIndexBuilder meta_index_builder; + PropertyBlockBuilder property_block_builder; + + property_block_builder.AddTableProperty(properties_); + property_block_builder.Add(properties_.user_collected_properties); + Slice property_block = property_block_builder.Finish(); + BlockHandle property_block_handle; + property_block_handle.set_offset(offset); + property_block_handle.set_size(property_block.size()); + io_status_ = file_->Append(property_block); + offset += property_block.size(); + if (!io_status_.ok()) { + status_ = io_status_; + return status_; + } + + meta_index_builder.Add(kPropertiesBlockName, property_block_handle); + Slice meta_index_block = meta_index_builder.Finish(); + + BlockHandle meta_index_block_handle; + meta_index_block_handle.set_offset(offset); + meta_index_block_handle.set_size(meta_index_block.size()); + io_status_ = file_->Append(meta_index_block); + if (!io_status_.ok()) { + status_ = io_status_; + return status_; + } + + FooterBuilder footer; + footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset, + kNoChecksum, meta_index_block_handle); + io_status_ = file_->Append(footer.GetSlice()); + status_ = io_status_; + return status_; +} + +void CuckooTableBuilder::Abandon() { + assert(!closed_); + closed_ = true; +} + +uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_; } + +uint64_t CuckooTableBuilder::FileSize() const { + if (closed_) { + return file_->GetFileSize(); + } else if (num_entries_ == 0) { + return 0; + } + + if (use_module_hash_) { + return static_cast((key_size_ + value_size_) * num_entries_ / + max_hash_table_ratio_); + } else { + // Account for buckets being a power of two. + // As elements are added, file size remains constant for a while and + // doubles its size. Since compaction algorithm stops adding elements + // only after it exceeds the file limit, we account for the extra element + // being added here. + uint64_t expected_hash_table_size = hash_table_size_; + if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) { + expected_hash_table_size *= 2; + } + return (key_size_ + value_size_) * expected_hash_table_size - 1; + } +} + +// This method is invoked when there is no place to insert the target key. +// It searches for a set of elements that can be moved to accommodate target +// key. The search is a BFS graph traversal with first level (hash_vals) +// being all the buckets target key could go to. +// Then, from each node (curr_node), we find all the buckets that curr_node +// could go to. They form the children of curr_node in the tree. +// We continue the traversal until we find an empty bucket, in which case, we +// move all elements along the path from first level to this empty bucket, to +// make space for target key which is inserted at first level (*bucket_id). +// If tree depth exceedes max depth, we return false indicating failure. +bool CuckooTableBuilder::MakeSpaceForKey( + const autovector& hash_vals, + const uint32_t make_space_for_key_call_id, + std::vector* buckets, uint64_t* bucket_id) { + struct CuckooNode { + uint64_t bucket_id; + uint32_t depth; + uint32_t parent_pos; + CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos) + : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {} + }; + // This is BFS search tree that is stored simply as a vector. + // Each node stores the index of parent node in the vector. + std::vector tree; + // We want to identify already visited buckets in the current method call so + // that we don't add same buckets again for exploration in the tree. + // We do this by maintaining a count of current method call in + // make_space_for_key_call_id, which acts as a unique id for this invocation + // of the method. We store this number into the nodes that we explore in + // current method call. + // It is unlikely for the increment operation to overflow because the maximum + // no. of times this will be called is <= max_num_hash_func_ + num_entries_. + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { + uint64_t bid = hash_vals[hash_cnt]; + (*buckets)[static_cast(bid)].make_space_for_key_call_id = + make_space_for_key_call_id; + tree.push_back(CuckooNode(bid, 0, 0)); + } + bool null_found = false; + uint32_t curr_pos = 0; + while (!null_found && curr_pos < tree.size()) { + CuckooNode& curr_node = tree[curr_pos]; + uint32_t curr_depth = curr_node.depth; + if (curr_depth >= max_search_depth_) { + break; + } + CuckooBucket& curr_bucket = + (*buckets)[static_cast(curr_node.bucket_id)]; + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found; + ++hash_cnt) { + uint64_t child_bucket_id = CuckooHash( + GetUserKey(curr_bucket.vector_idx), hash_cnt, use_module_hash_, + hash_table_size_, identity_as_first_hash_, get_slice_hash_); + // Iterate inside Cuckoo Block. + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++child_bucket_id) { + if ((*buckets)[static_cast(child_bucket_id)] + .make_space_for_key_call_id == make_space_for_key_call_id) { + continue; + } + (*buckets)[static_cast(child_bucket_id)] + .make_space_for_key_call_id = make_space_for_key_call_id; + tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos)); + if ((*buckets)[static_cast(child_bucket_id)].vector_idx == + kMaxVectorIdx) { + null_found = true; + break; + } + } + } + ++curr_pos; + } + + if (null_found) { + // There is an empty node in tree.back(). Now, traverse the path from this + // empty node to top of the tree and at every node in the path, replace + // child with the parent. Stop when first level is reached in the tree + // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return + // this location in first level for target key to be inserted. + uint32_t bucket_to_replace_pos = static_cast(tree.size()) - 1; + while (bucket_to_replace_pos >= num_hash_func_) { + CuckooNode& curr_node = tree[bucket_to_replace_pos]; + (*buckets)[static_cast(curr_node.bucket_id)] = + (*buckets)[static_cast(tree[curr_node.parent_pos].bucket_id)]; + bucket_to_replace_pos = curr_node.parent_pos; + } + *bucket_id = tree[bucket_to_replace_pos].bucket_id; + } + return null_found; +} + +std::string CuckooTableBuilder::GetFileChecksum() const { + if (file_ != nullptr) { + return file_->GetFileChecksum(); + } else { + return kUnknownFileChecksum; + } +} + +const char* CuckooTableBuilder::GetFileChecksumFuncName() const { + if (file_ != nullptr) { + return file_->GetFileChecksumFuncName(); + } else { + return kUnknownFileChecksumFuncName; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.h new file mode 100644 index 0000000000..3a19dd6f99 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_builder.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "port/port.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_builder.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class CuckooTableBuilder : public TableBuilder { + public: + CuckooTableBuilder( + WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_func, uint32_t max_search_depth, + const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool use_module_hash, bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t), + uint32_t column_family_id, const std::string& column_family_name, + const std::string& db_id = "", const std::string& db_session_id = "", + uint64_t file_number = 0); + // No copying allowed + CuckooTableBuilder(const CuckooTableBuilder&) = delete; + void operator=(const CuckooTableBuilder&) = delete; + + // REQUIRES: Either Finish() or Abandon() has been called. + ~CuckooTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override { return status_; } + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return io_status_; } + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + TableProperties GetTableProperties() const override { return properties_; } + + // Get file checksum + std::string GetFileChecksum() const override; + + // Get file checksum function name + const char* GetFileChecksumFuncName() const override; + + private: + struct CuckooBucket { + CuckooBucket() : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} + uint32_t vector_idx; + // This number will not exceed kvs_.size() + max_num_hash_func_. + // We assume number of items is <= 2^32. + uint32_t make_space_for_key_call_id; + }; + static const uint32_t kMaxVectorIdx = std::numeric_limits::max(); + + bool MakeSpaceForKey(const autovector& hash_vals, + const uint32_t call_id, + std::vector* buckets, uint64_t* bucket_id); + Status MakeHashTable(std::vector* buckets); + + inline bool IsDeletedKey(uint64_t idx) const; + inline Slice GetKey(uint64_t idx) const; + inline Slice GetUserKey(uint64_t idx) const; + inline Slice GetValue(uint64_t idx) const; + + uint32_t num_hash_func_; + WritableFileWriter* file_; + const double max_hash_table_ratio_; + const uint32_t max_num_hash_func_; + const uint32_t max_search_depth_; + const uint32_t cuckoo_block_size_; + uint64_t hash_table_size_; + bool is_last_level_file_; + bool has_seen_first_key_; + bool has_seen_first_value_; + uint64_t key_size_; + uint64_t value_size_; + // A list of fixed-size key-value pairs concatenating into a string. + // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific + // key / value given an index + std::string kvs_; + std::string deleted_keys_; + // Number of key-value pairs stored in kvs_ + number of deleted keys + uint64_t num_entries_; + // Number of keys that contain value (non-deletion op) + uint64_t num_values_; + Status status_; + IOStatus io_status_; + TableProperties properties_; + const Comparator* ucomp_; + bool use_module_hash_; + bool identity_as_first_hash_; + uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, + uint64_t max_num_buckets); + std::string largest_user_key_ = ""; + std::string smallest_user_key_ = ""; + + bool closed_; // Either Finish() or Abandon() has been called. +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.cc new file mode 100644 index 0000000000..774e00212d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/cuckoo/cuckoo_table_factory.h" + +#include "db/dbformat.h" +#include "options/configurable_helper.h" +#include "rocksdb/utilities/options_type.h" +#include "table/cuckoo/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +Status CuckooTableFactory::NewTableReader( + const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool /*prefetch_index_and_filter_in_cache*/) const { + std::unique_ptr new_reader(new CuckooTableReader( + table_reader_options.ioptions, std::move(file), file_size, + table_reader_options.internal_comparator.user_comparator(), nullptr)); + Status s = new_reader->status(); + if (s.ok()) { + *table = std::move(new_reader); + } + return s; +} + +TableBuilder* CuckooTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const { + // TODO: change builder to take the option struct + return new CuckooTableBuilder( + file, table_options_.hash_table_ratio, 64, + table_options_.max_search_depth, + table_builder_options.internal_comparator.user_comparator(), + table_options_.cuckoo_block_size, table_options_.use_module_hash, + table_options_.identity_as_first_hash, nullptr /* get_slice_hash */, + table_builder_options.column_family_id, + table_builder_options.column_family_name, table_builder_options.db_id, + table_builder_options.db_session_id, table_builder_options.cur_file_num); +} + +std::string CuckooTableFactory::GetPrintableOptions() const { + std::string ret; + ret.reserve(2000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + table_options_.hash_table_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " max_search_depth: %u\n", + table_options_.max_search_depth); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n", + table_options_.cuckoo_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n", + table_options_.identity_as_first_hash); + ret.append(buffer); + return ret; +} + +static std::unordered_map cuckoo_table_type_info = + { + {"hash_table_ratio", + {offsetof(struct CuckooTableOptions, hash_table_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_search_depth", + {offsetof(struct CuckooTableOptions, max_search_depth), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cuckoo_block_size", + {offsetof(struct CuckooTableOptions, cuckoo_block_size), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"identity_as_first_hash", + {offsetof(struct CuckooTableOptions, identity_as_first_hash), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_module_hash", + {offsetof(struct CuckooTableOptions, use_module_hash), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options) + : table_options_(table_options) { + RegisterOptions(&table_options_, &cuckoo_table_type_info); +} + +TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { + return new CuckooTableFactory(table_options); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.h new file mode 100644 index 0000000000..7132cec659 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_factory.h @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "util/murmurhash.h" + +namespace ROCKSDB_NAMESPACE { + +const uint32_t kCuckooMurmurSeedMultiplier = 816922183; +static inline uint64_t CuckooHash( + const Slice& user_key, uint32_t hash_cnt, bool use_module_hash, + uint64_t table_size_, bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) { +#if !defined NDEBUG || defined OS_WIN + // This part is used only in unit tests but we have to keep it for Windows + // build as we run test in both debug and release modes under Windows. + if (get_slice_hash != nullptr) { + return get_slice_hash(user_key, hash_cnt, table_size_); + } +#else + (void)get_slice_hash; +#endif + + uint64_t value = 0; + if (hash_cnt == 0 && identity_as_first_hash) { + value = (*reinterpret_cast(user_key.data())); + } else { + value = MurmurHash(user_key.data(), static_cast(user_key.size()), + kCuckooMurmurSeedMultiplier * hash_cnt); + } + if (use_module_hash) { + return value % table_size_; + } else { + return value & (table_size_ - 1); + } +} + +// Cuckoo Table is designed for applications that require fast point lookups +// but not fast range scans. +// +// Some assumptions: +// - Key length and Value length are fixed. +// - Does not support Snapshot. +// - Does not support Merge operations. +// - Does not support prefix bloom filters. +class CuckooTableFactory : public TableFactory { + public: + explicit CuckooTableFactory( + const CuckooTableOptions& table_option = CuckooTableOptions()); + ~CuckooTableFactory() {} + + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kCuckooTableName(); } + const char* Name() const override { return kCuckooTableName(); } + + using TableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const override; + + std::string GetPrintableOptions() const override; + + private: + CuckooTableOptions table_options_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.cc new file mode 100644 index 0000000000..d647619620 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.cc @@ -0,0 +1,412 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/cuckoo/cuckoo_table_reader.h" + +#include +#include +#include +#include +#include + +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); +const uint32_t kInvalidIndex = std::numeric_limits::max(); +} // namespace + +extern const uint64_t kCuckooTableMagicNumber; + +CuckooTableReader::CuckooTableReader( + const ImmutableOptions& ioptions, + std::unique_ptr&& file, uint64_t file_size, + const Comparator* comparator, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) + : file_(std::move(file)), + is_last_level_(false), + identity_as_first_hash_(false), + use_module_hash_(false), + num_hash_func_(0), + unused_key_(""), + key_length_(0), + user_key_length_(0), + value_length_(0), + bucket_length_(0), + cuckoo_block_size_(0), + cuckoo_block_bytes_minus_one_(0), + table_size_(0), + ucomp_(comparator), + get_slice_hash_(get_slice_hash) { + if (!ioptions.allow_mmap_reads) { + status_ = Status::InvalidArgument("File is not mmaped"); + return; + } + { + std::unique_ptr props; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status_ = + ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, + ioptions, read_options, &props); + if (!status_.ok()) { + return; + } + table_props_ = std::move(props); + } + auto& user_props = table_props_->user_collected_properties; + auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); + if (hash_funs == user_props.end()) { + status_ = Status::Corruption("Number of hash functions not found"); + return; + } + num_hash_func_ = *reinterpret_cast(hash_funs->second.data()); + auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); + if (unused_key == user_props.end()) { + status_ = Status::Corruption("Empty bucket value not found"); + return; + } + unused_key_ = unused_key->second; + + key_length_ = static_cast(table_props_->fixed_key_len); + auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); + if (user_key_len == user_props.end()) { + status_ = Status::Corruption("User key length not found"); + return; + } + user_key_length_ = + *reinterpret_cast(user_key_len->second.data()); + + auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); + if (value_length == user_props.end()) { + status_ = Status::Corruption("Value length not found"); + return; + } + value_length_ = + *reinterpret_cast(value_length->second.data()); + bucket_length_ = key_length_ + value_length_; + + auto hash_table_size = + user_props.find(CuckooTablePropertyNames::kHashTableSize); + if (hash_table_size == user_props.end()) { + status_ = Status::Corruption("Hash table size not found"); + return; + } + table_size_ = + *reinterpret_cast(hash_table_size->second.data()); + + auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); + if (is_last_level == user_props.end()) { + status_ = Status::Corruption("Is last level not found"); + return; + } + is_last_level_ = *reinterpret_cast(is_last_level->second.data()); + + auto identity_as_first_hash = + user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash); + if (identity_as_first_hash == user_props.end()) { + status_ = Status::Corruption("identity as first hash not found"); + return; + } + identity_as_first_hash_ = + *reinterpret_cast(identity_as_first_hash->second.data()); + + auto use_module_hash = + user_props.find(CuckooTablePropertyNames::kUseModuleHash); + if (use_module_hash == user_props.end()) { + status_ = Status::Corruption("hash type is not found"); + return; + } + use_module_hash_ = + *reinterpret_cast(use_module_hash->second.data()); + auto cuckoo_block_size = + user_props.find(CuckooTablePropertyNames::kCuckooBlockSize); + if (cuckoo_block_size == user_props.end()) { + status_ = Status::Corruption("Cuckoo block size not found"); + return; + } + cuckoo_block_size_ = + *reinterpret_cast(cuckoo_block_size->second.data()); + cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; + // TODO: rate limit reads of whole cuckoo tables. + status_ = + file_->Read(IOOptions(), 0, static_cast(file_size), &file_data_, + nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); +} + +Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, + const Slice& key, GetContext* get_context, + const SliceTransform* /* prefix_extractor */, + bool /*skip_filters*/) { + assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); + Slice user_key = ExtractUserKey(key); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { + uint64_t offset = + bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_, + table_size_, identity_as_first_hash_, + get_slice_hash_); + const char* bucket = &file_data_.data()[offset]; + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, bucket += bucket_length_) { + if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), + Slice(bucket, user_key.size()))) { + return Status::OK(); + } + // Here, we compare only the user key part as we support only one entry + // per user key and we don't support snapshot. + if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { + Slice value(bucket + key_length_, value_length_); + if (is_last_level_) { + // Sequence number is not stored at the last level, so we will use + // kMaxSequenceNumber since it is unknown. This could cause some + // transactions to fail to lock a key due to known sequence number. + // However, it is expected for anyone to use a CuckooTable in a + // TransactionDB. + get_context->SaveValue(value, kMaxSequenceNumber); + } else { + Slice full_key(bucket, key_length_); + ParsedInternalKey found_ikey; + Status s = ParseInternalKey(full_key, &found_ikey, + false /* log_err_key */); // TODO + if (!s.ok()) return s; + bool dont_care __attribute__((__unused__)); + get_context->SaveValue(found_ikey, value, &dont_care); + } + // We don't support merge operations. So, we return here. + return Status::OK(); + } + } + } + return Status::OK(); +} + +void CuckooTableReader::Prepare(const Slice& key) { + // Prefetch the first Cuckoo Block. + Slice user_key = ExtractUserKey(key); + uint64_t addr = + reinterpret_cast(file_data_.data()) + + bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, + identity_as_first_hash_, nullptr); + uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; + for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { + PREFETCH(reinterpret_cast(addr), 0, 3); + } +} + +class CuckooTableIterator : public InternalIterator { + public: + explicit CuckooTableIterator(CuckooTableReader* reader); + // No copying allowed + CuckooTableIterator(const CuckooTableIterator&) = delete; + void operator=(const Iterator&) = delete; + ~CuckooTableIterator() override {} + bool Valid() const override; + void SeekToFirst() override; + void SeekToLast() override; + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void Next() override; + void Prev() override; + Slice key() const override; + Slice value() const override; + Status status() const override { return Status::OK(); } + void InitIfNeeded(); + + private: + struct BucketComparator { + BucketComparator(const Slice& file_data, const Comparator* ucomp, + uint32_t bucket_len, uint32_t user_key_len, + const Slice& target = Slice()) + : file_data_(file_data), + ucomp_(ucomp), + bucket_len_(bucket_len), + user_key_len_(user_key_len), + target_(target) {} + bool operator()(const uint32_t first, const uint32_t second) const { + const char* first_bucket = (first == kInvalidIndex) + ? target_.data() + : &file_data_.data()[first * bucket_len_]; + const char* second_bucket = + (second == kInvalidIndex) ? target_.data() + : &file_data_.data()[second * bucket_len_]; + return ucomp_->Compare(Slice(first_bucket, user_key_len_), + Slice(second_bucket, user_key_len_)) < 0; + } + + private: + const Slice file_data_; + const Comparator* ucomp_; + const uint32_t bucket_len_; + const uint32_t user_key_len_; + const Slice target_; + }; + + const BucketComparator bucket_comparator_; + void PrepareKVAtCurrIdx(); + CuckooTableReader* reader_; + bool initialized_; + // Contains a map of keys to bucket_id sorted in key order. + std::vector sorted_bucket_ids_; + // We assume that the number of items can be stored in uint32 (4 Billion). + uint32_t curr_key_idx_; + Slice curr_value_; + IterKey curr_key_; +}; + +CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) + : bucket_comparator_(reader->file_data_, reader->ucomp_, + reader->bucket_length_, reader->user_key_length_), + reader_(reader), + initialized_(false), + curr_key_idx_(kInvalidIndex) { + sorted_bucket_ids_.clear(); + curr_value_.clear(); + curr_key_.Clear(); +} + +void CuckooTableIterator::InitIfNeeded() { + if (initialized_) { + return; + } + sorted_bucket_ids_.reserve( + static_cast(reader_->GetTableProperties()->num_entries)); + uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; + assert(num_buckets < kInvalidIndex); + const char* bucket = reader_->file_data_.data(); + for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { + if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { + sorted_bucket_ids_.push_back(bucket_id); + } + bucket += reader_->bucket_length_; + } + assert(sorted_bucket_ids_.size() == + reader_->GetTableProperties()->num_entries); + std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), + bucket_comparator_); + curr_key_idx_ = kInvalidIndex; + initialized_ = true; +} + +void CuckooTableIterator::SeekToFirst() { + InitIfNeeded(); + curr_key_idx_ = 0; + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::SeekToLast() { + InitIfNeeded(); + curr_key_idx_ = static_cast(sorted_bucket_ids_.size()) - 1; + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::Seek(const Slice& target) { + InitIfNeeded(); + const BucketComparator seek_comparator( + reader_->file_data_, reader_->ucomp_, reader_->bucket_length_, + reader_->user_key_length_, ExtractUserKey(target)); + auto seek_it = + std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), + kInvalidIndex, seek_comparator); + curr_key_idx_ = + static_cast(std::distance(sorted_bucket_ids_.begin(), seek_it)); + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) { + // Not supported + assert(false); +} + +bool CuckooTableIterator::Valid() const { + return curr_key_idx_ < sorted_bucket_ids_.size(); +} + +void CuckooTableIterator::PrepareKVAtCurrIdx() { + if (!Valid()) { + curr_value_.clear(); + curr_key_.Clear(); + return; + } + uint32_t id = sorted_bucket_ids_[curr_key_idx_]; + const char* offset = + reader_->file_data_.data() + id * reader_->bucket_length_; + if (reader_->is_last_level_) { + // Always return internal key. + curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0, + kTypeValue); + } else { + curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); + } + curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); +} + +void CuckooTableIterator::Next() { + if (!Valid()) { + curr_value_.clear(); + curr_key_.Clear(); + return; + } + ++curr_key_idx_; + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::Prev() { + if (curr_key_idx_ == 0) { + curr_key_idx_ = static_cast(sorted_bucket_ids_.size()); + } + if (!Valid()) { + curr_value_.clear(); + curr_key_.Clear(); + return; + } + --curr_key_idx_; + PrepareKVAtCurrIdx(); +} + +Slice CuckooTableIterator::key() const { + assert(Valid()); + return curr_key_.GetInternalKey(); +} + +Slice CuckooTableIterator::value() const { + assert(Valid()); + return curr_value_; +} + +InternalIterator* CuckooTableReader::NewIterator( + const ReadOptions& /*read_options*/, + const SliceTransform* /* prefix_extractor */, Arena* arena, + bool /*skip_filters*/, TableReaderCaller /*caller*/, + size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) { + if (!status().ok()) { + return NewErrorInternalIterator( + Status::Corruption("CuckooTableReader status is not okay."), arena); + } + CuckooTableIterator* iter; + if (arena == nullptr) { + iter = new CuckooTableIterator(this); + } else { + auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); + iter = new (iter_mem) CuckooTableIterator(this); + } + return iter; +} + +size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.h new file mode 100644 index 0000000000..d17011ed83 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/cuckoo/cuckoo_table_reader.h @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +#include "file/random_access_file_reader.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "table/table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class TableReader; +struct ImmutableOptions; + +class CuckooTableReader : public TableReader { + public: + CuckooTableReader(const ImmutableOptions& ioptions, + std::unique_ptr&& file, + uint64_t file_size, const Comparator* user_comparator, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t)); + ~CuckooTableReader() {} + + std::shared_ptr GetTableProperties() const override { + return table_props_; + } + + Status status() const { return status_; } + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + // Returns a new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; + void Prepare(const Slice& target) override; + + // Report an approximation of how much memory has been used. + size_t ApproximateMemoryUsage() const override; + + // Following methods are not implemented for Cuckoo Table Reader + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + uint64_t ApproximateSize(const ReadOptions& /* read_options */, + const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + void SetupForCompaction() override {} + // End of methods not implemented. + + private: + friend class CuckooTableIterator; + void LoadAllKeys(std::vector>* key_to_bucket_id); + std::unique_ptr file_; + Slice file_data_; + bool is_last_level_; + bool identity_as_first_hash_; + bool use_module_hash_; + std::shared_ptr table_props_; + Status status_; + uint32_t num_hash_func_; + std::string unused_key_; + uint32_t key_length_; + uint32_t user_key_length_; + uint32_t value_length_; + uint32_t bucket_length_; + uint32_t cuckoo_block_size_; + uint32_t cuckoo_block_bytes_minus_one_; + uint64_t table_size_; + const Comparator* ucomp_; + uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, + uint64_t max_num_buckets); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.cc new file mode 100644 index 0000000000..dc077bd452 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.cc @@ -0,0 +1,573 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include +#include + +#include "block_fetcher.h" +#include "file/random_access_file_reader.h" +#include "memory/memory_allocator_impl.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics_impl.h" +#include "options/options_helper.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/persistent_cache_helper.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/hash.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { + +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; + +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +const char* kHostnameForDbHostId = "__hostname__"; + +bool ShouldReportDetailedTime(Env* env, Statistics* stats) { + return env != nullptr && stats != nullptr && + stats->get_stats_level() > kExceptDetailedTimers; +} + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~uint64_t{0}); + assert(size_ != ~uint64_t{0}); + PutVarint64Varint64(dst, offset_, size_); +} + +char* BlockHandle::EncodeTo(char* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~uint64_t{0}); + assert(size_ != ~uint64_t{0}); + char* cur = EncodeVarint64(dst, offset_); + cur = EncodeVarint64(cur, size_); + return cur; +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + +Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { + if (GetVarint64(input, &size_)) { + offset_ = _offset; + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + +// Return a string that contains the copy of handle. +std::string BlockHandle::ToString(bool hex) const { + std::string handle_str; + EncodeTo(&handle_str); + if (hex) { + return Slice(handle_str).ToString(true); + } else { + return handle_str; + } +} + +const BlockHandle BlockHandle::kNullBlockHandle(0, 0); + +void IndexValue::EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const { + if (previous_handle) { + // WART: this is specific to Block-based table + assert(handle.offset() == previous_handle->offset() + + previous_handle->size() + + BlockBasedTable::kBlockTrailerSize); + PutVarsignedint64(dst, handle.size() - previous_handle->size()); + } else { + handle.EncodeTo(dst); + } + assert(dst->size() != 0); + + if (have_first_key) { + PutLengthPrefixedSlice(dst, first_internal_key); + } +} + +Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle) { + if (previous_handle) { + int64_t delta; + if (!GetVarsignedint64(input, &delta)) { + return Status::Corruption("bad delta-encoded index value"); + } + // WART: this is specific to Block-based table + handle = BlockHandle(previous_handle->offset() + previous_handle->size() + + BlockBasedTable::kBlockTrailerSize, + previous_handle->size() + delta); + } else { + Status s = handle.DecodeFrom(input); + if (!s.ok()) { + return s; + } + } + + if (!have_first_key) { + first_internal_key = Slice(); + } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { + return Status::Corruption("bad first key in block info"); + } + + return Status::OK(); +} + +std::string IndexValue::ToString(bool hex, bool have_first_key) const { + std::string s; + EncodeTo(&s, have_first_key, nullptr); + if (hex) { + return Slice(s).ToString(true); + } else { + return s; + } +} + +namespace { +inline bool IsLegacyFooterFormat(uint64_t magic_number) { + return magic_number == kLegacyBlockBasedTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber; +} +inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kLegacyBlockBasedTableMagicNumber) { + return kBlockBasedTableMagicNumber; + } + if (magic_number == kLegacyPlainTableMagicNumber) { + return kPlainTableMagicNumber; + } + assert(false); + return magic_number; +} +inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kBlockBasedTableMagicNumber) { + return kLegacyBlockBasedTableMagicNumber; + } + if (magic_number == kPlainTableMagicNumber) { + return kLegacyPlainTableMagicNumber; + } + assert(false); + return magic_number; +} +inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) { + if (magic_number == kBlockBasedTableMagicNumber || + magic_number == kLegacyBlockBasedTableMagicNumber) { + return static_cast(BlockBasedTable::kBlockTrailerSize); + } else { + return 0; + } +} + +// Footer format, in three parts: +// * Part1 +// -> format_version == 0 (inferred from legacy magic number) +// (0 bytes) +// -> format_version >= 1 +// checksum type (char, 1 byte) +// * Part2 +// metaindex handle (varint64 offset, varint64 size) +// index handle (varint64 offset, varint64 size) +// for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40 +// * Part3 +// -> format_version == 0 (inferred from legacy magic number) +// legacy magic number (8 bytes) +// -> format_version >= 1 (inferred from NOT legacy magic number) +// format_version (uint32LE, 4 bytes), also called "footer version" +// newer magic number (8 bytes) + +constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength; +} // namespace + +void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, + uint64_t footer_offset, ChecksumType checksum_type, + const BlockHandle& metaindex_handle, + const BlockHandle& index_handle) { + (void)footer_offset; // Future use + + assert(magic_number != Footer::kNullTableMagicNumber); + assert(IsSupportedFormatVersion(format_version)); + + char* part2; + char* part3; + if (format_version > 0) { + slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength); + // Generate parts 1 and 3 + char* cur = data_.data(); + // Part 1 + *(cur++) = checksum_type; + // Part 2 + part2 = cur; + // Skip over part 2 for now + cur += kFooterPart2Size; + // Part 3 + part3 = cur; + EncodeFixed32(cur, format_version); + cur += 4; + EncodeFixed64(cur, magic_number); + assert(cur + 8 == slice_.data() + slice_.size()); + } else { + slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength); + // Legacy SST files use kCRC32c checksum but it's not stored in footer. + assert(checksum_type == kNoChecksum || checksum_type == kCRC32c); + // Generate part 3 (part 1 empty, skip part 2 for now) + part2 = data_.data(); + part3 = part2 + kFooterPart2Size; + char* cur = part3; + // Use legacy magic numbers to indicate format_version=0, for + // compatibility. No other cases should use format_version=0. + EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number)); + assert(cur + 8 == slice_.data() + slice_.size()); + } + + { + char* cur = part2; + cur = metaindex_handle.EncodeTo(cur); + cur = index_handle.EncodeTo(cur); + // Zero pad remainder + std::fill(cur, part3, char{0}); + } +} + +Status Footer::DecodeFrom(Slice input, uint64_t input_offset, + uint64_t enforce_table_magic_number) { + (void)input_offset; // Future use + + // Only decode to unused Footer + assert(table_magic_number_ == kNullTableMagicNumber); + assert(input != nullptr); + assert(input.size() >= kMinEncodedLength); + + const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte; + uint64_t magic = DecodeFixed64(magic_ptr); + + // We check for legacy formats here and silently upconvert them + bool legacy = IsLegacyFooterFormat(magic); + if (legacy) { + magic = UpconvertLegacyFooterFormat(magic); + } + if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) { + return Status::Corruption("Bad table magic number: expected " + + std::to_string(enforce_table_magic_number) + + ", found " + std::to_string(magic)); + } + table_magic_number_ = magic; + block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic); + + // Parse Part3 + if (legacy) { + // The size is already asserted to be at least kMinEncodedLength + // at the beginning of the function + input.remove_prefix(input.size() - kVersion0EncodedLength); + format_version_ = 0 /* legacy */; + checksum_type_ = kCRC32c; + } else { + const char* part3_ptr = magic_ptr - 4; + format_version_ = DecodeFixed32(part3_ptr); + if (!IsSupportedFormatVersion(format_version_)) { + return Status::Corruption("Corrupt or unsupported format_version: " + + std::to_string(format_version_)); + } + // All known format versions >= 1 occupy exactly this many bytes. + if (input.size() < kNewVersionsEncodedLength) { + return Status::Corruption("Input is too short to be an SST file"); + } + uint64_t adjustment = input.size() - kNewVersionsEncodedLength; + input.remove_prefix(adjustment); + + // Parse Part1 + char chksum = input.data()[0]; + checksum_type_ = lossless_cast(chksum); + if (!IsSupportedChecksumType(checksum_type())) { + return Status::Corruption("Corrupt or unsupported checksum type: " + + std::to_string(lossless_cast(chksum))); + } + // Consume checksum type field + input.remove_prefix(1); + } + + // Parse Part2 + Status result = metaindex_handle_.DecodeFrom(&input); + if (result.ok()) { + result = index_handle_.DecodeFrom(&input); + } + return result; + // Padding in part2 is ignored +} + +std::string Footer::ToString() const { + std::string result; + result.reserve(1024); + + bool legacy = IsLegacyFooterFormat(table_magic_number_); + if (legacy) { + result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); + result.append("index handle: " + index_handle_.ToString() + "\n "); + result.append("table_magic_number: " + std::to_string(table_magic_number_) + + "\n "); + } else { + result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); + result.append("index handle: " + index_handle_.ToString() + "\n "); + result.append("table_magic_number: " + std::to_string(table_magic_number_) + + "\n "); + result.append("format version: " + std::to_string(format_version_) + + "\n "); + } + return result; +} + +Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, + FileSystem& fs, FilePrefetchBuffer* prefetch_buffer, + uint64_t file_size, Footer* footer, + uint64_t enforce_table_magic_number) { + if (file_size < Footer::kMinEncodedLength) { + return Status::Corruption("file is too short (" + + std::to_string(file_size) + + " bytes) to be an " + "sstable: " + + file->file_name()); + } + + std::string footer_buf; + AlignedBuf internal_buf; + Slice footer_input; + uint64_t read_offset = (file_size > Footer::kMaxEncodedLength) + ? file_size - Footer::kMaxEncodedLength + : 0; + Status s; + // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now, + // there is no readahead for point lookups, so TryReadFromCache will fail if + // the required data is not in the prefetch buffer. Once deadline is enabled + // for iterator, TryReadFromCache might do a readahead. Revisit to see if we + // need to pass a timeout at that point + // TODO: rate limit footer reads. + if (prefetch_buffer == nullptr || + !prefetch_buffer->TryReadFromCache( + opts, file, read_offset, Footer::kMaxEncodedLength, &footer_input, + nullptr, opts.rate_limiter_priority)) { + if (file->use_direct_io()) { + s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, + &footer_input, nullptr, &internal_buf, + opts.rate_limiter_priority); + } else { + footer_buf.reserve(Footer::kMaxEncodedLength); + s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, + &footer_input, &footer_buf[0], nullptr, + opts.rate_limiter_priority); + } + if (!s.ok()) return s; + } + + // Check that we actually read the whole footer from the file. It may be + // that size isn't correct. + if (footer_input.size() < Footer::kMinEncodedLength) { + uint64_t size_on_disk = 0; + if (fs.GetFileSize(file->file_name(), IOOptions(), &size_on_disk, nullptr) + .ok()) { + // Similar to CheckConsistency message, but not completely sure the + // expected size always came from manifest. + return Status::Corruption("Sst file size mismatch: " + file->file_name() + + ". Expected " + std::to_string(file_size) + + ", actual size " + + std::to_string(size_on_disk) + "\n"); + } else { + return Status::Corruption( + "Missing SST footer data in file " + file->file_name() + + " File too short? Expected size: " + std::to_string(file_size)); + } + } + + s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number); + if (!s.ok()) { + s = Status::CopyAppendMessage(s, " in ", file->file_name()); + return s; + } + return Status::OK(); +} + +namespace { +// Custom handling for the last byte of a block, to avoid invoking streaming +// API to get an effective block checksum. This function is its own inverse +// because it uses xor. +inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) { + // This strategy bears some resemblance to extending a CRC checksum by one + // more byte, except we don't need to re-mix the input checksum as long as + // we do this step only once (per checksum). + const uint32_t kRandomPrime = 0x6b9083d9; + return checksum ^ lossless_cast(last_byte) * kRandomPrime; +} +} // namespace + +uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data, + size_t data_size) { + switch (type) { + case kCRC32c: + return crc32c::Mask(crc32c::Value(data, data_size)); + case kxxHash: + return XXH32(data, data_size, /*seed*/ 0); + case kxxHash64: + return Lower32of64(XXH64(data, data_size, /*seed*/ 0)); + case kXXH3: { + if (data_size == 0) { + // Special case because of special handling for last byte, not + // present in this case. Can be any value different from other + // small input size checksums. + return 0; + } else { + // See corresponding code in ComputeBuiltinChecksumWithLastByte + uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1)); + return ModifyChecksumForLastByte(v, data[data_size - 1]); + } + } + default: // including kNoChecksum + return 0; + } +} + +uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, + size_t data_size, char last_byte) { + switch (type) { + case kCRC32c: { + uint32_t crc = crc32c::Value(data, data_size); + // Extend to cover last byte (compression type) + crc = crc32c::Extend(crc, &last_byte, 1); + return crc32c::Mask(crc); + } + case kxxHash: { + XXH32_state_t* const state = XXH32_createState(); + XXH32_reset(state, 0); + XXH32_update(state, data, data_size); + // Extend to cover last byte (compression type) + XXH32_update(state, &last_byte, 1); + uint32_t v = XXH32_digest(state); + XXH32_freeState(state); + return v; + } + case kxxHash64: { + XXH64_state_t* const state = XXH64_createState(); + XXH64_reset(state, 0); + XXH64_update(state, data, data_size); + // Extend to cover last byte (compression type) + XXH64_update(state, &last_byte, 1); + uint32_t v = Lower32of64(XXH64_digest(state)); + XXH64_freeState(state); + return v; + } + case kXXH3: { + // XXH3 is a complicated hash function that is extremely fast on + // contiguous input, but that makes its streaming support rather + // complex. It is worth custom handling of the last byte (`type`) + // in order to avoid allocating a large state object and bringing + // that code complexity into CPU working set. + uint32_t v = Lower32of64(XXH3_64bits(data, data_size)); + return ModifyChecksumForLastByte(v, last_byte); + } + default: // including kNoChecksum + return 0; + } +} + +Status UncompressBlockData(const UncompressionInfo& uncompression_info, + const char* data, size_t size, + BlockContents* out_contents, uint32_t format_version, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator) { + Status ret = Status::OK(); + + assert(uncompression_info.type() != kNoCompression && + "Invalid compression type"); + + StopWatchNano timer(ioptions.clock, + ShouldReportDetailedTime(ioptions.env, ioptions.stats)); + size_t uncompressed_size = 0; + CacheAllocationPtr ubuf = + UncompressData(uncompression_info, data, size, &uncompressed_size, + GetCompressFormatForVersion(format_version), allocator); + if (!ubuf) { + if (!CompressionTypeSupported(uncompression_info.type())) { + return Status::NotSupported( + "Unsupported compression method for this build", + CompressionTypeToString(uncompression_info.type())); + } else { + return Status::Corruption( + "Corrupted compressed block contents", + CompressionTypeToString(uncompression_info.type())); + } + } + + *out_contents = BlockContents(std::move(ubuf), uncompressed_size); + + if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) { + RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + } + RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, size); + RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size()); + RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED); + + TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue", + static_cast(&ret)); + TEST_SYNC_POINT_CALLBACK( + "UncompressBlockData:" + "TamperWithDecompressionOutput", + static_cast(out_contents)); + + return ret; +} + +Status UncompressSerializedBlock(const UncompressionInfo& uncompression_info, + const char* data, size_t size, + BlockContents* out_contents, + uint32_t format_version, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator) { + assert(data[size] != kNoCompression); + assert(data[size] == static_cast(uncompression_info.type())); + return UncompressBlockData(uncompression_info, data, size, out_contents, + format_version, ioptions, allocator); +} + +// Replace the contents of db_host_id with the actual hostname, if db_host_id +// matches the keyword kHostnameForDbHostId +Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) { + assert(db_host_id); + if (*db_host_id == kHostnameForDbHostId) { + Status s = env->GetHostNameString(db_host_id); + if (!s.ok()) { + db_host_id->clear(); + } + return s; + } + + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.h new file mode 100644 index 0000000000..c375165bf6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/format.h @@ -0,0 +1,378 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include + +#include "file/file_prefetch_buffer.h" +#include "file/random_access_file_reader.h" +#include "memory/memory_allocator_impl.h" +#include "options/cf_options.h" +#include "port/malloc.h" +#include "port/port.h" // noexcept +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFile; +struct ReadOptions; + +bool ShouldReportDetailedTime(Env* env, Statistics* stats); + +// the length of the magic number in bytes. +constexpr uint32_t kMagicNumberLengthByte = 8; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + // Creates a block handle with special values indicating "uninitialized," + // distinct from the "null" block handle. + BlockHandle(); + BlockHandle(uint64_t offset, uint64_t size); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t _offset) { offset_ = _offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t _size) { size_ = _size; } + + void EncodeTo(std::string* dst) const; + char* EncodeTo(char* dst) const; + Status DecodeFrom(Slice* input); + Status DecodeSizeFrom(uint64_t offset, Slice* input); + + // Return a string that contains the copy of handle. + std::string ToString(bool hex = true) const; + + // if the block handle's offset and size are both "0", we will view it + // as a null block handle that points to no where. + bool IsNull() const { return offset_ == 0 && size_ == 0; } + + static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; } + + // Maximum encoding length of a BlockHandle + static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length; + + inline bool operator==(const BlockHandle& rhs) const { + return offset_ == rhs.offset_ && size_ == rhs.size_; + } + inline bool operator!=(const BlockHandle& rhs) const { + return !(*this == rhs); + } + + private: + uint64_t offset_; + uint64_t size_; + + static const BlockHandle kNullBlockHandle; +}; + +// Value in block-based table file index. +// +// The index entry for block n is: y -> h, [x], +// where: y is some key between the last key of block n (inclusive) and the +// first key of block n+1 (exclusive); h is BlockHandle pointing to block n; +// x, if present, is the first key of block n (unshortened). +// This struct represents the "h, [x]" part. +struct IndexValue { + BlockHandle handle; + // Empty means unknown. + Slice first_internal_key; + + IndexValue() = default; + IndexValue(BlockHandle _handle, Slice _first_internal_key) + : handle(_handle), first_internal_key(_first_internal_key) {} + + // have_first_key indicates whether the `first_internal_key` is used. + // If previous_handle is not null, delta encoding is used; + // in this case, the two handles must point to consecutive blocks: + // handle.offset() == + // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize + void EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const; + Status DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle); + + std::string ToString(bool hex, bool have_first_key) const; +}; + +inline uint32_t GetCompressFormatForVersion(uint32_t format_version) { + // As of format_version 2, we encode compressed block with + // compress_format_version == 2. Before that, the version is 1. + // DO NOT CHANGE THIS FUNCTION, it affects disk format + return format_version >= 2 ? 2 : 1; +} + +constexpr uint32_t kLatestFormatVersion = 5; + +inline bool IsSupportedFormatVersion(uint32_t version) { + return version <= kLatestFormatVersion; +} + +// Footer encapsulates the fixed information stored at the tail end of every +// SST file. In general, it should only include things that cannot go +// elsewhere under the metaindex block. For example, checksum_type is +// required for verifying metaindex block checksum (when applicable), but +// index block handle can easily go in metaindex block (possible future). +// See also FooterBuilder below. +class Footer { + public: + // Create empty. Populate using DecodeFrom. + Footer() {} + + // Deserialize a footer (populate fields) from `input` and check for various + // corruptions. `input_offset` is the offset within the target file of + // `input` buffer (future use). + // If enforce_table_magic_number != 0, will return corruption if table magic + // number is not equal to enforce_table_magic_number. + Status DecodeFrom(Slice input, uint64_t input_offset, + uint64_t enforce_table_magic_number = 0); + + // Table magic number identifies file as RocksDB SST file and which kind of + // SST format is use. + uint64_t table_magic_number() const { return table_magic_number_; } + + // A version (footer and more) within a kind of SST. (It would add more + // unnecessary complexity to separate footer versions and + // BBTO::format_version.) + uint32_t format_version() const { return format_version_; } + + // Block handle for metaindex block. + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + + // Block handle for (top-level) index block. + const BlockHandle& index_handle() const { return index_handle_; } + + // Checksum type used in the file. + ChecksumType checksum_type() const { + return static_cast(checksum_type_); + } + + // Block trailer size used by file with this footer (e.g. 5 for block-based + // table and 0 for plain table). This is inferred from magic number so + // not in the serialized form. + inline size_t GetBlockTrailerSize() const { return block_trailer_size_; } + + // Convert this object to a human readable form + std::string ToString() const; + + // Encoded lengths of Footers. Bytes for serialized Footer will always be + // >= kMinEncodedLength and <= kMaxEncodedLength. + // + // Footer version 0 (legacy) will always occupy exactly this many bytes. + // It consists of two block handles, padding, and a magic number. + static constexpr uint32_t kVersion0EncodedLength = + 2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte; + static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength; + + // Footer of versions 1 and higher will always occupy exactly this many + // bytes. It originally consisted of the checksum type, two block handles, + // padding (to maximum handle encoding size), a format version number, and a + // magic number. + static constexpr uint32_t kNewVersionsEncodedLength = + 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte; + static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength; + + static constexpr uint64_t kNullTableMagicNumber = 0; + + static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU; + + private: + static constexpr int kInvalidChecksumType = + (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum; + + uint64_t table_magic_number_ = kNullTableMagicNumber; + uint32_t format_version_ = kInvalidFormatVersion; + BlockHandle metaindex_handle_; + BlockHandle index_handle_; + int checksum_type_ = kInvalidChecksumType; + uint8_t block_trailer_size_ = 0; +}; + +// Builder for Footer +class FooterBuilder { + public: + // Run builder in inputs. This is a single step with lots of parameters for + // efficiency (based on perf testing). + // * table_magic_number identifies file as RocksDB SST file and which kind of + // SST format is use. + // * format_version is a version for the footer and can also apply to other + // aspects of the SST file (see BlockBasedTableOptions::format_version). + // NOTE: To save complexity in the caller, when format_version == 0 and + // there is a corresponding legacy magic number to the one specified, the + // legacy magic number will be written for forward compatibility. + // * footer_offset is the file offset where the footer will be written + // (for future use). + // * checksum_type is for formats using block checksums. + // * index_handle is optional for some kinds of SST files. + void Build(uint64_t table_magic_number, uint32_t format_version, + uint64_t footer_offset, ChecksumType checksum_type, + const BlockHandle& metaindex_handle, + const BlockHandle& index_handle = BlockHandle::NullBlockHandle()); + + // After Builder, get a Slice for the serialized Footer, backed by this + // FooterBuilder. + const Slice& GetSlice() const { + assert(slice_.size()); + return slice_; + } + + private: + Slice slice_; + std::array data_; +}; + +// Read the footer from file +// If enforce_table_magic_number != 0, ReadFooterFromFile() will return +// corruption if table_magic number is not equal to enforce_table_magic_number +Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, + FileSystem& fs, FilePrefetchBuffer* prefetch_buffer, + uint64_t file_size, Footer* footer, + uint64_t enforce_table_magic_number = 0); + +// Computes a checksum using the given ChecksumType. Sometimes we need to +// include one more input byte logically at the end but not part of the main +// data buffer. If data_size >= 1, then +// ComputeBuiltinChecksum(type, data, size) +// == +// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1]) +uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data, + size_t size); +uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, + size_t size, char last_byte); + +// Represents the contents of a block read from an SST file. Depending on how +// it's created, it may or may not own the actual block bytes. As an example, +// BlockContents objects representing data read from mmapped files only point +// into the mmapped region. Depending on context, it might be a serialized +// (potentially compressed) block, including a trailer beyond `size`, or an +// uncompressed block. +// +// Please try to use this terminology when dealing with blocks: +// * "Serialized block" - bytes that go into storage. For block-based table +// (usually the case) this includes the block trailer. Here the `size` does +// not include the trailer, but other places in code might include the trailer +// in the size. +// * "Maybe compressed block" - like a serialized block, but without the +// trailer (or no promise of including a trailer). Must be accompanied by a +// CompressionType in some other variable or field. +// * "Uncompressed block" - "payload" bytes that are either stored with no +// compression, used as input to compression function, or result of +// decompression function. +// * "Parsed block" - an in-memory form of a block in block cache, as it is +// used by the table reader. Different C++ types are used depending on the +// block type (see block_cache.h). Only trivially parsable block types +// use BlockContents as the parsed form. +// +struct BlockContents { + // Points to block payload (without trailer) + Slice data; + CacheAllocationPtr allocation; + +#ifndef NDEBUG + // Whether there is a known trailer after what is pointed to by `data`. + // See BlockBasedTable::GetCompressionType. + bool has_trailer = false; +#endif // NDEBUG + + BlockContents() {} + + // Does not take ownership of the underlying data bytes. + BlockContents(const Slice& _data) : data(_data) {} + + // Takes ownership of the underlying data bytes. + BlockContents(CacheAllocationPtr&& _data, size_t _size) + : data(_data.get(), _size), allocation(std::move(_data)) {} + + // Takes ownership of the underlying data bytes. + BlockContents(std::unique_ptr&& _data, size_t _size) + : data(_data.get(), _size) { + allocation.reset(_data.release()); + } + + // Returns whether the object has ownership of the underlying data bytes. + bool own_bytes() const { return allocation.get() != nullptr; } + + // The additional memory space taken by the block data. + size_t usable_size() const { + if (allocation.get() != nullptr) { + auto allocator = allocation.get_deleter().allocator; + if (allocator) { + return allocator->UsableSize(allocation.get(), data.size()); + } +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size(allocation.get()); +#else + return data.size(); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + } else { + return 0; // no extra memory is occupied by the data + } + } + + size_t ApproximateMemoryUsage() const { + return usable_size() + sizeof(*this); + } + + BlockContents(BlockContents&& other) noexcept { *this = std::move(other); } + + BlockContents& operator=(BlockContents&& other) { + data = std::move(other.data); + allocation = std::move(other.allocation); +#ifndef NDEBUG + has_trailer = other.has_trailer; +#endif // NDEBUG + return *this; + } +}; + +// The `data` points to serialized block contents read in from file, which +// must be compressed and include a trailer beyond `size`. A new buffer is +// allocated with the given allocator (or default) and the uncompressed +// contents are returned in `out_contents`. +// format_version is as defined in include/rocksdb/table.h, which is +// used to determine compression format version. +Status UncompressSerializedBlock(const UncompressionInfo& info, + const char* data, size_t size, + BlockContents* out_contents, + uint32_t format_version, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator = nullptr); + +// This is a variant of UncompressSerializedBlock that does not expect a +// block trailer beyond `size`. (CompressionType is taken from `info`.) +Status UncompressBlockData(const UncompressionInfo& info, const char* data, + size_t size, BlockContents* out_contents, + uint32_t format_version, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator = nullptr); + +// Replace db_host_id contents with the real hostname if necessary +Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id); + +// Implementation details follow. Clients should ignore, + +// TODO(andrewkr): we should prefer one way of representing a null/uninitialized +// BlockHandle. Currently we use zeros for null and use negation-of-zeros for +// uninitialized. +inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {} + +inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size) + : offset_(_offset), size_(_size) {} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.cc new file mode 100644 index 0000000000..8f5cd75f15 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.cc @@ -0,0 +1,616 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/get_context.h" + +#include "db/blob//blob_fetcher.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/read_callback.h" +#include "db/wide/wide_column_serialization.h" +#include "monitoring/file_read_sample.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics_impl.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { + if (replay_log) { + if (replay_log->empty()) { + // Optimization: in the common case of only one operation in the + // log, we allocate the exact amount of space needed. + replay_log->reserve(1 + VarintLength(value.size()) + value.size()); + } + replay_log->push_back(type); + PutLengthPrefixedSlice(replay_log, value); + } +} + +} // namespace + +GetContext::GetContext( + const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, const Slice& user_key, + PinnableSlice* pinnable_val, PinnableWideColumns* columns, + std::string* timestamp, bool* value_found, MergeContext* merge_context, + bool do_merge, SequenceNumber* _max_covering_tombstone_seq, + SystemClock* clock, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback, + bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher) + : ucmp_(ucmp), + merge_operator_(merge_operator), + logger_(logger), + statistics_(statistics), + state_(init_state), + user_key_(user_key), + pinnable_val_(pinnable_val), + columns_(columns), + timestamp_(timestamp), + value_found_(value_found), + merge_context_(merge_context), + max_covering_tombstone_seq_(_max_covering_tombstone_seq), + clock_(clock), + seq_(seq), + replay_log_(nullptr), + pinned_iters_mgr_(_pinned_iters_mgr), + callback_(callback), + do_merge_(do_merge), + is_blob_index_(is_blob_index), + tracing_get_id_(tracing_get_id), + blob_fetcher_(blob_fetcher) { + if (seq_) { + *seq_ = kMaxSequenceNumber; + } + sample_ = should_sample_file_read(); +} + +GetContext::GetContext(const Comparator* ucmp, + const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* pinnable_val, + PinnableWideColumns* columns, bool* value_found, + MergeContext* merge_context, bool do_merge, + SequenceNumber* _max_covering_tombstone_seq, + SystemClock* clock, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, + ReadCallback* callback, bool* is_blob_index, + uint64_t tracing_get_id, BlobFetcher* blob_fetcher) + : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key, + pinnable_val, columns, /*timestamp=*/nullptr, value_found, + merge_context, do_merge, _max_covering_tombstone_seq, clock, + seq, _pinned_iters_mgr, callback, is_blob_index, + tracing_get_id, blob_fetcher) {} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +void GetContext::MarkKeyMayExist() { + state_ = kFound; + if (value_found_ != nullptr) { + *value_found_ = false; + } +} + +void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) { + assert(state_ == kNotFound); + appendToReplayLog(replay_log_, kTypeValue, value); + + state_ = kFound; + if (LIKELY(pinnable_val_ != nullptr)) { + pinnable_val_->PinSelf(value); + } +} + +void GetContext::ReportCounters() { + if (get_context_stats_.num_cache_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit); + } + if (get_context_stats_.num_cache_index_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT, + get_context_stats_.num_cache_index_hit); + } + if (get_context_stats_.num_cache_data_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_HIT, + get_context_stats_.num_cache_data_hit); + } + if (get_context_stats_.num_cache_filter_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT, + get_context_stats_.num_cache_filter_hit); + } + if (get_context_stats_.num_cache_compression_dict_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT, + get_context_stats_.num_cache_compression_dict_hit); + } + if (get_context_stats_.num_cache_index_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS, + get_context_stats_.num_cache_index_miss); + } + if (get_context_stats_.num_cache_filter_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS, + get_context_stats_.num_cache_filter_miss); + } + if (get_context_stats_.num_cache_data_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_MISS, + get_context_stats_.num_cache_data_miss); + } + if (get_context_stats_.num_cache_compression_dict_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS, + get_context_stats_.num_cache_compression_dict_miss); + } + if (get_context_stats_.num_cache_bytes_read > 0) { + RecordTick(statistics_, BLOCK_CACHE_BYTES_READ, + get_context_stats_.num_cache_bytes_read); + } + if (get_context_stats_.num_cache_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_MISS, + get_context_stats_.num_cache_miss); + } + if (get_context_stats_.num_cache_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add); + } + if (get_context_stats_.num_cache_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT, + get_context_stats_.num_cache_add_redundant); + } + if (get_context_stats_.num_cache_bytes_write > 0) { + RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE, + get_context_stats_.num_cache_bytes_write); + } + if (get_context_stats_.num_cache_index_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD, + get_context_stats_.num_cache_index_add); + } + if (get_context_stats_.num_cache_index_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT, + get_context_stats_.num_cache_index_add_redundant); + } + if (get_context_stats_.num_cache_index_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT, + get_context_stats_.num_cache_index_bytes_insert); + } + if (get_context_stats_.num_cache_data_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_ADD, + get_context_stats_.num_cache_data_add); + } + if (get_context_stats_.num_cache_data_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT, + get_context_stats_.num_cache_data_add_redundant); + } + if (get_context_stats_.num_cache_data_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT, + get_context_stats_.num_cache_data_bytes_insert); + } + if (get_context_stats_.num_cache_filter_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD, + get_context_stats_.num_cache_filter_add); + } + if (get_context_stats_.num_cache_filter_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT, + get_context_stats_.num_cache_filter_add_redundant); + } + if (get_context_stats_.num_cache_filter_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT, + get_context_stats_.num_cache_filter_bytes_insert); + } + if (get_context_stats_.num_cache_compression_dict_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD, + get_context_stats_.num_cache_compression_dict_add); + } + if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + get_context_stats_.num_cache_compression_dict_add_redundant); + } + if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + get_context_stats_.num_cache_compression_dict_bytes_insert); + } +} + +bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, + const Slice& value, bool* matched, + Cleanable* value_pinner) { + assert(matched); + assert((state_ != kMerge && parsed_key.type != kTypeMerge) || + merge_context_ != nullptr); + if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) { + *matched = true; + // If the value is not in the snapshot, skip it + if (!CheckCallback(parsed_key.sequence)) { + return true; // to continue to the next seq + } + + appendToReplayLog(replay_log_, parsed_key.type, value); + + if (seq_ != nullptr) { + // Set the sequence number if it is uninitialized + if (*seq_ == kMaxSequenceNumber) { + *seq_ = parsed_key.sequence; + } + if (max_covering_tombstone_seq_) { + *seq_ = std::max(*seq_, *max_covering_tombstone_seq_); + } + } + + size_t ts_sz = ucmp_->timestamp_size(); + if (ts_sz > 0 && timestamp_ != nullptr) { + if (!timestamp_->empty()) { + assert(ts_sz == timestamp_->size()); + // `timestamp` can be set before `SaveValue` is ever called + // when max_covering_tombstone_seq_ was set. + // If this key has a higher sequence number than range tombstone, + // then timestamp should be updated. `ts_from_rangetombstone_` is + // set to false afterwards so that only the key with highest seqno + // updates the timestamp. + if (ts_from_rangetombstone_) { + assert(max_covering_tombstone_seq_); + if (parsed_key.sequence > *max_covering_tombstone_seq_) { + Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz); + timestamp_->assign(ts.data(), ts.size()); + ts_from_rangetombstone_ = false; + } + } + } + // TODO optimize for small size ts + const std::string kMaxTs(ts_sz, '\xff'); + if (timestamp_->empty() || + ucmp_->CompareTimestamp(*timestamp_, kMaxTs) == 0) { + Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz); + timestamp_->assign(ts.data(), ts.size()); + } + } + + auto type = parsed_key.type; + // Key matches. Process it + if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex || + type == kTypeWideColumnEntity || type == kTypeDeletion || + type == kTypeDeletionWithTimestamp || type == kTypeSingleDeletion) && + max_covering_tombstone_seq_ != nullptr && + *max_covering_tombstone_seq_ > parsed_key.sequence) { + // Note that deletion types are also considered, this is for the case + // when we need to return timestamp to user. If a range tombstone has a + // higher seqno than point tombstone, its timestamp should be returned. + type = kTypeRangeDeletion; + } + switch (type) { + case kTypeValue: + case kTypeBlobIndex: + case kTypeWideColumnEntity: + assert(state_ == kNotFound || state_ == kMerge); + if (type == kTypeBlobIndex) { + if (is_blob_index_ == nullptr) { + // Blob value not supported. Stop. + state_ = kUnexpectedBlobIndex; + return false; + } + } + + if (is_blob_index_ != nullptr) { + *is_blob_index_ = (type == kTypeBlobIndex); + } + + if (kNotFound == state_) { + state_ = kFound; + if (do_merge_) { + if (type == kTypeBlobIndex && ucmp_->timestamp_size() != 0) { + ukey_with_ts_found_.PinSelf(parsed_key.user_key); + } + if (LIKELY(pinnable_val_ != nullptr)) { + Slice value_to_use = value; + + if (type == kTypeWideColumnEntity) { + Slice value_copy = value; + + if (!WideColumnSerialization::GetValueOfDefaultColumn( + value_copy, value_to_use) + .ok()) { + state_ = kCorrupt; + return false; + } + } + + if (LIKELY(value_pinner != nullptr)) { + // If the backing resources for the value are provided, pin them + pinnable_val_->PinSlice(value_to_use, value_pinner); + } else { + TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", + this); + // Otherwise copy the value + pinnable_val_->PinSelf(value_to_use); + } + } else if (columns_ != nullptr) { + if (type == kTypeWideColumnEntity) { + if (!columns_->SetWideColumnValue(value, value_pinner).ok()) { + state_ = kCorrupt; + return false; + } + } else { + columns_->SetPlainValue(value, value_pinner); + } + } + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + if (type == kTypeBlobIndex) { + PinnableSlice pin_val; + if (GetBlobValue(parsed_key.user_key, value, &pin_val) == false) { + return false; + } + Slice blob_value(pin_val); + push_operand(blob_value, nullptr); + } else if (type == kTypeWideColumnEntity) { + Slice value_copy = value; + Slice value_of_default; + + if (!WideColumnSerialization::GetValueOfDefaultColumn( + value_copy, value_of_default) + .ok()) { + state_ = kCorrupt; + return false; + } + + push_operand(value_of_default, value_pinner); + } else { + assert(type == kTypeValue); + push_operand(value, value_pinner); + } + } + } else if (kMerge == state_) { + assert(merge_operator_ != nullptr); + if (type == kTypeBlobIndex) { + PinnableSlice pin_val; + if (GetBlobValue(parsed_key.user_key, value, &pin_val) == false) { + return false; + } + Slice blob_value(pin_val); + state_ = kFound; + if (do_merge_) { + Merge(&blob_value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(blob_value, nullptr); + } + } else if (type == kTypeWideColumnEntity) { + state_ = kFound; + + if (do_merge_) { + MergeWithEntity(value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + Slice value_copy = value; + Slice value_of_default; + + if (!WideColumnSerialization::GetValueOfDefaultColumn( + value_copy, value_of_default) + .ok()) { + state_ = kCorrupt; + return false; + } + + push_operand(value_of_default, value_pinner); + } + } else { + assert(type == kTypeValue); + + state_ = kFound; + if (do_merge_) { + Merge(&value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); + } + } + } + return false; + + case kTypeDeletion: + case kTypeDeletionWithTimestamp: + case kTypeSingleDeletion: + case kTypeRangeDeletion: + // TODO(noetzli): Verify correctness once merge of single-deletes + // is supported + assert(state_ == kNotFound || state_ == kMerge); + if (kNotFound == state_) { + state_ = kDeleted; + } else if (kMerge == state_) { + state_ = kFound; + if (do_merge_) { + Merge(nullptr); + } + // If do_merge_ = false then the current value shouldn't be part of + // merge_context_->operand_list + } + return false; + + case kTypeMerge: + assert(state_ == kNotFound || state_ == kMerge); + state_ = kMerge; + // value_pinner is not set from plain_table_reader.cc for example. + push_operand(value, value_pinner); + PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1); + + if (do_merge_ && merge_operator_ != nullptr && + merge_operator_->ShouldMerge( + merge_context_->GetOperandsDirectionBackward())) { + state_ = kFound; + Merge(nullptr); + return false; + } + return true; + + default: + assert(false); + break; + } + } + + // state_ could be Corrupt, merge or notfound + return false; +} + +void GetContext::Merge(const Slice* value) { + assert(do_merge_); + assert(!pinnable_val_ || !columns_); + + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, value, merge_context_->GetOperands(), &result, + logger_, statistics_, clock_, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + if (s.subcode() == Status::SubCode::kMergeOperatorFailed) { + state_ = kMergeOperatorFailed; + } else { + state_ = kCorrupt; + } + return; + } + + if (LIKELY(pinnable_val_ != nullptr)) { + *(pinnable_val_->GetSelf()) = std::move(result); + pinnable_val_->PinSelf(); + return; + } + + assert(columns_); + columns_->SetPlainValue(std::move(result)); +} + +void GetContext::MergeWithEntity(Slice entity) { + assert(do_merge_); + assert(!pinnable_val_ || !columns_); + + if (LIKELY(pinnable_val_ != nullptr)) { + Slice value_of_default; + + { + const Status s = WideColumnSerialization::GetValueOfDefaultColumn( + entity, value_of_default); + if (!s.ok()) { + state_ = kCorrupt; + return; + } + } + + { + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, &value_of_default, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), logger_, + statistics_, clock_, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + if (s.subcode() == Status::SubCode::kMergeOperatorFailed) { + state_ = kMergeOperatorFailed; + } else { + state_ = kCorrupt; + } + return; + } + } + + pinnable_val_->PinSelf(); + return; + } + + std::string result; + + { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMergeWithEntity( + merge_operator_, user_key_, entity, merge_context_->GetOperands(), + &result, logger_, statistics_, clock_, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + if (s.subcode() == Status::SubCode::kMergeOperatorFailed) { + state_ = kMergeOperatorFailed; + } else { + state_ = kCorrupt; + } + return; + } + } + + { + assert(columns_); + const Status s = columns_->SetWideColumnValue(std::move(result)); + if (!s.ok()) { + state_ = kCorrupt; + return; + } + } +} + +bool GetContext::GetBlobValue(const Slice& user_key, const Slice& blob_index, + PinnableSlice* blob_value) { + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + Status status = blob_fetcher_->FetchBlob( + user_key, blob_index, prefetch_buffer, blob_value, bytes_read); + if (!status.ok()) { + if (status.IsIncomplete()) { + // FIXME: this code is not covered by unit tests + MarkKeyMayExist(); + return false; + } + state_ = kCorrupt; + return false; + } + *is_blob_index_ = false; + return true; +} + +void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { + // TODO(yanqin) preserve timestamps information in merge_context + if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && + value_pinner != nullptr) { + value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); + merge_context_->PushOperand(value, true /*value_pinned*/); + } else { + merge_context_->PushOperand(value, false); + } +} + +void replayGetContextLog(const Slice& replay_log, const Slice& user_key, + GetContext* get_context, Cleanable* value_pinner) { + Slice s = replay_log; + while (s.size()) { + auto type = static_cast(*s.data()); + s.remove_prefix(1); + Slice value; + bool ret = GetLengthPrefixedSlice(&s, &value); + assert(ret); + (void)ret; + + bool dont_care __attribute__((__unused__)); + // Since SequenceNumber is not stored and unknown, we will use + // kMaxSequenceNumber. + get_context->SaveValue( + ParsedInternalKey(user_key, kMaxSequenceNumber, type), value, + &dont_care, value_pinner); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.h new file mode 100644 index 0000000000..528cd14fd8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/get_context.h @@ -0,0 +1,245 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "db/read_callback.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { +class BlobFetcher; +class Comparator; +class Logger; +class MergeContext; +class MergeOperator; +class PinnableWideColumns; +class PinnedIteratorsManager; +class Statistics; +class SystemClock; +struct ParsedInternalKey; + +// Data structure for accumulating statistics during a point lookup. At the +// end of the point lookup, the corresponding ticker stats are updated. This +// avoids the overhead of frequent ticker stats updates +struct GetContextStats { + uint64_t num_cache_hit = 0; + uint64_t num_cache_index_hit = 0; + uint64_t num_cache_data_hit = 0; + uint64_t num_cache_filter_hit = 0; + uint64_t num_cache_compression_dict_hit = 0; + uint64_t num_cache_index_miss = 0; + uint64_t num_cache_filter_miss = 0; + uint64_t num_cache_data_miss = 0; + uint64_t num_cache_compression_dict_miss = 0; + uint64_t num_cache_bytes_read = 0; + uint64_t num_cache_miss = 0; + uint64_t num_cache_add = 0; + uint64_t num_cache_add_redundant = 0; + uint64_t num_cache_bytes_write = 0; + uint64_t num_cache_index_add = 0; + uint64_t num_cache_index_add_redundant = 0; + uint64_t num_cache_index_bytes_insert = 0; + uint64_t num_cache_data_add = 0; + uint64_t num_cache_data_add_redundant = 0; + uint64_t num_cache_data_bytes_insert = 0; + uint64_t num_cache_filter_add = 0; + uint64_t num_cache_filter_add_redundant = 0; + uint64_t num_cache_filter_bytes_insert = 0; + uint64_t num_cache_compression_dict_add = 0; + uint64_t num_cache_compression_dict_add_redundant = 0; + uint64_t num_cache_compression_dict_bytes_insert = 0; + // MultiGet stats. + uint64_t num_filter_read = 0; + uint64_t num_index_read = 0; + uint64_t num_sst_read = 0; +}; + +// A class to hold context about a point lookup, such as pointer to value +// slice, key, merge context etc, as well as the current state of the +// lookup. Any user using GetContext to track the lookup result must call +// SaveValue() whenever the internal key is found. This can happen +// repeatedly in case of merge operands. In case the key may exist with +// high probability, but IO is required to confirm and the user doesn't allow +// it, MarkKeyMayExist() must be called instead of SaveValue(). +class GetContext { + public: + // Current state of the point lookup. All except kNotFound and kMerge are + // terminal states + enum GetState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge, // saver contains the current merge result (the operands) + kUnexpectedBlobIndex, + kMergeOperatorFailed, + }; + GetContextStats get_context_stats_; + + // Constructor + // @param value Holds the value corresponding to user_key. If its nullptr + // then return all merge operands corresponding to user_key + // via merge_context + // @param value_found If non-nullptr, set to false if key may be present + // but we can't be certain because we cannot do IO + // @param max_covering_tombstone_seq Pointer to highest sequence number of + // range deletion covering the key. When an internal key + // is found with smaller sequence number, the lookup + // terminates + // @param seq If non-nullptr, the sequence number of the found key will be + // saved here + // @param callback Pointer to ReadCallback to perform additional checks + // for visibility of a key + // @param is_blob_index If non-nullptr, will be used to indicate if a found + // key is of type blob index + // @param do_merge True if value associated with user_key has to be returned + // and false if all the merge operands associated with user_key has to be + // returned. Id do_merge=false then all the merge operands are stored in + // merge_context and they are never merged. The value pointer is untouched. + GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* value, + PinnableWideColumns* columns, bool* value_found, + MergeContext* merge_context, bool do_merge, + SequenceNumber* max_covering_tombstone_seq, SystemClock* clock, + SequenceNumber* seq = nullptr, + PinnedIteratorsManager* _pinned_iters_mgr = nullptr, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr); + GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* value, + PinnableWideColumns* columns, std::string* timestamp, + bool* value_found, MergeContext* merge_context, bool do_merge, + SequenceNumber* max_covering_tombstone_seq, SystemClock* clock, + SequenceNumber* seq = nullptr, + PinnedIteratorsManager* _pinned_iters_mgr = nullptr, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr); + + GetContext() = delete; + + // This can be called to indicate that a key may be present, but cannot be + // confirmed due to IO not allowed + void MarkKeyMayExist(); + + // Records this key, value, and any meta-data (such as sequence number and + // state) into this GetContext. + // + // If the parsed_key matches the user key that we are looking for, sets + // matched to true. + // + // Returns True if more keys need to be read (due to merges) or + // False if the complete value has been found. + bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value, + bool* matched, Cleanable* value_pinner = nullptr); + + // Simplified version of the previous function. Should only be used when we + // know that the operation is a Put. + void SaveValue(const Slice& value, SequenceNumber seq); + + GetState State() const { return state_; } + + SequenceNumber* max_covering_tombstone_seq() { + return max_covering_tombstone_seq_; + } + + bool NeedTimestamp() { return timestamp_ != nullptr; } + + void SetTimestampFromRangeTombstone(const Slice& timestamp) { + assert(timestamp_); + timestamp_->assign(timestamp.data(), timestamp.size()); + ts_from_rangetombstone_ = true; + } + + PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; } + + // If a non-null string is passed, all the SaveValue calls will be + // logged into the string. The operations can then be replayed on + // another GetContext with replayGetContextLog. + void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; } + + // Do we need to fetch the SequenceNumber for this key? + bool NeedToReadSequence() const { return (seq_ != nullptr); } + + bool sample() const { return sample_; } + + bool CheckCallback(SequenceNumber seq) { + if (callback_) { + return callback_->IsVisible(seq); + } + return true; + } + + void ReportCounters(); + + bool has_callback() const { return callback_ != nullptr; } + + const Slice& ukey_to_get_blob_value() const { + if (!ukey_with_ts_found_.empty()) { + return ukey_with_ts_found_; + } else { + return user_key_; + } + } + + uint64_t get_tracing_get_id() const { return tracing_get_id_; } + + void push_operand(const Slice& value, Cleanable* value_pinner); + + private: + void Merge(const Slice* value); + void MergeWithEntity(Slice entity); + bool GetBlobValue(const Slice& user_key, const Slice& blob_index, + PinnableSlice* blob_value); + + const Comparator* ucmp_; + const MergeOperator* merge_operator_; + // the merge operations encountered; + Logger* logger_; + Statistics* statistics_; + + GetState state_; + Slice user_key_; + // When a blob index is found with the user key containing timestamp, + // this copies the corresponding user key on record in the sst file + // and is later used for blob verification. + PinnableSlice ukey_with_ts_found_; + PinnableSlice* pinnable_val_; + PinnableWideColumns* columns_; + std::string* timestamp_; + bool ts_from_rangetombstone_{false}; + bool* value_found_; // Is value set correctly? Used by KeyMayExist + MergeContext* merge_context_; + SequenceNumber* max_covering_tombstone_seq_; + SystemClock* clock_; + // If a key is found, seq_ will be set to the SequenceNumber of most recent + // write to the key or kMaxSequenceNumber if unknown + SequenceNumber* seq_; + std::string* replay_log_; + // Used to temporarily pin blocks when state_ == GetContext::kMerge + PinnedIteratorsManager* pinned_iters_mgr_; + ReadCallback* callback_; + bool sample_; + // Value is true if it's called as part of DB Get API and false if it's + // called as part of DB GetMergeOperands API. When it's false merge operators + // are never merged. + bool do_merge_; + bool* is_blob_index_; + // Used for block cache tracing only. A tracing get id uniquely identifies a + // Get or a MultiGet. + const uint64_t tracing_get_id_; + BlobFetcher* blob_fetcher_; +}; + +// Call this to replay a log and bring the get_context up to date. The replay +// log must have been created by another GetContext object, whose replay log +// must have been set by calling GetContext::SetReplayLog(). +void replayGetContextLog(const Slice& replay_log, const Slice& user_key, + GetContext* get_context, + Cleanable* value_pinner = nullptr); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/internal_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/internal_iterator.h new file mode 100644 index 0000000000..8015ed6351 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/internal_iterator.h @@ -0,0 +1,224 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include + +#include "db/dbformat.h" +#include "file/readahead_file_info.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/status.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +class PinnedIteratorsManager; + +enum class IterBoundCheck : char { + kUnknown = 0, + kOutOfBound, + kInbound, +}; + +struct IterateResult { + Slice key; + IterBoundCheck bound_check_result = IterBoundCheck::kUnknown; + // If false, PrepareValue() needs to be called before value(). + bool value_prepared = true; +}; + +template +class InternalIteratorBase : public Cleanable { + public: + InternalIteratorBase() {} + + // No copying allowed + InternalIteratorBase(const InternalIteratorBase&) = delete; + InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; + + virtual ~InternalIteratorBase() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + // Always returns false if !status().ok(). + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + // All Seek*() methods clear any error status() that the iterator had prior to + // the call; after the seek, status() indicates only the error (if any) that + // happened during the seek, not any past errors. + // 'target' contains user timestamp if timestamp is enabled. + virtual void Seek(const Slice& target) = 0; + + // Position at the first key in the source that at or before target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the next entry in the source, and return result. Iterator + // implementation should override this method to help methods inline better, + // or when UpperBoundCheckResult() is non-trivial. + // REQUIRES: Valid() + virtual bool NextAndGetResult(IterateResult* result) { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual + // call. If an implementation has non-trivial UpperBoundCheckResult(), + // it should also override NextAndGetResult(). + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = false; + assert(UpperBoundCheckResult() != IterBoundCheck::kOutOfBound); + } + return is_valid; + } + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return user key for the current entry. + // REQUIRES: Valid() + virtual Slice user_key() const { return ExtractUserKey(key()); } + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + // REQUIRES: PrepareValue() has been called if needed (see PrepareValue()). + virtual TValue value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // For some types of iterators, sometimes Seek()/Next()/SeekForPrev()/etc may + // load key but not value (to avoid the IO cost of reading the value from disk + // if it won't be not needed). This method loads the value in such situation. + // + // Needs to be called before value() at least once after each iterator + // movement (except if IterateResult::value_prepared = true), for iterators + // created with allow_unprepared_value = true. + // + // Returns false if an error occurred; in this case Valid() is also changed + // to false, and status() is changed to non-ok. + // REQUIRES: Valid() + virtual bool PrepareValue() { return true; } + + // Keys return from this iterator can be smaller than iterate_lower_bound. + virtual bool MayBeOutOfLowerBound() { return true; } + + // If the iterator has checked the key against iterate_upper_bound, returns + // the result here. The function can be used by user of the iterator to skip + // their own checks. If Valid() = true, IterBoundCheck::kUnknown is always + // a valid value. If Valid() = false, IterBoundCheck::kOutOfBound indicates + // that the iterator is filtered out by upper bound checks. + virtual IterBoundCheck UpperBoundCheckResult() { + return IterBoundCheck::kUnknown; + } + + // Pass the PinnedIteratorsManager to the Iterator, most Iterators don't + // communicate with PinnedIteratorsManager so default implementation is no-op + // but for Iterators that need to communicate with PinnedIteratorsManager + // they will implement this function and use the passed pointer to communicate + // with PinnedIteratorsManager. + virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) { + } + + // If true, this means that the Slice returned by key() is valid as long as + // PinnedIteratorsManager::ReleasePinnedData is not called and the + // Iterator is not deleted. + // + // IsKeyPinned() is guaranteed to always return true if + // - Iterator is created with ReadOptions::pin_data = true + // - DB tables were created with BlockBasedTableOptions::use_delta_encoding + // set to false. + virtual bool IsKeyPinned() const { return false; } + + // If true, this means that the Slice returned by value() is valid as long as + // PinnedIteratorsManager::ReleasePinnedData is not called and the + // Iterator is not deleted. + // REQUIRES: Same as for value(). + virtual bool IsValuePinned() const { return false; } + + virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) { + return Status::NotSupported(""); + } + + // When iterator moves from one file to another file at same level, new file's + // readahead state (details of last block read) is updated with previous + // file's readahead state. This way internal readahead_size of Prefetch Buffer + // doesn't start from scratch and can fall back to 8KB with no prefetch if + // reads are not sequential. + // + // Default implementation is no-op and its implemented by iterators. + virtual void GetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {} + + // Default implementation is no-op and its implemented by iterators. + virtual void SetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {} + + // When used under merging iterator, LevelIterator treats file boundaries + // as sentinel keys to prevent it from moving to next SST file before range + // tombstones in the current SST file are no longer needed. This method makes + // it cheap to check if the current key is a sentinel key. This should only be + // used by MergingIterator and LevelIterator for now. + virtual bool IsDeleteRangeSentinelKey() const { return false; } + + protected: + void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && cmp->Compare(target, key()) < 0) { + Prev(); + } + } +}; + +using InternalIterator = InternalIteratorBase; + +// Return an empty iterator (yields nothing). +template +extern InternalIteratorBase* NewEmptyInternalIterator(); + +// Return an empty iterator with the specified status. +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +// Return an empty iterator with the specified status, allocated arena. +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iter_heap.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iter_heap.h new file mode 100644 index 0000000000..6ad94be9b6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iter_heap.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include "db/dbformat.h" +#include "table/iterator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +// When used with std::priority_queue, this comparison functor puts the +// iterator with the max/largest key on top. +class MaxIteratorComparator { + public: + MaxIteratorComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) const { + return comparator_->Compare(a->key(), b->key()) < 0; + } + + private: + const InternalKeyComparator* comparator_; +}; + +// When used with std::priority_queue, this comparison functor puts the +// iterator with the min/smallest key on top. +class MinIteratorComparator { + public: + MinIteratorComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) const { + return comparator_->Compare(a->key(), b->key()) > 0; + } + + private: + const InternalKeyComparator* comparator_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator.cc new file mode 100644 index 0000000000..14e280a07b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/iterator.h" + +#include "memory/arena.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +Status Iterator::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.is-key-pinned") { + *prop = "0"; + return Status::OK(); + } + return Status::InvalidArgument("Unidentified property."); +} + +namespace { +class EmptyIterator : public Iterator { + public: + explicit EmptyIterator(const Status& s) : status_(s) {} + bool Valid() const override { return false; } + void Seek(const Slice& /*target*/) override {} + void SeekForPrev(const Slice& /*target*/) override {} + void SeekToFirst() override {} + void SeekToLast() override {} + void Next() override { assert(false); } + void Prev() override { assert(false); } + Slice key() const override { + assert(false); + return Slice(); + } + Slice value() const override { + assert(false); + return Slice(); + } + Status status() const override { return status_; } + + private: + Status status_; +}; + +template +class EmptyInternalIterator : public InternalIteratorBase { + public: + explicit EmptyInternalIterator(const Status& s) : status_(s) {} + bool Valid() const override { return false; } + void Seek(const Slice& /*target*/) override {} + void SeekForPrev(const Slice& /*target*/) override {} + void SeekToFirst() override {} + void SeekToLast() override {} + void Next() override { assert(false); } + void Prev() override { assert(false); } + Slice key() const override { + assert(false); + return Slice(); + } + TValue value() const override { + assert(false); + return TValue(); + } + Status status() const override { return status_; } + + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); } + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status) { + return new EmptyInternalIterator(status); +} +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status, + Arena* arena) { + if (arena == nullptr) { + return NewErrorInternalIterator(status); + } else { + auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator)); + return new (mem) EmptyInternalIterator(status); + } +} +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); + +template +InternalIteratorBase* NewEmptyInternalIterator() { + return new EmptyInternalIterator(Status::OK()); +} +template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); + +template +InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { + if (arena == nullptr) { + return NewEmptyInternalIterator(); + } else { + auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator)); + return new (mem) EmptyInternalIterator(Status::OK()); + } +} +template InternalIteratorBase* NewEmptyInternalIterator( + Arena* arena); +template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator_wrapper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator_wrapper.h new file mode 100644 index 0000000000..17abef4ac7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/iterator_wrapper.h @@ -0,0 +1,190 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +// A internal wrapper class with an interface similar to Iterator that caches +// the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +template +class IteratorWrapperBase { + public: + IteratorWrapperBase() : iter_(nullptr), valid_(false) {} + explicit IteratorWrapperBase(InternalIteratorBase* _iter) + : iter_(nullptr) { + Set(_iter); + } + ~IteratorWrapperBase() {} + InternalIteratorBase* iter() const { return iter_; } + + // Set the underlying Iterator to _iter and return + // previous underlying Iterator. + InternalIteratorBase* Set(InternalIteratorBase* _iter) { + InternalIteratorBase* old_iter = iter_; + + iter_ = _iter; + if (iter_ == nullptr) { + valid_ = false; + } else { + Update(); + } + return old_iter; + } + + void DeleteIter(bool is_arena_mode) { + if (iter_) { + if (!is_arena_mode) { + delete iter_; + } else { + iter_->~InternalIteratorBase(); + } + } + } + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { + assert(Valid()); + return result_.key; + } + TValue value() const { + assert(Valid()); + return iter_->value(); + } + // Methods below require iter() != nullptr + Status status() const { + assert(iter_); + return iter_->status(); + } + bool PrepareValue() { + assert(Valid()); + if (result_.value_prepared) { + return true; + } + if (iter_->PrepareValue()) { + result_.value_prepared = true; + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } + void Next() { + assert(iter_); + valid_ = iter_->NextAndGetResult(&result_); + assert(!valid_ || iter_->status().ok()); + } + bool NextAndGetResult(IterateResult* result) { + assert(iter_); + valid_ = iter_->NextAndGetResult(&result_); + *result = result_; + assert(!valid_ || iter_->status().ok()); + return valid_; + } + void Prev() { + assert(iter_); + iter_->Prev(); + Update(); + } + void Seek(const Slice& k) { + assert(iter_); + iter_->Seek(k); + Update(); + } + void SeekForPrev(const Slice& k) { + assert(iter_); + iter_->SeekForPrev(k); + Update(); + } + void SeekToFirst() { + assert(iter_); + iter_->SeekToFirst(); + Update(); + } + void SeekToLast() { + assert(iter_); + iter_->SeekToLast(); + Update(); + } + + bool MayBeOutOfLowerBound() { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() { + assert(Valid()); + return result_.bound_check_result; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { + assert(iter_); + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + bool IsKeyPinned() const { + assert(Valid()); + return iter_->IsKeyPinned(); + } + bool IsValuePinned() const { + assert(Valid()); + return iter_->IsValuePinned(); + } + + bool IsValuePrepared() const { return result_.value_prepared; } + + Slice user_key() const { + assert(Valid()); + return iter_->user_key(); + } + + void UpdateReadaheadState(InternalIteratorBase* old_iter) { + if (old_iter && iter_) { + ReadaheadFileInfo readahead_file_info; + old_iter->GetReadaheadState(&readahead_file_info); + iter_->SetReadaheadState(&readahead_file_info); + } + } + + bool IsDeleteRangeSentinelKey() const { + return iter_->IsDeleteRangeSentinelKey(); + } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + assert(iter_->status().ok()); + result_.key = iter_->key(); + result_.bound_check_result = IterBoundCheck::kUnknown; + result_.value_prepared = false; + } + } + + InternalIteratorBase* iter_; + IterateResult result_; + bool valid_; +}; + +using IteratorWrapper = IteratorWrapperBase; + +class Arena; +// Return an empty iterator (yields nothing) allocated from arena. +template +extern InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.cc new file mode 100644 index 0000000000..0fa3fcd3eb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.cc @@ -0,0 +1,1725 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merging_iterator.h" + +#include "db/arena_wrapped_db_iter.h" + +namespace ROCKSDB_NAMESPACE { +// MergingIterator uses a min/max heap to combine data from point iterators. +// Range tombstones can be added and keys covered by range tombstones will be +// skipped. +// +// The following are implementation details and can be ignored by user. +// For merging iterator to process range tombstones, it treats the start and end +// keys of a range tombstone as two keys and put them into minHeap_ or maxHeap_ +// together with regular point keys. Each range tombstone is active only within +// its internal key range [start_key, end_key). An `active_` set is used to +// track levels that have an active range tombstone. Take forward scanning +// for example. Level j is in active_ if its current range tombstone has its +// start_key popped from minHeap_ and its end_key in minHeap_. If the top of +// minHeap_ is a point key from level L, we can determine if the point key is +// covered by any range tombstone by checking if there is an l <= L in active_. +// The case of l == L also involves checking range tombstone's sequence number. +// +// The following (non-exhaustive) list of invariants are maintained by +// MergingIterator during forward scanning. After each InternalIterator API, +// i.e., Seek*() and Next(), and FindNextVisibleKey(), if minHeap_ is not empty: +// (1) minHeap_.top().type == ITERATOR +// (2) minHeap_.top()->key() is not covered by any range tombstone. +// +// After each call to SeekImpl() in addition to the functions mentioned above: +// (3) For all level i and j <= i, range_tombstone_iters_[j].prev.end_key() < +// children_[i].iter.key(). That is, range_tombstone_iters_[j] is at or before +// the first range tombstone from level j with end_key() > +// children_[i].iter.key(). +// (4) For all level i and j <= i, if j in active_, then +// range_tombstone_iters_[j]->start_key() < children_[i].iter.key(). +// - When range_tombstone_iters_[j] is !Valid(), we consider its `prev` to be +// the last range tombstone from that range tombstone iterator. +// - When referring to range tombstone start/end keys, assume it is the value of +// HeapItem::tombstone_pik. This value has op_type = kMaxValid, which makes +// range tombstone keys have distinct values from point keys. +// +// Applicable class variables have their own (forward scanning) invariants +// listed in the comments above their definition. +class MergingIterator : public InternalIterator { + public: + MergingIterator(const InternalKeyComparator* comparator, + InternalIterator** children, int n, bool is_arena_mode, + bool prefix_seek_mode, + const Slice* iterate_upper_bound = nullptr) + : is_arena_mode_(is_arena_mode), + prefix_seek_mode_(prefix_seek_mode), + direction_(kForward), + comparator_(comparator), + current_(nullptr), + minHeap_(MinHeapItemComparator(comparator_)), + pinned_iters_mgr_(nullptr), + iterate_upper_bound_(iterate_upper_bound) { + children_.resize(n); + for (int i = 0; i < n; i++) { + children_[i].level = i; + children_[i].iter.Set(children[i]); + } + } + + void considerStatus(Status s) { + if (!s.ok() && status_.ok()) { + status_ = s; + } + } + + virtual void AddIterator(InternalIterator* iter) { + children_.emplace_back(children_.size(), iter); + if (pinned_iters_mgr_) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + // Invalidate to ensure `Seek*()` is called to construct the heaps before + // use. + current_ = nullptr; + } + + // There must be either no range tombstone iterator or the same number of + // range tombstone iterators as point iterators after all iters are added. + // The i-th added range tombstone iterator and the i-th point iterator + // must point to the same LSM level. + // Merging iterator takes ownership of `iter` and is responsible for freeing + // it. One exception to this is when a LevelIterator moves to a different SST + // file or when Iterator::Refresh() is called, the range tombstone iterator + // could be updated. In that case, this merging iterator is only responsible + // for freeing the new range tombstone iterator that it has pointers to in + // range_tombstone_iters_. + void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) { + range_tombstone_iters_.emplace_back(iter); + } + + // Called by MergingIteratorBuilder when all point iterators and range + // tombstone iterators are added. Initializes HeapItems for range tombstone + // iterators. + void Finish() { + if (!range_tombstone_iters_.empty()) { + assert(range_tombstone_iters_.size() == children_.size()); + pinned_heap_item_.resize(range_tombstone_iters_.size()); + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + pinned_heap_item_[i].level = i; + // Range tombstone end key is exclusive. If a point internal key has the + // same user key and sequence number as the start or end key of a range + // tombstone, the order will be start < end key < internal key with the + // following op_type change. This is helpful to ensure keys popped from + // heap are in expected order since range tombstone start/end keys will + // be distinct from point internal keys. Strictly speaking, this is only + // needed for tombstone end points that are truncated in + // TruncatedRangeDelIterator since untruncated tombstone end points + // always have kMaxSequenceNumber and kTypeRangeDeletion (see + // TruncatedRangeDelIterator::start_key()/end_key()). + pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid; + } + } + } + + ~MergingIterator() override { + for (auto child : range_tombstone_iters_) { + delete child; + } + + for (auto& child : children_) { + child.iter.DeleteIter(is_arena_mode_); + } + status_.PermitUncheckedError(); + } + + bool Valid() const override { return current_ != nullptr && status_.ok(); } + + Status status() const override { return status_; } + + // Add range_tombstone_iters_[level] into min heap. + // Updates active_ if the end key of a range tombstone is inserted. + // pinned_heap_items_[level].type is updated based on `start_key`. + // + // If range_tombstone_iters_[level] is after iterate_upper_bound_, + // it is removed from the heap. + // @param start_key specifies which end point of the range tombstone to add. + void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true, + bool replace_top = false) { + assert(!range_tombstone_iters_.empty() && + range_tombstone_iters_[level]->Valid()); + // Maintains Invariant(phi) + if (start_key) { + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START; + ParsedInternalKey pik = range_tombstone_iters_[level]->start_key(); + // iterate_upper_bound does not have timestamp + if (iterate_upper_bound_ && + comparator_->user_comparator()->CompareWithoutTimestamp( + pik.user_key, true /* a_has_ts */, *iterate_upper_bound_, + false /* b_has_ts */) >= 0) { + if (replace_top) { + // replace_top implies this range tombstone iterator is still in + // minHeap_ and at the top. + minHeap_.pop(); + } + return; + } + pinned_heap_item_[level].SetTombstoneKey(std::move(pik)); + // Checks Invariant(active_) + assert(active_.count(level) == 0); + } else { + // allow end key to go over upper bound (if present) since start key is + // before upper bound and the range tombstone could still cover a + // range before upper bound. + // Maintains Invariant(active_) + pinned_heap_item_[level].SetTombstoneKey( + range_tombstone_iters_[level]->end_key()); + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END; + active_.insert(level); + } + if (replace_top) { + minHeap_.replace_top(&pinned_heap_item_[level]); + } else { + minHeap_.push(&pinned_heap_item_[level]); + } + } + + // Add range_tombstone_iters_[level] into max heap. + // Updates active_ if the start key of a range tombstone is inserted. + // @param end_key specifies which end point of the range tombstone to add. + void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true, + bool replace_top = false) { + assert(!range_tombstone_iters_.empty() && + range_tombstone_iters_[level]->Valid()); + if (end_key) { + pinned_heap_item_[level].SetTombstoneKey( + range_tombstone_iters_[level]->end_key()); + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END; + assert(active_.count(level) == 0); + } else { + pinned_heap_item_[level].SetTombstoneKey( + range_tombstone_iters_[level]->start_key()); + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START; + active_.insert(level); + } + if (replace_top) { + maxHeap_->replace_top(&pinned_heap_item_[level]); + } else { + maxHeap_->push(&pinned_heap_item_[level]); + } + } + + // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START + // until minHeap_ is empty or the top of the minHeap_ is not of type + // DELETE_RANGE_START. Each such item means a range tombstone becomes active, + // so `active_` is updated accordingly. + void PopDeleteRangeStart() { + while (!minHeap_.empty() && + minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_START) { + TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr); + // Invariant(rti) holds since + // range_tombstone_iters_[minHeap_.top()->level] is still valid, and + // parameter `replace_top` is set to true here to ensure only one such + // HeapItem is in minHeap_. + InsertRangeTombstoneToMinHeap( + minHeap_.top()->level, false /* start_key */, true /* replace_top */); + } + } + + // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END + // until maxHeap_ is empty or the top of the maxHeap_ is not of type + // DELETE_RANGE_END. Each such item means a range tombstone becomes active, + // so `active_` is updated accordingly. + void PopDeleteRangeEnd() { + while (!maxHeap_->empty() && + maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_END) { + // insert start key of this range tombstone and updates active_ + InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */, + true /* replace_top */); + } + } + + void SeekToFirst() override { + ClearHeaps(); + status_ = Status::OK(); + for (auto& child : children_) { + child.iter.SeekToFirst(); + AddToMinHeapOrCheckStatus(&child); + } + + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + if (range_tombstone_iters_[i]) { + range_tombstone_iters_[i]->SeekToFirst(); + if (range_tombstone_iters_[i]->Valid()) { + // It is possible to be invalid due to snapshots. + InsertRangeTombstoneToMinHeap(i); + } + } + } + FindNextVisibleKey(); + direction_ = kForward; + current_ = CurrentForward(); + } + + void SeekToLast() override { + ClearHeaps(); + InitMaxHeap(); + status_ = Status::OK(); + for (auto& child : children_) { + child.iter.SeekToLast(); + AddToMaxHeapOrCheckStatus(&child); + } + + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + if (range_tombstone_iters_[i]) { + range_tombstone_iters_[i]->SeekToLast(); + if (range_tombstone_iters_[i]->Valid()) { + // It is possible to be invalid due to snapshots. + InsertRangeTombstoneToMaxHeap(i); + } + } + } + FindPrevVisibleKey(); + direction_ = kReverse; + current_ = CurrentReverse(); + } + + // Position this merging iterator at the first key >= target (internal key). + // If range tombstones are present, keys covered by range tombstones are + // skipped, and this merging iter points to the first non-range-deleted key >= + // target after Seek(). If !Valid() and status().ok() then this iterator + // reaches the end. + // + // If range tombstones are present, cascading seeks may be called (an + // optimization adapted from Pebble https://github.com/cockroachdb/pebble). + // Roughly, if there is a range tombstone [start, end) that covers the + // target user key at level L, then this range tombstone must cover the range + // [target key, end) in all levels > L. So for all levels > L, we can pretend + // the target key is `end`. This optimization is applied at each level and + // hence the name "cascading seek". + void Seek(const Slice& target) override { + // Define LevelNextVisible(i, k) to be the first key >= k in level i that is + // not covered by any range tombstone. + // After SeekImpl(target, 0), invariants (3) and (4) hold. + // For all level i, target <= children_[i].iter.key() <= LevelNextVisible(i, + // target). By the contract of FindNextVisibleKey(), Invariants (1)-(4) + // holds after this call, and minHeap_.top().iter points to the + // first key >= target among children_ that is not covered by any range + // tombstone. + SeekImpl(target); + FindNextVisibleKey(); + + direction_ = kForward; + { + PERF_TIMER_GUARD(seek_min_heap_time); + current_ = CurrentForward(); + } + } + + void SeekForPrev(const Slice& target) override { + assert(range_tombstone_iters_.empty() || + range_tombstone_iters_.size() == children_.size()); + SeekForPrevImpl(target); + FindPrevVisibleKey(); + + direction_ = kReverse; + { + PERF_TIMER_GUARD(seek_max_heap_time); + current_ = CurrentReverse(); + } + } + + void Next() override { + assert(Valid()); + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all the non-current children since current_ is + // the smallest child and key() == current_->key(). + if (direction_ != kForward) { + // The loop advanced all non-current children to be > key() so current_ + // should still be strictly the smallest key. + SwitchToForward(); + } + + // For the heap modifications below to be correct, current_ must be the + // current top of the heap. + assert(current_ == CurrentForward()); + // as the current points to the current record. move the iterator forward. + current_->Next(); + if (current_->Valid()) { + // current is still valid after the Next() call above. Call + // replace_top() to restore the heap property. When the same child + // iterator yields a sequence of keys, this is cheap. + assert(current_->status().ok()); + minHeap_.replace_top(minHeap_.top()); + } else { + // current stopped being valid, remove it from the heap. + considerStatus(current_->status()); + minHeap_.pop(); + } + // Invariants (3) and (4) hold when after advancing current_. + // Let k be the smallest key among children_[i].iter.key(). + // k <= children_[i].iter.key() <= LevelNextVisible(i, k) holds for all + // level i. After FindNextVisible(), Invariants (1)-(4) hold and + // minHeap_.top()->key() is the first key >= k from any children_ that is + // not covered by any range tombstone. + FindNextVisibleKey(); + current_ = CurrentForward(); + } + + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = current_->IsValuePrepared(); + } + return is_valid; + } + + void Prev() override { + assert(Valid()); + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all the non-current children since current_ is + // the largest child and key() == current_->key(). + if (direction_ != kReverse) { + // Otherwise, retreat the non-current children. We retreat current_ + // just after the if-block. + SwitchToBackward(); + } + + // For the heap modifications below to be correct, current_ must be the + // current top of the heap. + assert(current_ == CurrentReverse()); + current_->Prev(); + if (current_->Valid()) { + // current is still valid after the Prev() call above. Call + // replace_top() to restore the heap property. When the same child + // iterator yields a sequence of keys, this is cheap. + assert(current_->status().ok()); + maxHeap_->replace_top(maxHeap_->top()); + } else { + // current stopped being valid, remove it from the heap. + considerStatus(current_->status()); + maxHeap_->pop(); + } + FindPrevVisibleKey(); + current_ = CurrentReverse(); + } + + Slice key() const override { + assert(Valid()); + return current_->key(); + } + + Slice value() const override { + assert(Valid()); + return current_->value(); + } + + bool PrepareValue() override { + assert(Valid()); + if (current_->PrepareValue()) { + return true; + } + + considerStatus(current_->status()); + assert(!status_.ok()); + return false; + } + + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return current_->UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + for (auto& child : children_) { + child.iter.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsKeyPinned() const override { + assert(Valid()); + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(Valid()); + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsValuePinned(); + } + + private: + // Represents an element in the min/max heap. Each HeapItem corresponds to a + // point iterator or a range tombstone iterator, differentiated by + // HeapItem::type. + struct HeapItem { + HeapItem() = default; + + // corresponding point iterator + IteratorWrapper iter; + size_t level = 0; + // corresponding range tombstone iterator's start or end key value + // depending on value of `type`. + ParsedInternalKey tombstone_pik; + // Will be overwritten before use, initialize here so compiler does not + // complain. + enum class Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; + Type type = Type::ITERATOR; + + explicit HeapItem(size_t _level, InternalIteratorBase* _iter) + : level(_level), type(Type::ITERATOR) { + iter.Set(_iter); + } + + void SetTombstoneKey(ParsedInternalKey&& pik) { + // op_type is already initialized in MergingIterator::Finish(). + tombstone_pik.user_key = pik.user_key; + tombstone_pik.sequence = pik.sequence; + } + }; + + class MinHeapItemComparator { + public: + explicit MinHeapItemComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(HeapItem* a, HeapItem* b) const { + if (LIKELY(a->type == HeapItem::Type::ITERATOR)) { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->iter.key(), b->iter.key()) > 0; + } else { + return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0; + } + } else { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0; + } else { + return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0; + } + } + } + + private: + const InternalKeyComparator* comparator_; + }; + + class MaxHeapItemComparator { + public: + explicit MaxHeapItemComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(HeapItem* a, HeapItem* b) const { + if (LIKELY(a->type == HeapItem::Type::ITERATOR)) { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->iter.key(), b->iter.key()) < 0; + } else { + return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0; + } + } else { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0; + } else { + return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0; + } + } + } + + private: + const InternalKeyComparator* comparator_; + }; + + using MergerMinIterHeap = BinaryHeap; + using MergerMaxIterHeap = BinaryHeap; + + friend class MergeIteratorBuilder; + // Clears heaps for both directions, used when changing direction or seeking + void ClearHeaps(bool clear_active = true); + // Ensures that maxHeap_ is initialized when starting to go in the reverse + // direction + void InitMaxHeap(); + // Advance this merging iterator until the current key (minHeap_.top()) is + // from a point iterator and is not covered by any range tombstone, + // or that there is no more keys (heap is empty). SeekImpl() may be called + // to seek to the end of a range tombstone as an optimization. + void FindNextVisibleKey(); + void FindPrevVisibleKey(); + + // Advance this merging iterators to the first key >= `target` for all + // components from levels >= starting_level. All iterators before + // starting_level are untouched. + // + // @param range_tombstone_reseek Whether target is some range tombstone + // end, i.e., whether this SeekImpl() call is a part of a "cascading seek". + // This is used only for recoding relevant perf_context. + void SeekImpl(const Slice& target, size_t starting_level = 0, + bool range_tombstone_reseek = false); + + // Seek to fist key <= target key (internal key) for + // children_[starting_level:]. + void SeekForPrevImpl(const Slice& target, size_t starting_level = 0, + bool range_tombstone_reseek = false); + + bool is_arena_mode_; + bool prefix_seek_mode_; + // Which direction is the iterator moving? + enum Direction : uint8_t { kForward, kReverse }; + Direction direction_; + const InternalKeyComparator* comparator_; + // HeapItem for all child point iterators. + // Invariant(children_): children_[i] is in minHeap_ iff + // children_[i].iter.Valid(), and at most one children_[i] is in minHeap_. + // TODO: We could use an autovector with a larger reserved size. + std::vector children_; + // HeapItem for range tombstone start and end keys. + // pinned_heap_item_[i] corresponds to range_tombstone_iters_[i]. + // Invariant(phi): If range_tombstone_iters_[i]->Valid(), + // pinned_heap_item_[i].tombstone_pik is equal to + // range_tombstone_iters_[i]->start_key() when + // pinned_heap_item_[i].type is DELETE_RANGE_START and + // range_tombstone_iters_[i]->end_key() when + // pinned_heap_item_[i].type is DELETE_RANGE_END (ignoring op_type which is + // kMaxValid for all pinned_heap_item_.tombstone_pik). + // pinned_heap_item_[i].type is either DELETE_RANGE_START or DELETE_RANGE_END. + std::vector pinned_heap_item_; + // range_tombstone_iters_[i] contains range tombstones in the sorted run that + // corresponds to children_[i]. range_tombstone_iters_.empty() means not + // handling range tombstones in merging iterator. range_tombstone_iters_[i] == + // nullptr means the sorted run of children_[i] does not have range + // tombstones. + // Invariant(rti): pinned_heap_item_[i] is in minHeap_ iff + // range_tombstone_iters_[i]->Valid() and at most one pinned_heap_item_[i] is + // in minHeap_. + std::vector range_tombstone_iters_; + + // Levels (indices into range_tombstone_iters_/children_ ) that currently have + // "active" range tombstones. See comments above MergingIterator for meaning + // of "active". + // Invariant(active_): i is in active_ iff range_tombstone_iters_[i]->Valid() + // and pinned_heap_item_[i].type == DELETE_RANGE_END. + std::set active_; + + bool SkipNextDeleted(); + + bool SkipPrevDeleted(); + + // Invariant: at the end of each InternalIterator API, + // current_ points to minHeap_.top().iter (maxHeap_ if backward scanning) + // or nullptr if no child iterator is valid. + // This follows from that current_ = CurrentForward()/CurrentReverse() is + // called at the end of each InternalIterator API. + IteratorWrapper* current_; + // If any of the children have non-ok status, this is one of them. + Status status_; + // Invariant: min heap property is maintained (parent is always <= child). + // This holds by using only BinaryHeap APIs to modify heap. One + // exception is to modify heap top item directly (by caller iter->Next()), and + // it should be followed by a call to replace_top() or pop(). + MergerMinIterHeap minHeap_; + + // Max heap is used for reverse iteration, which is way less common than + // forward. Lazily initialize it to save memory. + std::unique_ptr maxHeap_; + PinnedIteratorsManager* pinned_iters_mgr_; + + // Used to bound range tombstones. For point keys, DBIter and SSTable iterator + // take care of boundary checking. + const Slice* iterate_upper_bound_; + + // In forward direction, process a child that is not in the min heap. + // If valid, add to the min heap. Otherwise, check status. + void AddToMinHeapOrCheckStatus(HeapItem*); + + // In backward direction, process a child that is not in the max heap. + // If valid, add to the min heap. Otherwise, check status. + void AddToMaxHeapOrCheckStatus(HeapItem*); + + void SwitchToForward(); + + // Switch the direction from forward to backward without changing the + // position. Iterator should still be valid. + void SwitchToBackward(); + + IteratorWrapper* CurrentForward() const { + assert(direction_ == kForward); + assert(minHeap_.empty() || + minHeap_.top()->type == HeapItem::Type::ITERATOR); + return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr; + } + + IteratorWrapper* CurrentReverse() const { + assert(direction_ == kReverse); + assert(maxHeap_); + assert(maxHeap_->empty() || + maxHeap_->top()->type == HeapItem::Type::ITERATOR); + return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr; + } +}; + +// Pre-condition: +// - Invariants (3) and (4) hold for i < starting_level +// - For i < starting_level, range_tombstone_iters_[i].prev.end_key() < +// `target`. +// - For i < starting_level, if i in active_, then +// range_tombstone_iters_[i]->start_key() < `target`. +// +// Post-condition: +// - Invariants (3) and (4) hold for all level i. +// - (*) target <= children_[i].iter.key() <= LevelNextVisible(i, target) +// for i >= starting_level +// - (**) target < pinned_heap_item_[i].tombstone_pik if +// range_tombstone_iters_[i].Valid() for i >= starting_level +// +// Proof sketch: +// Invariant (3) holds for all level i. +// For j <= i < starting_level, it follows from Pre-condition that (3) holds +// and that SeekImpl(-, starting_level) does not update children_[i] or +// range_tombstone_iters_[j]. +// For j < starting_level and i >= starting_level, it follows from +// - Pre-condition that range_tombstone_iters_[j].prev.end_key() < `target` +// - range_tombstone_iters_[j] is not updated in SeekImpl(), and +// - children_[i].iter.Seek(current_search_key) is called with +// current_search_key >= target (shown below). +// When current_search_key is updated, it is updated to some +// range_tombstone_iter->end_key() after +// range_tombstone_iter->SeekInternalKey(current_search_key) was called. So +// current_search_key increases if updated and >= target. +// For starting_level <= j <= i: +// children_[i].iter.Seek(k1) and range_tombstone_iters_[j]->SeekInternalKey(k2) +// are called in SeekImpl(). Seek(k1) positions children_[i] at the first key >= +// k1 from level i. SeekInternalKey(k2) positions range_tombstone_iters_[j] at +// the first range tombstone from level j with end_key() > k2. It suffices to +// show that k1 >= k2. Since k1 and k2 are values of current_search_key where +// k1 = k2 or k1 is value of a later current_search_key than k2, so k1 >= k2. +// +// Invariant (4) holds for all level >= 0. +// By Pre-condition Invariant (4) holds for i < starting_level. +// Since children_[i], range_tombstone_iters_[i] and contents of active_ for +// i < starting_level do not change (4) holds for j <= i < starting_level. +// By Pre-condition: for all j < starting_level, if j in active_, then +// range_tombstone_iters_[j]->start_key() < target. For i >= starting_level, +// children_[i].iter.Seek(k) is called for k >= target. So +// children_[i].iter.key() >= target > range_tombstone_iters_[j]->start_key() +// for j < starting_level and i >= starting_level. So invariant (4) holds for +// j < starting_level and i >= starting_level. +// For starting_level <= j <= i, j is added to active_ only if +// - range_tombstone_iters_[j]->SeekInternalKey(k1) was called +// - range_tombstone_iters_[j]->start_key() <= k1 +// Since children_[i].iter.Seek(k2) is called for some k2 >= k1 and for all +// starting_level <= j <= i, (4) also holds for all starting_level <= j <= i. +// +// Post-condition (*): target <= children_[i].iter.key() <= LevelNextVisible(i, +// target) for i >= starting_level. +// target <= children_[i].iter.key() follows from that Seek() is called on some +// current_search_key >= target for children_[i].iter. If current_search_key +// is updated from k1 to k2 when level = i, we show that the range [k1, k2) is +// not visible for children_[j] for any j > i. When current_search_key is +// updated from k1 to k2, +// - range_tombstone_iters_[i]->SeekInternalKey(k1) was called +// - range_tombstone_iters_[i]->Valid() +// - range_tombstone_iters_[i]->start_key().user_key <= k1.user_key +// - k2 = range_tombstone_iters_[i]->end_key() +// We assume that range_tombstone_iters_[i]->start_key() has a higher sequence +// number compared to any key from levels > i that has the same user key. So no +// point key from levels > i in range [k1, k2) is visible. So +// children_[i].iter.key() <= LevelNextVisible(i, target). +// +// Post-condition (**) target < pinned_heap_item_[i].tombstone_pik for i >= +// starting_level if range_tombstone_iters_[i].Valid(). This follows from that +// SeekInternalKey() being called for each range_tombstone_iters_ with some key +// >= `target` and that we pick start/end key that is > `target` to insert to +// minHeap_. +void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, + bool range_tombstone_reseek) { + // active range tombstones before `starting_level` remain active + ClearHeaps(false /* clear_active */); + ParsedInternalKey pik; + if (!range_tombstone_iters_.empty()) { + // pik is only used in InsertRangeTombstoneToMinHeap(). + ParseInternalKey(target, &pik, false).PermitUncheckedError(); + } + + // TODO: perhaps we could save some upheap cost by add all child iters first + // and then do a single heapify. + // Invariant(children_) for level < starting_level + for (size_t level = 0; level < starting_level; ++level) { + PERF_TIMER_GUARD(seek_min_heap_time); + AddToMinHeapOrCheckStatus(&children_[level]); + } + if (!range_tombstone_iters_.empty()) { + // Add range tombstones from levels < starting_level. We can insert from + // pinned_heap_item_ for the following reasons: + // - pinned_heap_item_[level] is in minHeap_ iff + // range_tombstone_iters[level]->Valid(). + // - If `level` is in active_, then range_tombstone_iters_[level]->Valid() + // and pinned_heap_item_[level] is of type RANGE_DELETION_END. + for (size_t level = 0; level < starting_level; ++level) { + // Restores Invariants(rti), (phi) and (active_) for level < + // starting_level + if (range_tombstone_iters_[level] && + range_tombstone_iters_[level]->Valid()) { + // use an iterator on active_ if performance becomes an issue here + if (active_.count(level) > 0) { + assert(pinned_heap_item_[level].type == + HeapItem::Type::DELETE_RANGE_END); + // if it was active, then start key must be within upper_bound, + // so we can add to minHeap_ directly. + minHeap_.push(&pinned_heap_item_[level]); + } else { + assert(pinned_heap_item_[level].type == + HeapItem::Type::DELETE_RANGE_START); + // this takes care of checking iterate_upper_bound, but with an extra + // key comparison if range_tombstone_iters_[level] was already out of + // bound. Consider using a new HeapItem type or some flag to remember + // boundary checking result. + InsertRangeTombstoneToMinHeap(level); + } + } else { + assert(!active_.count(level)); + } + } + // levels >= starting_level will be reseeked below, so clearing their active + // state here. + active_.erase(active_.lower_bound(starting_level), active_.end()); + } + + status_ = Status::OK(); + IterKey current_search_key; + current_search_key.SetInternalKey(target, false /* copy */); + // Seek target might change to some range tombstone end key, so + // we need to remember them for async requests. + // (level, target) pairs + autovector> prefetched_target; + for (auto level = starting_level; level < children_.size(); ++level) { + { + PERF_TIMER_GUARD(seek_child_seek_time); + children_[level].iter.Seek(current_search_key.GetInternalKey()); + } + + PERF_COUNTER_ADD(seek_child_seek_count, 1); + + if (!range_tombstone_iters_.empty()) { + if (range_tombstone_reseek) { + // This seek is to some range tombstone end key. + // Should only happen when there are range tombstones. + PERF_COUNTER_ADD(internal_range_del_reseek_count, 1); + } + if (children_[level].iter.status().IsTryAgain()) { + prefetched_target.emplace_back( + level, current_search_key.GetInternalKey().ToString()); + } + auto range_tombstone_iter = range_tombstone_iters_[level]; + if (range_tombstone_iter) { + range_tombstone_iter->SeekInternalKey( + current_search_key.GetInternalKey()); + // Invariants (rti) and (phi) + if (range_tombstone_iter->Valid()) { + // If range tombstone starts after `current_search_key`, + // we should insert start key to heap as the range tombstone is not + // active yet. + InsertRangeTombstoneToMinHeap( + level, comparator_->Compare(range_tombstone_iter->start_key(), + pik) > 0 /* start_key */); + // current_search_key < end_key guaranteed by the SeekInternalKey() + // and Valid() calls above. Here we only need to compare user_key + // since if target.user_key == + // range_tombstone_iter->start_key().user_key and target < + // range_tombstone_iter->start_key(), no older level would have any + // key in range [target, range_tombstone_iter->start_key()], so no + // keys in range [target, range_tombstone_iter->end_key()) from older + // level would be visible. So it is safe to seek to + // range_tombstone_iter->end_key(). + // + // TODO: range_tombstone_iter->Seek() finds the max covering + // sequence number, can make it cheaper by not looking for max. + if (comparator_->user_comparator()->Compare( + range_tombstone_iter->start_key().user_key, + current_search_key.GetUserKey()) <= 0) { + range_tombstone_reseek = true; + // Note that for prefix seek case, it is possible that the prefix + // is not the same as the original target, it should not affect + // correctness. Besides, in most cases, range tombstone start and + // end key should have the same prefix? + current_search_key.SetInternalKey(range_tombstone_iter->end_key()); + } + } + } + } + // child.iter.status() is set to Status::TryAgain indicating asynchronous + // request for retrieval of data blocks has been submitted. So it should + // return at this point and Seek should be called again to retrieve the + // requested block and add the child to min heap. + if (children_[level].iter.status().IsTryAgain()) { + continue; + } + { + // Strictly, we timed slightly more than min heap operation, + // but these operations are very cheap. + PERF_TIMER_GUARD(seek_min_heap_time); + AddToMinHeapOrCheckStatus(&children_[level]); + } + } + + if (range_tombstone_iters_.empty()) { + for (auto& child : children_) { + if (child.iter.status().IsTryAgain()) { + child.iter.Seek(target); + { + PERF_TIMER_GUARD(seek_min_heap_time); + AddToMinHeapOrCheckStatus(&child); + } + PERF_COUNTER_ADD(number_async_seek, 1); + } + } + } else { + for (auto& prefetch : prefetched_target) { + // (level, target) pairs + children_[prefetch.first].iter.Seek(prefetch.second); + { + PERF_TIMER_GUARD(seek_min_heap_time); + AddToMinHeapOrCheckStatus(&children_[prefetch.first]); + } + PERF_COUNTER_ADD(number_async_seek, 1); + } + } +} + +// Returns true iff the current key (min heap top) should not be returned +// to user (of the merging iterator). This can be because the current key +// is deleted by some range tombstone, the current key is some fake file +// boundary sentinel key, or the current key is an end point of a range +// tombstone. Advance the iterator at heap top if needed. Heap order is restored +// and `active_` is updated accordingly. +// See FindNextVisibleKey() for more detail on internal implementation +// of advancing child iters. +// When false is returned, if minHeap is not empty, then minHeap_.top().type +// == ITERATOR +// +// REQUIRES: +// - min heap is currently not empty, and iter is in kForward direction. +// - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current). +bool MergingIterator::SkipNextDeleted() { + // 3 types of keys: + // - point key + // - file boundary sentinel keys + // - range deletion end key + auto current = minHeap_.top(); + if (current->type == HeapItem::Type::DELETE_RANGE_END) { + // Invariant(active_): range_tombstone_iters_[current->level] is about to + // become !Valid() or that its start key is going to be added to minHeap_. + active_.erase(current->level); + assert(range_tombstone_iters_[current->level] && + range_tombstone_iters_[current->level]->Valid()); + range_tombstone_iters_[current->level]->Next(); + // Maintain Invariants (rti) and (phi) + if (range_tombstone_iters_[current->level]->Valid()) { + InsertRangeTombstoneToMinHeap(current->level, true /* start_key */, + true /* replace_top */); + } else { + minHeap_.pop(); + } + return true /* current key deleted */; + } + if (current->iter.IsDeleteRangeSentinelKey()) { + // If the file boundary is defined by a range deletion, the range + // tombstone's end key must come before this sentinel key (see op_type in + // SetTombstoneKey()). + assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion || + active_.count(current->level) == 0); + // When entering a new file, range tombstone iter from the old file is + // freed, but the last key from that range tombstone iter may still be in + // the heap. We need to ensure the data underlying its corresponding key + // Slice is still alive. We do so by popping the range tombstone key from + // heap before calling iter->Next(). Technically, this change is not needed: + // if there is a range tombstone end key that is after file boundary + // sentinel key in minHeap_, the range tombstone end key must have been + // truncated at file boundary. The underlying data of the range tombstone + // end key Slice is the SST file's largest internal key stored as file + // metadata in Version. However, since there are too many implicit + // assumptions made, it is safer to just ensure range tombstone iter is + // still alive. + minHeap_.pop(); + // Remove last SST file's range tombstone end key if there is one. + // This means file boundary is before range tombstone end key, + // which could happen when a range tombstone and a user key + // straddle two SST files. Note that in TruncatedRangeDelIterator + // constructor, parsed_largest.sequence is decremented 1 in this case. + // Maintains Invariant(rti) that at most one + // pinned_heap_item_[current->level] is in minHeap_. + if (range_tombstone_iters_[current->level] && + range_tombstone_iters_[current->level]->Valid()) { + if (!minHeap_.empty() && minHeap_.top()->level == current->level) { + assert(minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_END); + minHeap_.pop(); + // Invariant(active_): we are about to enter a new SST file with new + // range_tombstone_iters[current->level]. Either it is !Valid() or its + // start key is going to be added to minHeap_. + active_.erase(current->level); + } else { + // range tombstone is still valid, but it is not on heap. + // This should only happen if the range tombstone is over iterator + // upper bound. + assert(iterate_upper_bound_ && + comparator_->user_comparator()->CompareWithoutTimestamp( + range_tombstone_iters_[current->level]->start_key().user_key, + true /* a_has_ts */, *iterate_upper_bound_, + false /* b_has_ts */) >= 0); + } + } + // LevelIterator enters a new SST file + current->iter.Next(); + // Invariant(children_): current is popped from heap and added back only if + // it is valid + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + minHeap_.push(current); + } + // Invariants (rti) and (phi) + if (range_tombstone_iters_[current->level] && + range_tombstone_iters_[current->level]->Valid()) { + InsertRangeTombstoneToMinHeap(current->level); + } + return true /* current key deleted */; + } + assert(current->type == HeapItem::Type::ITERATOR); + // Point key case: check active_ for range tombstone coverage. + ParsedInternalKey pik; + ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); + if (!active_.empty()) { + auto i = *active_.begin(); + if (i < current->level) { + // range tombstone is from a newer level, definitely covers + assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), + pik) <= 0); + assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < + 0); + std::string target; + AppendInternalKey(&target, range_tombstone_iters_[i]->end_key()); + SeekImpl(target, current->level, true); + return true /* current key deleted */; + } else if (i == current->level) { + // range tombstone is from the same level as current, check sequence + // number. By `active_` we know current key is between start key and end + // key. + assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), + pik) <= 0); + assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < + 0); + if (pik.sequence < range_tombstone_iters_[current->level]->seq()) { + // covered by range tombstone + current->iter.Next(); + // Invariant (children_) + if (current->iter.Valid()) { + minHeap_.replace_top(current); + } else { + minHeap_.pop(); + } + return true /* current key deleted */; + } else { + return false /* current key not deleted */; + } + } else { + return false /* current key not deleted */; + // range tombstone from an older sorted run with current key < end key. + // current key is not deleted and the older sorted run will have its range + // tombstone updated when the range tombstone's end key are popped from + // minHeap_. + } + } + // we can reach here only if active_ is empty + assert(active_.empty()); + assert(minHeap_.top()->type == HeapItem::Type::ITERATOR); + return false /* current key not deleted */; +} + +void MergingIterator::SeekForPrevImpl(const Slice& target, + size_t starting_level, + bool range_tombstone_reseek) { + // active range tombstones before `starting_level` remain active + ClearHeaps(false /* clear_active */); + InitMaxHeap(); + ParsedInternalKey pik; + if (!range_tombstone_iters_.empty()) { + ParseInternalKey(target, &pik, false).PermitUncheckedError(); + } + for (size_t level = 0; level < starting_level; ++level) { + PERF_TIMER_GUARD(seek_max_heap_time); + AddToMaxHeapOrCheckStatus(&children_[level]); + } + if (!range_tombstone_iters_.empty()) { + // Add range tombstones before starting_level. + for (size_t level = 0; level < starting_level; ++level) { + if (range_tombstone_iters_[level] && + range_tombstone_iters_[level]->Valid()) { + assert(static_cast(active_.count(level)) == + (pinned_heap_item_[level].type == + HeapItem::Type::DELETE_RANGE_START)); + maxHeap_->push(&pinned_heap_item_[level]); + } else { + assert(!active_.count(level)); + } + } + // levels >= starting_level will be reseeked below, + active_.erase(active_.lower_bound(starting_level), active_.end()); + } + + status_ = Status::OK(); + IterKey current_search_key; + current_search_key.SetInternalKey(target, false /* copy */); + // Seek target might change to some range tombstone end key, so + // we need to remember them for async requests. + // (level, target) pairs + autovector> prefetched_target; + for (auto level = starting_level; level < children_.size(); ++level) { + { + PERF_TIMER_GUARD(seek_child_seek_time); + children_[level].iter.SeekForPrev(current_search_key.GetInternalKey()); + } + + PERF_COUNTER_ADD(seek_child_seek_count, 1); + + if (!range_tombstone_iters_.empty()) { + if (range_tombstone_reseek) { + // This seek is to some range tombstone end key. + // Should only happen when there are range tombstones. + PERF_COUNTER_ADD(internal_range_del_reseek_count, 1); + } + if (children_[level].iter.status().IsTryAgain()) { + prefetched_target.emplace_back( + level, current_search_key.GetInternalKey().ToString()); + } + auto range_tombstone_iter = range_tombstone_iters_[level]; + if (range_tombstone_iter) { + range_tombstone_iter->SeekForPrev(current_search_key.GetUserKey()); + if (range_tombstone_iter->Valid()) { + InsertRangeTombstoneToMaxHeap( + level, comparator_->Compare(range_tombstone_iter->end_key(), + pik) <= 0 /* end_key */); + // start key <= current_search_key guaranteed by the Seek() call above + // Only interested in user key coverage since older sorted runs must + // have smaller sequence numbers than this tombstone. + if (comparator_->user_comparator()->Compare( + current_search_key.GetUserKey(), + range_tombstone_iter->end_key().user_key) < 0) { + range_tombstone_reseek = true; + current_search_key.SetInternalKey( + range_tombstone_iter->start_key().user_key, kMaxSequenceNumber, + kValueTypeForSeekForPrev); + } + } + } + } + // child.iter.status() is set to Status::TryAgain indicating asynchronous + // request for retrieval of data blocks has been submitted. So it should + // return at this point and Seek should be called again to retrieve the + // requested block and add the child to min heap. + if (children_[level].iter.status().IsTryAgain()) { + continue; + } + { + // Strictly, we timed slightly more than min heap operation, + // but these operations are very cheap. + PERF_TIMER_GUARD(seek_max_heap_time); + AddToMaxHeapOrCheckStatus(&children_[level]); + } + } + + if (range_tombstone_iters_.empty()) { + for (auto& child : children_) { + if (child.iter.status().IsTryAgain()) { + child.iter.SeekForPrev(target); + { + PERF_TIMER_GUARD(seek_min_heap_time); + AddToMaxHeapOrCheckStatus(&child); + } + PERF_COUNTER_ADD(number_async_seek, 1); + } + } + } else { + for (auto& prefetch : prefetched_target) { + // (level, target) pairs + children_[prefetch.first].iter.SeekForPrev(prefetch.second); + { + PERF_TIMER_GUARD(seek_max_heap_time); + AddToMaxHeapOrCheckStatus(&children_[prefetch.first]); + } + PERF_COUNTER_ADD(number_async_seek, 1); + } + } +} + +// See more in comments above SkipNextDeleted(). +// REQUIRES: +// - max heap is currently not empty, and iter is in kReverse direction. +// - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current). +bool MergingIterator::SkipPrevDeleted() { + // 3 types of keys: + // - point key + // - file boundary sentinel keys + // - range deletion start key + auto current = maxHeap_->top(); + if (current->type == HeapItem::Type::DELETE_RANGE_START) { + active_.erase(current->level); + assert(range_tombstone_iters_[current->level] && + range_tombstone_iters_[current->level]->Valid()); + range_tombstone_iters_[current->level]->Prev(); + if (range_tombstone_iters_[current->level]->Valid()) { + InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */, + true /* replace_top */); + } else { + maxHeap_->pop(); + } + return true /* current key deleted */; + } + if (current->iter.IsDeleteRangeSentinelKey()) { + // LevelIterator enters a new SST file + maxHeap_->pop(); + // Remove last SST file's range tombstone key if there is one. + if (!maxHeap_->empty() && maxHeap_->top()->level == current->level && + maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_START) { + maxHeap_->pop(); + active_.erase(current->level); + } + current->iter.Prev(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + maxHeap_->push(current); + } + + if (range_tombstone_iters_[current->level] && + range_tombstone_iters_[current->level]->Valid()) { + InsertRangeTombstoneToMaxHeap(current->level); + } + return true /* current key deleted */; + } + assert(current->type == HeapItem::Type::ITERATOR); + // Point key case: check active_ for range tombstone coverage. + ParsedInternalKey pik; + ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); + if (!active_.empty()) { + auto i = *active_.begin(); + if (i < current->level) { + // range tombstone is from a newer level, definitely covers + assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), + pik) <= 0); + assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < + 0); + std::string target; + AppendInternalKey(&target, range_tombstone_iters_[i]->start_key()); + // This is different from SkipNextDeleted() which does reseek at sorted + // runs >= level (instead of i+1 here). With min heap, if level L is at + // top of the heap, then levels level L's + // current internal key, which means levels level) { + // By `active_` we know current key is between start key and end key. + assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), + pik) <= 0); + assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) < + 0); + if (pik.sequence < range_tombstone_iters_[current->level]->seq()) { + current->iter.Prev(); + if (current->iter.Valid()) { + maxHeap_->replace_top(current); + } else { + maxHeap_->pop(); + } + return true /* current key deleted */; + } else { + return false /* current key not deleted */; + } + } else { + return false /* current key not deleted */; + } + } + + assert(active_.empty()); + assert(maxHeap_->top()->type == HeapItem::Type::ITERATOR); + return false /* current key not deleted */; +} + +void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) { + // Invariant(children_) + if (child->iter.Valid()) { + assert(child->iter.status().ok()); + minHeap_.push(child); + } else { + considerStatus(child->iter.status()); + } +} + +void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) { + if (child->iter.Valid()) { + assert(child->iter.status().ok()); + maxHeap_->push(child); + } else { + considerStatus(child->iter.status()); + } +} + +// Advance all non current_ child to > current_.key(). +// We advance current_ after the this function call as it does not require +// Seek(). +// Advance all range tombstones iters, including the one corresponding to +// current_, to the first tombstone with end_key > current_.key(). +// TODO: potentially do cascading seek here too +// TODO: show that invariants hold +void MergingIterator::SwitchToForward() { + ClearHeaps(); + Slice target = key(); + for (auto& child : children_) { + if (&child.iter != current_) { + child.iter.Seek(target); + // child.iter.status() is set to Status::TryAgain indicating asynchronous + // request for retrieval of data blocks has been submitted. So it should + // return at this point and Seek should be called again to retrieve the + // requested block and add the child to min heap. + if (child.iter.status() == Status::TryAgain()) { + continue; + } + if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { + assert(child.iter.status().ok()); + child.iter.Next(); + } + } + AddToMinHeapOrCheckStatus(&child); + } + + for (auto& child : children_) { + if (child.iter.status() == Status::TryAgain()) { + child.iter.Seek(target); + if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { + assert(child.iter.status().ok()); + child.iter.Next(); + } + AddToMinHeapOrCheckStatus(&child); + } + } + + // Current range tombstone iter also needs to seek for the following case: + // Previous direction is backward, so range tombstone iter may point to a + // tombstone before current_. If there is no such tombstone, then the range + // tombstone iter is !Valid(). Need to reseek here to make it valid again. + if (!range_tombstone_iters_.empty()) { + ParsedInternalKey pik; + ParseInternalKey(target, &pik, false /* log_err_key */) + .PermitUncheckedError(); + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + auto iter = range_tombstone_iters_[i]; + if (iter) { + iter->Seek(pik.user_key); + // The while loop is needed as the Seek() call above is only for user + // key. We could have a range tombstone with end_key covering user_key, + // but still is smaller than target. This happens when the range + // tombstone is truncated at iter.largest_. + while (iter->Valid() && + comparator_->Compare(iter->end_key(), pik) <= 0) { + iter->Next(); + } + if (range_tombstone_iters_[i]->Valid()) { + InsertRangeTombstoneToMinHeap( + i, comparator_->Compare(range_tombstone_iters_[i]->start_key(), + pik) > 0 /* start_key */); + } + } + } + } + + direction_ = kForward; + assert(current_ == CurrentForward()); +} + +// Advance all range tombstones iters, including the one corresponding to +// current_, to the first tombstone with start_key <= current_.key(). +void MergingIterator::SwitchToBackward() { + ClearHeaps(); + InitMaxHeap(); + Slice target = key(); + for (auto& child : children_) { + if (&child.iter != current_) { + child.iter.SeekForPrev(target); + TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child); + if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { + assert(child.iter.status().ok()); + child.iter.Prev(); + } + } + AddToMaxHeapOrCheckStatus(&child); + } + + ParsedInternalKey pik; + ParseInternalKey(target, &pik, false /* log_err_key */) + .PermitUncheckedError(); + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + auto iter = range_tombstone_iters_[i]; + if (iter) { + iter->SeekForPrev(pik.user_key); + // Since the SeekForPrev() call above is only for user key, + // we may end up with some range tombstone with start key having the + // same user key at current_, but with a smaller sequence number. This + // makes current_ not at maxHeap_ top for the CurrentReverse() call + // below. If there is a range tombstone start key with the same user + // key and the same sequence number as current_.key(), it will be fine as + // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest + // op_type. + while (iter->Valid() && + comparator_->Compare(iter->start_key(), pik) > 0) { + iter->Prev(); + } + if (iter->Valid()) { + InsertRangeTombstoneToMaxHeap( + i, comparator_->Compare(range_tombstone_iters_[i]->end_key(), + pik) <= 0 /* end_key */); + } + } + } + + direction_ = kReverse; + if (!prefix_seek_mode_) { + // Note that we don't do assert(current_ == CurrentReverse()) here + // because it is possible to have some keys larger than the seek-key + // inserted between Seek() and SeekToLast(), which makes current_ not + // equal to CurrentReverse(). + current_ = CurrentReverse(); + } + assert(current_ == CurrentReverse()); +} + +void MergingIterator::ClearHeaps(bool clear_active) { + minHeap_.clear(); + if (maxHeap_) { + maxHeap_->clear(); + } + if (clear_active) { + active_.clear(); + } +} + +void MergingIterator::InitMaxHeap() { + if (!maxHeap_) { + maxHeap_ = + std::make_unique(MaxHeapItemComparator(comparator_)); + } +} + +// Assume there is a next key that is not covered by range tombstone. +// Pre-condition: +// - Invariants (3) and (4) +// - There is some k where k <= children_[i].iter.key() <= LevelNextVisible(i, +// k) for all levels i (LevelNextVisible() defined in Seek()). +// +// Define NextVisible(k) to be the first key >= k from among children_ that +// is not covered by any range tombstone. +// Post-condition: +// - Invariants (1)-(4) hold +// - (*): minHeap_->top()->key() == NextVisible(k) +// +// Loop invariants: +// - Invariants (3) and (4) +// - (*): k <= children_[i].iter.key() <= LevelNextVisible(i, k) +// +// Progress: minHeap_.top()->key() is non-decreasing and strictly increases in +// a finite number of iterations. +// TODO: it is possible to call SeekImpl(k2) after SeekImpl(k1) with +// k2 < k1 in the same FindNextVisibleKey(). For example, l1 has a range +// tombstone [2,3) and l2 has a range tombstone [1, 4). Point key 1 from l5 +// triggers SeekImpl(4 /* target */, 5). Then point key 2 from l3 triggers +// SeekImpl(3 /* target */, 3). +// Ideally we should only move iterators forward in SeekImpl(), and the +// progress condition can be made simpler: iterator only moves forward. +// +// Proof sketch: +// Post-condition: +// Invariant (1) holds when this method returns: +// Ignoring the empty minHeap_ case, there are two cases: +// Case 1: active_ is empty and !minHeap_.top()->iter.IsDeleteRangeSentinelKey() +// By invariants (rti) and (active_), active_ being empty means if a +// pinned_heap_item_[i] is in minHeap_, it has type DELETE_RANGE_START. Note +// that PopDeleteRangeStart() was called right before the while loop condition, +// so minHeap_.top() is not of type DELETE_RANGE_START. So minHeap_.top() must +// be of type ITERATOR. +// Case 2: SkipNextDeleted() returns false. The method returns false only when +// minHeap_.top().type == ITERATOR. +// +// Invariant (2) holds when this method returns: +// From Invariant (1), minHeap_.top().type == ITERATOR. Suppose it is +// children_[i] for some i. Suppose that children_[i].iter.key() is covered by +// some range tombstone. This means there is a j <= i and a range tombstone from +// level j with start_key() < children_[i].iter.key() < end_key(). +// - If range_tombstone_iters_[j]->Valid(), by Invariants (rti) and (phi), +// pinned_heap_item_[j] is in minHeap_, and pinned_heap_item_[j].tombstone_pik +// is either start or end key of this range tombstone. If +// pinned_heap_item_[j].tombstone_pik < children_[i].iter.key(), it would be at +// top of minHeap_ which would contradict Invariant (1). So +// pinned_heap_item_[j].tombstone_pik > children_[i].iter.key(). +// By Invariant (3), range_tombstone_iters_[j].prev.end_key() < +// children_[i].iter.key(). We assume that in each level, range tombstones +// cover non-overlapping ranges. So range_tombstone_iters_[j] is at +// the range tombstone with start_key() < children_[i].iter.key() < end_key() +// and has its end_key() in minHeap_. By Invariants (phi) and (active_), +// j is in active_. From while loop condition, SkipNextDeleted() must have +// returned false for this method to return. +// - If j < i, then SeekImpl(range_tombstone_iters_[j']->end_key(), i) +// was called for some j' < i and j' in active_. Note that since j' is in +// active_, pinned_heap_item_[j'] is in minHeap_ and has tombstone_pik = +// range_tombstone_iters_[j']->end_key(). So +// range_tombstone_iters_[j']->end_key() must be larger than +// children_[i].iter.key() to not be at top of minHeap_. This means after +// SeekImpl(), children_[i] would be at a key > children_[i].iter.key() +// -- contradiction. +// - If j == i, children_[i]->Next() would have been called and children_[i] +// would be at a key > children_[i].iter.key() -- contradiction. +// - If !range_tombstone_iters_[j]->Valid(). Then range_tombstone_iters_[j] +// points to an SST file with all range tombstones from that file exhausted. +// The file must come before the file containing the first +// range tombstone with start_key() < children_[i].iter.key() < end_key(). +// Assume files from same level have non-overlapping ranges, the current file's +// meta.largest is less than children_[i].iter.key(). So the file boundary key, +// which has value meta.largest must have been popped from minHeap_ before +// children_[i].iter.key(). So range_tombstone_iters_[j] would not point to +// this SST file -- contradiction. +// So it is impossible for children_[i].iter.key() to be covered by a range +// tombstone. +// +// Post-condition (*) holds when the function returns: +// From loop invariant (*) that k <= children_[i].iter.key() <= +// LevelNextVisible(i, k) and Invariant (2) above, when the function returns, +// minHeap_.top()->key() is the smallest LevelNextVisible(i, k) among all levels +// i. This is equal to NextVisible(k). +// +// Invariant (3) holds after each iteration: +// PopDeleteRangeStart() does not change range tombstone position. +// In SkipNextDeleted(): +// - If DELETE_RANGE_END is popped from minHeap_, it means the range +// tombstone's end key is < all other point keys, so it is safe to advance to +// next range tombstone. +// - If file boundary is popped (current->iter.IsDeleteRangeSentinelKey()), +// we assume that file's last range tombstone's +// end_key <= file boundary key < all other point keys. So it is safe to +// move to the first range tombstone in the next SST file. +// - If children_[i]->Next() is called, then it is fine as it is advancing a +// point iterator. +// - If SeekImpl(target, l) is called, then (3) follows from SeekImpl()'s +// post-condition if its pre-condition holds. First pre-condition follows +// from loop invariant where Invariant (3) holds for all levels i. +// Now we should second pre-condition holds. Since Invariant (3) holds for +// all i, we have for all j <= l, range_tombstone_iters_[j].prev.end_key() +// < children_[l].iter.key(). `target` is the value of +// range_tombstone_iters_[j'].end_key() for some j' < l and j' in active_. +// By Invariant (active_) and (rti), pinned_heap_item_[j'] is in minHeap_ and +// pinned_heap_item_[j'].tombstone_pik = range_tombstone_iters_[j'].end_key(). +// This end_key must be larger than children_[l].key() since it was not at top +// of minHeap_. So for all levels j <= l, +// range_tombstone_iters_[j].prev.end_key() < children_[l].iter.key() < target +// +// Invariant (4) holds after each iteration: +// A level i is inserted into active_ during calls to PopDeleteRangeStart(). +// In that case, range_tombstone_iters_[i].start_key() < all point keys +// by heap property and the assumption that point keys and range tombstone keys +// are distinct. +// If SeekImpl(target, l) is called, then there is a range_tombstone_iters_[j] +// where target = range_tombstone_iters_[j]->end_key() and children_[l]->key() +// < target. By loop invariants, (3) and (4) holds for levels. +// Since target > children_[l]->key(), it also holds that for j < l, +// range_tombstone_iters_[j].prev.end_key() < target and that if j in active_, +// range_tombstone_iters_[i]->start_key() < target. So all pre-conditions of +// SeekImpl(target, l) holds, and (4) follow from its post-condition. +// All other places either in this function either advance point iterators +// or remove some level from active_, so (4) still holds. +// +// Look Invariant (*): for all level i, k <= children_[i] <= LevelNextVisible(i, +// k). +// k <= children_[i] follows from loop `progress` condition. +// Consider when children_[i] is changed for any i. It is through +// children_[i].iter.Next() or SeekImpl() in SkipNextDeleted(). +// If children_[i].iter.Next() is called, there is a range tombstone from level +// i where tombstone seqno > children_[i].iter.key()'s seqno and i in active_. +// By Invariant (4), tombstone's start_key < children_[i].iter.key(). By +// invariants (active_), (phi), and (rti), tombstone's end_key is in minHeap_ +// and that children_[i].iter.key() < end_key. So children_[i].iter.key() is +// not visible, and it is safe to call Next(). +// If SeekImpl(target, l) is called, by its contract, when SeekImpl() returns, +// target <= children_[i]->key() <= LevelNextVisible(i, target) for i >= l, +// and children_[end_key() for some j < i and j is in active_. +// By Invariant (4), range_tombstone_iters_[j]->start_key() < +// children_[i].iter.key() for all i >= l. So for each level i >= l, the range +// [children_[i].iter.key(), target) is not visible. So after SeekImpl(), +// children_[i].iter.key() <= LevelNextVisible(i, target) <= +// LevelNextVisible(i, k). +// +// `Progress` holds for each iteration: +// Very sloppy intuition: +// - in PopDeleteRangeStart(): the value of a pinned_heap_item_.tombstone_pik_ +// is updated from the start key to the end key of the same range tombstone. +// We assume that start key <= end key for the same range tombstone. +// - in SkipNextDeleted() +// - If the top of heap is DELETE_RANGE_END, the range tombstone is advanced +// and the relevant pinned_heap_item_.tombstone_pik is increased or popped +// from minHeap_. +// - If the top of heap is a file boundary key, then both point iter and +// range tombstone iter are advanced to the next file. +// - If the top of heap is ITERATOR and current->iter.Next() is called, it +// moves to a larger point key. +// - If the top of heap is ITERATOR and SeekImpl(k, l) is called, then all +// iterators from levels >= l are advanced to some key >= k by its contract. +// And top of minHeap_ before SeekImpl(k, l) was less than k. +// There are special cases where different heap items have the same key, +// e.g. when two range tombstone end keys share the same value). In +// these cases, iterators are being advanced, so the minimum key should increase +// in a finite number of steps. +inline void MergingIterator::FindNextVisibleKey() { + PopDeleteRangeStart(); + // PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START + // active_ being empty implies no DELETE_RANGE_END in heap. + // So minHeap_->top() must be of type ITERATOR. + while ( + !minHeap_.empty() && + (!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) && + SkipNextDeleted()) { + PopDeleteRangeStart(); + } + // Checks Invariant (1) + assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::Type::ITERATOR); +} + +inline void MergingIterator::FindPrevVisibleKey() { + PopDeleteRangeEnd(); + // PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END + // active_ being empty implies no DELETE_RANGE_START in heap. + // So maxHeap_->top() must be of type ITERATOR. + while ( + !maxHeap_->empty() && + (!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) && + SkipPrevDeleted()) { + PopDeleteRangeEnd(); + } +} + +InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp, + InternalIterator** list, int n, + Arena* arena, bool prefix_seek_mode) { + assert(n >= 0); + if (n == 0) { + return NewEmptyInternalIterator(arena); + } else if (n == 1) { + return list[0]; + } else { + if (arena == nullptr) { + return new MergingIterator(cmp, list, n, false, prefix_seek_mode); + } else { + auto mem = arena->AllocateAligned(sizeof(MergingIterator)); + return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode); + } + } +} + +MergeIteratorBuilder::MergeIteratorBuilder( + const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode, + const Slice* iterate_upper_bound) + : first_iter(nullptr), use_merging_iter(false), arena(a) { + auto mem = arena->AllocateAligned(sizeof(MergingIterator)); + merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true, + prefix_seek_mode, iterate_upper_bound); +} + +MergeIteratorBuilder::~MergeIteratorBuilder() { + if (first_iter != nullptr) { + first_iter->~InternalIterator(); + } + if (merge_iter != nullptr) { + merge_iter->~MergingIterator(); + } +} + +void MergeIteratorBuilder::AddIterator(InternalIterator* iter) { + if (!use_merging_iter && first_iter != nullptr) { + merge_iter->AddIterator(first_iter); + use_merging_iter = true; + first_iter = nullptr; + } + if (use_merging_iter) { + merge_iter->AddIterator(iter); + } else { + first_iter = iter; + } +} + +void MergeIteratorBuilder::AddPointAndTombstoneIterator( + InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter, + TruncatedRangeDelIterator*** tombstone_iter_ptr) { + // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator. + bool add_range_tombstone = tombstone_iter || + !merge_iter->range_tombstone_iters_.empty() || + tombstone_iter_ptr; + if (!use_merging_iter && (add_range_tombstone || first_iter)) { + use_merging_iter = true; + if (first_iter) { + merge_iter->AddIterator(first_iter); + first_iter = nullptr; + } + } + if (use_merging_iter) { + merge_iter->AddIterator(point_iter); + if (add_range_tombstone) { + // If there was a gap, fill in nullptr as empty range tombstone iterators. + while (merge_iter->range_tombstone_iters_.size() < + merge_iter->children_.size() - 1) { + merge_iter->AddRangeTombstoneIterator(nullptr); + } + merge_iter->AddRangeTombstoneIterator(tombstone_iter); + } + + if (tombstone_iter_ptr) { + // This is needed instead of setting to &range_tombstone_iters_[i] + // directly here since the memory address of range_tombstone_iters_[i] + // might change during vector resizing. + range_del_iter_ptrs_.emplace_back( + merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr); + } + } else { + first_iter = point_iter; + } +} + +InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) { + InternalIterator* ret = nullptr; + if (!use_merging_iter) { + ret = first_iter; + first_iter = nullptr; + } else { + for (auto& p : range_del_iter_ptrs_) { + *(p.second) = &(merge_iter->range_tombstone_iters_[p.first]); + } + if (db_iter && !merge_iter->range_tombstone_iters_.empty()) { + // memtable is always the first level + db_iter->SetMemtableRangetombstoneIter( + &merge_iter->range_tombstone_iters_.front()); + } + merge_iter->Finish(); + ret = merge_iter; + merge_iter = nullptr; + } + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.h new file mode 100644 index 0000000000..562a4e57f5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/merging_iterator.h @@ -0,0 +1,97 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/range_del_aggregator.h" +#include "rocksdb/slice.h" +#include "rocksdb/types.h" +#include "table/iterator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ArenaWrappedDBIter; +class InternalKeyComparator; + +template +class InternalIteratorBase; +using InternalIterator = InternalIteratorBase; + +// Return an iterator that provides the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern InternalIterator* NewMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, int n, + Arena* arena = nullptr, bool prefix_seek_mode = false); + +// The iterator returned by NewMergingIterator() and +// MergeIteratorBuilder::Finish(). MergingIterator handles the merging of data +// from different point and/or range tombstone iterators. +class MergingIterator; + +// A builder class to for an iterator that provides the union of data +// of input iterators. Two APIs are provided to add input iterators. User should +// only call one of them exclusively depending on if range tombstone should be +// processed. +class MergeIteratorBuilder { + public: + // comparator: the comparator used in merging comparator + // arena: where the merging iterator needs to be allocated from. + explicit MergeIteratorBuilder(const InternalKeyComparator* comparator, + Arena* arena, bool prefix_seek_mode = false, + const Slice* iterate_upper_bound = nullptr); + ~MergeIteratorBuilder(); + + // Add point key iterator `iter` to the merging iterator. + void AddIterator(InternalIterator* iter); + + // Add a point key iterator and a range tombstone iterator. + // `tombstone_iter_ptr` should and only be set by LevelIterator. + // *tombstone_iter_ptr will be set to where the merging iterator stores + // `tombstone_iter` when MergeIteratorBuilder::Finish() is called. This is + // used by LevelIterator to update range tombstone iters when switching to a + // different SST file. If a single point iterator with a nullptr range + // tombstone iterator is provided, and the point iterator is not a level + // iterator, then this builder will return the point iterator directly, + // instead of creating a merging iterator on top of it. Internally, if all + // point iterators are not LevelIterator, then range tombstone iterator is + // only added to the merging iter if there is a non-null `tombstone_iter`. + void AddPointAndTombstoneIterator( + InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter, + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr); + + // Get arena used to build the merging iterator. It is called one a child + // iterator needs to be allocated. + Arena* GetArena() { return arena; } + + // Return the result merging iterator. + // If db_iter is not nullptr, then db_iter->SetMemtableRangetombstoneIter() + // will be called with pointer to where the merging iterator + // stores the memtable range tombstone iterator. + // This is used for DB iterator to refresh memtable range tombstones. + InternalIterator* Finish(ArenaWrappedDBIter* db_iter = nullptr); + + private: + MergingIterator* merge_iter; + InternalIterator* first_iter; + bool use_merging_iter; + Arena* arena; + // Used to set LevelIterator.range_tombstone_iter_. + // See AddRangeTombstoneIterator() implementation for more detail. + std::vector> + range_del_iter_ptrs_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.cc new file mode 100644 index 0000000000..6fea536d62 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "table/meta_blocks.h" + +#include +#include + +#include "block_fetcher.h" +#include "db/table_properties_collector.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block.h" +#include "table/block_based/reader_common.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "table/persistent_cache_helper.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_properties_internal.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kPropertiesBlockName = "rocksdb.properties"; +// Old property block name for backward compatibility +const std::string kPropertiesBlockOldName = "rocksdb.stats"; +const std::string kCompressionDictBlockName = "rocksdb.compression_dict"; +const std::string kRangeDelBlockName = "rocksdb.range_del"; + +MetaIndexBuilder::MetaIndexBuilder() + : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {} + +void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) { + std::string handle_encoding; + handle.EncodeTo(&handle_encoding); + meta_block_handles_.insert({key, handle_encoding}); +} + +Slice MetaIndexBuilder::Finish() { + for (const auto& metablock : meta_block_handles_) { + meta_index_block_->Add(metablock.first, metablock.second); + } + return meta_index_block_->Finish(); +} + +// Property block will be read sequentially and cached in a heap located +// object, so there's no need for restart points. Thus we set the restart +// interval to infinity to save space. +PropertyBlockBuilder::PropertyBlockBuilder() + : properties_block_(new BlockBuilder( + std::numeric_limits::max() /* restart interval */)) {} + +void PropertyBlockBuilder::Add(const std::string& name, + const std::string& val) { + props_.insert({name, val}); +} + +void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { + assert(props_.find(name) == props_.end()); + + std::string dst; + PutVarint64(&dst, val); + + Add(name, dst); +} + +void PropertyBlockBuilder::Add( + const UserCollectedProperties& user_collected_properties) { + for (const auto& prop : user_collected_properties) { + Add(prop.first, prop.second); + } +} + +void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { + TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start", + const_cast(&props)); + + Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number); + Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); + Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); + Add(TablePropertiesNames::kDataSize, props.data_size); + Add(TablePropertiesNames::kIndexSize, props.index_size); + if (props.index_partitions != 0) { + Add(TablePropertiesNames::kIndexPartitions, props.index_partitions); + Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); + } + Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); + Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, + props.index_value_is_delta_encoded); + Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries); + Add(TablePropertiesNames::kDeletedKeys, props.num_deletions); + Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands); + Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); + Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); + Add(TablePropertiesNames::kFilterSize, props.filter_size); + Add(TablePropertiesNames::kFormatVersion, props.format_version); + Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); + Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id); + Add(TablePropertiesNames::kCreationTime, props.creation_time); + Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time); + if (props.file_creation_time > 0) { + Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time); + } + if (props.slow_compression_estimated_data_size > 0) { + Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize, + props.slow_compression_estimated_data_size); + } + if (props.fast_compression_estimated_data_size > 0) { + Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, + props.fast_compression_estimated_data_size); + } + Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset); + if (!props.db_id.empty()) { + Add(TablePropertiesNames::kDbId, props.db_id); + } + if (!props.db_session_id.empty()) { + Add(TablePropertiesNames::kDbSessionId, props.db_session_id); + } + if (!props.db_host_id.empty()) { + Add(TablePropertiesNames::kDbHostId, props.db_host_id); + } + + if (!props.filter_policy_name.empty()) { + Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name); + } + if (!props.comparator_name.empty()) { + Add(TablePropertiesNames::kComparator, props.comparator_name); + } + + if (!props.merge_operator_name.empty()) { + Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name); + } + if (!props.prefix_extractor_name.empty()) { + Add(TablePropertiesNames::kPrefixExtractorName, + props.prefix_extractor_name); + } + if (!props.property_collectors_names.empty()) { + Add(TablePropertiesNames::kPropertyCollectors, + props.property_collectors_names); + } + if (!props.column_family_name.empty()) { + Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name); + } + + if (!props.compression_name.empty()) { + Add(TablePropertiesNames::kCompression, props.compression_name); + } + if (!props.compression_options.empty()) { + Add(TablePropertiesNames::kCompressionOptions, props.compression_options); + } + if (!props.seqno_to_time_mapping.empty()) { + Add(TablePropertiesNames::kSequenceNumberTimeMapping, + props.seqno_to_time_mapping); + } +} + +Slice PropertyBlockBuilder::Finish() { + for (const auto& prop : props_) { + properties_block_->Add(prop.first, prop.second); + } + + return properties_block_->Finish(); +} + +void LogPropertiesCollectionError(Logger* info_log, const std::string& method, + const std::string& name) { + assert(method == "Add" || method == "Finish"); + + std::string msg = + "Encountered error when calling TablePropertiesCollector::" + method + + "() with collector name: " + name; + ROCKS_LOG_ERROR(info_log, "%s", msg.c_str()); +} + +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, const Slice& value, uint64_t file_size, + const std::vector>& collectors, + Logger* info_log) { + bool all_succeeded = true; + for (auto& collector : collectors) { + Status s = collector->InternalAdd(key, value, file_size); + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Add" /* method */, + collector->Name()); + } + } + return all_succeeded; +} + +void NotifyCollectTableCollectorsOnBlockAdd( + const std::vector>& collectors, + const uint64_t block_uncomp_bytes, + const uint64_t block_compressed_bytes_fast, + const uint64_t block_compressed_bytes_slow) { + for (auto& collector : collectors) { + collector->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); + } +} + +bool NotifyCollectTableCollectorsOnFinish( + const std::vector>& collectors, + Logger* info_log, PropertyBlockBuilder* builder) { + bool all_succeeded = true; + for (auto& collector : collectors) { + UserCollectedProperties user_collected_properties; + Status s = collector->Finish(&user_collected_properties); + + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Finish" /* method */, + collector->Name()); + } else { + builder->Add(user_collected_properties); + } + } + + return all_succeeded; +} + +// FIXME: should be a parameter for reading table properties to use persistent +// cache? +Status ReadTablePropertiesHelper( + const ReadOptions& ro, const BlockHandle& handle, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ImmutableOptions& ioptions, + std::unique_ptr* table_properties, + MemoryAllocator* memory_allocator) { + assert(table_properties); + + // If this is an external SST file ingested with write_global_seqno set to + // true, then we expect the checksum mismatch because checksum was written + // by SstFileWriter, but its global seqno in the properties block may have + // been changed during ingestion. For this reason, we initially read + // and process without checksum verification, then later try checksum + // verification so that if it fails, we can copy to a temporary buffer with + // global seqno set to its original value, i.e. 0, and attempt checksum + // verification again. + ReadOptions modified_ro = ro; + modified_ro.verify_checksums = false; + BlockContents block_contents; + BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle, + &block_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kProperties, + UncompressionDict::GetEmptyDict(), + PersistentCacheOptions::kEmpty, memory_allocator); + Status s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + + // Unfortunately, Block::size() might not equal block_contents.data.size(), + // and Block hides block_contents + uint64_t block_size = block_contents.data.size(); + Block properties_block(std::move(block_contents)); + std::unique_ptr iter(properties_block.NewMetaIterator()); + + std::unique_ptr new_table_properties{new TableProperties}; + // All pre-defined properties of type uint64_t + std::unordered_map predefined_uint64_properties = { + {TablePropertiesNames::kOriginalFileNumber, + &new_table_properties->orig_file_number}, + {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, + {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, + {TablePropertiesNames::kIndexPartitions, + &new_table_properties->index_partitions}, + {TablePropertiesNames::kTopLevelIndexSize, + &new_table_properties->top_level_index_size}, + {TablePropertiesNames::kIndexKeyIsUserKey, + &new_table_properties->index_key_is_user_key}, + {TablePropertiesNames::kIndexValueIsDeltaEncoded, + &new_table_properties->index_value_is_delta_encoded}, + {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, + {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, + {TablePropertiesNames::kRawValueSize, + &new_table_properties->raw_value_size}, + {TablePropertiesNames::kNumDataBlocks, + &new_table_properties->num_data_blocks}, + {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kNumFilterEntries, + &new_table_properties->num_filter_entries}, + {TablePropertiesNames::kDeletedKeys, + &new_table_properties->num_deletions}, + {TablePropertiesNames::kMergeOperands, + &new_table_properties->num_merge_operands}, + {TablePropertiesNames::kNumRangeDeletions, + &new_table_properties->num_range_deletions}, + {TablePropertiesNames::kFormatVersion, + &new_table_properties->format_version}, + {TablePropertiesNames::kFixedKeyLen, + &new_table_properties->fixed_key_len}, + {TablePropertiesNames::kColumnFamilyId, + &new_table_properties->column_family_id}, + {TablePropertiesNames::kCreationTime, + &new_table_properties->creation_time}, + {TablePropertiesNames::kOldestKeyTime, + &new_table_properties->oldest_key_time}, + {TablePropertiesNames::kFileCreationTime, + &new_table_properties->file_creation_time}, + {TablePropertiesNames::kSlowCompressionEstimatedDataSize, + &new_table_properties->slow_compression_estimated_data_size}, + {TablePropertiesNames::kFastCompressionEstimatedDataSize, + &new_table_properties->fast_compression_estimated_data_size}, + {TablePropertiesNames::kTailStartOffset, + &new_table_properties->tail_start_offset}, + }; + + std::string last_key; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); + if (!s.ok()) { + break; + } + + auto key = iter->key().ToString(); + // properties block should be strictly sorted with no duplicate key. + if (!last_key.empty() && + BytewiseComparator()->Compare(key, last_key) <= 0) { + s = Status::Corruption("properties unsorted"); + break; + } + last_key = key; + + auto raw_val = iter->value(); + auto pos = predefined_uint64_properties.find(key); + + if (key == ExternalSstFilePropertyNames::kGlobalSeqno) { + new_table_properties->external_sst_file_global_seqno_offset = + handle.offset() + iter->ValueOffset(); + } + + if (pos != predefined_uint64_properties.end()) { + if (key == TablePropertiesNames::kDeletedKeys || + key == TablePropertiesNames::kMergeOperands) { + // Insert in user-collected properties for API backwards compatibility + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "Detect malformed value in properties meta-block:" + "\tkey: " + + key + "\tval: " + raw_val.ToString(); + ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == TablePropertiesNames::kDbId) { + new_table_properties->db_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbSessionId) { + new_table_properties->db_session_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbHostId) { + new_table_properties->db_host_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kFilterPolicy) { + new_table_properties->filter_policy_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kColumnFamilyName) { + new_table_properties->column_family_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kComparator) { + new_table_properties->comparator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kMergeOperator) { + new_table_properties->merge_operator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPrefixExtractorName) { + new_table_properties->prefix_extractor_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPropertyCollectors) { + new_table_properties->property_collectors_names = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompression) { + new_table_properties->compression_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompressionOptions) { + new_table_properties->compression_options = raw_val.ToString(); + } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) { + new_table_properties->seqno_to_time_mapping = raw_val.ToString(); + } else { + // handle user-collected properties + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + } + + // Modified version of BlockFetcher checksum verification + // (See write_global_seqno comment above) + if (s.ok() && footer.GetBlockTrailerSize() > 0) { + s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(), + block_size, file->file_name(), handle.offset()); + if (s.IsCorruption()) { + if (new_table_properties->external_sst_file_global_seqno_offset != 0) { + std::string tmp_buf(properties_block.data(), + block_fetcher.GetBlockSizeWithTrailer()); + uint64_t global_seqno_offset = + new_table_properties->external_sst_file_global_seqno_offset - + handle.offset(); + EncodeFixed64(&tmp_buf[static_cast(global_seqno_offset)], 0); + s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(), + block_size, file->file_name(), handle.offset()); + } + } + } + + if (s.ok()) { + *table_properties = std::move(new_table_properties); + } + + return s; +} + +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + std::unique_ptr* properties, + MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer) { + BlockHandle block_handle; + Footer footer; + Status s = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, kPropertiesBlockName, &block_handle, + memory_allocator, prefetch_buffer, &footer); + if (!s.ok()) { + return s; + } + + if (!block_handle.IsNull()) { + s = ReadTablePropertiesHelper(read_options, block_handle, file, + prefetch_buffer, footer, ioptions, properties, + memory_allocator); + } else { + s = Status::NotFound(); + } + return s; +} + +Status FindOptionalMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle) { + assert(block_handle != nullptr); + meta_index_iter->Seek(meta_block_name); + if (meta_index_iter->status().ok()) { + if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) { + Slice v = meta_index_iter->value(); + return block_handle->DecodeFrom(&v); + } else if (meta_block_name == kPropertiesBlockName) { + // Have to try old name for compatibility + meta_index_iter->Seek(kPropertiesBlockOldName); + if (meta_index_iter->status().ok() && meta_index_iter->Valid() && + meta_index_iter->key() == kPropertiesBlockOldName) { + Slice v = meta_index_iter->value(); + return block_handle->DecodeFrom(&v); + } + } + } + // else + *block_handle = BlockHandle::NullBlockHandle(); + return meta_index_iter->status(); +} + +Status FindMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle) { + Status s = + FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle); + if (s.ok() && block_handle->IsNull()) { + return Status::Corruption("Cannot find the meta block", meta_block_name); + } else { + return s; + } +} + +Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, + uint64_t file_size, uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + BlockContents* metaindex_contents, + MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer, + Footer* footer_out) { + Footer footer; + IOOptions opts; + Status s; + s = file->PrepareIOOptions(read_options, opts); + if (!s.ok()) { + return s; + } + s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, file_size, + &footer, table_magic_number); + if (!s.ok()) { + return s; + } + if (footer_out) { + *footer_out = footer; + } + + auto metaindex_handle = footer.metaindex_handle(); + return BlockFetcher(file, prefetch_buffer, footer, read_options, + metaindex_handle, metaindex_contents, ioptions, + false /* do decompression */, false /*maybe_compressed*/, + BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), + PersistentCacheOptions::kEmpty, memory_allocator) + .ReadBlockContents(); +} + +Status FindMetaBlockInFile( + RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, + BlockHandle* block_handle, MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { + BlockContents metaindex_contents; + auto s = ReadMetaIndexBlockInFile( + file, file_size, table_magic_number, ioptions, read_options, + &metaindex_contents, memory_allocator, prefetch_buffer, footer_out); + if (!s.ok()) { + return s; + } + // meta blocks are never compressed. Need to add uncompress logic if we are to + // compress it. + Block metaindex_block(std::move(metaindex_contents)); + + std::unique_ptr meta_iter; + meta_iter.reset(metaindex_block.NewMetaIterator()); + + return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); +} + +Status ReadMetaBlock(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + const std::string& meta_block_name, BlockType block_type, + BlockContents* contents, + MemoryAllocator* memory_allocator) { + // TableProperties requires special handling because of checksum issues. + // Call ReadTableProperties instead for that case. + assert(block_type != BlockType::kProperties); + + BlockHandle block_handle; + Footer footer; + Status status = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, meta_block_name, &block_handle, + memory_allocator, prefetch_buffer, &footer); + if (!status.ok()) { + return status; + } + + return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle, + contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, block_type, + UncompressionDict::GetEmptyDict(), + PersistentCacheOptions::kEmpty, memory_allocator) + .ReadBlockContents(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.h new file mode 100644 index 0000000000..962a316388 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/meta_blocks.h @@ -0,0 +1,172 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include + +#include "db/builder.h" +#include "db/table_properties_collector.h" +#include "rocksdb/comparator.h" +#include "rocksdb/memory_allocator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/block_type.h" +#include "table/format.h" +#include "util/kv_map.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder; +class BlockHandle; +class Env; +class Footer; +class Logger; +class RandomAccessFile; +struct TableProperties; + +// Meta block names for metaindex +extern const std::string kPropertiesBlockName; +extern const std::string kPropertiesBlockOldName; +extern const std::string kCompressionDictBlockName; +extern const std::string kRangeDelBlockName; + +class MetaIndexBuilder { + public: + MetaIndexBuilder(const MetaIndexBuilder&) = delete; + MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete; + + MetaIndexBuilder(); + void Add(const std::string& key, const BlockHandle& handle); + + // Write all the added key/value pairs to the block and return the contents + // of the block. + Slice Finish(); + + private: + // store the sorted key/handle of the metablocks. + stl_wrappers::KVMap meta_block_handles_; + std::unique_ptr meta_index_block_; +}; + +class PropertyBlockBuilder { + public: + PropertyBlockBuilder(const PropertyBlockBuilder&) = delete; + PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete; + + PropertyBlockBuilder(); + + void AddTableProperty(const TableProperties& props); + void Add(const std::string& key, uint64_t value); + void Add(const std::string& key, const std::string& value); + void Add(const UserCollectedProperties& user_collected_properties); + + // Write all the added entries to the block and return the block contents + Slice Finish(); + + private: + std::unique_ptr properties_block_; + stl_wrappers::KVMap props_; +}; + +// Were we encounter any error occurs during user-defined statistics collection, +// we'll write the warning message to info log. +void LogPropertiesCollectionError(Logger* info_log, const std::string& method, + const std::string& name); + +// Utility functions help table builder to trigger batch events for user +// defined property collectors. +// Return value indicates if there is any error occurred; if error occurred, +// the warning message will be logged. +// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all +// property collectors. +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, const Slice& value, uint64_t file_size, + const std::vector>& collectors, + Logger* info_log); + +void NotifyCollectTableCollectorsOnBlockAdd( + const std::vector>& collectors, + uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow); + +// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all +// property collectors. The collected properties will be added to `builder`. +bool NotifyCollectTableCollectorsOnFinish( + const std::vector>& collectors, + Logger* info_log, PropertyBlockBuilder* builder); + +// Read table properties from a file using known BlockHandle. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadTablePropertiesHelper( + const ReadOptions& ro, const BlockHandle& handle, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ImmutableOptions& ioptions, + std::unique_ptr* table_properties, + MemoryAllocator* memory_allocator = nullptr); + +// Read table properties from the properties block of a plain table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + std::unique_ptr* properties, + MemoryAllocator* memory_allocator = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr); + +// Find the meta block from the meta index block. Returns OK and +// block_handle->IsNull() if not found. +Status FindOptionalMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle); + +// Find the meta block from the meta index block. Returns Corruption if not +// found. +Status FindMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle); + +// Find the meta block +Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + const std::string& meta_block_name, + BlockHandle* block_handle, + MemoryAllocator* memory_allocator = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr, + Footer* footer_out = nullptr); + +// Read meta block contents +Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, + uint64_t file_size, uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + BlockContents* block_contents, + MemoryAllocator* memory_allocator = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr, + Footer* footer_out = nullptr); + +// Read the specified meta block with name meta_block_name +// from `file` and initialize `contents` with contents of this block. +// Return Status::OK in case of success. +Status ReadMetaBlock(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const ReadOptions& read_options, + const std::string& meta_block_name, BlockType block_type, + BlockContents* contents, + MemoryAllocator* memory_allocator = nullptr); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/mock_table.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/mock_table.h new file mode 100644 index 0000000000..e4850d0606 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/mock_table.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/io_status.h" +#include "rocksdb/table.h" +#include "table/internal_iterator.h" +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/kv_map.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { +namespace mock { +using KVPair = std::pair; +using KVVector = std::vector; + +KVVector MakeMockFile(std::initializer_list l = {}); +void SortKVVector(KVVector* kv_vector, + const Comparator* ucmp = BytewiseComparator()); + +struct MockTableFileSystem { + port::Mutex mutex; + std::map files; +}; + +class MockTableFactory : public TableFactory { + public: + enum MockCorruptionMode { + kCorruptNone, + kCorruptKey, + kCorruptValue, + kCorruptReorderKey, + }; + + MockTableFactory(); + static const char* kClassName() { return "MockTable"; } + const char* Name() const override { return kClassName(); } + using TableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const override; + + // This function will directly create mock table instead of going through + // MockTableBuilder. file_contents has to have a format of . Those key-value pairs will then be inserted into the mock table. + Status CreateMockTable(Env* env, const std::string& fname, + KVVector file_contents); + + virtual std::string GetPrintableOptions() const override { + return std::string(); + } + + void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; } + + void SetKeyValueSize(size_t size) { key_value_size_ = size; } + // This function will assert that only a single file exists and that the + // contents are equal to file_contents + void AssertSingleFile(const KVVector& file_contents); + void AssertLatestFiles(const std::vector& files_contents); + + private: + Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const; + Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const; + + mutable MockTableFileSystem file_system_; + mutable std::atomic next_id_; + MockCorruptionMode corrupt_mode_; + + size_t key_value_size_ = 1; +}; + +} // namespace mock +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/multiget_context.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/multiget_context.h new file mode 100644 index 0000000000..54af2262cb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/multiget_context.h @@ -0,0 +1,405 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "db/dbformat.h" +#include "db/lookup_key.h" +#include "db/merge_context.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/types.h" +#include "util/async_file_reader.h" +#include "util/autovector.h" +#include "util/math.h" +#include "util/single_thread_executor.h" + +namespace ROCKSDB_NAMESPACE { +class GetContext; +class PinnableWideColumns; + +struct KeyContext { + const Slice* key; + LookupKey* lkey; + Slice ukey_with_ts; + Slice ukey_without_ts; + Slice ikey; + ColumnFamilyHandle* column_family; + Status* s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq; + bool key_exists; + bool is_blob_index; + void* cb_arg; + PinnableSlice* value; + PinnableWideColumns* columns; + std::string* timestamp; + GetContext* get_context; + + KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key, + PinnableSlice* val, PinnableWideColumns* cols, std::string* ts, + Status* stat) + : key(&user_key), + lkey(nullptr), + column_family(col_family), + s(stat), + max_covering_tombstone_seq(0), + key_exists(false), + is_blob_index(false), + cb_arg(nullptr), + value(val), + columns(cols), + timestamp(ts), + get_context(nullptr) {} +}; + +// The MultiGetContext class is a container for the sorted list of keys that +// we need to lookup in a batch. Its main purpose is to make batch execution +// easier by allowing various stages of the MultiGet lookups to operate on +// subsets of keys, potentially non-contiguous. In order to accomplish this, +// it defines the following classes - +// +// MultiGetContext::Range +// MultiGetContext::Range::Iterator +// MultiGetContext::Range::IteratorWrapper +// +// Here is an example of how this can be used - +// +// { +// MultiGetContext ctx(...); +// MultiGetContext::Range range = ctx.GetMultiGetRange(); +// +// // Iterate to determine some subset of the keys +// MultiGetContext::Range::Iterator start = range.begin(); +// MultiGetContext::Range::Iterator end = ...; +// +// // Make a new range with a subset of keys +// MultiGetContext::Range subrange(range, start, end); +// +// // Define an auxillary vector, if needed, to hold additional data for +// // each key +// std::array aux; +// +// // Iterate over the subrange and the auxillary vector simultaneously +// MultiGetContext::Range::Iterator iter = subrange.begin(); +// for (; iter != subrange.end(); ++iter) { +// KeyContext& key = *iter; +// Foo& aux_key = aux_iter[iter.index()]; +// ... +// } +// } +class MultiGetContext { + public: + // Limit the number of keys in a batch to this number. Benchmarks show that + // there is negligible benefit for batches exceeding this. Keeping this < 32 + // simplifies iteration, as well as reduces the amount of stack allocations + // that need to be performed + static const int MAX_BATCH_SIZE = 32; + + // A bitmask of at least MAX_BATCH_SIZE - 1 bits, so that + // Mask{1} << MAX_BATCH_SIZE is well defined + using Mask = uint64_t; + static_assert(MAX_BATCH_SIZE < sizeof(Mask) * 8); + + MultiGetContext(autovector* sorted_keys, + size_t begin, size_t num_keys, SequenceNumber snapshot, + const ReadOptions& read_opts, FileSystem* fs, + Statistics* stats) + : num_keys_(num_keys), + value_mask_(0), + value_size_(0), + lookup_key_ptr_(reinterpret_cast(lookup_key_stack_buf)) +#if USE_COROUTINES + , + reader_(fs, stats), + executor_(reader_) +#endif // USE_COROUTINES + { + (void)fs; + (void)stats; + assert(num_keys <= MAX_BATCH_SIZE); + if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) { + lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]); + lookup_key_ptr_ = reinterpret_cast(lookup_key_heap_buf.get()); + } + + for (size_t iter = 0; iter != num_keys_; ++iter) { + // autovector may not be contiguous storage, so make a copy + sorted_keys_[iter] = (*sorted_keys)[begin + iter]; + sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter]) + LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp); + sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key(); + sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey( + sorted_keys_[iter]->lkey->user_key(), + read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size()); + sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key(); + sorted_keys_[iter]->timestamp = (*sorted_keys)[begin + iter]->timestamp; + sorted_keys_[iter]->get_context = + (*sorted_keys)[begin + iter]->get_context; + } + } + + ~MultiGetContext() { + for (size_t i = 0; i < num_keys_; ++i) { + lookup_key_ptr_[i].~LookupKey(); + } + } + +#if USE_COROUTINES + SingleThreadExecutor& executor() { return executor_; } + + AsyncFileReader& reader() { return reader_; } +#endif // USE_COROUTINES + + private: + static const int MAX_LOOKUP_KEYS_ON_STACK = 16; + alignas( + alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) * + MAX_LOOKUP_KEYS_ON_STACK]; + std::array sorted_keys_; + size_t num_keys_; + Mask value_mask_; + uint64_t value_size_; + std::unique_ptr lookup_key_heap_buf; + LookupKey* lookup_key_ptr_; +#if USE_COROUTINES + AsyncFileReader reader_; + SingleThreadExecutor executor_; +#endif // USE_COROUTINES + + public: + // MultiGetContext::Range - Specifies a range of keys, by start and end index, + // from the parent MultiGetContext. Each range contains a bit vector that + // indicates whether the corresponding keys need to be processed or skipped. + // A Range object can be copy constructed, and the new object inherits the + // original Range's bit vector. This is useful for progressively skipping + // keys as the lookup goes through various stages. For example, when looking + // up keys in the same SST file, a Range is created excluding keys not + // belonging to that file. A new Range is then copy constructed and individual + // keys are skipped based on bloom filter lookup. + class Range { + public: + // MultiGetContext::Range::Iterator - A forward iterator that iterates over + // non-skippable keys in a Range, as well as keys whose final value has been + // found. The latter is tracked by MultiGetContext::value_mask_ + class Iterator { + public: + // -- iterator traits + using self_type = Iterator; + using value_type = KeyContext; + using reference = KeyContext&; + using pointer = KeyContext*; + using difference_type = int; + using iterator_category = std::forward_iterator_tag; + + Iterator(const Range* range, size_t idx) + : range_(range), ctx_(range->ctx_), index_(idx) { + while (index_ < range_->end_ && + (Mask{1} << index_) & + (range_->ctx_->value_mask_ | range_->skip_mask_ | + range_->invalid_mask_)) + index_++; + } + + Iterator(const Iterator&) = default; + + Iterator(const Iterator& other, const Range* range) + : range_(range), ctx_(other.ctx_), index_(other.index_) { + assert(range->ctx_ == other.ctx_); + } + Iterator& operator=(const Iterator&) = default; + + Iterator& operator++() { + while (++index_ < range_->end_ && + (Mask{1} << index_) & + (range_->ctx_->value_mask_ | range_->skip_mask_ | + range_->invalid_mask_)) + ; + return *this; + } + + bool operator==(Iterator other) const { + assert(range_->ctx_ == other.range_->ctx_); + return index_ == other.index_; + } + + bool operator!=(Iterator other) const { + assert(range_->ctx_ == other.range_->ctx_); + return index_ != other.index_; + } + + KeyContext& operator*() { + assert(index_ < range_->end_ && index_ >= range_->start_); + return *(ctx_->sorted_keys_[index_]); + } + + KeyContext* operator->() { + assert(index_ < range_->end_ && index_ >= range_->start_); + return ctx_->sorted_keys_[index_]; + } + + size_t index() { return index_; } + + private: + friend Range; + const Range* range_; + const MultiGetContext* ctx_; + size_t index_; + }; + + Range(const Range& mget_range, const Iterator& first, + const Iterator& last) { + ctx_ = mget_range.ctx_; + if (first == last) { + // This means create an empty range based on mget_range. So just + // set start_ and and_ to the same value + start_ = mget_range.start_; + end_ = start_; + } else { + start_ = first.index_; + end_ = last.index_; + } + skip_mask_ = mget_range.skip_mask_; + invalid_mask_ = mget_range.invalid_mask_; + assert(start_ < 64); + assert(end_ < 64); + } + + Range() = default; + + Iterator begin() const { return Iterator(this, start_); } + + Iterator end() const { return Iterator(this, end_); } + + bool empty() const { return RemainingMask() == 0; } + + void SkipIndex(size_t index) { skip_mask_ |= Mask{1} << index; } + + void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); } + + bool IsKeySkipped(const Iterator& iter) const { + return skip_mask_ & (Mask{1} << iter.index_); + } + + // Update the value_mask_ in MultiGetContext so its + // immediately reflected in all the Range Iterators + void MarkKeyDone(Iterator& iter) { + ctx_->value_mask_ |= (Mask{1} << iter.index_); + } + + bool CheckKeyDone(Iterator& iter) const { + return ctx_->value_mask_ & (Mask{1} << iter.index_); + } + + uint64_t KeysLeft() const { return BitsSetToOne(RemainingMask()); } + + void AddSkipsFrom(const Range& other) { + assert(ctx_ == other.ctx_); + skip_mask_ |= other.skip_mask_; + } + + uint64_t GetValueSize() { return ctx_->value_size_; } + + void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; } + + MultiGetContext* context() const { return ctx_; } + + Range Suffix(const Range& other) const { + size_t other_last = other.FindLastRemaining(); + size_t my_last = FindLastRemaining(); + + if (my_last > other_last) { + return Range(*this, Iterator(this, other_last), + Iterator(this, my_last)); + } else { + return Range(*this, begin(), begin()); + } + } + + // The += operator expands the number of keys in this range. The expansion + // is always to the right, i.e start of the additional range >= end of + // current range. There should be no overlap. Any skipped keys in rhs are + // marked as invalid in the invalid_mask_. + Range& operator+=(const Range& rhs) { + assert(rhs.start_ >= end_); + // Check for non-overlapping ranges and adjust invalid_mask_ accordingly + if (end_ < rhs.start_) { + invalid_mask_ |= RangeMask(end_, rhs.start_); + skip_mask_ |= RangeMask(end_, rhs.start_); + } + start_ = std::min(start_, rhs.start_); + end_ = std::max(end_, rhs.end_); + skip_mask_ |= rhs.skip_mask_ & RangeMask(rhs.start_, rhs.end_); + invalid_mask_ |= (rhs.invalid_mask_ | rhs.skip_mask_) & + RangeMask(rhs.start_, rhs.end_); + assert(start_ < 64); + assert(end_ < 64); + return *this; + } + + // The -= operator removes keys from this range. The removed keys should + // come from a range completely overlapping the current range. The removed + // keys are marked invalid in the invalid_mask_. + Range& operator-=(const Range& rhs) { + assert(start_ <= rhs.start_ && end_ >= rhs.end_); + skip_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) & + RangeMask(rhs.start_, rhs.end_); + invalid_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) & + RangeMask(rhs.start_, rhs.end_); + return *this; + } + + // Return a complement of the current range + Range operator~() { + Range res = *this; + res.skip_mask_ = ~skip_mask_ & RangeMask(start_, end_); + return res; + } + + private: + friend MultiGetContext; + MultiGetContext* ctx_; + size_t start_; + size_t end_; + Mask skip_mask_; + Mask invalid_mask_; + + Range(MultiGetContext* ctx, size_t num_keys) + : ctx_(ctx), + start_(0), + end_(num_keys), + skip_mask_(0), + invalid_mask_(0) { + assert(num_keys < 64); + } + + static Mask RangeMask(size_t start, size_t end) { + return (((Mask{1} << (end - start)) - 1) << start); + } + + Mask RemainingMask() const { + return (((Mask{1} << end_) - 1) & ~((Mask{1} << start_) - 1) & + ~(ctx_->value_mask_ | skip_mask_)); + } + + size_t FindLastRemaining() const { + Mask mask = RemainingMask(); + size_t index = (mask >>= start_) ? start_ : 0; + while (mask >>= 1) { + index++; + } + return index; + } + }; + + // Return the initial range that encompasses all the keys in the batch + Range GetMultiGetRange() { return Range(this, num_keys_); } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.cc new file mode 100644 index 0000000000..eece8100e6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/persistent_cache_helper.h" + +#include "table/block_based/block_based_table_reader.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +const PersistentCacheOptions PersistentCacheOptions::kEmpty; + +void PersistentCacheHelper::InsertSerialized( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const char* data, const size_t size) { + assert(cache_options.persistent_cache); + assert(cache_options.persistent_cache->IsCompressed()); + + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + + cache_options.persistent_cache->Insert(key.AsSlice(), data, size) + .PermitUncheckedError(); +} + +void PersistentCacheHelper::InsertUncompressed( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const BlockContents& contents) { + assert(cache_options.persistent_cache); + assert(!cache_options.persistent_cache->IsCompressed()); + // Precondition: + // (1) content is cacheable + // (2) content is not compressed + + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + + cache_options.persistent_cache + ->Insert(key.AsSlice(), contents.data.data(), contents.data.size()) + .PermitUncheckedError(); + ; +} + +Status PersistentCacheHelper::LookupSerialized( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + std::unique_ptr* out_data, const size_t expected_data_size) { +#ifdef NDEBUG + (void)expected_data_size; +#endif + assert(cache_options.persistent_cache); + assert(cache_options.persistent_cache->IsCompressed()); + + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + + size_t size; + Status s = + cache_options.persistent_cache->Lookup(key.AsSlice(), out_data, &size); + if (!s.ok()) { + // cache miss + RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); + return s; + } + + // cache hit + // Block-based table is assumed + assert(expected_data_size == + handle.size() + BlockBasedTable::kBlockTrailerSize); + assert(size == expected_data_size); + RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); + return Status::OK(); +} + +Status PersistentCacheHelper::LookupUncompressed( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + BlockContents* contents) { + assert(cache_options.persistent_cache); + assert(!cache_options.persistent_cache->IsCompressed()); + if (!contents) { + // We shouldn't lookup in the cache. Either + // (1) Nowhere to store + return Status::NotFound(); + } + + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + + std::unique_ptr data; + size_t size; + Status s = + cache_options.persistent_cache->Lookup(key.AsSlice(), &data, &size); + if (!s.ok()) { + // cache miss + RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); + return s; + } + + // please note we are potentially comparing compressed data size with + // uncompressed data size + assert(handle.size() <= size); + + // update stats + RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); + // construct result and return + *contents = BlockContents(std::move(data), size); + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.h new file mode 100644 index 0000000000..cce4092cde --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_helper.h @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "monitoring/statistics_impl.h" +#include "table/format.h" +#include "table/persistent_cache_options.h" + +namespace ROCKSDB_NAMESPACE { + +struct BlockContents; + +// PersistentCacheHelper +// +// Encapsulates some of the helper logic for read and writing from the cache +class PersistentCacheHelper { + public: + // Insert block into cache of serialized blocks. Size includes block trailer + // (if applicable). + static void InsertSerialized(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, const char* data, + const size_t size); + + // Insert block into cache of uncompressed blocks. No block trailer. + static void InsertUncompressed(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, + const BlockContents& contents); + + // Lookup block from cache of serialized blocks. Size includes block trailer + // (if applicable). + static Status LookupSerialized(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, + std::unique_ptr* out_data, + const size_t expected_data_size); + + // Lookup block from uncompressed cache. No block trailer. + static Status LookupUncompressed(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, + BlockContents* contents); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_options.h new file mode 100644 index 0000000000..46f2687979 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/persistent_cache_options.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "cache/cache_key.h" +#include "monitoring/statistics_impl.h" +#include "rocksdb/persistent_cache.h" + +namespace ROCKSDB_NAMESPACE { + +// PersistentCacheOptions +// +// This describe the caching behavior for page cache +// This is used to pass the context for caching and the cache handle +struct PersistentCacheOptions { + PersistentCacheOptions() {} + explicit PersistentCacheOptions( + const std::shared_ptr& _persistent_cache, + const OffsetableCacheKey& _base_cache_key, Statistics* const _statistics) + : persistent_cache(_persistent_cache), + base_cache_key(_base_cache_key), + statistics(_statistics) {} + std::shared_ptr persistent_cache; + OffsetableCacheKey base_cache_key; + Statistics* statistics = nullptr; + + static const PersistentCacheOptions kEmpty; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.cc new file mode 100644 index 0000000000..21441f6161 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_bloom.h" + +#include +#include + +#include "memory/allocator.h" +#include "util/dynamic_bloom.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint32_t GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_blocks = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + + return num_blocks * (CACHE_LINE_SIZE * 8); +} +} // namespace + +PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes) + : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {} + +void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits, + uint32_t num_blocks) { + data_ = raw_data; + kTotalBits = total_bits; + kNumBlocks = num_blocks; +} + +void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, + size_t huge_page_tlb_size, + Logger* logger) { + kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) + : (total_bits + 7) / 8 * 8; + kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + + assert(kNumBlocks > 0 || kTotalBits > 0); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kNumBlocks > 0) { + sz += CACHE_LINE_SIZE - 1; + } + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto cache_line_offset = reinterpret_cast(raw) % CACHE_LINE_SIZE; + if (kNumBlocks > 0 && cache_line_offset > 0) { + raw += CACHE_LINE_SIZE - cache_line_offset; + } + data_ = raw; +} + +void BloomBlockBuilder::AddKeysHashes( + const std::vector& keys_hashes) { + for (auto hash : keys_hashes) { + bloom_.AddHash(hash); + } +} + +Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } + +const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.h new file mode 100644 index 0000000000..460e7ec390 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_bloom.h @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/bloom_impl.h" +#include "util/hash.h" +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { +class Slice; +class Allocator; +class Logger; + +// A legacy Bloom filter implementation used by Plain Table db format, for +// schema backward compatibility. Not for use in new filter applications. +class PlainTableBloomV1 { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // locality: If positive, optimize for cache line locality, 0 otherwise. + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit PlainTableBloomV1(uint32_t num_probes = 6); + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger); + + ~PlainTableBloomV1() {} + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t hash); + + uint32_t GetNumBlocks() const { return kNumBlocks; } + + Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); } + + void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0); + + uint32_t GetTotalBits() const { return kTotalBits; } + + bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } + + private: + uint32_t kTotalBits; + uint32_t kNumBlocks; + const uint32_t kNumProbes; + + char* data_; + + static constexpr int LOG2_CACHE_LINE_SIZE = + ConstexprFloorLog2(CACHE_LINE_SIZE); +}; + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void PlainTableBloomV1::Prefetch(uint32_t h) { + if (kNumBlocks != 0) { + uint32_t ignored; + LegacyLocalityBloomImpl::PrepareHashMayMatch( + h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE); + } +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const { + assert(IsInitialized()); + if (kNumBlocks != 0) { + return LegacyLocalityBloomImpl::HashMayMatch( + h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE); + } else { + return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes, + data_); + } +} + +inline void PlainTableBloomV1::AddHash(uint32_t h) { + assert(IsInitialized()); + if (kNumBlocks != 0) { + LegacyLocalityBloomImpl::AddHash(h, kNumBlocks, kNumProbes, data_, + LOG2_CACHE_LINE_SIZE); + } else { + LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_); + } +} + +class BloomBlockBuilder { + public: + static const std::string kBloomBlock; + + explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} + + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger) { + bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, + logger); + } + + uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } + + void AddKeysHashes(const std::vector& keys_hashes); + + Slice Finish(); + + private: + PlainTableBloomV1 bloom_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.cc new file mode 100644 index 0000000000..126098a868 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.cc @@ -0,0 +1,335 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_builder.h" + +#include + +#include +#include +#include + +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// a utility that helps writing block content to the file +// @offset will advance if @block_contents was successfully written. +// @block_handle the block handle this particular block. +IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file, + uint64_t* offset, BlockHandle* block_handle) { + block_handle->set_offset(*offset); + block_handle->set_size(block_contents.size()); + IOStatus io_s = file->Append(block_contents); + + if (io_s.ok()) { + *offset += block_contents.size(); + } + return io_s; +} + +} // namespace + +// kPlainTableMagicNumber was picked by running +// echo rocksdb.table.plain | sha1sum +// and taking the leading 64 bits. +extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; +extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; + +PlainTableBuilder::PlainTableBuilder( + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + uint32_t column_family_id, int level_at_creation, WritableFileWriter* file, + uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, const std::string& column_family_name, + uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, + bool store_index_in_file, const std::string& db_id, + const std::string& db_session_id, uint64_t file_number) + : ioptions_(ioptions), + moptions_(moptions), + bloom_block_(num_probes), + file_(file), + bloom_bits_per_key_(bloom_bits_per_key), + huge_page_tlb_size_(huge_page_tlb_size), + encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(), + index_sparseness), + store_index_in_file_(store_index_in_file), + prefix_extractor_(moptions.prefix_extractor.get()) { + // Build index block and save it in the file if hash_table_ratio > 0 + if (store_index_in_file_) { + assert(hash_table_ratio > 0 || IsTotalOrderMode()); + index_builder_.reset(new PlainTableIndexBuilder( + &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness, + hash_table_ratio, huge_page_tlb_size_)); + properties_ + .user_collected_properties[PlainTablePropertyNames::kBloomVersion] = + "1"; // For future use + } + + properties_.fixed_key_len = user_key_len; + + // for plain table, we put all the data in a big chuck. + properties_.num_data_blocks = 1; + // Fill it later if store_index_in_file_ == true + properties_.index_size = 0; + properties_.filter_size = 0; + // To support roll-back to previous version, now still use version 0 for + // plain encoding. + properties_.format_version = (encoding_type == kPlain) ? 0 : 1; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; + properties_.db_id = db_id; + properties_.db_session_id = db_session_id; + properties_.db_host_id = ioptions.db_host_id; + if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) { + ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set"); + } + properties_.orig_file_number = file_number; + properties_.prefix_extractor_name = + moptions_.prefix_extractor != nullptr + ? moptions_.prefix_extractor->AsString() + : "nullptr"; + + std::string val; + PutFixed32(&val, static_cast(encoder_.GetEncodingType())); + properties_ + .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val; + + assert(int_tbl_prop_collector_factories); + for (auto& factory : *int_tbl_prop_collector_factories) { + assert(factory); + + table_properties_collectors_.emplace_back( + factory->CreateIntTblPropCollector(column_family_id, + level_at_creation)); + } +} + +PlainTableBuilder::~PlainTableBuilder() { + // They are supposed to have been passed to users through Finish() + // if the file succeeds. + status_.PermitUncheckedError(); + io_status_.PermitUncheckedError(); +} + +void PlainTableBuilder::Add(const Slice& key, const Slice& value) { + // temp buffer for metadata bytes between key and value. + char meta_bytes_buf[6]; + size_t meta_bytes_buf_size = 0; + + ParsedInternalKey internal_key; + if (!ParseInternalKey(key, &internal_key, false /* log_err_key */) + .ok()) { // TODO + assert(false); + return; + } + if (internal_key.type == kTypeRangeDeletion) { + status_ = Status::NotSupported("Range deletion unsupported"); + return; + } + + // Store key hash + if (store_index_in_file_) { + if (moptions_.prefix_extractor == nullptr) { + keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); + } else { + Slice prefix = + moptions_.prefix_extractor->Transform(internal_key.user_key); + keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); + } + } + + // Write value + assert(offset_ <= std::numeric_limits::max()); + auto prev_offset = static_cast(offset_); + // Write out the key + io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, + &meta_bytes_buf_size); + if (SaveIndexInFile()) { + index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset); + } + + // Write value length + uint32_t value_size = static_cast(value.size()); + if (io_status_.ok()) { + char* end_ptr = + EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); + assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); + meta_bytes_buf_size = end_ptr - meta_bytes_buf; + io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + } + + // Write value + if (io_status_.ok()) { + io_status_ = file_->Append(value); + offset_ += value_size + meta_bytes_buf_size; + } + + if (io_status_.ok()) { + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + if (internal_key.type == kTypeDeletion || + internal_key.type == kTypeSingleDeletion) { + properties_.num_deletions++; + } else if (internal_key.type == kTypeMerge) { + properties_.num_merge_operands++; + } + } + + // notify property collectors + NotifyCollectTableCollectorsOnAdd( + key, value, offset_, table_properties_collectors_, ioptions_.logger); + status_ = io_status_; +} + +Status PlainTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + + properties_.data_size = offset_; + + // Write the following blocks + // 1. [meta block: bloom] - optional + // 2. [meta block: index] - optional + // 3. [meta block: properties] + // 4. [metaindex block] + // 5. [footer] + + MetaIndexBuilder meta_index_builer; + + if (store_index_in_file_ && (properties_.num_entries > 0)) { + assert(properties_.num_entries <= std::numeric_limits::max()); + BlockHandle bloom_block_handle; + if (bloom_bits_per_key_ > 0) { + bloom_block_.SetTotalBits( + &arena_, + static_cast(properties_.num_entries) * bloom_bits_per_key_, + ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger); + + PutVarint32(&properties_.user_collected_properties + [PlainTablePropertyNames::kNumBloomBlocks], + bloom_block_.GetNumBlocks()); + + bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_); + + Slice bloom_finish_result = bloom_block_.Finish(); + + properties_.filter_size = bloom_finish_result.size(); + io_status_ = + WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle); + + if (!io_status_.ok()) { + status_ = io_status_; + return status_; + } + meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle); + } + BlockHandle index_block_handle; + Slice index_finish_result = index_builder_->Finish(); + + properties_.index_size = index_finish_result.size(); + io_status_ = + WriteBlock(index_finish_result, file_, &offset_, &index_block_handle); + + if (!io_status_.ok()) { + status_ = io_status_; + return status_; + } + + meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock, + index_block_handle); + } + + // Calculate bloom block size and index block size + PropertyBlockBuilder property_block_builder; + // -- Add basic properties + property_block_builder.AddTableProperty(properties_); + + property_block_builder.Add(properties_.user_collected_properties); + + // -- Add user collected properties + NotifyCollectTableCollectorsOnFinish( + table_properties_collectors_, ioptions_.logger, &property_block_builder); + + // -- Write property block + BlockHandle property_block_handle; + IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_, + &property_block_handle); + if (!s.ok()) { + return static_cast(s); + } + meta_index_builer.Add(kPropertiesBlockName, property_block_handle); + + // -- write metaindex block + BlockHandle metaindex_block_handle; + io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_, + &metaindex_block_handle); + if (!io_status_.ok()) { + status_ = io_status_; + return status_; + } + + // Write Footer + // no need to write out new footer if we're using default checksum + FooterBuilder footer; + footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_, + kNoChecksum, metaindex_block_handle); + io_status_ = file_->Append(footer.GetSlice()); + if (io_status_.ok()) { + offset_ += footer.GetSlice().size(); + } + status_ = io_status_; + return status_; +} + +void PlainTableBuilder::Abandon() { closed_ = true; } + +uint64_t PlainTableBuilder::NumEntries() const { + return properties_.num_entries; +} + +uint64_t PlainTableBuilder::FileSize() const { return offset_; } + +std::string PlainTableBuilder::GetFileChecksum() const { + if (file_ != nullptr) { + return file_->GetFileChecksum(); + } else { + return kUnknownFileChecksum; + } +} + +const char* PlainTableBuilder::GetFileChecksumFuncName() const { + if (file_ != nullptr) { + return file_->GetFileChecksumFuncName(); + } else { + return kUnknownFileChecksumFuncName; + } +} +void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string, + uint64_t uint_64) { + // TODO: storing seqno to time mapping is not yet support for plain table. + TableBuilder::SetSeqnoTimeTableProperties(string, uint_64); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.h new file mode 100644 index 0000000000..863882baf2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_builder.h @@ -0,0 +1,152 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "db/version_edit.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_index.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/table_builder.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +class TableBuilder; + +// The builder class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. +class PlainTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + PlainTableBuilder( + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + uint32_t column_family_id, int level_at_creation, + WritableFileWriter* file, uint32_t user_key_size, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, const std::string& column_family_name, + uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, + double hash_table_ratio = 0, bool store_index_in_file = false, + const std::string& db_id = "", const std::string& db_session_id = "", + uint64_t file_number = 0); + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; + + // REQUIRES: Either Finish() or Abandon() has been called. + ~PlainTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override { return status_; } + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return io_status_; } + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + TableProperties GetTableProperties() const override { return properties_; } + + bool SaveIndexInFile() const { return store_index_in_file_; } + + // Get file checksum + std::string GetFileChecksum() const override; + + // Get file checksum function name + const char* GetFileChecksumFuncName() const override; + + void SetSeqnoTimeTableProperties(const std::string& string, + uint64_t uint_64) override; + + private: + Arena arena_; + const ImmutableOptions& ioptions_; + const MutableCFOptions& moptions_; + std::vector> + table_properties_collectors_; + + BloomBlockBuilder bloom_block_; + std::unique_ptr index_builder_; + + WritableFileWriter* file_; + uint64_t offset_ = 0; + uint32_t bloom_bits_per_key_; + size_t huge_page_tlb_size_; + Status status_; + IOStatus io_status_; + TableProperties properties_; + PlainTableKeyEncoder encoder_; + + bool store_index_in_file_; + + std::vector keys_or_prefixes_hashes_; + bool closed_ = false; // Either Finish() or Abandon() has been called. + + const SliceTransform* prefix_extractor_; + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(ExtractUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.cc new file mode 100644 index 0000000000..80aa9cb8e8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.cc @@ -0,0 +1,296 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain/plain_table_factory.h" + +#include + +#include + +#include "db/dbformat.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "table/plain/plain_table_builder.h" +#include "table/plain/plain_table_reader.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map plain_table_type_info = { + {"user_key_len", + {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"bloom_bits_per_key", + {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"hash_table_ratio", + {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"index_sparseness", + {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"huge_page_tlb_size", + {offsetof(struct PlainTableOptions, huge_page_tlb_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"encoding_type", + {offsetof(struct PlainTableOptions, encoding_type), + OptionType::kEncodingType, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"full_scan_mode", + {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"store_index_in_file", + {offsetof(struct PlainTableOptions, store_index_in_file), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +PlainTableFactory::PlainTableFactory(const PlainTableOptions& options) + : table_options_(options) { + RegisterOptions(&table_options_, &plain_table_type_info); +} + +Status PlainTableFactory::NewTableReader( + const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool /*prefetch_index_and_filter_in_cache*/) const { + return PlainTableReader::Open( + table_reader_options.ioptions, table_reader_options.env_options, + table_reader_options.internal_comparator, std::move(file), file_size, + table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio, + table_options_.index_sparseness, table_options_.huge_page_tlb_size, + table_options_.full_scan_mode, table_reader_options.immortal, + table_reader_options.prefix_extractor.get()); +} + +TableBuilder* PlainTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const { + // Ignore the skip_filters flag. PlainTable format is optimized for small + // in-memory dbs. The skip_filters optimization is not useful for plain + // tables + // + return new PlainTableBuilder( + table_builder_options.ioptions, table_builder_options.moptions, + table_builder_options.int_tbl_prop_collector_factories, + table_builder_options.column_family_id, + table_builder_options.level_at_creation, file, + table_options_.user_key_len, table_options_.encoding_type, + table_options_.index_sparseness, table_options_.bloom_bits_per_key, + table_builder_options.column_family_name, 6, + table_options_.huge_page_tlb_size, table_options_.hash_table_ratio, + table_options_.store_index_in_file, table_builder_options.db_id, + table_builder_options.db_session_id, table_builder_options.cur_file_num); +} + +std::string PlainTableFactory::GetPrintableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " user_key_len: %u\n", + table_options_.user_key_len); + ret.append(buffer); + snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n", + table_options_.bloom_bits_per_key); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + table_options_.hash_table_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_sparseness: %" ROCKSDB_PRIszt "\n", + table_options_.index_sparseness); + ret.append(buffer); + snprintf(buffer, kBufferSize, " huge_page_tlb_size: %" ROCKSDB_PRIszt "\n", + table_options_.huge_page_tlb_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " encoding_type: %d\n", + table_options_.encoding_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " full_scan_mode: %d\n", + table_options_.full_scan_mode); + ret.append(buffer); + snprintf(buffer, kBufferSize, " store_index_in_file: %d\n", + table_options_.store_index_in_file); + ret.append(buffer); + return ret; +} + +Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, + const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + + s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map, + new_table_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } +} + +static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, + const std::string& /*arg*/) { + // The MemTableRepFactory built-in classes will be either a class + // (VectorRepFactory) or a nickname (vector), followed optionally by ":#", + // where # is the "size" of the factory. + auto AsPattern = [](const std::string& name, const std::string& alt) { + auto pattern = ObjectLibrary::PatternEntry(name, true); + pattern.AnotherName(alt); + pattern.AddNumber(":"); + return pattern; + }; + library.AddFactory( + AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t count = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new VectorRepFactory(count)); + } else { + guard->reset(new VectorRepFactory()); + } + return guard->get(); + }); + library.AddFactory( + AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t lookahead = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new SkipListFactory(lookahead)); + } else { + guard->reset(new SkipListFactory()); + } + return guard->get(); + }); + library.AddFactory( + AsPattern("HashLinkListRepFactory", "hash_linkedlist"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + // Expecting format: hash_linkedlist: + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewHashLinkListRepFactory(hash_bucket_count)); + } else { + guard->reset(NewHashLinkListRepFactory()); + } + return guard->get(); + }); + library.AddFactory( + AsPattern("HashSkipListRepFactory", "prefix_hash"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + // Expecting format: prefix_hash: + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewHashSkipListRepFactory(hash_bucket_count)); + } else { + guard->reset(NewHashSkipListRepFactory()); + } + return guard->get(); + }); + library.AddFactory( + "cuckoo", + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, std::string* errmsg) { + *errmsg = "cuckoo hash memtable is not supported anymore."; + return nullptr; + }); + + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, std::unique_ptr* result) { + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + config_options.ignore_unknown_options = false; + return MemTableRepFactory::CreateFromString(config_options, opts_str, result); +} + +Status MemTableRepFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::unique_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), ""); + }); + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (value.empty()) { + // No Id and no options. Clear the object + result->reset(); + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { + status = NewUniqueObject(config_options, id, opt_map, + result); + } + return status; +} + +Status MemTableRepFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + std::unique_ptr factory; + Status s = CreateFromString(config_options, value, &factory); + if (factory && s.ok()) { + result->reset(factory.release()); + } + return s; +} + +Status GetPlainTableOptionsFromMap( + const ConfigOptions& config_options, const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options) { + assert(new_table_options); + PlainTableFactory ptf(table_options); + Status s = ptf.ConfigureFromMap(config_options, opts_map); + if (s.ok()) { + *new_table_options = *(ptf.GetOptions()); + } else { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; + } + return s; +} + +extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { + return new PlainTableFactory(options); +} + +const std::string PlainTablePropertyNames::kEncodingType = + "rocksdb.plain.table.encoding.type"; + +const std::string PlainTablePropertyNames::kBloomVersion = + "rocksdb.plain.table.bloom.version"; + +const std::string PlainTablePropertyNames::kNumBloomBlocks = + "rocksdb.plain.table.bloom.numblocks"; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.h new file mode 100644 index 0000000000..a47418af69 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_factory.h @@ -0,0 +1,180 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include +#include + +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +struct EnvOptions; + +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +// PlainTableFactory is the entrance function to the PlainTable format of +// SST files. It returns instances PlainTableBuilder as the builder +// class and PlainTableReader as the reader class, where the format is +// actually implemented. +// +// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. +// Data is not organized in blocks, which allows fast access. Because of +// following downsides +// 1. Data compression is not supported. +// 2. Data is not checksumed. +// it is not recommended to use this format on other type of file systems. +// +// PlainTable requires fixed length key, configured as a constructor +// parameter of the factory class. Output file format: +// +-------------+-----------------+ +// | version | user_key_length | +// +------------++------------+-----------------+ <= key1 offset +// | encoded key1 | value_size | | +// +------------+-------------+-------------+ | +// | value1 | +// | | +// +--------------------------+-------------+---+ <= key2 offset +// | encoded key2 | value_size | | +// +------------+-------------+-------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ +// +// When the key encoding type is kPlain. Key part is encoded as: +// +------------+--------------------+ +// | [key_size] | internal key | +// +------------+--------------------+ +// for the case of user_key_len = kPlainTableVariableLength case, +// and simply: +// +----------------------+ +// | internal key | +// +----------------------+ +// for user_key_len != kPlainTableVariableLength case. +// +// If key encoding type is kPrefix. Keys are encoding in this format. +// There are three ways to encode a key: +// (1) Full Key +// +---------------+---------------+-------------------+ +// | Full Key Flag | Full Key Size | Full Internal Key | +// +---------------+---------------+-------------------+ +// which simply encodes a full key +// +// (2) A key shared the same prefix as the previous key, which is encoded as +// format of (1). +// +-------------+-------------+-------------+-------------+------------+ +// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | +// +-------------+-------------+-------------+-------------+------------+ +// where key is the suffix part of the key, including the internal bytes. +// the actual key will be constructed by concatenating prefix part of the +// previous key, with the suffix part of the key here, with sizes given here. +// +// (3) A key shared the same prefix as the previous key, which is encoded as +// the format of (2). +// +-----------------+-----------------+------------------------+ +// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | +// +-----------------+-----------------+------------------------+ +// The key will be constructed by concatenating previous key's prefix (which is +// also a prefix which the last key encoded in the format of (1)) and the +// key given here. +// +// For example, we for following keys (prefix and suffix are separated by +// spaces): +// 0000 0001 +// 0000 00021 +// 0000 0002 +// 00011 00 +// 0002 0001 +// Will be encoded like this: +// FK 8 00000001 +// PF 4 SF 5 00021 +// SF 4 0002 +// FK 7 0001100 +// FK 8 00020001 +// (where FK means full key flag, PF means prefix flag and SF means suffix flag) +// +// All those "key flag + key size" shown above are in this format: +// The 8 bits of the first byte: +// +----+----+----+----+----+----+----+----+ +// | Type | Size | +// +----+----+----+----+----+----+----+----+ +// Type indicates: full key, prefix, or suffix. +// The last 6 bits are for size. If the size bits are not all 1, it means the +// size of the key. Otherwise, varint32 is read after this byte. This varint +// value + 0x3F (the value of all 1) will be the key size. +// +// For example, full key with length 16 will be encoded as (binary): +// 00 010000 +// (00 means full key) +// and a prefix with 100 bytes will be encoded as: +// 01 111111 00100101 +// (63) (37) +// (01 means key suffix) +// +// All the internal keys above (including kPlain and kPrefix) are encoded in +// this format: +// There are two types: +// (1) normal internal key format +// +----------- ...... -------------+----+---+---+---+---+---+---+---+ +// | user key |type| sequence ID | +// +----------- ..... --------------+----+---+---+---+---+---+---+---+ +// (2) Special case for keys whose sequence ID is 0 and is value type +// +----------- ...... -------------+----+ +// | user key |0x80| +// +----------- ..... --------------+----+ +// To save 7 bytes for the special case where sequence ID = 0. +// +// +class PlainTableFactory : public TableFactory { + public: + ~PlainTableFactory() {} + // user_key_len is the length of the user key. If it is set to be + // kPlainTableVariableLength, then it means variable length. Otherwise, all + // the keys need to have the fix length of this value. bloom_bits_per_key is + // number of bits used for bloom filer per key. hash_table_ratio is + // the desired utilization of the hash table used for prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. + // huge_page_tlb_size determines whether to allocate hash indexes from huge + // page TLB and the page size if allocating from there. See comments of + // Arena::AllocateAligned() for details. + explicit PlainTableFactory( + const PlainTableOptions& _table_options = PlainTableOptions()); + + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kPlainTableName(); } + const char* Name() const override { return kPlainTableName(); } + using TableFactory::NewTableReader; + Status NewTableReader(const ReadOptions& ro, + const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, + uint64_t file_size, std::unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const override; + + std::string GetPrintableOptions() const override; + static const char kValueTypeSeqId0 = char(~0); + + private: + PlainTableOptions table_options_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.cc new file mode 100644 index 0000000000..c85176d6ca --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_index.h" + +#include + +#include "logging/logging.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + assert(num_buckets > 0); + return hash % num_buckets; +} +} // namespace + +Status PlainTableIndex::InitFromRawData(Slice data) { + if (!GetVarint32(&data, &index_size_)) { + return Status::Corruption("Couldn't read the index size!"); + } + assert(index_size_ > 0); + if (!GetVarint32(&data, &num_prefixes_)) { + return Status::Corruption("Couldn't read the index size!"); + } + sub_index_size_ = + static_cast(data.size()) - index_size_ * kOffsetLen; + + char* index_data_begin = const_cast(data.data()); + index_ = reinterpret_cast(index_data_begin); + sub_index_ = reinterpret_cast(index_ + index_size_); + return Status::OK(); +} + +PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( + uint32_t prefix_hash, uint32_t* bucket_value) const { + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + GetUnaligned(index_ + bucket, bucket_value); + if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { + *bucket_value ^= kSubIndexMask; + return kSubindex; + } + if (*bucket_value >= kMaxFileSize) { + return kNoPrefixForBucket; + } else { + // point directly to the file + return kDirectToFile; + } +} + +void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, + uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; +} + +void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, + uint32_t key_offset) { + if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { + ++num_prefixes_; + if (!is_first_record_) { + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + } + num_keys_per_prefix_ = 0; + prev_key_prefix_ = key_prefix_slice.ToString(); + prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); + due_index_ = true; + } + + if (due_index_) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list_.AddRecord(prev_key_prefix_hash_, key_offset); + due_index_ = false; + } + + num_keys_per_prefix_++; + if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { + due_index_ = true; + } + is_first_record_ = false; +} + +Slice PlainTableIndexBuilder::Finish() { + AllocateIndex(); + std::vector hash_to_offsets(index_size_, nullptr); + std::vector entries_per_bucket(index_size_, 0); + BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); + + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist_.ToString().c_str()); + + // From the temp data structure, populate indexes. + return FillIndexes(hash_to_offsets, entries_per_bucket); +} + +void PlainTableIndexBuilder::AllocateIndex() { + if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / hash_table_ratio_; + index_size_ = + static_cast(num_prefixes_ * hash_table_size_multipier) + 1; + assert(index_size_ > 0); + } +} + +void PlainTableIndexBuilder::BucketizeIndexes( + std::vector* hash_to_offsets, + std::vector* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list_.GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list_.At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + + sub_index_size_ = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size_ += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; + } +} + +Slice PlainTableIndexBuilder::FillIndexes( + const std::vector& hash_to_offsets, + const std::vector& entries_per_bucket) { + ROCKS_LOG_DEBUG(ioptions_.logger, + "Reserving %" PRIu32 " bytes for plain table's sub_index", + sub_index_size_); + auto total_allocate_size = GetTotalSize(); + char* allocated = arena_->AllocateAligned( + total_allocate_size, huge_page_tlb_size_, ioptions_.logger); + + auto temp_ptr = EncodeVarint32(allocated, index_size_); + uint32_t* index = + reinterpret_cast(EncodeVarint32(temp_ptr, num_prefixes_)); + char* sub_index = reinterpret_cast(index + index_size_); + + uint32_t sub_index_offset = 0; + for (uint32_t i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize); + break; + case 1: + // point directly to the file offset + PutUnaligned(index + i, hash_to_offsets[i]->offset); + break; + default: + // point to second level indexes. + PutUnaligned(index + i, + sub_index_offset | PlainTableIndex::kSubIndexMask); + char* prev_ptr = &sub_index[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += static_cast(cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= sub_index_size_); + break; + } + } + assert(sub_index_offset == sub_index_size_); + + ROCKS_LOG_DEBUG(ioptions_.logger, + "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, + index_size_, sub_index_size_); + return Slice(allocated, GetTotalSize()); +} + +const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = + "PlainTableIndexBlock"; +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.h new file mode 100644 index 0000000000..0adb6417d9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_index.h @@ -0,0 +1,246 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include + +#include "memory/arena.h" +#include "monitoring/histogram.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains two classes PlainTableIndex and PlainTableIndexBuilder +// The two classes implement the index format of PlainTable. +// For description of PlainTable format, see comments of class +// PlainTableFactory +// +// +// PlainTableIndex contains buckets size of index_size_, each is a +// 32-bit integer. The lower 31 bits contain an offset value (explained below) +// and the first bit of the integer indicates type of the offset. +// +// +--------------+------------------------------------------------------+ +// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + +// +--------------+------------------------------------------------------+ +// +// Explanation for the "flag bit": +// +// 0 indicates that the bucket contains only one prefix (no conflict when +// hashing this prefix), whose first row starts from this offset of the +// file. +// 1 indicates that the bucket contains more than one prefixes, or there +// are too many rows for one prefix so we need a binary search for it. In +// this case, the offset indicates the offset of sub_index_ holding the +// binary search indexes of keys for those rows. Those binary search indexes +// are organized in this way: +// +// The first 4 bytes, indicate how many indexes (N) are stored after it. After +// it, there are N 32-bit integers, each points of an offset of the file, +// which +// points to starting of a row. Those offsets need to be guaranteed to be in +// ascending order so the keys they are pointing to are also in ascending +// order +// to make sure we can use them to do binary searches. Below is visual +// presentation of a bucket. +// +// +// number_of_records: varint32 +// record 1 file offset: fixedint32 +// record 2 file offset: fixedint32 +// .... +// record N file offset: fixedint32 +// + +// The class loads the index block from a PlainTable SST file, and executes +// the index lookup. +// The class is used by PlainTableReader class. +class PlainTableIndex { + public: + enum IndexSearchResult { + kNoPrefixForBucket = 0, + kDirectToFile = 1, + kSubindex = 2 + }; + + explicit PlainTableIndex(Slice data) { InitFromRawData(data); } + + PlainTableIndex() + : index_size_(0), + sub_index_size_(0), + num_prefixes_(0), + index_(nullptr), + sub_index_(nullptr) {} + + // The function that executes the lookup the hash table. + // The hash key is `prefix_hash`. The function fills the hash bucket + // content in `bucket_value`, which is up to the caller to interpret. + IndexSearchResult GetOffset(uint32_t prefix_hash, + uint32_t* bucket_value) const; + + // Initialize data from `index_data`, which points to raw data for + // index stored in the SST file. + Status InitFromRawData(Slice index_data); + + // Decode the sub index for specific hash bucket. + // The `offset` is the value returned as `bucket_value` by GetOffset() + // and is only valid when the return value is `kSubindex`. + // The return value is the pointer to the starting address of the + // sub-index. `upper_bound` is filled with the value indicating how many + // entries the sub-index has. + const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, + uint32_t* upper_bound) const { + const char* index_ptr = &sub_index_[offset]; + return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); + } + + uint32_t GetIndexSize() const { return index_size_; } + + uint32_t GetSubIndexSize() const { return sub_index_size_; } + + uint32_t GetNumPrefixes() const { return num_prefixes_; } + + static const uint64_t kMaxFileSize = (1u << 31) - 1; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + + private: + uint32_t index_size_; + uint32_t sub_index_size_; + uint32_t num_prefixes_; + + uint32_t* index_; + char* sub_index_; +}; + +// PlainTableIndexBuilder is used to create plain table index. +// After calling Finish(), it returns Slice, which is usually +// used either to initialize PlainTableIndex or +// to save index to sst file. +// For more details about the index, please refer to: +// https://github.com/facebook/rocksdb/wiki/PlainTable-Format +// #wiki-in-memory-index-format +// The class is used by PlainTableBuilder class. +class PlainTableIndexBuilder { + public: + PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions, + const SliceTransform* prefix_extractor, + size_t index_sparseness, double hash_table_ratio, + size_t huge_page_tlb_size) + : arena_(arena), + ioptions_(ioptions), + record_list_(kRecordsPerGroup), + is_first_record_(true), + due_index_(false), + num_prefixes_(0), + num_keys_per_prefix_(0), + prev_key_prefix_hash_(0), + index_sparseness_(index_sparseness), + index_size_(0), + sub_index_size_(0), + prefix_extractor_(prefix_extractor), + hash_table_ratio_(hash_table_ratio), + huge_page_tlb_size_(huge_page_tlb_size) {} + + void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); + + Slice Finish(); + + uint32_t GetTotalSize() const { + return VarintLength(index_size_) + VarintLength(num_prefixes_) + + PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; + } + + static const std::string kPlainTableIndexBlock; + + private: + struct IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; + }; + + // Helper class to track all the index records + class IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(uint32_t hash, uint32_t offset); + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &( + groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector groups_; + size_t num_records_in_current_group_; + }; + + void AllocateIndex(); + + // Internal helper function to bucket index record list to hash buckets. + void BucketizeIndexes(std::vector* hash_to_offsets, + std::vector* entries_per_bucket); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. + Slice FillIndexes(const std::vector& hash_to_offsets, + const std::vector& entries_per_bucket); + + Arena* arena_; + const ImmutableOptions ioptions_; + HistogramImpl keys_per_prefix_hist_; + IndexRecordList record_list_; + bool is_first_record_; + bool due_index_; + uint32_t num_prefixes_; + uint32_t num_keys_per_prefix_; + + uint32_t prev_key_prefix_hash_; + size_t index_sparseness_; + uint32_t index_size_; + uint32_t sub_index_size_; + + const SliceTransform* prefix_extractor_; + double hash_table_ratio_; + size_t huge_page_tlb_size_; + + std::string prev_key_prefix_; + + static const size_t kRecordsPerGroup = 256; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.cc new file mode 100644 index 0000000000..a40968a608 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.cc @@ -0,0 +1,507 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_key_coding.h" + +#include +#include + +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +enum PlainTableEntryType : unsigned char { + kFullKey = 0, + kPrefixFromPreviousKey = 1, + kKeySuffix = 2, +}; + +namespace { + +// Control byte: +// First two bits indicate type of entry +// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes +// are used. key_size-0x3F will be encoded as a variint32 after this bytes. + +const unsigned char kSizeInlineLimit = 0x3F; + +// Return 0 for error +size_t EncodeSize(PlainTableEntryType type, uint32_t key_size, + char* out_buffer) { + out_buffer[0] = type << 6; + + if (key_size < static_cast(kSizeInlineLimit)) { + // size inlined + out_buffer[0] |= static_cast(key_size); + return 1; + } else { + out_buffer[0] |= kSizeInlineLimit; + char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit); + return ptr - out_buffer; + } +} +} // namespace + +// Fill bytes_read with number of bytes read. +inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset, + PlainTableEntryType* entry_type, + uint32_t* key_size, + uint32_t* bytes_read) { + Slice next_byte_slice; + bool success = file_reader_.Read(start_offset, 1, &next_byte_slice); + if (!success) { + return file_reader_.status(); + } + *entry_type = static_cast( + (static_cast(next_byte_slice[0]) & ~kSizeInlineLimit) >> + 6); + char inline_key_size = next_byte_slice[0] & kSizeInlineLimit; + if (inline_key_size < kSizeInlineLimit) { + *key_size = inline_key_size; + *bytes_read = 1; + return Status::OK(); + } else { + uint32_t extra_size; + uint32_t tmp_bytes_read; + success = file_reader_.ReadVarint32(start_offset + 1, &extra_size, + &tmp_bytes_read); + if (!success) { + return file_reader_.status(); + } + assert(tmp_bytes_read > 0); + *key_size = kSizeInlineLimit + extra_size; + *bytes_read = tmp_bytes_read + 1; + return Status::OK(); + } +} + +IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, + WritableFileWriter* file, + uint64_t* offset, char* meta_bytes_buf, + size_t* meta_bytes_buf_size) { + ParsedInternalKey parsed_key; + Status pik_status = + ParseInternalKey(key, &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + return IOStatus::Corruption(pik_status.getState()); + } + + Slice key_to_write = key; // Portion of internal key to write out. + + uint32_t user_key_size = static_cast(key.size() - 8); + if (encoding_type_ == kPlain) { + if (fixed_user_key_len_ == kPlainTableVariableLength) { + // Write key length + char key_size_buf[5]; // tmp buffer for key size as varint32 + char* ptr = EncodeVarint32(key_size_buf, user_key_size); + assert(ptr <= key_size_buf + sizeof(key_size_buf)); + auto len = ptr - key_size_buf; + IOStatus io_s = file->Append(Slice(key_size_buf, len)); + if (!io_s.ok()) { + return io_s; + } + *offset += len; + } + } else { + assert(encoding_type_ == kPrefix); + char size_bytes[12]; + size_t size_bytes_pos = 0; + + Slice prefix = + prefix_extractor_->Transform(Slice(key.data(), user_key_size)); + if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() || + key_count_for_prefix_ % index_sparseness_ == 0) { + key_count_for_prefix_ = 1; + pre_prefix_.SetUserKey(prefix); + size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); + IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!io_s.ok()) { + return io_s; + } + *offset += size_bytes_pos; + } else { + key_count_for_prefix_++; + if (key_count_for_prefix_ == 2) { + // For second key within a prefix, need to encode prefix length + size_bytes_pos += + EncodeSize(kPrefixFromPreviousKey, + static_cast(pre_prefix_.GetUserKey().size()), + size_bytes + size_bytes_pos); + } + uint32_t prefix_len = + static_cast(pre_prefix_.GetUserKey().size()); + size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, + size_bytes + size_bytes_pos); + IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!io_s.ok()) { + return io_s; + } + *offset += size_bytes_pos; + key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len); + } + } + + // Encode full key + // For value size as varint32 (up to 5 bytes). + // If the row is of value type with seqId 0, flush the special flag together + // in this buffer to safe one file append call, which takes 1 byte. + if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { + IOStatus io_s = + file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); + if (!io_s.ok()) { + return io_s; + } + *offset += key_to_write.size() - 8; + meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; + *meta_bytes_buf_size += 1; + } else { + IOStatus io_s = file->Append(key_to_write); + if (!io_s.ok()) { + return io_s; + } + *offset += key_to_write.size(); + } + + return IOStatus::OK(); +} + +Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset, + uint32_t len) { + assert(file_offset + len <= file_info_->data_end_offset); + return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset), + len); +} + +bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len, + Slice* out) { + const uint32_t kPrefetchSize = 256u; + + // Try to read from buffers. + for (uint32_t i = 0; i < num_buf_; i++) { + Buffer* buffer = buffers_[num_buf_ - 1 - i].get(); + if (file_offset >= buffer->buf_start_offset && + file_offset + len <= buffer->buf_start_offset + buffer->buf_len) { + *out = GetFromBuffer(buffer, file_offset, len); + return true; + } + } + + Buffer* new_buffer; + // Data needed is not in any of the buffer. Allocate a new buffer. + if (num_buf_ < buffers_.size()) { + // Add a new buffer + new_buffer = new Buffer(); + buffers_[num_buf_++].reset(new_buffer); + } else { + // Now simply replace the last buffer. Can improve the placement policy + // if needed. + new_buffer = buffers_[num_buf_ - 1].get(); + } + + assert(file_offset + len <= file_info_->data_end_offset); + uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset, + std::max(kPrefetchSize, len)); + if (size_to_read > new_buffer->buf_capacity) { + new_buffer->buf.reset(new char[size_to_read]); + new_buffer->buf_capacity = size_to_read; + new_buffer->buf_len = 0; + } + Slice read_result; + // TODO: rate limit plain table reads. + Status s = + file_info_->file->Read(IOOptions(), file_offset, size_to_read, + &read_result, new_buffer->buf.get(), nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + status_ = s; + return false; + } + new_buffer->buf_start_offset = file_offset; + new_buffer->buf_len = size_to_read; + *out = GetFromBuffer(new_buffer, file_offset, len); + return true; +} + +inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { + if (file_info_->is_mmap_mode) { + const char* start = file_info_->file_data.data() + offset; + const char* limit = + file_info_->file_data.data() + file_info_->data_end_offset; + const char* key_ptr = GetVarint32Ptr(start, limit, out); + assert(key_ptr != nullptr); + *bytes_read = static_cast(key_ptr - start); + return true; + } else { + return ReadVarint32NonMmap(offset, out, bytes_read); + } +} + +bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { + const char* start; + const char* limit; + const uint32_t kMaxVarInt32Size = 6u; + uint32_t bytes_to_read = + std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size); + Slice bytes; + if (!Read(offset, bytes_to_read, &bytes)) { + return false; + } + start = bytes.data(); + limit = bytes.data() + bytes.size(); + + const char* key_ptr = GetVarint32Ptr(start, limit, out); + *bytes_read = + (key_ptr != nullptr) ? static_cast(key_ptr - start) : 0; + return true; +} + +Status PlainTableKeyDecoder::ReadInternalKey( + uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key, + uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) { + Slice tmp_slice; + bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice); + if (!success) { + return file_reader_.status(); + } + if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) { + // Special encoding for the row with seqID=0 + parsed_key->user_key = Slice(tmp_slice.data(), user_key_size); + parsed_key->sequence = 0; + parsed_key->type = kTypeValue; + *bytes_read += user_key_size + 1; + *internal_key_valid = false; + } else { + success = file_reader_.Read(file_offset, user_key_size + 8, internal_key); + if (!success) { + return file_reader_.status(); + } + *internal_key_valid = true; + Status pik_status = ParseInternalKey(*internal_key, parsed_key, + false /* log_err_key */); // TODO + if (!pik_status.ok()) { + return Status::Corruption( + Slice("Corrupted key found during next key read. "), + pik_status.getState()); + } + *bytes_read += user_key_size + 8; + } + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, + uint32_t* bytes_read, + bool* /*seekable*/) { + uint32_t user_key_size = 0; + Status s; + if (fixed_user_key_len_ != kPlainTableVariableLength) { + user_key_size = fixed_user_key_len_; + } else { + uint32_t tmp_size = 0; + uint32_t tmp_read; + bool success = + file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read); + if (!success) { + return file_reader_.status(); + } + assert(tmp_read > 0); + user_key_size = tmp_size; + *bytes_read = tmp_read; + } + // dummy initial value to avoid compiler complain + bool decoded_internal_key_valid = true; + Slice decoded_internal_key; + s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &decoded_internal_key); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode) { + cur_key_.SetInternalKey(*parsed_key); + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), user_key_size); + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + } else if (internal_key != nullptr) { + if (decoded_internal_key_valid) { + *internal_key = decoded_internal_key; + } else { + // Need to copy out the internal key + cur_key_.SetInternalKey(*parsed_key); + *internal_key = cur_key_.GetInternalKey(); + } + } + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextPrefixEncodingKey( + uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, + uint32_t* bytes_read, bool* seekable) { + PlainTableEntryType entry_type; + + bool expect_suffix = false; + Status s; + do { + uint32_t size = 0; + // dummy initial value to avoid compiler complain + bool decoded_internal_key_valid = true; + uint32_t my_bytes_read = 0; + s = DecodeSize(start_offset + *bytes_read, &entry_type, &size, + &my_bytes_read); + if (!s.ok()) { + return s; + } + if (my_bytes_read == 0) { + return Status::Corruption("Unexpected EOF when reading size of the key"); + } + *bytes_read += my_bytes_read; + + switch (entry_type) { + case kFullKey: { + expect_suffix = false; + Slice decoded_internal_key; + s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &decoded_internal_key); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode || + (internal_key != nullptr && !decoded_internal_key_valid)) { + // In non-mmap mode, always need to make a copy of keys returned to + // users, because after reading value for the key, the key might + // be invalid. + cur_key_.SetInternalKey(*parsed_key); + saved_user_key_ = cur_key_.GetUserKey(); + if (!file_reader_.file_info()->is_mmap_mode) { + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), size); + } + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + } else { + if (internal_key != nullptr) { + *internal_key = decoded_internal_key; + } + saved_user_key_ = parsed_key->user_key; + } + break; + } + case kPrefixFromPreviousKey: { + if (seekable != nullptr) { + *seekable = false; + } + prefix_len_ = size; + assert(prefix_extractor_ == nullptr || + prefix_extractor_->Transform(saved_user_key_).size() == + prefix_len_); + // Need read another size flag for suffix + expect_suffix = true; + break; + } + case kKeySuffix: { + expect_suffix = false; + if (seekable != nullptr) { + *seekable = false; + } + + Slice tmp_slice; + s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &tmp_slice); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode) { + // In non-mmap mode, we need to make a copy of keys returned to + // users, because after reading value for the key, the key might + // be invalid. + // saved_user_key_ points to cur_key_. We are making a copy of + // the prefix part to another string, and construct the current + // key from the prefix part and the suffix part back to cur_key_. + std::string tmp = + Slice(saved_user_key_.data(), prefix_len_).ToString(); + cur_key_.Reserve(prefix_len_ + size); + cur_key_.SetInternalKey(tmp, *parsed_key); + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size); + saved_user_key_ = cur_key_.GetUserKey(); + } else { + cur_key_.Reserve(prefix_len_ + size); + cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_), + *parsed_key); + } + parsed_key->user_key = cur_key_.GetUserKey(); + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + break; + } + default: + return Status::Corruption("Un-identified size flag."); + } + } while (expect_suffix); // Another round if suffix is expected. + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, + uint32_t* bytes_read, bool* seekable) { + assert(value != nullptr); + Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read, + seekable); + if (s.ok()) { + assert(bytes_read != nullptr); + uint32_t value_size; + uint32_t value_size_bytes; + bool success = file_reader_.ReadVarint32(start_offset + *bytes_read, + &value_size, &value_size_bytes); + if (!success) { + return file_reader_.status(); + } + if (value_size_bytes == 0) { + return Status::Corruption( + "Unexpected EOF when reading the next value's size."); + } + *bytes_read += value_size_bytes; + success = file_reader_.Read(start_offset + *bytes_read, value_size, value); + if (!success) { + return file_reader_.status(); + } + *bytes_read += value_size; + } + return s; +} + +Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, + uint32_t* bytes_read, + bool* seekable) { + *bytes_read = 0; + if (seekable != nullptr) { + *seekable = true; + } + if (encoding_type_ == kPlain) { + return NextPlainEncodingKey(start_offset, parsed_key, internal_key, + bytes_read, seekable); + } else { + assert(encoding_type_ == kPrefix); + return NextPrefixEncodingKey(start_offset, parsed_key, internal_key, + bytes_read, seekable); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.h new file mode 100644 index 0000000000..fdef224825 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_key_coding.h @@ -0,0 +1,199 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "rocksdb/slice.h" +#include "table/plain/plain_table_reader.h" + +// The file contains three helper classes of PlainTable format, +// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. +// These classes issue the lowest level of operations of PlainTable. +// Actual data format of the key is documented in comments of class +// PlainTableFactory. +namespace ROCKSDB_NAMESPACE { + +class WritableFile; +struct ParsedInternalKey; +struct PlainTableReaderFileInfo; +enum PlainTableEntryType : unsigned char; + +// Helper class for PlainTable format to write out a key to an output file +// The class is used in PlainTableBuilder. +class PlainTableKeyEncoder { + public: + explicit PlainTableKeyEncoder(EncodingType encoding_type, + uint32_t user_key_len, + const SliceTransform* prefix_extractor, + size_t index_sparseness) + : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), + fixed_user_key_len_(user_key_len), + prefix_extractor_(prefix_extractor), + index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), + key_count_for_prefix_(0) {} + // key: the key to write out, in the format of internal key. + // file: the output file to write out + // offset: offset in the file. Needs to be updated after appending bytes + // for the key + // meta_bytes_buf: buffer for extra meta bytes + // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated + // if meta_bytes_buf is updated. + IOStatus AppendKey(const Slice& key, WritableFileWriter* file, + uint64_t* offset, char* meta_bytes_buf, + size_t* meta_bytes_buf_size); + + // Return actual encoding type to be picked + EncodingType GetEncodingType() { return encoding_type_; } + + private: + EncodingType encoding_type_; + uint32_t fixed_user_key_len_; + const SliceTransform* prefix_extractor_; + const size_t index_sparseness_; + size_t key_count_for_prefix_; + IterKey pre_prefix_; +}; + +// The class does raw file reads for PlainTableReader. +// It hides whether it is a mmap-read, or a non-mmap read. +// The class is implemented in a way to favor the performance of mmap case. +// The class is used by PlainTableReader. +class PlainTableFileReader { + public: + explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) + : file_info_(_file_info), num_buf_(0) {} + + ~PlainTableFileReader() { + // Should fix. + status_.PermitUncheckedError(); + } + + // In mmaped mode, the results point to mmaped area of the file, which + // means it is always valid before closing the file. + // In non-mmap mode, the results point to an internal buffer. If the caller + // makes another read call, the results may not be valid. So callers should + // make a copy when needed. + // In order to save read calls to files, we keep two internal buffers: + // the first read and the most recent read. This is efficient because it + // columns these two common use cases: + // (1) hash index only identify one location, we read the key to verify + // the location, and read key and value if it is the right location. + // (2) after hash index checking, we identify two locations (because of + // hash bucket conflicts), we binary search the two location to see + // which one is what we need and start to read from the location. + // These two most common use cases will be covered by the two buffers + // so that we don't need to re-read the same location. + // Currently we keep a fixed size buffer. If a read doesn't exactly fit + // the buffer, we replace the second buffer with the location user reads. + // + // If return false, status code is stored in status_. + bool Read(uint32_t file_offset, uint32_t len, Slice* out) { + if (file_info_->is_mmap_mode) { + assert(file_offset + len <= file_info_->data_end_offset); + *out = Slice(file_info_->file_data.data() + file_offset, len); + return true; + } else { + return ReadNonMmap(file_offset, len, out); + } + } + + // If return false, status code is stored in status_. + bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); + + // *bytes_read = 0 means eof. false means failure and status is saved + // in status_. Not directly returning Status to save copying status + // object to map previous performance of mmap mode. + inline bool ReadVarint32(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + Status status() const { return status_; } + + const PlainTableReaderFileInfo* file_info() { return file_info_; } + + private: + const PlainTableReaderFileInfo* file_info_; + + struct Buffer { + Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} + std::unique_ptr buf; + uint32_t buf_start_offset; + uint32_t buf_len; + uint32_t buf_capacity; + }; + + // Keep buffers for two recent reads. + std::array, 2> buffers_; + uint32_t num_buf_; + Status status_; + + Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); +}; + +// A helper class to decode keys from input buffer +// The class is used by PlainTableBuilder. +class PlainTableKeyDecoder { + public: + explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, + EncodingType encoding_type, + uint32_t user_key_len, + const SliceTransform* prefix_extractor) + : file_reader_(file_info), + encoding_type_(encoding_type), + prefix_len_(0), + fixed_user_key_len_(user_key_len), + prefix_extractor_(prefix_extractor), + in_prefix_(false) {} + + // Find the next key. + // start: char array where the key starts. + // limit: boundary of the char array + // parsed_key: the output of the result key + // internal_key: if not null, fill with the output of the result key in + // un-parsed format + // bytes_read: how many bytes read from start. Output + // seekable: whether key can be read from this place. Used when building + // indexes. Output. + Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, uint32_t* bytes_read, + bool* seekable = nullptr); + + Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + + PlainTableFileReader file_reader_; + EncodingType encoding_type_; + uint32_t prefix_len_; + uint32_t fixed_user_key_len_; + Slice saved_user_key_; + IterKey cur_key_; + const SliceTransform* prefix_extractor_; + bool in_prefix_; + + private: + Status NextPlainEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + Status NextPrefixEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, + ParsedInternalKey* parsed_key, uint32_t* bytes_read, + bool* internal_key_valid, Slice* internal_key); + inline Status DecodeSize(uint32_t start_offset, + PlainTableEntryType* entry_type, uint32_t* key_size, + uint32_t* bytes_read); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.cc new file mode 100644 index 0000000000..2f0379f72a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.cc @@ -0,0 +1,771 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include "table/plain/plain_table_reader.h" + +#include +#include + +#include "db/dbformat.h" +#include "memory/arena.h" +#include "monitoring/histogram.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "table/block_based/block.h" +#include "table/block_based/filter_block.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Safely getting a uint32_t element from a char array, where, starting from +// `base`, every 4 bytes are considered as an fixed 32 bit integer. +inline uint32_t GetFixed32Element(const char* base, size_t offset) { + return DecodeFixed32(base + offset * sizeof(uint32_t)); +} +} // namespace + +// Iterator to iterate IndexedTable +class PlainTableIterator : public InternalIterator { + public: + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; + + ~PlainTableIterator() override; + + bool Valid() const override; + + void SeekToFirst() override; + + void SeekToLast() override; + + void Seek(const Slice& target) override; + + void SeekForPrev(const Slice& target) override; + + void Next() override; + + void Prev() override; + + Slice key() const override; + + Slice value() const override; + + Status status() const override; + + private: + PlainTableReader* table_; + PlainTableKeyDecoder decoder_; + bool use_prefix_seek_; + uint32_t offset_; + uint32_t next_offset_; + Slice key_; + Slice value_; + Status status_; +}; + +extern const uint64_t kPlainTableMagicNumber; +PlainTableReader::PlainTableReader( + const ImmutableOptions& ioptions, + std::unique_ptr&& file, + const EnvOptions& storage_options, const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, + const TableProperties* table_properties, + const SliceTransform* prefix_extractor) + : internal_comparator_(icomparator), + encoding_type_(encoding_type), + full_scan_mode_(false), + user_key_len_(static_cast(table_properties->fixed_key_len)), + prefix_extractor_(prefix_extractor), + enable_bloom_(false), + bloom_(6), + file_info_(std::move(file), storage_options, + static_cast(table_properties->data_size)), + ioptions_(ioptions), + file_size_(file_size), + table_properties_(nullptr) {} + +PlainTableReader::~PlainTableReader() { + // Should fix? + status_.PermitUncheckedError(); +} + +Status PlainTableReader::Open( + const ImmutableOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, + bool full_scan_mode, const bool immortal_table, + const SliceTransform* prefix_extractor) { + if (file_size > PlainTableIndex::kMaxFileSize) { + return Status::NotSupported("File is too large for PlainTableReader!"); + } + + std::unique_ptr props; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + ioptions, read_options, &props); + if (!s.ok()) { + return s; + } + + assert(hash_table_ratio >= 0.0); + auto& user_props = props->user_collected_properties; + auto prefix_extractor_in_file = props->prefix_extractor_name; + + if (!full_scan_mode && + !prefix_extractor_in_file.empty() /* old version sst file*/ + && prefix_extractor_in_file != "nullptr") { + if (!prefix_extractor) { + return Status::InvalidArgument( + "Prefix extractor is missing when opening a PlainTable built " + "using a prefix extractor"); + } else if (prefix_extractor_in_file != prefix_extractor->AsString()) { + return Status::InvalidArgument( + "Prefix extractor given doesn't match the one used to build " + "PlainTable"); + } + } + + EncodingType encoding_type = kPlain; + auto encoding_type_prop = + user_props.find(PlainTablePropertyNames::kEncodingType); + if (encoding_type_prop != user_props.end()) { + encoding_type = static_cast( + DecodeFixed32(encoding_type_prop->second.c_str())); + } + + std::unique_ptr new_reader(new PlainTableReader( + ioptions, std::move(file), env_options, internal_comparator, + encoding_type, file_size, props.get(), prefix_extractor)); + + s = new_reader->MmapDataIfNeeded(); + if (!s.ok()) { + return s; + } + + if (!full_scan_mode) { + s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key, + hash_table_ratio, index_sparseness, + huge_page_tlb_size); + if (!s.ok()) { + return s; + } + } else { + // Flag to indicate it is a full scan mode so that none of the indexes + // can be used. + new_reader->full_scan_mode_ = true; + } + // PopulateIndex can add to the props, so don't store them until now + new_reader->table_properties_ = std::move(props); + + if (immortal_table && new_reader->file_info_.is_mmap_mode) { + new_reader->dummy_cleanable_.reset(new Cleanable()); + } + + *table_reader = std::move(new_reader); + return s; +} + +void PlainTableReader::SetupForCompaction() {} + +InternalIterator* PlainTableReader::NewIterator( + const ReadOptions& options, const SliceTransform* /* prefix_extractor */, + Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, + size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) { + // Not necessarily used here, but make sure this has been initialized + assert(table_properties_); + + // Auto prefix mode is not implemented in PlainTable. + bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && + !options.auto_prefix_mode; + if (arena == nullptr) { + return new PlainTableIterator(this, use_prefix_seek); + } else { + auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); + return new (mem) PlainTableIterator(this, use_prefix_seek); + } +} + +Status PlainTableReader::PopulateIndexRecordList( + PlainTableIndexBuilder* index_builder, + std::vector* prefix_hashes) { + Slice prev_key_prefix_slice; + std::string prev_key_prefix_buf; + uint32_t pos = data_start_offset_; + + bool is_first_record = true; + Slice key_prefix_slice; + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + prefix_extractor_); + while (pos < file_info_.data_end_offset) { + uint32_t key_offset = pos; + ParsedInternalKey key; + Slice value_slice; + bool seekable = false; + Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); + if (!s.ok()) { + return s; + } + + key_prefix_slice = GetPrefix(key); + if (enable_bloom_) { + bloom_.AddHash(GetSliceHash(key.user_key)); + } else { + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + if (!is_first_record) { + prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); + } + if (file_info_.is_mmap_mode) { + prev_key_prefix_slice = key_prefix_slice; + } else { + prev_key_prefix_buf = key_prefix_slice.ToString(); + prev_key_prefix_slice = prev_key_prefix_buf; + } + } + } + + index_builder->AddKeyPrefix(GetPrefix(key), key_offset); + + if (!seekable && is_first_record) { + return Status::Corruption("Key for a prefix is not seekable"); + } + + is_first_record = false; + } + + prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); + auto s = index_.InitFromRawData(index_builder->Finish()); + return s; +} + +void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys, + size_t huge_page_tlb_size) { + uint32_t bloom_total_bits = num_keys * bloom_bits_per_key; + if (bloom_total_bits > 0) { + enable_bloom_ = true; + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.logger); + } +} + +void PlainTableReader::FillBloom(const std::vector& prefix_hashes) { + assert(bloom_.IsInitialized()); + for (const auto prefix_hash : prefix_hashes) { + bloom_.AddHash(prefix_hash); + } +} + +Status PlainTableReader::MmapDataIfNeeded() { + if (file_info_.is_mmap_mode) { + // Get mmapped memory. + return file_info_.file->Read( + IOOptions(), 0, static_cast(file_size_), &file_info_.file_data, + nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + } + return Status::OK(); +} + +Status PlainTableReader::PopulateIndex(TableProperties* props, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness, + size_t huge_page_tlb_size) { + assert(props != nullptr); + + BlockContents index_block_contents; + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s = + ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, read_options, + PlainTableIndexBuilder::kPlainTableIndexBlock, + BlockType::kIndex, &index_block_contents); + + bool index_in_file = s.ok(); + + BlockContents bloom_block_contents; + bool bloom_in_file = false; + // We only need to read the bloom block if index block is in file. + if (index_in_file) { + s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, + read_options, BloomBlockBuilder::kBloomBlock, + BlockType::kFilter, &bloom_block_contents); + bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; + } + + Slice* bloom_block; + if (bloom_in_file) { + // If bloom_block_contents.allocation is not empty (which will be the case + // for non-mmap mode), it holds the alloated memory for the bloom block. + // It needs to be kept alive to keep `bloom_block` valid. + bloom_block_alloc_ = std::move(bloom_block_contents.allocation); + bloom_block = &bloom_block_contents.data; + } else { + bloom_block = nullptr; + } + + Slice* index_block; + if (index_in_file) { + // If index_block_contents.allocation is not empty (which will be the case + // for non-mmap mode), it holds the alloated memory for the index block. + // It needs to be kept alive to keep `index_block` valid. + index_block_alloc_ = std::move(index_block_contents.allocation); + index_block = &index_block_contents.data; + } else { + index_block = nullptr; + } + + if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) { + // moptions.prefix_extractor is requried for a hash-based look-up. + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + + // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows + // for a prefix (starting from the first one), generate a record of (hash, + // offset) and append it to IndexRecordList, which is a data structure created + // to store them. + + if (!index_in_file) { + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + AllocateBloom(bloom_bits_per_key, + static_cast(props->num_entries), + huge_page_tlb_size); + } + } else if (bloom_in_file) { + enable_bloom_ = true; + auto num_blocks_property = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + + uint32_t num_blocks = 0; + if (num_blocks_property != props->user_collected_properties.end()) { + Slice temp_slice(num_blocks_property->second); + if (!GetVarint32(&temp_slice, &num_blocks)) { + num_blocks = 0; + } + } + // cast away const qualifier, because bloom_ won't be changed + bloom_.SetRawData(const_cast(bloom_block->data()), + static_cast(bloom_block->size()) * 8, + num_blocks); + } else { + // Index in file but no bloom in file. Disable bloom filter in this case. + enable_bloom_ = false; + bloom_bits_per_key = 0; + } + + PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_, + index_sparseness, hash_table_ratio, + huge_page_tlb_size); + + std::vector prefix_hashes; + if (!index_in_file) { + // Populates _bloom if enabled (total order mode) + s = PopulateIndexRecordList(&index_builder, &prefix_hashes); + if (!s.ok()) { + return s; + } + } else { + s = index_.InitFromRawData(*index_block); + if (!s.ok()) { + return s; + } + } + + if (!index_in_file) { + if (!IsTotalOrderMode()) { + // Calculated bloom filter size and allocate memory for + // bloom filter based on the number of prefixes, then fill it. + AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(), + huge_page_tlb_size); + if (enable_bloom_) { + FillBloom(prefix_hashes); + } + } + } + + // Fill two table properties. + if (!index_in_file) { + props->user_collected_properties["plain_table_hash_table_size"] = + std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); + props->user_collected_properties["plain_table_sub_index_size"] = + std::to_string(index_.GetSubIndexSize()); + } else { + props->user_collected_properties["plain_table_hash_table_size"] = + std::to_string(0); + props->user_collected_properties["plain_table_sub_index_size"] = + std::to_string(0); + } + + return Status::OK(); +} + +Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, + const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const { + prefix_matched = false; + uint32_t prefix_index_offset; + auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); + if (res == PlainTableIndex::kNoPrefixForBucket) { + *offset = file_info_.data_end_offset; + return Status::OK(); + } else if (res == PlainTableIndex::kDirectToFile) { + *offset = prefix_index_offset; + return Status::OK(); + } + + // point to sub-index, need to do a binary search + uint32_t upper_bound = 0; + const char* base_ptr = + index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); + uint32_t low = 0; + uint32_t high = upper_bound; + ParsedInternalKey mid_key; + ParsedInternalKey parsed_target; + Status s = ParseInternalKey(target, &parsed_target, + false /* log_err_key */); // TODO + if (!s.ok()) return s; + + // The key is between [low, high). Do a binary search between it. + while (high - low > 1) { + uint32_t mid = (high + low) / 2; + uint32_t file_offset = GetFixed32Element(base_ptr, mid); + uint32_t tmp; + s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); + if (!s.ok()) { + return s; + } + int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); + if (cmp_result < 0) { + low = mid; + } else { + if (cmp_result == 0) { + // Happen to have found the exact key or target is smaller than the + // first key after base_offset. + prefix_matched = true; + *offset = file_offset; + return Status::OK(); + } else { + high = mid; + } + } + } + // Both of the key at the position low or low+1 could share the same + // prefix as target. We need to rule out one of them to avoid to go + // to the wrong prefix. + ParsedInternalKey low_key; + uint32_t tmp; + uint32_t low_key_offset = GetFixed32Element(base_ptr, low); + s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); + if (!s.ok()) { + return s; + } + + if (GetPrefix(low_key) == prefix) { + prefix_matched = true; + *offset = low_key_offset; + } else if (low + 1 < upper_bound) { + // There is possible a next prefix, return it + prefix_matched = false; + *offset = GetFixed32Element(base_ptr, low + 1); + } else { + // target is larger than a key of the last prefix in this bucket + // but with a different prefix. Key does not exist. + *offset = file_info_.data_end_offset; + } + return Status::OK(); +} + +bool PlainTableReader::MatchBloom(uint32_t hash) const { + if (!enable_bloom_) { + return true; + } + + if (bloom_.MayContainHash(hash)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } +} + +Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, + bool* seekable) const { + if (*offset == file_info_.data_end_offset) { + *offset = file_info_.data_end_offset; + return Status::OK(); + } + + if (*offset > file_info_.data_end_offset) { + return Status::Corruption("Offset is out of file size"); + } + + uint32_t bytes_read; + Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, + &bytes_read, seekable); + if (!s.ok()) { + return s; + } + *offset = *offset + bytes_read; + return Status::OK(); +} + +void PlainTableReader::Prepare(const Slice& target) { + if (enable_bloom_) { + uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); + bloom_.Prefetch(prefix_hash); + } +} + +Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, + GetContext* get_context, + const SliceTransform* /* prefix_extractor */, + bool /*skip_filters*/) { + // Check bloom filter first. + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + if (full_scan_mode_) { + status_ = + Status::InvalidArgument("Get() is not allowed in full scan mode."); + } + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(ExtractUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } + } + uint32_t offset; + bool prefix_match; + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + prefix_extractor_); + Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, + prefix_match, &offset); + + if (!s.ok()) { + return s; + } + ParsedInternalKey found_key; + ParsedInternalKey parsed_target; + s = ParseInternalKey(target, &parsed_target, + false /* log_err_key */); // TODO + if (!s.ok()) return s; + + Slice found_value; + while (offset < file_info_.data_end_offset) { + s = Next(&decoder, &offset, &found_key, nullptr, &found_value); + if (!s.ok()) { + return s; + } + if (!prefix_match) { + // Need to verify prefix for the first key found if it is not yet + // checked. + if (GetPrefix(found_key) != prefix_slice) { + return Status::OK(); + } + prefix_match = true; + } + // TODO(ljin): since we know the key comparison result here, + // can we enable the fast path? + if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { + bool dont_care __attribute__((__unused__)); + if (!get_context->SaveValue(found_key, found_value, &dont_care, + dummy_cleanable_.get())) { + break; + } + } + } + return Status::OK(); +} + +uint64_t PlainTableReader::ApproximateOffsetOf( + const ReadOptions& /*read_options*/, const Slice& /*key*/, + TableReaderCaller /*caller*/) { + return 0; +} + +uint64_t PlainTableReader::ApproximateSize(const ReadOptions& /* read_options*/, + const Slice& /*start*/, + const Slice& /*end*/, + TableReaderCaller /*caller*/) { + return 0; +} + +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), + decoder_(&table_->file_info_, table_->encoding_type_, + table_->user_key_len_, table_->prefix_extractor_), + use_prefix_seek_(use_prefix_seek) { + next_offset_ = offset_ = table_->file_info_.data_end_offset; +} + +PlainTableIterator::~PlainTableIterator() {} + +bool PlainTableIterator::Valid() const { + return offset_ < table_->file_info_.data_end_offset && + offset_ >= table_->data_start_offset_; +} + +void PlainTableIterator::SeekToFirst() { + status_ = Status::OK(); + next_offset_ = table_->data_start_offset_; + if (next_offset_ >= table_->file_info_.data_end_offset) { + next_offset_ = offset_ = table_->file_info_.data_end_offset; + } else { + Next(); + } +} + +void PlainTableIterator::SeekToLast() { + assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); + next_offset_ = offset_ = table_->file_info_.data_end_offset; +} + +void PlainTableIterator::Seek(const Slice& target) { + if (use_prefix_seek_ != !table_->IsTotalOrderMode()) { + // This check is done here instead of NewIterator() to permit creating an + // iterator with total_order_seek = true even if we won't be able to Seek() + // it. This is needed for compaction: it creates iterator with + // total_order_seek = true but usually never does Seek() on it, + // only SeekToFirst(). + status_ = Status::InvalidArgument( + "total_order_seek not implemented for PlainTable."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (table_->IsTotalOrderMode()) { + if (table_->full_scan_mode_) { + status_ = + Status::InvalidArgument("Seek() is not allowed in full scan mode."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } else if (table_->GetIndexSize() > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order " + "mode."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash = 0; + // Bloom filter is ignored in total-order mode. + if (!table_->IsTotalOrderMode()) { + prefix_hash = GetSliceHash(prefix_slice); + if (!table_->MatchBloom(prefix_hash)) { + status_ = Status::OK(); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + } + bool prefix_match; + status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, + prefix_match, &next_offset_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + + if (next_offset_ < table_->file_info_.data_end_offset) { + for (Next(); status_.ok() && Valid(); Next()) { + if (!prefix_match) { + // Need to verify the first key's prefix + if (table_->GetPrefix(key()) != prefix_slice) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + break; + } + prefix_match = true; + } + if (table_->internal_comparator_.Compare(key(), target) >= 0) { + break; + } + } + } else { + offset_ = table_->file_info_.data_end_offset; + } +} + +void PlainTableIterator::SeekForPrev(const Slice& /*target*/) { + assert(false); + status_ = + Status::NotSupported("SeekForPrev() is not supported in PlainTable"); + offset_ = next_offset_ = table_->file_info_.data_end_offset; +} + +void PlainTableIterator::Next() { + offset_ = next_offset_; + if (offset_ < table_->file_info_.data_end_offset) { + Slice tmp_slice; + ParsedInternalKey parsed_key; + status_ = + table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + } + } +} + +void PlainTableIterator::Prev() { assert(false); } + +Slice PlainTableIterator::key() const { + assert(Valid()); + return key_; +} + +Slice PlainTableIterator::value() const { + assert(Valid()); + return value_; +} + +Status PlainTableIterator::status() const { return status_; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.h new file mode 100644 index 0000000000..0f5f7f3ce0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/plain/plain_table_reader.h @@ -0,0 +1,243 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include +#include +#include +#include + +#include "file/random_access_file_reader.h" +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" +#include "table/table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +class Block; +struct BlockContents; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class InternalKeyComparator; +class PlainTableKeyDecoder; +class GetContext; + +extern const uint32_t kPlainTableVariableLength; + +struct PlainTableReaderFileInfo { + bool is_mmap_mode; + Slice file_data; + uint32_t data_end_offset; + std::unique_ptr file; + + PlainTableReaderFileInfo(std::unique_ptr&& _file, + const EnvOptions& storage_options, + uint32_t _data_size_offset) + : is_mmap_mode(storage_options.use_mmap_reads), + data_end_offset(_data_size_offset), + file(std::move(_file)) {} +}; + +// The reader class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. +class PlainTableReader : public TableReader { + public: + // Based on following output file format shown in plain_table_factory.h + // When opening the output file, PlainTableReader creates a hash table + // from key prefixes to offset of the output file. PlainTable will decide + // whether it points to the data offset of the first key with the key prefix + // or the offset of it. If there are too many keys share this prefix, it will + // create a binary search-able index from the suffix to offset on disk. + static Status Open(const ImmutableOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr&& file, + uint64_t file_size, std::unique_ptr* table, + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, size_t huge_page_tlb_size, + bool full_scan_mode, const bool immortal_table = false, + const SliceTransform* prefix_extractor = nullptr); + + // Returns new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; + + void Prepare(const Slice& target) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, + TableReaderCaller caller) override; + + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; + + uint32_t GetIndexSize() const { return index_.GetIndexSize(); } + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override { + return table_properties_; + } + + virtual size_t ApproximateMemoryUsage() const override { + return arena_.MemoryAllocatedBytes(); + } + + PlainTableReader(const ImmutableOptions& ioptions, + std::unique_ptr&& file, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + EncodingType encoding_type, uint64_t file_size, + const TableProperties* table_properties, + const SliceTransform* prefix_extractor); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // props: the table properties object that need to be stored. Ownership of + // the object will be passed. + // + + Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + size_t huge_page_tlb_size); + + Status MmapDataIfNeeded(); + + private: + const InternalKeyComparator internal_comparator_; + EncodingType encoding_type_; + // represents plain table's current status. + Status status_; + + PlainTableIndex index_; + bool full_scan_mode_; + + // data_start_offset_ and data_end_offset_ defines the range of the + // sst file that stores data. + const uint32_t data_start_offset_ = 0; + const uint32_t user_key_len_; + const SliceTransform* prefix_extractor_; + + static const size_t kNumInternalBytes = 8; + + // Bloom filter is used to rule out non-existent key + bool enable_bloom_; + PlainTableBloomV1 bloom_; + PlainTableReaderFileInfo file_info_; + Arena arena_; + CacheAllocationPtr index_block_alloc_; + CacheAllocationPtr bloom_block_alloc_; + + const ImmutableOptions& ioptions_; + std::unique_ptr dummy_cleanable_; + uint64_t file_size_; + + protected: // for testing + std::shared_ptr table_properties_; + + private: + bool IsFixedLength() const { + return user_key_len_ != kPlainTableVariableLength; + } + + size_t GetFixedInternalKeyLength() const { + return user_key_len_ + kNumInternalBytes; + } + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(ExtractUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + friend class TableCache; + friend class PlainTableIterator; + + // Internal helper function to generate an IndexRecordList object from all + // the rows, which contains index records as a list. + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, + std::vector* prefix_hashes); + + // Internal helper function to allocate memory for bloom filter + void AllocateBloom(int bloom_bits_per_key, int num_prefixes, + size_t huge_page_tlb_size); + + void FillBloom(const std::vector& prefix_hashes); + + // Read the key and value at `offset` to parameters for keys, the and + // `seekable`. + // On success, `offset` will be updated as the offset for the next key. + // `parsed_key` will be key in parsed format. + // if `internal_key` is not empty, it will be filled with key with slice + // format. + // if `seekable` is not null, it will return whether we can directly read + // data using this offset. + Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset, + ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value, + bool* seekable = nullptr) const; + // Get file offset for key target. + // return value prefix_matched is set to true if the offset is confirmed + // for a key with the same prefix as target. + Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target, + const Slice& prefix, uint32_t prefix_hash, + bool& prefix_matched, uint32_t* offset) const; + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } + + // No copying allowed + explicit PlainTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/scoped_arena_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/scoped_arena_iterator.h new file mode 100644 index 0000000000..2b8824d95e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/scoped_arena_iterator.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "port/port.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { +class ScopedArenaIterator { + void reset(InternalIterator* iter) noexcept { + if (iter_ != nullptr) { + iter_->~InternalIterator(); + } + iter_ = iter; + } + + public: + explicit ScopedArenaIterator(InternalIterator* iter = nullptr) + : iter_(iter) {} + + ScopedArenaIterator(const ScopedArenaIterator&) = delete; + ScopedArenaIterator& operator=(const ScopedArenaIterator&) = delete; + + ScopedArenaIterator(ScopedArenaIterator&& o) noexcept { + iter_ = o.iter_; + o.iter_ = nullptr; + } + + ScopedArenaIterator& operator=(ScopedArenaIterator&& o) noexcept { + reset(o.iter_); + o.iter_ = nullptr; + return *this; + } + + InternalIterator* operator->() { return iter_; } + InternalIterator* get() { return iter_; } + + void set(InternalIterator* iter) { reset(iter); } + + InternalIterator* release() { + assert(iter_ != nullptr); + auto* res = iter_; + iter_ = nullptr; + return res; + } + + ~ScopedArenaIterator() { reset(nullptr); } + + private: + InternalIterator* iter_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.cc new file mode 100644 index 0000000000..f6d6e195d6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.cc @@ -0,0 +1,520 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/sst_file_dumper.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "options/cf_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_reader.h" +#include "util/compression.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +SstFileDumper::SstFileDumper(const Options& options, + const std::string& file_path, + Temperature file_temp, size_t readahead_size, + bool verify_checksum, bool output_hex, + bool decode_blob_index, const EnvOptions& soptions, + bool silent) + : file_name_(file_path), + read_num_(0), + file_temp_(file_temp), + output_hex_(output_hex), + decode_blob_index_(decode_blob_index), + soptions_(soptions), + silent_(silent), + options_(options), + ioptions_(options_), + moptions_(ColumnFamilyOptions(options_)), + read_options_(verify_checksum, false), + internal_comparator_(BytewiseComparator()) { + read_options_.readahead_size = readahead_size; + if (!silent_) { + fprintf(stdout, "Process %s\n", file_path.c_str()); + } + init_result_ = GetTableReader(file_name_); +} + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; + +const char* testFileName = "test_file_name"; + +Status SstFileDumper::GetTableReader(const std::string& file_path) { + // Warning about 'magic_number' being uninitialized shows up only in UBsan + // builds. Though access is guarded by 's.ok()' checks, fix the issue to + // avoid any warnings. + uint64_t magic_number = Footer::kNullTableMagicNumber; + + // read table magic number + Footer footer; + + const auto& fs = options_.env->GetFileSystem(); + std::unique_ptr file; + uint64_t file_size = 0; + FileOptions fopts = soptions_; + fopts.temperature = file_temp_; + Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); + if (s.ok()) { + s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); + } + + // check empty file + // if true, skip further processing of this file + if (file_size == 0) { + return Status::Aborted(file_path, "Empty file"); + } + + file_.reset(new RandomAccessFileReader(std::move(file), file_path)); + + FilePrefetchBuffer prefetch_buffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, + false /* track_min_offset */); + if (s.ok()) { + const uint64_t kSstDumpTailPrefetchSize = 512 * 1024; + uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize) + ? kSstDumpTailPrefetchSize + : file_size; + uint64_t prefetch_off = file_size - prefetch_size; + IOOptions opts; + s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off, + static_cast(prefetch_size), + Env::IO_TOTAL /* rate_limiter_priority */); + + s = ReadFooterFromFile(opts, file_.get(), *fs, &prefetch_buffer, file_size, + &footer); + } + if (s.ok()) { + magic_number = footer.table_magic_number(); + } + + if (s.ok()) { + if (magic_number == kPlainTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber) { + soptions_.use_mmap_reads = true; + + fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); + file_.reset(new RandomAccessFileReader(std::move(file), file_path)); + } + + // For old sst format, ReadTableProperties might fail but file can be read + if (ReadTableProperties(magic_number, file_.get(), file_size, + (magic_number == kBlockBasedTableMagicNumber) + ? &prefetch_buffer + : nullptr) + .ok()) { + s = SetTableOptionsByMagicNumber(magic_number); + if (s.ok()) { + if (table_properties_ && !table_properties_->comparator_name.empty()) { + ConfigOptions config_options; + const Comparator* user_comparator = nullptr; + s = Comparator::CreateFromString(config_options, + table_properties_->comparator_name, + &user_comparator); + if (s.ok()) { + assert(user_comparator); + internal_comparator_ = InternalKeyComparator(user_comparator); + } + } + } + } else { + s = SetOldTableOptions(); + } + options_.comparator = internal_comparator_.user_comparator(); + } + + if (s.ok()) { + s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size, + &table_reader_); + } + return s; +} + +Status SstFileDumper::NewTableReader( + const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, + const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, + std::unique_ptr* /*table_reader*/) { + auto t_opt = TableReaderOptions( + ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_, + 0 /* block_protection_bytes_per_key */, false /* skip_filters */, + false /* immortal */, true /* force_direct_prefetch */); + // Allow open file with global sequence number for backward compatibility. + t_opt.largest_seqno = kMaxSequenceNumber; + + // We need to turn off pre-fetching of index and filter nodes for + // BlockBasedTable + if (options_.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + return options_.table_factory->NewTableReader(t_opt, std::move(file_), + file_size, &table_reader_, + /*enable_prefetch=*/false); + } + + // For all other factory implementation + return options_.table_factory->NewTableReader(t_opt, std::move(file_), + file_size, &table_reader_); +} + +Status SstFileDumper::VerifyChecksum() { + // We could pass specific readahead setting into read options if needed. + return table_reader_->VerifyChecksum(read_options_, + TableReaderCaller::kSSTDumpTool); +} + +Status SstFileDumper::DumpTable(const std::string& out_filename) { + std::unique_ptr out_file; + Env* env = options_.env; + Status s = env->NewWritableFile(out_filename, &out_file, soptions_); + if (s.ok()) { + s = table_reader_->DumpTable(out_file.get()); + } + if (!s.ok()) { + // close the file before return error, ignore the close error if there's any + out_file->Close().PermitUncheckedError(); + return s; + } + return out_file->Close(); +} + +Status SstFileDumper::CalculateCompressedTableSize( + const TableBuilderOptions& tb_options, size_t block_size, + uint64_t* num_data_blocks, uint64_t* compressed_table_size) { + std::unique_ptr env(NewMemEnv(options_.env)); + std::unique_ptr dest_writer; + Status s = + WritableFileWriter::Create(env->GetFileSystem(), testFileName, + FileOptions(soptions_), &dest_writer, nullptr); + if (!s.ok()) { + return s; + } + BlockBasedTableOptions table_options; + table_options.block_size = block_size; + BlockBasedTableFactory block_based_tf(table_options); + std::unique_ptr table_builder; + table_builder.reset( + block_based_tf.NewTableBuilder(tb_options, dest_writer.get())); + std::unique_ptr iter(table_reader_->NewIterator( + read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + table_builder->Add(iter->key(), iter->value()); + } + s = iter->status(); + if (!s.ok()) { + return s; + } + s = table_builder->Finish(); + if (!s.ok()) { + return s; + } + *compressed_table_size = table_builder->FileSize(); + assert(num_data_blocks != nullptr); + *num_data_blocks = table_builder->GetTableProperties().num_data_blocks; + return env->DeleteFile(testFileName); +} + +Status SstFileDumper::ShowAllCompressionSizes( + size_t block_size, + const std::vector>& + compression_types, + int32_t compress_level_from, int32_t compress_level_to, + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, + uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) { + fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size); + for (auto& i : compression_types) { + if (CompressionTypeSupported(i.first)) { + fprintf(stdout, "Compression: %-24s\n", i.second); + CompressionOptions compress_opt; + compress_opt.max_dict_bytes = max_dict_bytes; + compress_opt.zstd_max_train_bytes = zstd_max_train_bytes; + compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes; + compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer; + for (int32_t j = compress_level_from; j <= compress_level_to; j++) { + fprintf(stdout, "Compression level: %d", j); + compress_opt.level = j; + Status s = ShowCompressionSize(block_size, i.first, compress_opt); + if (!s.ok()) { + return s; + } + } + } else { + fprintf(stdout, "Unsupported compression type: %s.\n", i.second); + } + } + return Status::OK(); +} + +Status SstFileDumper::ShowCompressionSize( + size_t block_size, CompressionType compress_type, + const CompressionOptions& compress_opt) { + Options opts; + opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + opts.statistics->set_stats_level(StatsLevel::kAll); + const ImmutableOptions imoptions(opts); + const ColumnFamilyOptions cfo(opts); + const MutableCFOptions moptions(cfo); + ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); + IntTblPropCollectorFactories block_based_table_factories; + + std::string column_family_name; + int unknown_level = -1; + TableBuilderOptions tb_opts( + imoptions, moptions, ikc, &block_based_table_factories, compress_type, + compress_opt, + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + column_family_name, unknown_level); + uint64_t num_data_blocks = 0; + std::chrono::steady_clock::time_point start = + std::chrono::steady_clock::now(); + uint64_t file_size; + Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks, + &file_size); + if (!s.ok()) { + return s; + } + + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + fprintf(stdout, " Size: %10" PRIu64, file_size); + fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks); + fprintf(stdout, " Time Taken: %10s microsecs", + std::to_string( + std::chrono::duration_cast(end - start) + .count()) + .c_str()); + const uint64_t compressed_blocks = + opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED); + const uint64_t not_compressed_blocks = + opts.statistics->getAndResetTickerCount( + NUMBER_BLOCK_COMPRESSION_REJECTED); + // When the option enable_index_compression is true, + // NUMBER_BLOCK_COMPRESSED is incremented for index block(s). + if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) { + num_data_blocks = compressed_blocks + not_compressed_blocks; + } + + const uint64_t ratio_not_compressed_blocks = + (num_data_blocks - compressed_blocks) - not_compressed_blocks; + const double compressed_pcnt = + (0 == num_data_blocks) ? 0.0 + : ((static_cast(compressed_blocks) / + static_cast(num_data_blocks)) * + 100.0); + const double ratio_not_compressed_pcnt = + (0 == num_data_blocks) + ? 0.0 + : ((static_cast(ratio_not_compressed_blocks) / + static_cast(num_data_blocks)) * + 100.0); + const double not_compressed_pcnt = + (0 == num_data_blocks) ? 0.0 + : ((static_cast(not_compressed_blocks) / + static_cast(num_data_blocks)) * + 100.0); + fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks, + compressed_pcnt); + fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)", + ratio_not_compressed_blocks, ratio_not_compressed_pcnt); + fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n", + not_compressed_blocks, not_compressed_pcnt); + return Status::OK(); +} + +// Reads TableProperties prior to opening table reader in order to set up +// options. +Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, + RandomAccessFileReader* file, + uint64_t file_size, + FilePrefetchBuffer* prefetch_buffer) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s = ROCKSDB_NAMESPACE::ReadTableProperties( + file, file_size, table_magic_number, ioptions_, read_options, + &table_properties_, + /* memory_allocator= */ nullptr, prefetch_buffer); + if (!s.ok()) { + if (!silent_) { + fprintf(stdout, "Not able to read table properties\n"); + } + } + return s; +} + +Status SstFileDumper::SetTableOptionsByMagicNumber( + uint64_t table_magic_number) { + assert(table_properties_); + if (table_magic_number == kBlockBasedTableMagicNumber || + table_magic_number == kLegacyBlockBasedTableMagicNumber) { + BlockBasedTableFactory* bbtf = new BlockBasedTableFactory(); + // To force tail prefetching, we fake reporting two useful reads of 512KB + // from the tail. + // It needs at least two data points to warm up the stats. + bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024); + bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024); + + options_.table_factory.reset(bbtf); + if (!silent_) { + fprintf(stdout, "Sst file format: block-based\n"); + } + + auto& props = table_properties_->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + auto index_type_on_file = static_cast( + DecodeFixed32(pos->second.c_str())); + if (index_type_on_file == + BlockBasedTableOptions::IndexType::kHashSearch) { + options_.prefix_extractor.reset(NewNoopTransform()); + } + } + } else if (table_magic_number == kPlainTableMagicNumber || + table_magic_number == kLegacyPlainTableMagicNumber) { + options_.allow_mmap_reads = true; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 1; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + plain_table_options.full_scan_mode = true; + + options_.table_factory.reset(NewPlainTableFactory(plain_table_options)); + if (!silent_) { + fprintf(stdout, "Sst file format: plain table\n"); + } + } else { + char error_msg_buffer[80]; + snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, + "Unsupported table magic number --- %lx", + (long)table_magic_number); + return Status::InvalidArgument(error_msg_buffer); + } + + return Status::OK(); +} + +Status SstFileDumper::SetOldTableOptions() { + assert(table_properties_ == nullptr); + options_.table_factory = std::make_shared(); + if (!silent_) { + fprintf(stdout, "Sst file format: block-based(old version)\n"); + } + + return Status::OK(); +} + +Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, + bool has_from, const std::string& from_key, + bool has_to, const std::string& to_key, + bool use_from_as_prefix) { + if (!table_reader_) { + return init_result_; + } + + InternalIterator* iter = table_reader_->NewIterator( + read_options_, moptions_.prefix_extractor.get(), + /*arena=*/nullptr, /*skip_filters=*/false, + TableReaderCaller::kSSTDumpTool); + uint64_t i = 0; + if (has_from) { + InternalKey ikey; + ikey.SetMinPossibleForUserKey(from_key); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Slice value = iter->value(); + ++i; + if (read_num > 0 && i > read_num) break; + + ParsedInternalKey ikey; + Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */); + if (!pik_status.ok()) { + std::cerr << pik_status.getState() << "\n"; + continue; + } + + // the key returned is not prefixed with out 'from' key + if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) { + break; + } + + // If end marker was specified, we stop before it + if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + break; + } + + if (print_kv) { + if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) { + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), + value.ToString(output_hex_).c_str()); + } else { + BlobIndex blob_index; + + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + fprintf(stderr, "%s => error decoding blob index\n", + ikey.DebugString(true, output_hex_).c_str()); + continue; + } + + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), + blob_index.DebugString(output_hex_).c_str()); + } + } + } + + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} + +// Provides TableProperties to API user +Status SstFileDumper::ReadTableProperties( + std::shared_ptr* table_properties) { + if (!table_reader_) { + return init_result_; + } + + *table_properties = table_reader_->GetTableProperties(); + return init_result_; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.h new file mode 100644 index 0000000000..1e78959d14 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_dumper.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/advanced_options.h" + +namespace ROCKSDB_NAMESPACE { + +class SstFileDumper { + public: + explicit SstFileDumper(const Options& options, const std::string& file_name, + Temperature file_temp, size_t readahead_size, + bool verify_checksum, bool output_hex, + bool decode_blob_index, + const EnvOptions& soptions = EnvOptions(), + bool silent = false); + + Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from, + const std::string& from_key, bool has_to, + const std::string& to_key, + bool use_from_as_prefix = false); + + Status ReadTableProperties( + std::shared_ptr* table_properties); + uint64_t GetReadNumber() { return read_num_; } + TableProperties* GetInitTableProperties() { return table_properties_.get(); } + + Status VerifyChecksum(); + Status DumpTable(const std::string& out_filename); + Status getStatus() { return init_result_; } + + Status ShowAllCompressionSizes( + size_t block_size, + const std::vector>& + compression_types, + int32_t compress_level_from, int32_t compress_level_to, + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, + uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer); + + Status ShowCompressionSize(size_t block_size, CompressionType compress_type, + const CompressionOptions& compress_opt); + + private: + // Get the TableReader implementation for the sst file + Status GetTableReader(const std::string& file_path); + Status ReadTableProperties(uint64_t table_magic_number, + RandomAccessFileReader* file, uint64_t file_size, + FilePrefetchBuffer* prefetch_buffer); + + Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options, + size_t block_size, + uint64_t* num_data_blocks, + uint64_t* compressed_table_size); + + Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); + Status SetOldTableOptions(); + + // Helper function to call the factory with settings specific to the + // factory implementation + Status NewTableReader(const ImmutableOptions& ioptions, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_size, + std::unique_ptr* table_reader); + + std::string file_name_; + uint64_t read_num_; + Temperature file_temp_; + bool output_hex_; + bool decode_blob_index_; + EnvOptions soptions_; + // less verbose in stdout/stderr + bool silent_; + + // options_ and internal_comparator_ will also be used in + // ReadSequential internally (specifically, seek-related operations) + Options options_; + + Status init_result_; + std::unique_ptr table_reader_; + std::unique_ptr file_; + + const ImmutableOptions ioptions_; + const MutableCFOptions moptions_; + ReadOptions read_options_; + InternalKeyComparator internal_comparator_; + std::unique_ptr table_properties_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_reader.cc new file mode 100644 index 0000000000..533b7cd6ac --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_reader.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/sst_file_reader.h" + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "file/random_access_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "table/get_context.h" +#include "table/table_builder.h" +#include "table/table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +struct SstFileReader::Rep { + Options options; + EnvOptions soptions; + ImmutableOptions ioptions; + MutableCFOptions moptions; + + std::unique_ptr table_reader; + + Rep(const Options& opts) + : options(opts), + soptions(options), + ioptions(options), + moptions(ColumnFamilyOptions(options)) {} +}; + +SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {} + +SstFileReader::~SstFileReader() {} + +Status SstFileReader::Open(const std::string& file_path) { + auto r = rep_.get(); + Status s; + uint64_t file_size = 0; + std::unique_ptr file; + std::unique_ptr file_reader; + FileOptions fopts(r->soptions); + const auto& fs = r->options.env->GetFileSystem(); + + s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr); + if (s.ok()) { + s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); + } + if (s.ok()) { + file_reader.reset(new RandomAccessFileReader(std::move(file), file_path)); + } + if (s.ok()) { + TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor, + r->soptions, r->ioptions.internal_comparator, + r->moptions.block_protection_bytes_per_key); + // Allow open file with global sequence number for backward compatibility. + t_opt.largest_seqno = kMaxSequenceNumber; + s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader), + file_size, &r->table_reader); + } + return s; +} + +Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { + assert(roptions.io_activity == Env::IOActivity::kUnknown); + auto r = rep_.get(); + auto sequence = roptions.snapshot != nullptr + ? roptions.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + ArenaWrappedDBIter* res = new ArenaWrappedDBIter(); + res->Init(r->options.env, roptions, r->ioptions, r->moptions, + nullptr /* version */, sequence, + r->moptions.max_sequential_skip_in_iterations, + 0 /* version_number */, nullptr /* read_callback */, + nullptr /* db_impl */, nullptr /* cfd */, + true /* expose_blob_index */, false /* allow_refresh */); + auto internal_iter = r->table_reader->NewIterator( + res->GetReadOptions(), r->moptions.prefix_extractor.get(), + res->GetArena(), false /* skip_filters */, + TableReaderCaller::kSSTFileReader); + res->SetIterUnderDBIter(internal_iter); + return res; +} + +std::shared_ptr SstFileReader::GetTableProperties() + const { + return rep_->table_reader->GetTableProperties(); +} + +Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); + return rep_->table_reader->VerifyChecksum(read_options, + TableReaderCaller::kSSTFileReader); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer.cc new file mode 100644 index 0000000000..d187b741e3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer.cc @@ -0,0 +1,436 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/sst_file_writer.h" + +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "rocksdb/file_system.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/sst_file_writer_collectors.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string ExternalSstFilePropertyNames::kVersion = + "rocksdb.external_sst_file.version"; +const std::string ExternalSstFilePropertyNames::kGlobalSeqno = + "rocksdb.external_sst_file.global_seqno"; + + +const size_t kFadviseTrigger = 1024 * 1024; // 1MB + +struct SstFileWriter::Rep { + Rep(const EnvOptions& _env_options, const Options& options, + Env::IOPriority _io_priority, const Comparator* _user_comparator, + ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters, + std::string _db_session_id) + : env_options(_env_options), + ioptions(options), + mutable_cf_options(options), + io_priority(_io_priority), + internal_comparator(_user_comparator), + cfh(_cfh), + invalidate_page_cache(_invalidate_page_cache), + skip_filters(_skip_filters), + db_session_id(_db_session_id) {} + + std::unique_ptr file_writer; + std::unique_ptr builder; + EnvOptions env_options; + ImmutableOptions ioptions; + MutableCFOptions mutable_cf_options; + Env::IOPriority io_priority; + InternalKeyComparator internal_comparator; + ExternalSstFileInfo file_info; + InternalKey ikey; + std::string column_family_name; + ColumnFamilyHandle* cfh; + // If true, We will give the OS a hint that this file pages is not needed + // every time we write 1MB to the file. + bool invalidate_page_cache; + // The size of the file during the last time we called Fadvise to remove + // cached pages from page cache. + uint64_t last_fadvise_size = 0; + bool skip_filters; + std::string db_session_id; + uint64_t next_file_number = 1; + + Status AddImpl(const Slice& user_key, const Slice& value, + ValueType value_type) { + if (!builder) { + return Status::InvalidArgument("File is not opened"); + } + + if (file_info.num_entries == 0) { + file_info.smallest_key.assign(user_key.data(), user_key.size()); + } else { + if (internal_comparator.user_comparator()->Compare( + user_key, file_info.largest_key) <= 0) { + // Make sure that keys are added in order + return Status::InvalidArgument( + "Keys must be added in strict ascending order."); + } + } + + assert(value_type == kTypeValue || value_type == kTypeMerge || + value_type == kTypeDeletion || + value_type == kTypeDeletionWithTimestamp); + + constexpr SequenceNumber sequence_number = 0; + + ikey.Set(user_key, sequence_number, value_type); + + builder->Add(ikey.Encode(), value); + + // update file info + file_info.num_entries++; + file_info.largest_key.assign(user_key.data(), user_key.size()); + file_info.file_size = builder->FileSize(); + + InvalidatePageCache(false /* closing */).PermitUncheckedError(); + return Status::OK(); + } + + Status Add(const Slice& user_key, const Slice& value, ValueType value_type) { + if (internal_comparator.user_comparator()->timestamp_size() != 0) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + + return AddImpl(user_key, value, value_type); + } + + Status Add(const Slice& user_key, const Slice& timestamp, const Slice& value, + ValueType value_type) { + const size_t timestamp_size = timestamp.size(); + + if (internal_comparator.user_comparator()->timestamp_size() != + timestamp_size) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + + const size_t user_key_size = user_key.size(); + + if (user_key.data() + user_key_size == timestamp.data()) { + Slice user_key_with_ts(user_key.data(), user_key_size + timestamp_size); + return AddImpl(user_key_with_ts, value, value_type); + } + + std::string user_key_with_ts; + user_key_with_ts.reserve(user_key_size + timestamp_size); + user_key_with_ts.append(user_key.data(), user_key_size); + user_key_with_ts.append(timestamp.data(), timestamp_size); + + return AddImpl(user_key_with_ts, value, value_type); + } + + Status DeleteRangeImpl(const Slice& begin_key, const Slice& end_key) { + if (!builder) { + return Status::InvalidArgument("File is not opened"); + } + int cmp = internal_comparator.user_comparator()->CompareWithoutTimestamp( + begin_key, end_key); + if (cmp > 0) { + // It's an empty range where endpoints appear mistaken. Don't bother + // applying it to the DB, and return an error to the user. + return Status::InvalidArgument("end key comes before start key"); + } else if (cmp == 0) { + // It's an empty range. Don't bother applying it to the DB. + return Status::OK(); + } + + RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */); + if (file_info.num_range_del_entries == 0) { + file_info.smallest_range_del_key.assign(tombstone.start_key_.data(), + tombstone.start_key_.size()); + file_info.largest_range_del_key.assign(tombstone.end_key_.data(), + tombstone.end_key_.size()); + } else { + if (internal_comparator.user_comparator()->Compare( + tombstone.start_key_, file_info.smallest_range_del_key) < 0) { + file_info.smallest_range_del_key.assign(tombstone.start_key_.data(), + tombstone.start_key_.size()); + } + if (internal_comparator.user_comparator()->Compare( + tombstone.end_key_, file_info.largest_range_del_key) > 0) { + file_info.largest_range_del_key.assign(tombstone.end_key_.data(), + tombstone.end_key_.size()); + } + } + + auto ikey_and_end_key = tombstone.Serialize(); + builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second); + + // update file info + file_info.num_range_del_entries++; + file_info.file_size = builder->FileSize(); + + InvalidatePageCache(false /* closing */).PermitUncheckedError(); + return Status::OK(); + } + + Status DeleteRange(const Slice& begin_key, const Slice& end_key) { + if (internal_comparator.user_comparator()->timestamp_size() != 0) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + return DeleteRangeImpl(begin_key, end_key); + } + + // begin_key and end_key should be users keys without timestamp. + Status DeleteRange(const Slice& begin_key, const Slice& end_key, + const Slice& timestamp) { + const size_t timestamp_size = timestamp.size(); + + if (internal_comparator.user_comparator()->timestamp_size() != + timestamp_size) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + + const size_t begin_key_size = begin_key.size(); + const size_t end_key_size = end_key.size(); + if (begin_key.data() + begin_key_size == timestamp.data() || + end_key.data() + begin_key_size == timestamp.data()) { + assert(memcmp(begin_key.data() + begin_key_size, + end_key.data() + end_key_size, timestamp_size) == 0); + Slice begin_key_with_ts(begin_key.data(), + begin_key_size + timestamp_size); + Slice end_key_with_ts(end_key.data(), end_key.size() + timestamp_size); + return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts); + } + std::string begin_key_with_ts; + begin_key_with_ts.reserve(begin_key_size + timestamp_size); + begin_key_with_ts.append(begin_key.data(), begin_key_size); + begin_key_with_ts.append(timestamp.data(), timestamp_size); + std::string end_key_with_ts; + end_key_with_ts.reserve(end_key_size + timestamp_size); + end_key_with_ts.append(end_key.data(), end_key_size); + end_key_with_ts.append(timestamp.data(), timestamp_size); + return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts); + } + + Status InvalidatePageCache(bool closing) { + Status s = Status::OK(); + if (invalidate_page_cache == false) { + // Fadvise disabled + return s; + } + uint64_t bytes_since_last_fadvise = builder->FileSize() - last_fadvise_size; + if (bytes_since_last_fadvise > kFadviseTrigger || closing) { + TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache", + &(bytes_since_last_fadvise)); + // Tell the OS that we don't need this file in page cache + s = file_writer->InvalidateCache(0, 0); + if (s.IsNotSupported()) { + // NotSupported is fine as it could be a file type that doesn't use page + // cache. + s = Status::OK(); + } + last_fadvise_size = builder->FileSize(); + } + return s; + } +}; + +SstFileWriter::SstFileWriter(const EnvOptions& env_options, + const Options& options, + const Comparator* user_comparator, + ColumnFamilyHandle* column_family, + bool invalidate_page_cache, + Env::IOPriority io_priority, bool skip_filters) + : rep_(new Rep(env_options, options, io_priority, user_comparator, + column_family, invalidate_page_cache, skip_filters, + DBImpl::GenerateDbSessionId(options.env))) { + // SstFileWriter is used to create sst files that can be added to database + // later. Therefore, no real db_id and db_session_id are associated with it. + // Here we mimic the way db_session_id behaves by getting a db_session_id + // for each SstFileWriter, and (later below) assign unique file numbers + // in the table properties. The db_id is set to be "SST Writer" for clarity. + + rep_->file_info.file_size = 0; +} + +SstFileWriter::~SstFileWriter() { + if (rep_->builder) { + // User did not call Finish() or Finish() failed, we need to + // abandon the builder. + rep_->builder->Abandon(); + } +} + +Status SstFileWriter::Open(const std::string& file_path) { + Rep* r = rep_.get(); + Status s; + std::unique_ptr sst_file; + FileOptions cur_file_opts(r->env_options); + s = r->ioptions.env->GetFileSystem()->NewWritableFile( + file_path, cur_file_opts, &sst_file, nullptr); + if (!s.ok()) { + return s; + } + + sst_file->SetIOPriority(r->io_priority); + + CompressionType compression_type; + CompressionOptions compression_opts; + if (r->mutable_cf_options.bottommost_compression != + kDisableCompressionOption) { + compression_type = r->mutable_cf_options.bottommost_compression; + if (r->mutable_cf_options.bottommost_compression_opts.enabled) { + compression_opts = r->mutable_cf_options.bottommost_compression_opts; + } else { + compression_opts = r->mutable_cf_options.compression_opts; + } + } else if (!r->mutable_cf_options.compression_per_level.empty()) { + // Use the compression of the last level if we have per level compression + compression_type = *(r->mutable_cf_options.compression_per_level.rbegin()); + compression_opts = r->mutable_cf_options.compression_opts; + } else { + compression_type = r->mutable_cf_options.compression; + compression_opts = r->mutable_cf_options.compression_opts; + } + + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + + // SstFileWriter properties collector to add SstFileWriter version. + int_tbl_prop_collector_factories.emplace_back( + new SstFileWriterPropertiesCollectorFactory(2 /* version */, + 0 /* global_seqno*/)); + + // User collector factories + auto user_collector_factories = + r->ioptions.table_properties_collector_factories; + for (size_t i = 0; i < user_collector_factories.size(); i++) { + int_tbl_prop_collector_factories.emplace_back( + new UserKeyTablePropertiesCollectorFactory( + user_collector_factories[i])); + } + int unknown_level = -1; + uint32_t cf_id; + + if (r->cfh != nullptr) { + // user explicitly specified that this file will be ingested into cfh, + // we can persist this information in the file. + cf_id = r->cfh->GetID(); + r->column_family_name = r->cfh->GetName(); + } else { + r->column_family_name = ""; + cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + } + + // TODO: it would be better to set oldest_key_time to be used for getting the + // approximate time of ingested keys. + TableBuilderOptions table_builder_options( + r->ioptions, r->mutable_cf_options, r->internal_comparator, + &int_tbl_prop_collector_factories, compression_type, compression_opts, + cf_id, r->column_family_name, unknown_level, false /* is_bottommost */, + TableFileCreationReason::kMisc, 0 /* oldest_key_time */, + 0 /* file_creation_time */, "SST Writer" /* db_id */, r->db_session_id, + 0 /* target_file_size */, r->next_file_number); + // External SST files used to each get a unique session id. Now for + // slightly better uniqueness probability in constructing cache keys, we + // assign fake file numbers to each file (into table properties) and keep + // the same session id for the life of the SstFileWriter. + r->next_file_number++; + // XXX: when we can remove skip_filters from the SstFileWriter public API + // we can remove it from TableBuilderOptions. + table_builder_options.skip_filters = r->skip_filters; + FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types; + r->file_writer.reset(new WritableFileWriter( + std::move(sst_file), file_path, r->env_options, r->ioptions.clock, + nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners, + r->ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + // TODO(tec) : If table_factory is using compressed block cache, we will + // be adding the external sst file blocks into it, which is wasteful. + r->builder.reset(r->ioptions.table_factory->NewTableBuilder( + table_builder_options, r->file_writer.get())); + + r->file_info = ExternalSstFileInfo(); + r->file_info.file_path = file_path; + r->file_info.version = 2; + return s; +} + +Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { + return rep_->Add(user_key, value, ValueType::kTypeValue); +} + +Status SstFileWriter::Put(const Slice& user_key, const Slice& value) { + return rep_->Add(user_key, value, ValueType::kTypeValue); +} + +Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp, + const Slice& value) { + return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue); +} + +Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) { + return rep_->Add(user_key, value, ValueType::kTypeMerge); +} + +Status SstFileWriter::Delete(const Slice& user_key) { + return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion); +} + +Status SstFileWriter::Delete(const Slice& user_key, const Slice& timestamp) { + return rep_->Add(user_key, timestamp, Slice(), + ValueType::kTypeDeletionWithTimestamp); +} + +Status SstFileWriter::DeleteRange(const Slice& begin_key, + const Slice& end_key) { + return rep_->DeleteRange(begin_key, end_key); +} + +Status SstFileWriter::DeleteRange(const Slice& begin_key, const Slice& end_key, + const Slice& timestamp) { + return rep_->DeleteRange(begin_key, end_key, timestamp); +} + +Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { + Rep* r = rep_.get(); + if (!r->builder) { + return Status::InvalidArgument("File is not opened"); + } + if (r->file_info.num_entries == 0 && + r->file_info.num_range_del_entries == 0) { + return Status::InvalidArgument("Cannot create sst file with no entries"); + } + + Status s = r->builder->Finish(); + r->file_info.file_size = r->builder->FileSize(); + + if (s.ok()) { + s = r->file_writer->Sync(r->ioptions.use_fsync); + r->InvalidatePageCache(true /* closing */).PermitUncheckedError(); + if (s.ok()) { + s = r->file_writer->Close(); + } + } + if (s.ok()) { + r->file_info.file_checksum = r->file_writer->GetFileChecksum(); + r->file_info.file_checksum_func_name = + r->file_writer->GetFileChecksumFuncName(); + } + if (!s.ok()) { + r->ioptions.env->DeleteFile(r->file_info.file_path); + } + + if (file_info != nullptr) { + *file_info = r->file_info; + } + + r->builder.reset(); + return s; +} + +uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer_collectors.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer_collectors.h new file mode 100644 index 0000000000..486315fb5a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/sst_file_writer_collectors.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "db/table_properties_collector.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Table Properties that are specific to tables created by SstFileWriter. +struct ExternalSstFilePropertyNames { + // value of this property is a fixed uint32 number. + static const std::string kVersion; + // value of this property is a fixed uint64 number. + static const std::string kGlobalSeqno; +}; + +// PropertiesCollector used to add properties specific to tables +// generated by SstFileWriter +class SstFileWriterPropertiesCollector : public IntTblPropCollector { + public: + explicit SstFileWriterPropertiesCollector(int32_t version, + SequenceNumber global_seqno) + : version_(version), global_seqno_(global_seqno) {} + + virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + // Intentionally left blank. No interest in collecting stats for + // blocks. + return; + } + + virtual Status Finish(UserCollectedProperties* properties) override { + // File version + std::string version_val; + PutFixed32(&version_val, static_cast(version_)); + properties->insert({ExternalSstFilePropertyNames::kVersion, version_val}); + + // Global Sequence number + std::string seqno_val; + PutFixed64(&seqno_val, static_cast(global_seqno_)); + properties->insert({ExternalSstFilePropertyNames::kGlobalSeqno, seqno_val}); + + return Status::OK(); + } + + virtual const char* Name() const override { + return "SstFileWriterPropertiesCollector"; + } + + virtual UserCollectedProperties GetReadableProperties() const override { + return {{ExternalSstFilePropertyNames::kVersion, std::to_string(version_)}}; + } + + private: + int32_t version_; + SequenceNumber global_seqno_; +}; + +class SstFileWriterPropertiesCollectorFactory + : public IntTblPropCollectorFactory { + public: + explicit SstFileWriterPropertiesCollectorFactory(int32_t version, + SequenceNumber global_seqno) + : version_(version), global_seqno_(global_seqno) {} + + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t /*column_family_id*/, int /* level_at_creation */) override { + return new SstFileWriterPropertiesCollector(version_, global_seqno_); + } + + virtual const char* Name() const override { + return "SstFileWriterPropertiesCollector"; + } + + private: + int32_t version_; + SequenceNumber global_seqno_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_builder.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_builder.h new file mode 100644 index 0000000000..6d98bce707 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_builder.h @@ -0,0 +1,228 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/seqno_to_time_mapping.h" +#include "db/table_properties_collector.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "table/unique_id_impl.h" +#include "trace_replay/block_cache_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Status; + +struct TableReaderOptions { + // @param skip_filters Disables loading/accessing the filter block + TableReaderOptions( + const ImmutableOptions& _ioptions, + const std::shared_ptr& _prefix_extractor, + const EnvOptions& _env_options, + const InternalKeyComparator& _internal_comparator, + uint8_t _block_protection_bytes_per_key, bool _skip_filters = false, + bool _immortal = false, bool _force_direct_prefetch = false, + int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, + size_t _max_file_size_for_l0_meta_pin = 0, + const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, + UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0, + uint64_t _tail_size = 0) + : ioptions(_ioptions), + prefix_extractor(_prefix_extractor), + env_options(_env_options), + internal_comparator(_internal_comparator), + skip_filters(_skip_filters), + immortal(_immortal), + force_direct_prefetch(_force_direct_prefetch), + level(_level), + largest_seqno(_largest_seqno), + block_cache_tracer(_block_cache_tracer), + max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin), + cur_db_session_id(_cur_db_session_id), + cur_file_num(_cur_file_num), + unique_id(_unique_id), + block_protection_bytes_per_key(_block_protection_bytes_per_key), + tail_size(_tail_size) {} + + const ImmutableOptions& ioptions; + const std::shared_ptr& prefix_extractor; + const EnvOptions& env_options; + const InternalKeyComparator& internal_comparator; + // This is only used for BlockBasedTable (reader) + bool skip_filters; + // Whether the table will be valid as long as the DB is open + bool immortal; + // When data prefetching is needed, even if direct I/O is off, read data to + // fetch into RocksDB's buffer, rather than relying + // RandomAccessFile::Prefetch(). + bool force_direct_prefetch; + // What level this table/file is on, -1 for "not set, don't know." Used + // for level-specific statistics. + int level; + // largest seqno in the table (or 0 means unknown???) + SequenceNumber largest_seqno; + BlockCacheTracer* const block_cache_tracer; + // Largest L0 file size whose meta-blocks may be pinned (can be zero when + // unknown). + const size_t max_file_size_for_l0_meta_pin; + + std::string cur_db_session_id; + + uint64_t cur_file_num; + + // Known unique_id or {}, kNullUniqueId64x2 means unknown + UniqueId64x2 unique_id; + + uint8_t block_protection_bytes_per_key; + + uint64_t tail_size; +}; + +struct TableBuilderOptions { + TableBuilderOptions( + const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions, + const InternalKeyComparator& _internal_comparator, + const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories, + CompressionType _compression_type, + const CompressionOptions& _compression_opts, uint32_t _column_family_id, + const std::string& _column_family_name, int _level, + bool _is_bottommost = false, + TableFileCreationReason _reason = TableFileCreationReason::kMisc, + const int64_t _oldest_key_time = 0, + const uint64_t _file_creation_time = 0, const std::string& _db_id = "", + const std::string& _db_session_id = "", + const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0) + : ioptions(_ioptions), + moptions(_moptions), + internal_comparator(_internal_comparator), + int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), + compression_type(_compression_type), + compression_opts(_compression_opts), + column_family_id(_column_family_id), + column_family_name(_column_family_name), + oldest_key_time(_oldest_key_time), + target_file_size(_target_file_size), + file_creation_time(_file_creation_time), + db_id(_db_id), + db_session_id(_db_session_id), + level_at_creation(_level), + is_bottommost(_is_bottommost), + reason(_reason), + cur_file_num(_cur_file_num) {} + + const ImmutableOptions& ioptions; + const MutableCFOptions& moptions; + const InternalKeyComparator& internal_comparator; + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories; + const CompressionType compression_type; + const CompressionOptions& compression_opts; + const uint32_t column_family_id; + const std::string& column_family_name; + const int64_t oldest_key_time; + const uint64_t target_file_size; + const uint64_t file_creation_time; + const std::string db_id; + const std::string db_session_id; + // BEGIN for FilterBuildingContext + const int level_at_creation; + const bool is_bottommost; + const TableFileCreationReason reason; + // END for FilterBuildingContext + + // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you + // want to skip filters, that should be (for example) null filter_policy + // in the table options of the ioptions.table_factory + bool skip_filters = false; + const uint64_t cur_file_num; +}; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. +class TableBuilder { + public: + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Return non-ok iff some error happens during IO. + virtual IOStatus io_status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Whether the output file is completely empty. It has neither entries + // or tombstones. + virtual bool IsEmpty() const { + return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0; + } + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; + + // Estimated size of the file generated so far. This is used when + // FileSize() cannot estimate final SST size, e.g. parallel compression + // is enabled. + virtual uint64_t EstimatedFileSize() const { return FileSize(); } + + virtual uint64_t GetTailSize() const { return 0; } + + // If the user defined table properties collector suggest the file to + // be further compacted. + virtual bool NeedCompact() const { return false; } + + // Returns table properties + virtual TableProperties GetTableProperties() const = 0; + + // Return file checksum + virtual std::string GetFileChecksum() const = 0; + + // Return file checksum function name + virtual const char* GetFileChecksumFuncName() const = 0; + + // Set the sequence number to time mapping + virtual void SetSeqnoTimeTableProperties( + const std::string& /*encoded_seqno_to_time_mapping*/, + uint64_t /*oldest_ancestor_time*/){}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_factory.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_factory.cc new file mode 100644 index 0000000000..29b6c89f81 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_factory.cc @@ -0,0 +1,52 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/plain/plain_table_factory.h" + +namespace ROCKSDB_NAMESPACE { + +static void RegisterTableFactories(const std::string& /*arg*/) { + static std::once_flag loaded; + std::call_once(loaded, []() { + auto library = ObjectLibrary::Default(); + library->AddFactory( + TableFactory::kBlockBasedTableName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new BlockBasedTableFactory()); + return guard->get(); + }); + library->AddFactory( + TableFactory::kPlainTableName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new PlainTableFactory()); + return guard->get(); + }); + library->AddFactory( + TableFactory::kCuckooTableName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CuckooTableFactory()); + return guard->get(); + }); + }); +} + +Status TableFactory::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* factory) { + RegisterTableFactories(""); + return LoadSharedObject(config_options, value, factory); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties.cc new file mode 100644 index 0000000000..819d4da2ad --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/table_properties.h" + +#include "db/seqno_to_time_mapping.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/unique_id.h" +#include "table/table_properties_internal.h" +#include "table/unique_id_impl.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily = + std::numeric_limits::max(); + +namespace { +void AppendProperty(std::string& props, const std::string& key, + const std::string& value, const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); +} + +template +void AppendProperty(std::string& props, const std::string& key, + const TValue& value, const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty(props, key, std::to_string(value), prop_delim, kv_delim); +} +} // namespace + +std::string TableProperties::ToString(const std::string& prop_delim, + const std::string& kv_delim) const { + std::string result; + result.reserve(1024); + + // Basic Info + AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, + kv_delim); + AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim); + AppendProperty(result, "# merge operands", num_merge_operands, prop_delim, + kv_delim); + AppendProperty(result, "# range deletions", num_range_deletions, prop_delim, + kv_delim); + + AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); + AppendProperty(result, "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, kv_delim); + AppendProperty(result, "raw value size", raw_value_size, prop_delim, + kv_delim); + AppendProperty(result, "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, kv_delim); + + AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + char index_block_size_str[80]; + snprintf(index_block_size_str, sizeof(index_block_size_str), + "index block size (user-key? %d, delta-value? %d)", + static_cast(index_key_is_user_key), + static_cast(index_value_is_delta_encoded)); + AppendProperty(result, index_block_size_str, index_size, prop_delim, + kv_delim); + if (index_partitions != 0) { + AppendProperty(result, "# index partitions", index_partitions, prop_delim, + kv_delim); + AppendProperty(result, "top-level index size", top_level_index_size, + prop_delim, kv_delim); + } + AppendProperty(result, "filter block size", filter_size, prop_delim, + kv_delim); + AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim, + kv_delim); + AppendProperty(result, "(estimated) table size", + data_size + index_size + filter_size, prop_delim, kv_delim); + + AppendProperty( + result, "filter policy name", + filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, + prop_delim, kv_delim); + + AppendProperty(result, "prefix extractor name", + prefix_extractor_name.empty() ? std::string("N/A") + : prefix_extractor_name, + prop_delim, kv_delim); + + AppendProperty(result, "column family ID", + column_family_id == + ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory:: + Context::kUnknownColumnFamily + ? std::string("N/A") + : std::to_string(column_family_id), + prop_delim, kv_delim); + AppendProperty( + result, "column family name", + column_family_name.empty() ? std::string("N/A") : column_family_name, + prop_delim, kv_delim); + + AppendProperty(result, "comparator name", + comparator_name.empty() ? std::string("N/A") : comparator_name, + prop_delim, kv_delim); + + AppendProperty( + result, "merge operator name", + merge_operator_name.empty() ? std::string("N/A") : merge_operator_name, + prop_delim, kv_delim); + + AppendProperty(result, "property collectors names", + property_collectors_names.empty() ? std::string("N/A") + : property_collectors_names, + prop_delim, kv_delim); + + AppendProperty( + result, "SST file compression algo", + compression_name.empty() ? std::string("N/A") : compression_name, + prop_delim, kv_delim); + + AppendProperty( + result, "SST file compression options", + compression_options.empty() ? std::string("N/A") : compression_options, + prop_delim, kv_delim); + + AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim); + + AppendProperty(result, "time stamp of earliest key", oldest_key_time, + prop_delim, kv_delim); + + AppendProperty(result, "file creation time", file_creation_time, prop_delim, + kv_delim); + + AppendProperty(result, "slow compression estimated data size", + slow_compression_estimated_data_size, prop_delim, kv_delim); + AppendProperty(result, "fast compression estimated data size", + fast_compression_estimated_data_size, prop_delim, kv_delim); + + // DB identity and DB session ID + AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim); + AppendProperty(result, "DB session identity", db_session_id, prop_delim, + kv_delim); + AppendProperty(result, "DB host id", db_host_id, prop_delim, kv_delim); + AppendProperty(result, "original file number", orig_file_number, prop_delim, + kv_delim); + + // Unique ID, when available + std::string id; + Status s = GetUniqueIdFromTableProperties(*this, &id); + AppendProperty(result, "unique ID", + s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim, + kv_delim); + + SeqnoToTimeMapping seq_time_mapping; + s = seq_time_mapping.Add(seqno_to_time_mapping); + AppendProperty(result, "Sequence number to time mapping", + s.ok() ? seq_time_mapping.ToHumanString() : "N/A", prop_delim, + kv_delim); + + return result; +} + +void TableProperties::Add(const TableProperties& tp) { + data_size += tp.data_size; + index_size += tp.index_size; + index_partitions += tp.index_partitions; + top_level_index_size += tp.top_level_index_size; + index_key_is_user_key += tp.index_key_is_user_key; + index_value_is_delta_encoded += tp.index_value_is_delta_encoded; + filter_size += tp.filter_size; + raw_key_size += tp.raw_key_size; + raw_value_size += tp.raw_value_size; + num_data_blocks += tp.num_data_blocks; + num_entries += tp.num_entries; + num_filter_entries += tp.num_filter_entries; + num_deletions += tp.num_deletions; + num_merge_operands += tp.num_merge_operands; + num_range_deletions += tp.num_range_deletions; + slow_compression_estimated_data_size += + tp.slow_compression_estimated_data_size; + fast_compression_estimated_data_size += + tp.fast_compression_estimated_data_size; +} + +std::map +TableProperties::GetAggregatablePropertiesAsMap() const { + std::map rv; + rv["data_size"] = data_size; + rv["index_size"] = index_size; + rv["index_partitions"] = index_partitions; + rv["top_level_index_size"] = top_level_index_size; + rv["filter_size"] = filter_size; + rv["raw_key_size"] = raw_key_size; + rv["raw_value_size"] = raw_value_size; + rv["num_data_blocks"] = num_data_blocks; + rv["num_entries"] = num_entries; + rv["num_filter_entries"] = num_filter_entries; + rv["num_deletions"] = num_deletions; + rv["num_merge_operands"] = num_merge_operands; + rv["num_range_deletions"] = num_range_deletions; + rv["slow_compression_estimated_data_size"] = + slow_compression_estimated_data_size; + rv["fast_compression_estimated_data_size"] = + fast_compression_estimated_data_size; + return rv; +} + +// WARNING: manual update to this function is needed +// whenever a new string property is added to TableProperties +// to reduce approximation error. +// +// TODO: eliminate the need of manually updating this function +// for new string properties +std::size_t TableProperties::ApproximateMemoryUsage() const { + std::size_t usage = 0; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + + std::size_t string_props_mem_usage = + db_id.size() + db_session_id.size() + db_host_id.size() + + column_family_name.size() + filter_policy_name.size() + + comparator_name.size() + merge_operator_name.size() + + prefix_extractor_name.size() + property_collectors_names.size() + + compression_name.size() + compression_options.size(); + usage += string_props_mem_usage; + + for (auto iter = user_collected_properties.begin(); + iter != user_collected_properties.end(); ++iter) { + usage += (iter->first.size() + iter->second.size()); + } + + return usage; +} + +const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity"; +const std::string TablePropertiesNames::kDbSessionId = + "rocksdb.creating.session.identity"; +const std::string TablePropertiesNames::kDbHostId = + "rocksdb.creating.host.identity"; +const std::string TablePropertiesNames::kOriginalFileNumber = + "rocksdb.original.file.number"; +const std::string TablePropertiesNames::kDataSize = "rocksdb.data.size"; +const std::string TablePropertiesNames::kIndexSize = "rocksdb.index.size"; +const std::string TablePropertiesNames::kIndexPartitions = + "rocksdb.index.partitions"; +const std::string TablePropertiesNames::kTopLevelIndexSize = + "rocksdb.top-level.index.size"; +const std::string TablePropertiesNames::kIndexKeyIsUserKey = + "rocksdb.index.key.is.user.key"; +const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded = + "rocksdb.index.value.is.delta.encoded"; +const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size"; +const std::string TablePropertiesNames::kRawKeySize = "rocksdb.raw.key.size"; +const std::string TablePropertiesNames::kRawValueSize = + "rocksdb.raw.value.size"; +const std::string TablePropertiesNames::kNumDataBlocks = + "rocksdb.num.data.blocks"; +const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries"; +const std::string TablePropertiesNames::kNumFilterEntries = + "rocksdb.num.filter_entries"; +const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; +const std::string TablePropertiesNames::kMergeOperands = + "rocksdb.merge.operands"; +const std::string TablePropertiesNames::kNumRangeDeletions = + "rocksdb.num.range-deletions"; +const std::string TablePropertiesNames::kFilterPolicy = "rocksdb.filter.policy"; +const std::string TablePropertiesNames::kFormatVersion = + "rocksdb.format.version"; +const std::string TablePropertiesNames::kFixedKeyLen = + "rocksdb.fixed.key.length"; +const std::string TablePropertiesNames::kColumnFamilyId = + "rocksdb.column.family.id"; +const std::string TablePropertiesNames::kColumnFamilyName = + "rocksdb.column.family.name"; +const std::string TablePropertiesNames::kComparator = "rocksdb.comparator"; +const std::string TablePropertiesNames::kMergeOperator = + "rocksdb.merge.operator"; +const std::string TablePropertiesNames::kPrefixExtractorName = + "rocksdb.prefix.extractor.name"; +const std::string TablePropertiesNames::kPropertyCollectors = + "rocksdb.property.collectors"; +const std::string TablePropertiesNames::kCompression = "rocksdb.compression"; +const std::string TablePropertiesNames::kCompressionOptions = + "rocksdb.compression_options"; +const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time"; +const std::string TablePropertiesNames::kOldestKeyTime = + "rocksdb.oldest.key.time"; +const std::string TablePropertiesNames::kFileCreationTime = + "rocksdb.file.creation.time"; +const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize = + "rocksdb.sample_for_compression.slow.data.size"; +const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = + "rocksdb.sample_for_compression.fast.data.size"; +const std::string TablePropertiesNames::kSequenceNumberTimeMapping = + "rocksdb.seqno.time.map"; +const std::string TablePropertiesNames::kTailStartOffset = + "rocksdb.tail.start.offset"; + +#ifndef NDEBUG +// WARNING: TEST_SetRandomTableProperties assumes the following layout of +// TableProperties +// +// struct TableProperties { +// int64_t orig_file_number = 0; +// ... +// ... int64_t properties only +// ... +// std::string db_id; +// ... +// ... std::string properties only +// ... +// std::string compression_options; +// UserCollectedProperties user_collected_properties; +// ... +// ... Other extra properties: non-int64_t/non-std::string properties only +// ... +// } +void TEST_SetRandomTableProperties(TableProperties* props) { + Random* r = Random::GetTLSInstance(); + uint64_t* pu = &props->orig_file_number; + assert(static_cast(pu) == static_cast(props)); + std::string* ps = &props->db_id; + const uint64_t* const pu_end = reinterpret_cast(ps); + // Use the last string property's address instead of + // the first extra property (e.g `user_collected_properties`)'s address + // in the for-loop to avoid advancing pointer to pointing to + // potential non-zero padding bytes between these two addresses due to + // user_collected_properties's alignment requirement + const std::string* const ps_end_inclusive = &props->compression_options; + + for (; pu < pu_end; ++pu) { + *pu = r->Next64(); + } + assert(static_cast(pu) == static_cast(ps)); + for (; ps <= ps_end_inclusive; ++ps) { + *ps = r->RandomBinaryString(13); + } +} +#endif + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties_internal.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties_internal.h new file mode 100644 index 0000000000..5c2a0cb9aa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_properties_internal.h @@ -0,0 +1,14 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef NDEBUG +void TEST_SetRandomTableProperties(TableProperties* props); +#endif +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_reader.h new file mode 100644 index 0000000000..53c5220529 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/table_reader.h @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "db/range_tombstone_fragmenter.h" +#if USE_COROUTINES +#include "folly/experimental/coro/Coroutine.h" +#include "folly/experimental/coro/Task.h" +#endif +#include "rocksdb/slice_transform.h" +#include "rocksdb/table_reader_caller.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/multiget_context.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator; +struct ParsedInternalKey; +class Slice; +class Arena; +struct ReadOptions; +struct TableProperties; +class GetContext; +class MultiGetContext; + +// A Table (also referred to as SST) is a sorted map from strings to strings. +// Tables are immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. Table readers are used +// for reading various types of table formats supported by rocksdb including +// BlockBasedTable, PlainTable and CuckooTable format. +class TableReader { + public: + virtual ~TableReader() {} + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // read_options: Must outlive the returned iterator. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + // skip_filters: disables checking the bloom filters even if they exist. This + // option is effective only for block-based table format. + // compaction_readahead_size: its value will only be used if caller = + // kCompaction + virtual InternalIterator* NewIterator( + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) = 0; + + virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& /*read_options*/) { + return nullptr; + } + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + // TODO(peterd): Since this function is only used for approximate size + // from beginning of file, reduce code duplication by removing this + // function and letting ApproximateSize take optional start and end, so + // that absolute start and end can be specified and optimized without + // key / index work. + virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, + TableReaderCaller caller) = 0; + + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data and applicable + // portions of metadata including filters and indexes. Nullptr for start or + // end (or both) indicates absolute start or end of the table. + virtual uint64_t ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, + TableReaderCaller caller) = 0; + + struct Anchor { + Anchor(const Slice& _user_key, size_t _range_size) + : user_key(_user_key.ToStringView()), range_size(_range_size) {} + std::string user_key; + size_t range_size; + }; + + // Now try to return approximately 128 anchor keys. + // The last one tends to be the largest key. + virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/, + std::vector& /*anchors*/) { + return Status::NotSupported("ApproximateKeyAnchors() not supported."); + } + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual std::shared_ptr GetTableProperties() const = 0; + + // Prepare work that can be done before the real Get() + virtual void Prepare(const Slice& /*target*/) {} + + // Report an approximation of how much memory has been used. + virtual size_t ApproximateMemoryUsage() const = 0; + + // Calls get_context->SaveValue() repeatedly, starting with + // the entry found after a call to Seek(key), until it returns false. + // May not make such a call if filter policy says that key is not present. + // + // get_context->MarkKeyMayExist needs to be called when it is configured to be + // memory only and the key is not found in the block cache. + // + // readOptions is the options for the read + // key is the key to search for + // skip_filters: disables checking the bloom filters even if they exist. This + // option is effective only for block-based table format. + virtual Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters = false) = 0; + + // Use bloom filters in the table file, if present, to filter out keys. The + // mget_range will be updated to skip keys that get a negative result from + // the filter lookup. + virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/, + const SliceTransform* /*prefix_extractor*/, + MultiGetContext::Range* /*mget_range*/) { + return Status::NotSupported(); + } + + virtual void MultiGet(const ReadOptions& readOptions, + const MultiGetContext::Range* mget_range, + const SliceTransform* prefix_extractor, + bool skip_filters = false) { + for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) { + *iter->s = Get(readOptions, iter->ikey, iter->get_context, + prefix_extractor, skip_filters); + } + } + +#if USE_COROUTINES + virtual folly::coro::Task MultiGetCoroutine( + const ReadOptions& readOptions, const MultiGetContext::Range* mget_range, + const SliceTransform* prefix_extractor, bool skip_filters = false) { + MultiGet(readOptions, mget_range, prefix_extractor, skip_filters); + co_return; + } +#endif // USE_COROUTINES + + // Prefetch data corresponding to a give range of keys + // Typically this functionality is required for table implementations that + // persists the data on a non volatile storage medium like disk/SSD + virtual Status Prefetch(const ReadOptions& /* read_options */, + const Slice* begin = nullptr, + const Slice* end = nullptr) { + (void)begin; + (void)end; + // Default implementation is NOOP. + // The child class should implement functionality when applicable + return Status::OK(); + } + + // convert db file to a human readable form + virtual Status DumpTable(WritableFile* /*out_file*/) { + return Status::NotSupported("DumpTable() not supported"); + } + + // check whether there is corruption in this db file + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/, + TableReaderCaller /*caller*/) { + return Status::NotSupported("VerifyChecksum() not supported"); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.cc new file mode 100644 index 0000000000..4b6634e5cf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.cc @@ -0,0 +1,220 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "db/pinned_iterators_manager.h" +#include "memory/arena.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class TwoLevelIndexIterator : public InternalIteratorBase { + public: + explicit TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); + + ~TwoLevelIndexIterator() override { + first_level_iter_.DeleteIter(false /* is_arena_mode */); + second_level_iter_.DeleteIter(false /* is_arena_mode */); + delete state_; + } + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() override; + void Prev() override; + + bool Valid() const override { return second_level_iter_.Valid(); } + Slice key() const override { + assert(Valid()); + return second_level_iter_.key(); + } + Slice user_key() const override { + assert(Valid()); + return second_level_iter_.user_key(); + } + IndexValue value() const override { + assert(Valid()); + return second_level_iter_.value(); + } + Status status() const override { + if (!first_level_iter_.status().ok()) { + assert(second_level_iter_.iter() == nullptr); + return first_level_iter_.status(); + } else if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().ok()) { + return second_level_iter_.status(); + } else { + return status_; + } + } + void SetPinnedItersMgr( + PinnedIteratorsManager* /*pinned_iters_mgr*/) override {} + bool IsKeyPinned() const override { return false; } + bool IsValuePinned() const override { return false; } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetSecondLevelIterator(InternalIteratorBase* iter); + void InitDataBlock(); + + TwoLevelIteratorState* state_; + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr + Status status_; + // If second_level_iter is non-nullptr, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the second_level_iter. + BlockHandle data_block_handle_; +}; + +TwoLevelIndexIterator::TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) + : state_(state), first_level_iter_(first_level_iter) {} + +void TwoLevelIndexIterator::Seek(const Slice& target) { + first_level_iter_.Seek(target); + + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.Seek(target); + } + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIndexIterator::SeekForPrev(const Slice& target) { + first_level_iter_.Seek(target); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekForPrev(target); + } + if (!Valid()) { + if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) { + first_level_iter_.SeekToLast(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekForPrev(target); + } + } + SkipEmptyDataBlocksBackward(); + } +} + +void TwoLevelIndexIterator::SeekToFirst() { + first_level_iter_.SeekToFirst(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIndexIterator::SeekToLast() { + first_level_iter_.SeekToLast(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIndexIterator::Next() { + assert(Valid()); + second_level_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIndexIterator::Prev() { + assert(Valid()); + second_level_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { + // Move to next block + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Next(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } + } +} + +void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { + // Move to next block + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Prev(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } + } +} + +void TwoLevelIndexIterator::SetSecondLevelIterator( + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); + delete old_iter; +} + +void TwoLevelIndexIterator::InitDataBlock() { + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + } else { + BlockHandle handle = first_level_iter_.value().handle; + if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().IsIncomplete() && + handle.offset() == data_block_handle_.offset()) { + // second_level_iter is already constructed with this iterator, so + // no need to change anything + } else { + InternalIteratorBase* iter = + state_->NewSecondaryIterator(handle); + data_block_handle_ = handle; + SetSecondLevelIterator(iter); + if (iter == nullptr) { + status_ = Status::Corruption("Missing block for partition " + + handle.ToString()); + } + } + } +} + +} // namespace + +InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) { + return new TwoLevelIndexIterator(state, first_level_iter); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.h new file mode 100644 index 0000000000..1fed934175 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/two_level_iterator.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/iterator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +struct ReadOptions; +class InternalKeyComparator; + +// TwoLevelIteratorState expects iterators are not created using the arena +struct TwoLevelIteratorState { + TwoLevelIteratorState() {} + + virtual ~TwoLevelIteratorState() {} + virtual InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& handle) = 0; +}; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +// Note: this function expects first_level_iter was not created using the arena +extern InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id.cc new file mode 100644 index 0000000000..fcdd756504 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id.cc @@ -0,0 +1,223 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "table/unique_id_impl.h" +#include "util/coding_lean.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +std::string EncodeSessionId(uint64_t upper, uint64_t lower) { + std::string db_session_id(20U, '\0'); + char *buf = &db_session_id[0]; + // Preserving `lower` is slightly tricky. 36^12 is slightly more than + // 62 bits, so we use 12 chars plus the bottom two bits of one more. + // (A tiny fraction of 20 digit strings go unused.) + uint64_t a = (upper << 2) | (lower >> 62); + uint64_t b = lower & (UINT64_MAX >> 2); + PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true); + PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true); + assert(buf == &db_session_id.back() + 1); + return db_session_id; +} + +Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, + uint64_t *lower) { + const size_t len = db_session_id.size(); + if (len == 0) { + return Status::NotSupported("Missing db_session_id"); + } + // Anything from 13 to 24 chars is reasonable. We don't have to limit to + // exactly 20. + if (len < 13) { + return Status::NotSupported("Too short db_session_id"); + } + if (len > 24) { + return Status::NotSupported("Too long db_session_id"); + } + uint64_t a = 0, b = 0; + const char *buf = &db_session_id.front(); + bool success = ParseBaseChars<36>(&buf, len - 12U, &a); + if (!success) { + return Status::NotSupported("Bad digit in db_session_id"); + } + success = ParseBaseChars<36>(&buf, 12U, &b); + if (!success) { + return Status::NotSupported("Bad digit in db_session_id"); + } + assert(buf == &db_session_id.back() + 1); + *upper = a >> 2; + *lower = (b & (UINT64_MAX >> 2)) | (a << 62); + return Status::OK(); +} + +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueIdPtr out, + bool force) { + if (!force) { + if (db_id.empty()) { + return Status::NotSupported("Missing db_id"); + } + if (file_number == 0) { + return Status::NotSupported("Missing or bad file number"); + } + if (db_session_id.empty()) { + return Status::NotSupported("Missing db_session_id"); + } + } + uint64_t session_upper = 0; // Assignment to appease clang-analyze + uint64_t session_lower = 0; // Assignment to appease clang-analyze + { + Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); + if (!s.ok()) { + if (!force) { + return s; + } else { + // A reasonable fallback in case malformed + Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper, + &session_lower); + if (session_lower == 0) { + session_lower = session_upper | 1; + } + } + } + } + + // Exactly preserve session lower to ensure that session ids generated + // during the same process lifetime are guaranteed unique. + // DBImpl also guarantees (in recent versions) that this is not zero, + // so that we can guarantee unique ID is never all zeros. (Can't assert + // that here because of testing and old versions.) + // We put this first in anticipation of matching a small-ish set of cache + // key prefixes to cover entries relevant to any DB. + out.ptr[0] = session_lower; + + // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy) + // for very high global uniqueness entropy. + // (It is possible that many DBs descended from one common DB id are copied + // around and proliferate, in which case session id is critical, but it is + // more common for different DBs to have different DB ids.) + uint64_t db_a, db_b; + Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b); + + // Xor in file number for guaranteed uniqueness by file number for a given + // session and DB id. (Xor slightly better than + here. See + // https://github.com/pdillinger/unique_id ) + out.ptr[1] = db_a ^ file_number; + + // Extra (optional) global uniqueness + if (out.extended) { + out.ptr[2] = db_b; + } + + return Status::OK(); +} + +namespace { +// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all +// zeros in first 128 bits to map to itself, so that excluding zero in +// internal IDs (session_lower != 0 above) does the same for external IDs. +// These values are meaningless except for making that work. +constexpr uint64_t kHiOffsetForZero = 17391078804906429400U; +constexpr uint64_t kLoOffsetForZero = 6417269962128484497U; +} // namespace + +void InternalUniqueIdToExternal(UniqueIdPtr in_out) { + uint64_t hi, lo; + BijectiveHash2x64(in_out.ptr[1] + kHiOffsetForZero, + in_out.ptr[0] + kLoOffsetForZero, &hi, &lo); + in_out.ptr[0] = lo; + in_out.ptr[1] = hi; + if (in_out.extended) { + in_out.ptr[2] += lo + hi; + } +} + +void ExternalUniqueIdToInternal(UniqueIdPtr in_out) { + uint64_t lo = in_out.ptr[0]; + uint64_t hi = in_out.ptr[1]; + if (in_out.extended) { + in_out.ptr[2] -= lo + hi; + } + BijectiveUnhash2x64(hi, lo, &hi, &lo); + in_out.ptr[0] = lo - kLoOffsetForZero; + in_out.ptr[1] = hi - kHiOffsetForZero; +} + +std::string EncodeUniqueIdBytes(UniqueIdPtr in) { + std::string ret(in.extended ? 24U : 16U, '\0'); + EncodeFixed64(&ret[0], in.ptr[0]); + EncodeFixed64(&ret[8], in.ptr[1]); + if (in.extended) { + EncodeFixed64(&ret[16], in.ptr[2]); + } + return ret; +} + +Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) { + if (unique_id.size() != (out.extended ? 24 : 16)) { + return Status::NotSupported("Not a valid unique_id"); + } + const char *buf = &unique_id.front(); + out.ptr[0] = DecodeFixed64(&buf[0]); + out.ptr[1] = DecodeFixed64(&buf[8]); + if (out.extended) { + out.ptr[2] = DecodeFixed64(&buf[16]); + } + return Status::OK(); +} + +template +Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props, + std::string *out_id) { + ID tmp{}; + Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id, + props.orig_file_number, &tmp); + if (s.ok()) { + InternalUniqueIdToExternal(&tmp); + *out_id = EncodeUniqueIdBytes(&tmp); + } else { + out_id->clear(); + } + return s; +} + +Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id) { + return GetUniqueIdFromTablePropertiesHelper(props, out_id); +} + +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id) { + return GetUniqueIdFromTablePropertiesHelper(props, out_id); +} + +std::string UniqueIdToHumanString(const std::string &id) { + // Not so efficient, but that's OK + std::string str = Slice(id).ToString(/*hex*/ true); + for (size_t i = 16; i < str.size(); i += 17) { + str.insert(i, "-"); + } + return str; +} + +std::string InternalUniqueIdToHumanString(UniqueIdPtr in) { + std::string str = "{"; + str += std::to_string(in.ptr[0]); + str += ","; + str += std::to_string(in.ptr[1]); + if (in.extended) { + str += ","; + str += std::to_string(in.ptr[2]); + } + str += "}"; + return str; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id_impl.h new file mode 100644 index 0000000000..6e3dc62c79 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/table/unique_id_impl.h @@ -0,0 +1,93 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/unique_id.h" + +namespace ROCKSDB_NAMESPACE { + +// Standard size unique ID, good enough for almost all practical purposes +using UniqueId64x2 = std::array; + +// Value never used as an actual unique ID so can be used for "null" +constexpr UniqueId64x2 kNullUniqueId64x2 = {}; + +// Extended size unique ID, for extra certainty of uniqueness among SST files +// spanning many hosts over a long time (rarely if ever needed) +using UniqueId64x3 = std::array; + +// Value never used as an actual unique ID so can be used for "null" +constexpr UniqueId64x3 kNullUniqueId64x3 = {}; + +// Dynamic pointer wrapper for one of the two above +struct UniqueIdPtr { + uint64_t *ptr = nullptr; + bool extended = false; + + /*implicit*/ UniqueIdPtr(UniqueId64x2 *id) { + ptr = (*id).data(); + extended = false; + } + /*implicit*/ UniqueIdPtr(UniqueId64x3 *id) { + ptr = (*id).data(); + extended = true; + } +}; + +// Helper for GetUniqueIdFromTableProperties. This function can also be used +// for temporary ids for files without sufficient information in table +// properties. The internal unique id is more structured than the public +// unique id, so can be manipulated in more ways but very carefully. +// These must be long term stable to ensure GetUniqueIdFromTableProperties +// is long term stable. +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueIdPtr out, + bool force = false); + +// Helper for GetUniqueIdFromTableProperties. External unique ids go through +// this extra hashing layer so that prefixes of the unique id have predictable +// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on +// the full 192 bits. +// This transformation must be long term stable to ensure +// GetUniqueIdFromTableProperties is long term stable. +void InternalUniqueIdToExternal(UniqueIdPtr in_out); + +// Reverse of InternalUniqueIdToExternal mostly for testing purposes +// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits). +void ExternalUniqueIdToInternal(UniqueIdPtr in_out); + +// Convert numerical format to byte format for public API +std::string EncodeUniqueIdBytes(UniqueIdPtr in); + +// Reverse of EncodeUniqueIdBytes. +Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out); + +// For presenting internal IDs for debugging purposes. Visually distinct from +// UniqueIdToHumanString for external IDs. +std::string InternalUniqueIdToHumanString(UniqueIdPtr in); + +// Reformat a random value down to our "DB session id" format, +// which is intended to be compact and friendly for use in file names. +// `lower` is fully preserved and data is lost from `upper`. +// +// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of +// entropy, which is enough to expect no collisions across a billion servers +// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id: +// * Save ~ dozen bytes per SST file +// * Shorter shared backup file names (some platforms have low limits) +// * Visually distinct from DB id format (usually RFC-4122) +std::string EncodeSessionId(uint64_t upper, uint64_t lower); + +// Reverse of EncodeSessionId. Returns NotSupported on error rather than +// Corruption because non-standard session IDs should be allowed with degraded +// functionality. +Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, + uint64_t *lower); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/mock_time_env.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/mock_time_env.h new file mode 100644 index 0000000000..7834368e03 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/mock_time_env.h @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +// NOTE: SpecialEnv offers most of this functionality, along with hooks +// for safe DB behavior under a mock time environment, so should be used +// instead of MockSystemClock for DB tests. +class MockSystemClock : public SystemClockWrapper { + public: + explicit MockSystemClock(const std::shared_ptr& base) + : SystemClockWrapper(base) {} + + static const char* kClassName() { return "MockSystemClock"; } + const char* Name() const override { return kClassName(); } + virtual Status GetCurrentTime(int64_t* time_sec) override { + assert(time_sec != nullptr); + *time_sec = static_cast(current_time_us_ / kMicrosInSecond); + return Status::OK(); + } + + virtual uint64_t NowSeconds() { return current_time_us_ / kMicrosInSecond; } + + virtual uint64_t NowMicros() override { return current_time_us_; } + + virtual uint64_t NowNanos() override { + assert(current_time_us_ <= std::numeric_limits::max() / 1000); + return current_time_us_ * 1000; + } + + uint64_t RealNowMicros() { return target_->NowMicros(); } + + void SetCurrentTime(uint64_t time_sec) { + assert(time_sec < std::numeric_limits::max() / kMicrosInSecond); + assert(time_sec * kMicrosInSecond >= current_time_us_); + current_time_us_ = time_sec * kMicrosInSecond; + } + + // It's a fake sleep that just updates the Env current time, which is similar + // to `NoSleepEnv.SleepForMicroseconds()` and + // `SpecialEnv.MockSleepForMicroseconds()`. + // It's also similar to `set_current_time()`, which takes an absolute time in + // seconds, vs. this one takes the sleep in microseconds. + // Note: Not thread safe. + void SleepForMicroseconds(int micros) override { + assert(micros >= 0); + assert(current_time_us_ + static_cast(micros) >= + current_time_us_); + current_time_us_.fetch_add(micros); + } + + void MockSleepForSeconds(int seconds) { + assert(seconds >= 0); + uint64_t micros = static_cast(seconds) * kMicrosInSecond; + assert(current_time_us_ + micros >= current_time_us_); + current_time_us_.fetch_add(micros); + } + + // TODO: this is a workaround for the different behavior on different platform + // for timedwait timeout. Ideally timedwait API should be moved to env. + // details: PR #7101. + void InstallTimedWaitFixCallback(); + + private: + std::atomic current_time_us_{0}; + static constexpr uint64_t kMicrosInSecond = 1000U * 1000U; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/secondary_cache_test_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/secondary_cache_test_util.h new file mode 100644 index 0000000000..1cfb454b5a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/secondary_cache_test_util.h @@ -0,0 +1,119 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include + +#include "rocksdb/advanced_cache.h" + +namespace ROCKSDB_NAMESPACE { +namespace secondary_cache_test_util { + +struct TestCreateContext : public Cache::CreateContext { + void SetFailCreate(bool fail) { fail_create_ = fail; } + + bool fail_create_ = false; +}; + +class WithCacheType : public TestCreateContext { + public: + WithCacheType() {} + virtual ~WithCacheType() {} + + class TestItem { + public: + TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { + memcpy(buf_.get(), buf, size); + } + ~TestItem() = default; + + char* Buf() { return buf_.get(); } + [[nodiscard]] size_t Size() const { return size_; } + std::string ToString() { return std::string(Buf(), Size()); } + + private: + std::unique_ptr buf_; + size_t size_; + }; + + static constexpr auto kLRU = "lru"; + static constexpr auto kHyperClock = "hyper_clock"; + + // For options other than capacity + size_t estimated_value_size_ = 1; + + virtual const std::string& Type() = 0; + + std::shared_ptr NewCache( + size_t capacity, + std::function modify_opts_fn = {}) { + const auto& type = Type(); + if (type == kLRU) { + LRUCacheOptions lru_opts; + lru_opts.capacity = capacity; + lru_opts.hash_seed = 0; // deterministic tests + if (modify_opts_fn) { + modify_opts_fn(lru_opts); + } + return lru_opts.MakeSharedCache(); + } + if (type == kHyperClock) { + HyperClockCacheOptions hc_opts{capacity, estimated_value_size_}; + hc_opts.hash_seed = 0; // deterministic tests + if (modify_opts_fn) { + modify_opts_fn(hc_opts); + } + return hc_opts.MakeSharedCache(); + } + assert(false); + return nullptr; + } + + std::shared_ptr NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) { + return NewCache(capacity, [=](ShardedCacheOptions& opts) { + opts.num_shard_bits = num_shard_bits; + opts.strict_capacity_limit = strict_capacity_limit; + opts.metadata_charge_policy = charge_policy; + }); + } + + std::shared_ptr NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + std::shared_ptr secondary_cache) { + return NewCache(capacity, [=](ShardedCacheOptions& opts) { + opts.num_shard_bits = num_shard_bits; + opts.strict_capacity_limit = strict_capacity_limit; + opts.metadata_charge_policy = kDontChargeCacheMetadata; + opts.secondary_cache = secondary_cache; + }); + } + + static const Cache::CacheItemHelper* GetHelper( + CacheEntryRole r = CacheEntryRole::kDataBlock, + bool secondary_compatible = true, bool fail = false); + + static const Cache::CacheItemHelper* GetHelperFail( + CacheEntryRole r = CacheEntryRole::kDataBlock); +}; + +class WithCacheTypeParam : public WithCacheType, + public testing::WithParamInterface { + const std::string& Type() override { return GetParam(); } +}; + +constexpr auto kLRU = WithCacheType::kLRU; +constexpr auto kHyperClock = WithCacheType::kHyperClock; + +inline auto GetTestingCacheTypes() { + return testing::Values(std::string(kLRU), std::string(kHyperClock)); +} + +} // namespace secondary_cache_test_util +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.cc new file mode 100644 index 0000000000..bec02d4f67 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/sync_point.h" + +#include + +#include "test_util/sync_point_impl.h" + +std::vector rocksdb_kill_exclude_prefixes; + +#ifndef NDEBUG +namespace ROCKSDB_NAMESPACE { + +SyncPoint* SyncPoint::GetInstance() { + static SyncPoint sync_point; + return &sync_point; +} + +SyncPoint::SyncPoint() : impl_(new Data) {} + +SyncPoint::~SyncPoint() { delete impl_; } + +void SyncPoint::LoadDependency(const std::vector& dependencies) { + impl_->LoadDependency(dependencies); +} + +void SyncPoint::LoadDependencyAndMarkers( + const std::vector& dependencies, + const std::vector& markers) { + impl_->LoadDependencyAndMarkers(dependencies, markers); +} + +void SyncPoint::SetCallBack(const std::string& point, + const std::function& callback) { + impl_->SetCallBack(point, callback); +} + +void SyncPoint::ClearCallBack(const std::string& point) { + impl_->ClearCallBack(point); +} + +void SyncPoint::ClearAllCallBacks() { impl_->ClearAllCallBacks(); } + +void SyncPoint::EnableProcessing() { impl_->EnableProcessing(); } + +void SyncPoint::DisableProcessing() { impl_->DisableProcessing(); } + +void SyncPoint::ClearTrace() { impl_->ClearTrace(); } + +void SyncPoint::Process(const Slice& point, void* cb_arg) { + impl_->Process(point, cb_arg); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // NDEBUG + +namespace ROCKSDB_NAMESPACE { +void SetupSyncPointsToMockDirectIO() { +#if !defined(NDEBUG) && !defined(OS_MACOSX) && !defined(OS_WIN) && \ + !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewSequentialFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); +#endif +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.h new file mode 100644 index 0000000000..65f1239ec4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point.h @@ -0,0 +1,180 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) +#define TEST_KILL_RANDOM(kill_point) +#else + +namespace ROCKSDB_NAMESPACE { + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +// A class used to pass when a kill point is reached. +struct KillPoint { + public: + // This is only set from db_stress.cc and for testing only. + // If non-zero, kill at various points in source code with probability 1/this + int rocksdb_kill_odds = 0; + // If kill point has a prefix on this list, will skip killing. + std::vector rocksdb_kill_exclude_prefixes; + // Kill the process with probability 1/odds for testing. + void TestKillRandom(std::string kill_point, int odds, + const std::string& srcfile, int srcline); + + static KillPoint* GetInstance(); +}; + +#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) \ + { \ + KillPoint::GetInstance()->TestKillRandom( \ + kill_point, rocksdb_kill_odds_weight, __FILE__, __LINE__); \ + } +#define TEST_KILL_RANDOM(kill_point) TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, 1) +} // namespace ROCKSDB_NAMESPACE + +#endif + +#ifdef NDEBUG +#define TEST_SYNC_POINT(x) +#define TEST_IDX_SYNC_POINT(x, index) +#define TEST_SYNC_POINT_CALLBACK(x, y) +#define INIT_SYNC_POINT_SINGLETONS() +#else + +namespace ROCKSDB_NAMESPACE { + +// This class provides facility to reproduce race conditions deterministically +// in unit tests. +// Developer could specify sync points in the codebase via TEST_SYNC_POINT. +// Each sync point represents a position in the execution stream of a thread. +// In the unit test, 'Happens After' relationship among sync points could be +// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of +// threads execution. +// Refer to (DBTest,TransactionLogIteratorRace), for an example use case. + +class SyncPoint { + public: + static SyncPoint* GetInstance(); + + SyncPoint(const SyncPoint&) = delete; + SyncPoint& operator=(const SyncPoint&) = delete; + ~SyncPoint(); + + struct SyncPointPair { + std::string predecessor; + std::string successor; + }; + + // call once at the beginning of a test to setup the dependency between + // sync points + void LoadDependency(const std::vector& dependencies); + + // call once at the beginning of a test to setup the dependency between + // sync points and setup markers indicating the successor is only enabled + // when it is processed on the same thread as the predecessor. + // When adding a marker, it implicitly adds a dependency for the marker pair. + void LoadDependencyAndMarkers(const std::vector& dependencies, + const std::vector& markers); + + // The argument to the callback is passed through from + // TEST_SYNC_POINT_CALLBACK(); nullptr if TEST_SYNC_POINT or + // TEST_IDX_SYNC_POINT was used. + void SetCallBack(const std::string& point, + const std::function& callback); + + // Clear callback function by point + void ClearCallBack(const std::string& point); + + // Clear all call back functions. + void ClearAllCallBacks(); + + // enable sync point processing (disabled on startup) + void EnableProcessing(); + + // disable sync point processing + void DisableProcessing(); + + // remove the execution trace of all sync points + void ClearTrace(); + + // triggered by TEST_SYNC_POINT, blocking execution until all predecessors + // are executed. + // And/or call registered callback function, with argument `cb_arg` + void Process(const Slice& point, void* cb_arg = nullptr); + + // template gets length of const string at compile time, + // avoiding strlen() at runtime + template + void Process(const char (&point)[kLen], void* cb_arg = nullptr) { + static_assert(kLen > 0, "Must not be empty"); + assert(point[kLen - 1] == '\0'); + Process(Slice(point, kLen - 1), cb_arg); + } + + // TODO: it might be useful to provide a function that blocks until all + // sync points are cleared. + + // We want this to be public so we can + // subclass the implementation + struct Data; + + private: + // Singleton + SyncPoint(); + Data* impl_; +}; + +// Sets up sync points to mock direct IO instead of actually issuing direct IO +// to the file system. +void SetupSyncPointsToMockDirectIO(); +} // namespace ROCKSDB_NAMESPACE + +// Use TEST_SYNC_POINT to specify sync points inside code base. +// Sync points can have happens-after dependency on other sync points, +// configured at runtime via SyncPoint::LoadDependency. This could be +// utilized to re-produce race conditions between threads. +// See TransactionLogIteratorRace in db_test.cc for an example use case. +// TEST_SYNC_POINT is no op in release build. +#define TEST_SYNC_POINT(x) \ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->Process(x) +#define TEST_IDX_SYNC_POINT(x, index) \ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->Process(x + \ + std::to_string(index)) +#define TEST_SYNC_POINT_CALLBACK(x, y) \ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->Process(x, y) +#define INIT_SYNC_POINT_SINGLETONS() \ + (void)ROCKSDB_NAMESPACE::SyncPoint::GetInstance(); +#endif // NDEBUG + +// Callback sync point for any read IO errors that should be ignored by +// the fault injection framework +// Disable in release mode +#ifdef NDEBUG +#define IGNORE_STATUS_IF_ERROR(_status_) +#else +#define IGNORE_STATUS_IF_ERROR(_status_) \ + { \ + if (!_status_.ok()) { \ + TEST_SYNC_POINT("FaultInjectionIgnoreError"); \ + } \ + } +#endif // NDEBUG diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.cc new file mode 100644 index 0000000000..2a4bd3ccdf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/sync_point_impl.h" + +#ifndef NDEBUG +namespace ROCKSDB_NAMESPACE { +KillPoint* KillPoint::GetInstance() { + static KillPoint kp; + return &kp; +} + +void KillPoint::TestKillRandom(std::string kill_point, int odds_weight, + const std::string& srcfile, int srcline) { + if (rocksdb_kill_odds <= 0) { + return; + } + int odds = rocksdb_kill_odds * odds_weight; + for (auto& p : rocksdb_kill_exclude_prefixes) { + if (kill_point.substr(0, p.length()) == p) { + return; + } + } + + assert(odds > 0); + if (odds % 7 == 0) { + // class Random uses multiplier 16807, which is 7^5. If odds are + // multiplier of 7, there might be limited values generated. + odds++; + } + auto* r = Random::GetTLSInstance(); + bool crash = r->OneIn(odds); + if (crash) { + port::Crash(srcfile, srcline); + } +} + +void SyncPoint::Data::LoadDependency( + const std::vector& dependencies) { + std::lock_guard lock(mutex_); + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + point_filter_.Add(dependency.successor); + point_filter_.Add(dependency.predecessor); + } + cv_.notify_all(); +} + +void SyncPoint::Data::LoadDependencyAndMarkers( + const std::vector& dependencies, + const std::vector& markers) { + std::lock_guard lock(mutex_); + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + markers_.clear(); + marked_thread_id_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + point_filter_.Add(dependency.successor); + point_filter_.Add(dependency.predecessor); + } + for (const auto& marker : markers) { + successors_[marker.predecessor].push_back(marker.successor); + predecessors_[marker.successor].push_back(marker.predecessor); + markers_[marker.predecessor].push_back(marker.successor); + point_filter_.Add(marker.predecessor); + point_filter_.Add(marker.successor); + } + cv_.notify_all(); +} + +bool SyncPoint::Data::PredecessorsAllCleared(const std::string& point) { + for (const auto& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::Data::ClearCallBack(const std::string& point) { + std::unique_lock lock(mutex_); + while (num_callbacks_running_ > 0) { + cv_.wait(lock); + } + callbacks_.erase(point); +} + +void SyncPoint::Data::ClearAllCallBacks() { + std::unique_lock lock(mutex_); + while (num_callbacks_running_ > 0) { + cv_.wait(lock); + } + callbacks_.clear(); +} + +void SyncPoint::Data::Process(const Slice& point, void* cb_arg) { + if (!enabled_) { + return; + } + + // Use a filter to prevent mutex lock if possible. + if (!point_filter_.MayContain(point)) { + return; + } + + // Must convert to std::string for remaining work. Take + // heap hit. + std::string point_string(point.ToString()); + std::unique_lock lock(mutex_); + auto thread_id = std::this_thread::get_id(); + + auto marker_iter = markers_.find(point_string); + if (marker_iter != markers_.end()) { + for (auto& marked_point : marker_iter->second) { + marked_thread_id_.emplace(marked_point, thread_id); + point_filter_.Add(marked_point); + } + } + + if (DisabledByMarker(point_string, thread_id)) { + return; + } + + while (!PredecessorsAllCleared(point_string)) { + cv_.wait(lock); + if (DisabledByMarker(point_string, thread_id)) { + return; + } + } + + auto callback_pair = callbacks_.find(point_string); + if (callback_pair != callbacks_.end()) { + num_callbacks_running_++; + mutex_.unlock(); + callback_pair->second(cb_arg); + mutex_.lock(); + num_callbacks_running_--; + } + cleared_points_.insert(point_string); + cv_.notify_all(); +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.h new file mode 100644 index 0000000000..64cc0445e0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/sync_point_impl.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory/concurrent_arena.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/dynamic_bloom.h" +#include "util/random.h" + +#pragma once + +#ifndef NDEBUG +namespace ROCKSDB_NAMESPACE { +// A hacky allocator for single use. +// Arena depends on SyncPoint and create circular dependency. +class SingleAllocator : public Allocator { + public: + char* Allocate(size_t) override { + assert(false); + return nullptr; + } + char* AllocateAligned(size_t bytes, size_t, Logger*) override { + buf_.resize(bytes); + return const_cast(buf_.data()); + } + size_t BlockSize() const override { + assert(false); + return 0; + } + + private: + std::string buf_; +}; + +struct SyncPoint::Data { + Data() : point_filter_(&alloc_, /*total_bits=*/8192), enabled_(false) {} + // Enable proper deletion by subclasses + virtual ~Data() {} + // successor/predecessor map loaded from LoadDependency + std::unordered_map> successors_; + std::unordered_map> predecessors_; + std::unordered_map> callbacks_; + std::unordered_map> markers_; + std::unordered_map marked_thread_id_; + + std::mutex mutex_; + std::condition_variable cv_; + // sync points that have been passed through + std::unordered_set cleared_points_; + SingleAllocator alloc_; + // A filter before holding mutex to speed up process. + DynamicBloom point_filter_; + std::atomic enabled_; + int num_callbacks_running_ = 0; + + void LoadDependency(const std::vector& dependencies); + void LoadDependencyAndMarkers(const std::vector& dependencies, + const std::vector& markers); + bool PredecessorsAllCleared(const std::string& point); + void SetCallBack(const std::string& point, + const std::function& callback) { + std::lock_guard lock(mutex_); + callbacks_[point] = callback; + point_filter_.Add(point); + } + + void ClearCallBack(const std::string& point); + void ClearAllCallBacks(); + void EnableProcessing() { enabled_ = true; } + void DisableProcessing() { enabled_ = false; } + void ClearTrace() { + std::lock_guard lock(mutex_); + cleared_points_.clear(); + } + bool DisabledByMarker(const std::string& point, std::thread::id thread_id) { + auto marked_point_iter = marked_thread_id_.find(point); + return marked_point_iter != marked_thread_id_.end() && + thread_id != marked_point_iter->second; + } + void Process(const Slice& point, void* cb_arg); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // NDEBUG diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testharness.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testharness.h new file mode 100644 index 0000000000..d8b6c9679c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testharness.h @@ -0,0 +1,124 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifdef OS_AIX +#include "gtest/gtest.h" +#else +#include +#endif + +// A "skipped" test has a specific meaning in Facebook infrastructure: the +// test is in good shape and should be run, but something about the +// compilation or execution environment means the test cannot be run. +// Specifically, there is a hole in intended testing if any +// parameterization of a test (e.g. Foo/FooTest.Bar/42) is skipped for all +// tested build configurations/platforms/etc. +// +// If GTEST_SKIP is available, use it. Otherwise, define skip as success. +// +// The GTEST macros do not seem to print the message, even with -verbose, +// so these print to stderr. Note that these do not exit the test themselves; +// calling code should 'return' or similar from the test. +#ifdef GTEST_SKIP_ +#define ROCKSDB_GTEST_SKIP(m) \ + do { \ + fputs("SKIPPED: " m "\n", stderr); \ + GTEST_SKIP_(m); \ + } while (false) /* user ; */ +#else +#define ROCKSDB_GTEST_SKIP(m) \ + do { \ + fputs("SKIPPED: " m "\n", stderr); \ + GTEST_SUCCESS_("SKIPPED: " m); \ + } while (false) /* user ; */ +#endif + +// We add "bypass" as an alternative to ROCKSDB_GTEST_SKIP that is allowed to +// be a permanent condition, e.g. for intentionally omitting or disabling some +// parameterizations for some tests. (Use _DISABLED at the end of the test +// name to disable an entire test.) +#define ROCKSDB_GTEST_BYPASS(m) \ + do { \ + fputs("BYPASSED: " m "\n", stderr); \ + GTEST_SUCCESS_("BYPASSED: " m); \ + } while (false) /* user ; */ + +// Avoid "loss of precision" warnings when passing in 64-bit integers +#define EXPECT_NEAR2(val1, val2, abs_error) \ + EXPECT_NEAR(static_cast(val1), static_cast(val2), \ + static_cast(abs_error)) + +#include + +#include "port/stack_trace.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { +namespace test { + +// Return the directory to use for temporary storage. +std::string TmpDir(Env* env = Env::Default()); + +// A path unique within the thread +std::string PerThreadDBPath(std::string name); +std::string PerThreadDBPath(Env* env, std::string name); +std::string PerThreadDBPath(std::string dir, std::string name); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +int RandomSeed(); + +::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s); + +#define ASSERT_OK(s) \ + ASSERT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s) +#define ASSERT_NOK(s) ASSERT_FALSE((s).ok()) +#define EXPECT_OK(s) \ + EXPECT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s) +#define EXPECT_NOK(s) EXPECT_FALSE((s).ok()) + +// Useful for testing +// * No need to deal with Status like in Regex public API +// * No triggering lint reports on use of std::regex in tests +// * Available in LITE (unlike public API) +class TestRegex { + public: + // These throw on bad pattern + /*implicit*/ TestRegex(const std::string& pattern); + /*implicit*/ TestRegex(const char* pattern); + + // Checks that the whole of str is matched by this regex + bool Matches(const std::string& str) const; + + const std::string& GetPattern() const; + + private: + class Impl; + std::shared_ptr impl_; // shared_ptr for simple implementation + std::string pattern_; +}; + +::testing::AssertionResult AssertMatchesRegex(const char* str_expr, + const char* pattern_expr, + const std::string& str, + const TestRegex& pattern); + +#define ASSERT_MATCHES_REGEX(str, pattern) \ + ASSERT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern) +#define EXPECT_MATCHES_REGEX(str, pattern) \ + EXPECT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern) + +} // namespace test + +using test::TestRegex; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testutil.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testutil.h new file mode 100644 index 0000000000..686817c445 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/testutil.h @@ -0,0 +1,860 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "file/writable_file_writer.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "table/internal_iterator.h" +#include "util/mutexlock.h" + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int argc, char** argv); +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +namespace ROCKSDB_NAMESPACE { +class FileSystem; +class MemTableRepFactory; +class ObjectLibrary; +class Random; +class SequentialFile; +class SequentialFileReader; + +namespace test { + +extern const uint32_t kDefaultFormatVersion; +extern const std::set kFooterFormatVersionsToTest; + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE }; +extern std::string RandomKey(Random* rnd, int len, + RandomKeyType type = RandomKeyType::RANDOM); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +#ifndef NDEBUG +// An internal comparator that just forward comparing results from the +// user comparator in it. Can be used to test entities that have no dependency +// on internal key structure but consumes InternalKeyComparator, like +// BlockBasedTable. +class PlainInternalKeyComparator : public InternalKeyComparator { + public: + explicit PlainInternalKeyComparator(const Comparator* c) + : InternalKeyComparator(c) {} + + virtual ~PlainInternalKeyComparator() {} + + virtual int Compare(const Slice& a, const Slice& b) const override { + return user_comparator()->Compare(a, b); + } +}; +#endif + +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +// This helps simulate the case of compounded key of [entity][timestamp] and +// latest timestamp first. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + static const char* kClassName() { return "SimpleSuffixReverseComparator"; } + virtual const char* Name() const override { return kClassName(); } + + virtual int Compare(const Slice& a, const Slice& b) const override { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + virtual void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +// Returns a user key comparator that can be used for comparing two uint64_t +// slices. Instead of comparing slices byte-wise, it compares all the 8 bytes +// at once. Assumes same endian-ness is used though the database's lifetime. +// Symantics of comparison would differ from Bytewise comparator in little +// endian machines. +extern const Comparator* Uint64Comparator(); + +// A wrapper api for getting the ComparatorWithU64Ts +extern const Comparator* BytewiseComparatorWithU64TsWrapper(); + +class StringSink : public FSWritableFile { + public: + std::string contents_; + + explicit StringSink(Slice* reader_contents = nullptr) + : FSWritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) { + if (reader_contents_ != nullptr) { + *reader_contents_ = Slice(contents_.data(), 0); + } + } + + const std::string& contents() const { return contents_; } + + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_.resize(static_cast(size)); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + if (reader_contents_ != nullptr) { + assert(reader_contents_->size() <= last_flush_); + size_t offset = last_flush_ - reader_contents_->size(); + *reader_contents_ = + Slice(contents_.data() + offset, contents_.size() - offset); + last_flush_ = contents_.size(); + } + + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_.append(slice.data(), slice.size()); + return IOStatus::OK(); + } + void Drop(size_t bytes) { + if (reader_contents_ != nullptr) { + contents_.resize(contents_.size() - bytes); + *reader_contents_ = + Slice(reader_contents_->data(), reader_contents_->size() - bytes); + last_flush_ = contents_.size(); + } + } + + private: + Slice* reader_contents_; + size_t last_flush_; +}; + +// A wrapper around a StringSink to give it a RandomRWFile interface +class RandomRWStringSink : public FSRandomRWFile { + public: + explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {} + + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (offset + data.size() > ss_->contents_.size()) { + ss_->contents_.resize(static_cast(offset) + data.size(), '\0'); + } + + char* pos = const_cast(ss_->contents_.data() + offset); + memcpy(pos, data.data(), data.size()); + return IOStatus::OK(); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, + Slice* result, char* /*scratch*/, + IODebugContext* /*dbg*/) const override { + *result = Slice(nullptr, 0); + if (offset < ss_->contents_.size()) { + size_t str_res_sz = + std::min(static_cast(ss_->contents_.size() - offset), n); + *result = Slice(ss_->contents_.data() + offset, str_res_sz); + } + return IOStatus::OK(); + } + + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + const std::string& contents() const { return ss_->contents(); } + + private: + StringSink* ss_; +}; + +// Like StringSink, this writes into a string. Unlink StringSink, it +// has some initial content and overwrites it, just like a recycled +// log file. +class OverwritingStringSink : public FSWritableFile { + public: + explicit OverwritingStringSink(Slice* reader_contents) + : FSWritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) {} + + const std::string& contents() const { return contents_; } + + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_.resize(static_cast(size)); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + if (last_flush_ < contents_.size()) { + assert(reader_contents_->size() >= contents_.size()); + memcpy((char*)reader_contents_->data() + last_flush_, + contents_.data() + last_flush_, contents_.size() - last_flush_); + last_flush_ = contents_.size(); + } + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_.append(slice.data(), slice.size()); + return IOStatus::OK(); + } + void Drop(size_t bytes) { + contents_.resize(contents_.size() - bytes); + if (last_flush_ > contents_.size()) last_flush_ = contents_.size(); + } + + private: + std::string contents_; + Slice* reader_contents_; + size_t last_flush_; +}; + +class StringSource : public FSRandomAccessFile { + public: + explicit StringSource(const Slice& contents, uint64_t uniq_id = 0, + bool mmap = false) + : contents_(contents.data(), contents.size()), + uniq_id_(uniq_id), + mmap_(mmap), + total_reads_(0) {} + + virtual ~StringSource() {} + + uint64_t Size() const { return contents_.size(); } + + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + // If we are using mmap_, it is equivalent to performing a prefetch + if (mmap_) { + return IOStatus::OK(); + } else { + return IOStatus::NotSupported("Prefetch not supported"); + } + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + total_reads_++; + if (offset > contents_.size()) { + return IOStatus::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - static_cast(offset); + } + if (!mmap_) { + memcpy(scratch, &contents_[static_cast(offset)], n); + *result = Slice(scratch, n); + } else { + *result = Slice(&contents_[static_cast(offset)], n); + } + return IOStatus::OK(); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + if (max_size < 20) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uniq_id_); + rid = EncodeVarint64(rid, 0); + return static_cast(rid - id); + } + + int total_reads() const { return total_reads_; } + + void set_total_reads(int tr) { total_reads_ = tr; } + + private: + std::string contents_; + uint64_t uniq_id_; + bool mmap_; + mutable int total_reads_; +}; + +class NullLogger : public Logger { + public: + using Logger::Logv; + virtual void Logv(const char* /*format*/, va_list /*ap*/) override {} + virtual size_t GetLogFileSize() const override { return 0; } +}; + +// Corrupts key by changing the type +extern void CorruptKeyType(InternalKey* ikey); + +extern std::string KeyStr(const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt = false); + +extern std::string KeyStr(uint64_t ts, const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt = false); + +class SleepingBackgroundTask { + public: + SleepingBackgroundTask() + : bg_cv_(&mutex_), + should_sleep_(true), + done_with_sleep_(false), + sleeping_(false) {} + + ~SleepingBackgroundTask() { + MutexLock l(&mutex_); + should_sleep_ = false; + while (sleeping_) { + assert(!should_sleep_); + bg_cv_.SignalAll(); + bg_cv_.Wait(); + } + } + + bool IsSleeping() { + MutexLock l(&mutex_); + return sleeping_; + } + void DoSleep() { + MutexLock l(&mutex_); + sleeping_ = true; + bg_cv_.SignalAll(); + while (should_sleep_) { + bg_cv_.Wait(); + } + sleeping_ = false; + done_with_sleep_ = true; + bg_cv_.SignalAll(); + } + void WaitUntilSleeping() { + MutexLock l(&mutex_); + while (!sleeping_ || !should_sleep_) { + bg_cv_.Wait(); + } + } + // Waits for the status to change to sleeping, + // otherwise times out. + // wait_time is in microseconds. + // Returns true when times out, false otherwise. + bool TimedWaitUntilSleeping(uint64_t wait_time); + + void WakeUp() { + MutexLock l(&mutex_); + should_sleep_ = false; + bg_cv_.SignalAll(); + } + void WaitUntilDone() { + MutexLock l(&mutex_); + while (!done_with_sleep_) { + bg_cv_.Wait(); + } + } + // Similar to TimedWaitUntilSleeping. + // Waits until the task is done. + bool TimedWaitUntilDone(uint64_t wait_time); + + bool WokenUp() { + MutexLock l(&mutex_); + return should_sleep_ == false; + } + + void Reset() { + MutexLock l(&mutex_); + should_sleep_ = true; + done_with_sleep_ = false; + } + + static void DoSleepTask(void* arg) { + reinterpret_cast(arg)->DoSleep(); + } + + private: + port::Mutex mutex_; + port::CondVar bg_cv_; // Signalled when background work finishes + bool should_sleep_; + bool done_with_sleep_; + bool sleeping_; +}; + +// Filters merge operands and values that are equal to `num`. +class FilterNumber : public CompactionFilter { + public: + explicit FilterNumber(uint64_t num) : num_(num) {} + + std::string last_merge_operand_key() { return last_merge_operand_key_; } + + bool Filter(int /*level*/, const ROCKSDB_NAMESPACE::Slice& /*key*/, + const ROCKSDB_NAMESPACE::Slice& value, std::string* /*new_value*/, + bool* /*value_changed*/) const override { + if (value.size() == sizeof(uint64_t)) { + return num_ == DecodeFixed64(value.data()); + } + return true; + } + + bool FilterMergeOperand( + int /*level*/, const ROCKSDB_NAMESPACE::Slice& key, + const ROCKSDB_NAMESPACE::Slice& value) const override { + last_merge_operand_key_ = key.ToString(); + if (value.size() == sizeof(uint64_t)) { + return num_ == DecodeFixed64(value.data()); + } + return true; + } + + const char* Name() const override { return "FilterBadMergeOperand"; } + + private: + mutable std::string last_merge_operand_key_; + uint64_t num_; +}; + +inline std::string EncodeInt(uint64_t x) { + std::string result; + PutFixed64(&result, x); + return result; +} + +class SeqStringSource : public FSSequentialFile { + public: + SeqStringSource(const std::string& data, std::atomic* read_count) + : data_(data), offset_(0), read_count_(read_count) {} + ~SeqStringSource() override {} + IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + std::string output; + if (offset_ < data_.size()) { + n = std::min(data_.size() - offset_, n); + memcpy(scratch, data_.data() + offset_, n); + offset_ += n; + *result = Slice(scratch, n); + } else { + return IOStatus::InvalidArgument( + "Attempt to read when it already reached eof."); + } + (*read_count_)++; + return IOStatus::OK(); + } + + IOStatus Skip(uint64_t n) override { + if (offset_ >= data_.size()) { + return IOStatus::InvalidArgument( + "Attempt to read when it already reached eof."); + } + // TODO(yhchiang): Currently doesn't handle the overflow case. + offset_ += static_cast(n); + return IOStatus::OK(); + } + + private: + std::string data_; + size_t offset_; + std::atomic* read_count_; +}; + +class StringFS : public FileSystemWrapper { + public: + class StringSink : public FSWritableFile { + public: + explicit StringSink(std::string* contents) + : FSWritableFile(), contents_(contents) {} + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_->resize(static_cast(size)); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_->append(slice.data(), slice.size()); + return IOStatus::OK(); + } + + private: + std::string* contents_; + }; + + explicit StringFS(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + ~StringFS() override {} + + static const char* kClassName() { return "StringFS"; } + const char* Name() const override { return kClassName(); } + + const std::string& GetContent(const std::string& f) { return files_[f]; } + + const IOStatus WriteToNewFile(const std::string& file_name, + const std::string& content) { + std::unique_ptr r; + FileOptions file_opts; + IOOptions io_opts; + + auto s = NewWritableFile(file_name, file_opts, &r, nullptr); + if (s.ok()) { + s = r->Append(content, io_opts, nullptr); + } + if (s.ok()) { + s = r->Flush(io_opts, nullptr); + } + if (s.ok()) { + s = r->Close(io_opts, nullptr); + } + assert(!s.ok() || files_[file_name] == content); + return s; + } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, + const FileOptions& /*options*/, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return IOStatus::NotFound("The specified file does not exist", f); + } + r->reset(new SeqStringSource(iter->second, &num_seq_file_read_)); + return IOStatus::OK(); + } + + IOStatus NewRandomAccessFile(const std::string& /*f*/, + const FileOptions& /*options*/, + std::unique_ptr* /*r*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus NewWritableFile(const std::string& f, const FileOptions& /*options*/, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter != files_.end()) { + return IOStatus::IOError("The specified file already exists", f); + } + r->reset(new StringSink(&files_[f])); + return IOStatus::OK(); + } + IOStatus NewDirectory(const std::string& /*name*/, + const IOOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus FileExists(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (files_.find(f) == files_.end()) { + return IOStatus::NotFound(); + } + return IOStatus::OK(); + } + + IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*options*/, + std::vector* /*r*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + files_.erase(f); + return IOStatus::OK(); + } + + IOStatus CreateDir(const std::string& /*d*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus CreateDirIfMissing(const std::string& /*d*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus DeleteDir(const std::string& /*d*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return IOStatus::NotFound("The specified file does not exist:", f); + } + *s = iter->second.size(); + return IOStatus::OK(); + } + + IOStatus GetFileModificationTime(const std::string& /*fname*/, + const IOOptions& /*options*/, + uint64_t* /*file_mtime*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus RenameFile(const std::string& /*s*/, const std::string& /*t*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus LinkFile(const std::string& /*s*/, const std::string& /*t*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus LockFile(const std::string& /*f*/, const IOOptions& /*options*/, + FileLock** /*l*/, IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus UnlockFile(FileLock* /*l*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + std::atomic num_seq_file_read_; + + protected: + std::unordered_map files_; +}; + +// Randomly initialize the given DBOptions +void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); + +// Randomly initialize the given ColumnFamilyOptions +// Note that the caller is responsible for releasing non-null +// cf_opt->compaction_filter. +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions&, Random* rnd); + +// A dummy merge operator which can change its name +class ChanglingMergeOperator : public MergeOperator { + public: + explicit ChanglingMergeOperator(const std::string& name) + : name_(name + "MergeOperator") {} + ~ChanglingMergeOperator() {} + + void SetName(const std::string& name) { name_ = name; } + + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + return false; + } + virtual bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + return false; + } + static const char* kClassName() { return "ChanglingMergeOperator"; } + const char* NickName() const override { return kNickName(); } + static const char* kNickName() { return "Changling"; } + bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return MergeOperator::IsInstanceOf(id); + } + } + + virtual const char* Name() const override { return name_.c_str(); } + + protected: + std::string name_; +}; + +// Returns a dummy merge operator with random name. +MergeOperator* RandomMergeOperator(Random* rnd); + +// A dummy compaction filter which can change its name +class ChanglingCompactionFilter : public CompactionFilter { + public: + explicit ChanglingCompactionFilter(const std::string& name) + : name_(name + "CompactionFilter") {} + ~ChanglingCompactionFilter() {} + + void SetName(const std::string& name) { name_ = name; } + + bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + static const char* kClassName() { return "ChanglingCompactionFilter"; } + const char* NickName() const override { return kNickName(); } + static const char* kNickName() { return "Changling"; } + + bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return CompactionFilter::IsInstanceOf(id); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; +}; + +// Returns a dummy compaction filter with a random name. +CompactionFilter* RandomCompactionFilter(Random* rnd); + +// A dummy compaction filter factory which can change its name +class ChanglingCompactionFilterFactory : public CompactionFilterFactory { + public: + explicit ChanglingCompactionFilterFactory(const std::string& name) + : name_(name + "CompactionFilterFactory") {} + ~ChanglingCompactionFilterFactory() {} + + void SetName(const std::string& name) { name_ = name; } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(); + } + + // Returns a name that identifies this compaction filter factory. + const char* Name() const override { return name_.c_str(); } + static const char* kClassName() { return "ChanglingCompactionFilterFactory"; } + const char* NickName() const override { return kNickName(); } + static const char* kNickName() { return "Changling"; } + + bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return CompactionFilterFactory::IsInstanceOf(id); + } + } + + protected: + std::string name_; +}; + +// The factory for the hacky skip list mem table that triggers flush after +// number of entries exceeds a threshold. +extern MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush); + +CompressionType RandomCompressionType(Random* rnd); + +void RandomCompressionTypeVector(const size_t count, + std::vector* types, + Random* rnd); + +CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd); + +const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined = -1); + +TableFactory* RandomTableFactory(Random* rnd, int pre_defined = -1); + +std::string RandomName(Random* rnd, const size_t len); + +bool IsDirectIOSupported(Env* env, const std::string& dir); + +bool IsPrefetchSupported(const std::shared_ptr& fs, + const std::string& dir); + +// Return the number of lines where a given pattern was found in a file. +size_t GetLinesCount(const std::string& fname, const std::string& pattern); + +Status CorruptFile(Env* env, const std::string& fname, int offset, + int bytes_to_corrupt, bool verify_checksum = true); +Status TruncateFile(Env* env, const std::string& fname, uint64_t length); + +// Try and delete a directory if it exists +Status TryDeleteDir(Env* env, const std::string& dirname); + +// Delete a directory if it exists +void DeleteDir(Env* env, const std::string& dirname); + +// Creates an Env from the system environment by looking at the system +// environment variables. +Status CreateEnvFromSystem(const ConfigOptions& options, Env** result, + std::shared_ptr* guard); + +// Registers the testutil classes with the ObjectLibrary +int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/); + +// Register the testutil classes with the default ObjectRegistry/Library +void RegisterTestLibrary(const std::string& arg = ""); +} // namespace test +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.cc new file mode 100644 index 0000000000..11fca6d575 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.cc @@ -0,0 +1,400 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/transaction_test_util.h" + +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/snapshot_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +RandomTransactionInserter::RandomTransactionInserter( + Random64* rand, const WriteOptions& write_options, + const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets, + const uint64_t cmt_delay_ms, const uint64_t first_id) + : rand_(rand), + write_options_(write_options), + read_options_(read_options), + num_keys_(num_keys), + num_sets_(num_sets), + txn_id_(first_id), + cmt_delay_ms_(cmt_delay_ms) {} + +RandomTransactionInserter::~RandomTransactionInserter() { + if (txn_ != nullptr) { + delete txn_; + } + if (optimistic_txn_ != nullptr) { + delete optimistic_txn_; + } +} + +bool RandomTransactionInserter::TransactionDBInsert( + TransactionDB* db, const TransactionOptions& txn_options) { + txn_ = db->BeginTransaction(write_options_, txn_options, txn_); + + std::hash hasher; + char name[64]; + snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%" PRIu64, + hasher(std::this_thread::get_id()), txn_id_++); + assert(strlen(name) < 64 - 1); + assert(txn_->SetName(name).ok()); + + // Take a snapshot if set_snapshot was not set or with 50% change otherwise + bool take_snapshot = txn_->GetSnapshot() == nullptr || rand_->OneIn(2); + if (take_snapshot) { + txn_->SetSnapshot(); + read_options_.snapshot = txn_->GetSnapshot(); + } + auto res = DoInsert(db, txn_, false); + if (take_snapshot) { + read_options_.snapshot = nullptr; + } + return res; +} + +bool RandomTransactionInserter::OptimisticTransactionDBInsert( + OptimisticTransactionDB* db, + const OptimisticTransactionOptions& txn_options) { + optimistic_txn_ = + db->BeginTransaction(write_options_, txn_options, optimistic_txn_); + + return DoInsert(db, optimistic_txn_, true); +} + +bool RandomTransactionInserter::DBInsert(DB* db) { + return DoInsert(db, nullptr, false); +} + +Status RandomTransactionInserter::DBGet( + DB* db, Transaction* txn, ReadOptions& read_options, uint16_t set_i, + uint64_t ikey, bool get_for_update, uint64_t* int_value, + std::string* full_key, bool* unexpected_error) { + Status s; + // Five digits (since the largest uint16_t is 65535) plus the NUL + // end char. + char prefix_buf[6] = {0}; + // Pad prefix appropriately so we can iterate over each set + assert(set_i + 1 <= 9999); + snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1); + // key format: [SET#][random#] + std::string skey = std::to_string(ikey); + Slice base_key(skey); + *full_key = std::string(prefix_buf) + base_key.ToString(); + Slice key(*full_key); + + std::string value; + if (txn != nullptr) { + if (get_for_update) { + s = txn->GetForUpdate(read_options, key, &value); + } else { + s = txn->Get(read_options, key, &value); + } + } else { + s = db->Get(read_options, key, &value); + } + + if (s.ok()) { + // Found key, parse its value + *int_value = std::stoull(value); + if (*int_value == 0 || *int_value == ULONG_MAX) { + *unexpected_error = true; + fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str()); + s = Status::Corruption(); + } + } else if (s.IsNotFound()) { + // Have not yet written to this key, so assume its value is 0 + *int_value = 0; + s = Status::OK(); + } + return s; +} + +bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn, + bool is_optimistic) { + Status s; + WriteBatch batch; + + // pick a random number to use to increment a key in each set + uint64_t incr = (rand_->Next() % 100) + 1; + bool unexpected_error = false; + + std::vector set_vec(num_sets_); + std::iota(set_vec.begin(), set_vec.end(), static_cast(0)); + RandomShuffle(set_vec.begin(), set_vec.end()); + + // For each set, pick a key at random and increment it + for (uint16_t set_i : set_vec) { + uint64_t int_value = 0; + std::string full_key; + uint64_t rand_key = rand_->Next() % num_keys_; + const bool get_for_update = txn ? rand_->OneIn(2) : false; + s = DBGet(db, txn, read_options_, set_i, rand_key, get_for_update, + &int_value, &full_key, &unexpected_error); + Slice key(full_key); + if (!s.ok()) { + // Optimistic transactions should never return non-ok status here. + // Non-optimistic transactions may return write-coflict/timeout errors. + if (is_optimistic || !(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { + fprintf(stderr, "Get returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + break; + } + + if (s.ok()) { + // Increment key + std::string sum = std::to_string(int_value + incr); + if (txn != nullptr) { + if ((set_i % 4) != 0) { + s = txn->SingleDelete(key); + } else { + s = txn->Delete(key); + } + if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) { + // If the initial get was not for update, then the key is not locked + // before put and put could fail due to concurrent writes. + break; + } else if (!s.ok()) { + // Since we did a GetForUpdate, SingleDelete should not fail. + fprintf(stderr, "SingleDelete returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + s = txn->Put(key, sum); + if (!s.ok()) { + // Since we did a GetForUpdate, Put should not fail. + fprintf(stderr, "Put returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + } else { + batch.Put(key, sum); + } + bytes_inserted_ += key.size() + sum.size(); + } + if (txn != nullptr) { + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Insert (%s) %s snap: %" PRIu64 " key:%s value: %" PRIu64 + "+%" PRIu64 "=%" PRIu64, + txn->GetName().c_str(), s.ToString().c_str(), + txn->GetSnapshot()->GetSequenceNumber(), full_key.c_str(), + int_value, incr, int_value + incr); + } + } + + if (s.ok()) { + if (txn != nullptr) { + bool with_prepare = !is_optimistic && !rand_->OneIn(10); + if (with_prepare) { + // Also try commit without prepare + s = txn->Prepare(); + if (!s.ok()) { + fprintf(stderr, "Prepare returned an unexpected error: %s\n", + s.ToString().c_str()); + } + assert(s.ok()); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Prepare of %" PRIu64 " %s (%s)", txn->GetId(), + s.ToString().c_str(), txn->GetName().c_str()); + if (rand_->OneIn(20)) { + // This currently only tests the mechanics of writing commit time + // write batch so the exact values would not matter. + s = txn_->GetCommitTimeWriteBatch()->Put("cat", "dog"); + assert(s.ok()); + } + db->GetDBOptions().env->SleepForMicroseconds( + static_cast(cmt_delay_ms_ * 1000)); + } + if (!rand_->OneIn(20)) { + s = txn->Commit(); + assert(!with_prepare || s.ok()); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Commit of %" PRIu64 " %s (%s)", txn->GetId(), + s.ToString().c_str(), txn->GetName().c_str()); + } else { + // Also try 5% rollback + s = txn->Rollback(); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Rollback %" PRIu64 " %s %s", txn->GetId(), + txn->GetName().c_str(), s.ToString().c_str()); + assert(s.ok()); + } + assert(is_optimistic || s.ok()); + + if (!s.ok()) { + if (is_optimistic) { + // Optimistic transactions can have write-conflict errors on commit. + // Any other error is unexpected. + if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { + unexpected_error = true; + } + } else { + // Non-optimistic transactions should only fail due to expiration + // or write failures. For testing purproses, we do not expect any + // write failures. + if (!s.IsExpired()) { + unexpected_error = true; + } + } + + if (unexpected_error) { + fprintf(stderr, "Commit returned an unexpected error: %s\n", + s.ToString().c_str()); + } + } + } else { + s = db->Write(write_options_, &batch); + if (!s.ok()) { + unexpected_error = true; + fprintf(stderr, "Write returned an unexpected error: %s\n", + s.ToString().c_str()); + } + } + } else { + if (txn != nullptr) { + assert(txn->Rollback().ok()); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Error %s for txn %s", + s.ToString().c_str(), txn->GetName().c_str()); + } + } + + if (s.ok()) { + success_count_++; + } else { + failure_count_++; + } + + last_status_ = s; + + // return success if we didn't get any unexpected errors + return !unexpected_error; +} + +// Verify that the sum of the keys in each set are equal +Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets, + uint64_t num_keys_per_set, + bool take_snapshot, Random64* rand, + uint64_t delay_ms) { + // delay_ms is the delay between taking a snapshot and doing the reads. It + // emulates reads from a long-running backup job. + assert(delay_ms == 0 || take_snapshot); + uint64_t prev_total = 0; + uint32_t prev_i = 0; + bool prev_assigned = false; + + ReadOptions roptions; + if (take_snapshot) { + roptions.snapshot = db->GetSnapshot(); + db->GetDBOptions().env->SleepForMicroseconds( + static_cast(delay_ms * 1000)); + } + + std::vector set_vec(num_sets); + std::iota(set_vec.begin(), set_vec.end(), static_cast(0)); + RandomShuffle(set_vec.begin(), set_vec.end()); + + // For each set of keys with the same prefix, sum all the values + for (uint16_t set_i : set_vec) { + // Five digits (since the largest uint16_t is 65535) plus the NUL + // end char. + char prefix_buf[6]; + assert(set_i + 1 <= 9999); + snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1); + uint64_t total = 0; + + // Use either point lookup or iterator. Point lookups are slower so we use + // it less often. + const bool use_point_lookup = + num_keys_per_set != 0 && rand && rand->OneIn(10); + if (use_point_lookup) { + ReadOptions read_options; + for (uint64_t k = 0; k < num_keys_per_set; k++) { + std::string dont_care; + uint64_t int_value = 0; + bool unexpected_error = false; + const bool FOR_UPDATE = false; + Status s = DBGet(db, nullptr, roptions, set_i, k, FOR_UPDATE, + &int_value, &dont_care, &unexpected_error); + assert(s.ok()); + assert(!unexpected_error); + total += int_value; + } + } else { // user iterators + Iterator* iter = db->NewIterator(roptions); + for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + // stop when we reach a different prefix + if (key.ToString().compare(0, 4, prefix_buf) != 0) { + break; + } + Slice value = iter->value(); + uint64_t int_value = std::stoull(value.ToString()); + if (int_value == 0 || int_value == ULONG_MAX) { + fprintf(stderr, "Iter returned unexpected value: %s\n", + value.ToString().c_str()); + return Status::Corruption(); + } + ROCKS_LOG_DEBUG( + db->GetDBOptions().info_log, + "VerifyRead at %" PRIu64 " (%" PRIu64 "): %.*s value: %" PRIu64, + roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul, + roptions.snapshot + ? ((SnapshotImpl*)roptions.snapshot)->min_uncommitted_ + : 0ul, + static_cast(key.size()), key.data(), int_value); + total += int_value; + } + iter->status().PermitUncheckedError(); + delete iter; + } + + if (prev_assigned && total != prev_total) { + db->GetDBOptions().info_log->Flush(); + fprintf(stdout, + "RandomTransactionVerify found inconsistent totals using " + "pointlookup? %d " + "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64 + " at snapshot %" PRIu64 "\n", + use_point_lookup, prev_i, prev_total, set_i, total, + roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul); + fflush(stdout); + return Status::Corruption(); + } else { + ROCKS_LOG_DEBUG( + db->GetDBOptions().info_log, + "RandomTransactionVerify pass pointlookup? %d total: %" PRIu64 + " snap: %" PRIu64, + use_point_lookup, total, + roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul); + } + prev_total = total; + prev_i = set_i; + prev_assigned = true; + } + if (take_snapshot) { + db->ReleaseSnapshot(roptions.snapshot); + } + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.h new file mode 100644 index 0000000000..284b392503 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/test_util/transaction_test_util.h @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include "port/port.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; +class Random64; + +// Utility class for stress testing transactions. Can be used to write many +// transactions in parallel and then validate that the data written is logically +// consistent. This class assumes the input DB is initially empty. +// +// Each call to TransactionDBInsert()/OptimisticTransactionDBInsert() will +// increment the value of a key in #num_sets sets of keys. Regardless of +// whether the transaction succeeds, the total sum of values of keys in each +// set is an invariant that should remain equal. +// +// After calling TransactionDBInsert()/OptimisticTransactionDBInsert() many +// times, Verify() can be called to validate that the invariant holds. +// +// To test writing Transaction in parallel, multiple threads can create a +// RandomTransactionInserter with similar arguments using the same DB. +class RandomTransactionInserter { + public: + static bool RollbackDeletionTypeCallback(const Slice& key) { + // These are hard-coded atm. See how RandomTransactionInserter::DoInsert() + // determines whether to use SingleDelete or Delete for a key. + assert(key.size() >= 4); + const char* ptr = key.data(); + assert(ptr); + while (ptr && ptr < key.data() + 4 && *ptr == '0') { + ++ptr; + } + std::string prefix(ptr, 4 - (ptr - key.data())); + unsigned long set_i = std::stoul(prefix); + assert(set_i > 0); + assert(set_i <= 9999); + --set_i; + return ((set_i % 4) != 0); + } + + // num_keys is the number of keys in each set. + // num_sets is the number of sets of keys. + // cmt_delay_ms is the delay between prepare (if there is any) and commit + // first_id is the id of the first transaction + explicit RandomTransactionInserter( + Random64* rand, const WriteOptions& write_options = WriteOptions(), + const ReadOptions& read_options = ReadOptions(), uint64_t num_keys = 1000, + uint16_t num_sets = 3, const uint64_t cmt_delay_ms = 0, + const uint64_t first_id = 0); + + ~RandomTransactionInserter(); + + // Increment a key in each set using a Transaction on a TransactionDB. + // + // Returns true if the transaction succeeded OR if any error encountered was + // expected (eg a write-conflict). Error status may be obtained by calling + // GetLastStatus(); + bool TransactionDBInsert( + TransactionDB* db, + const TransactionOptions& txn_options = TransactionOptions()); + + // Increment a key in each set using a Transaction on an + // OptimisticTransactionDB + // + // Returns true if the transaction succeeded OR if any error encountered was + // expected (eg a write-conflict). Error status may be obtained by calling + // GetLastStatus(); + bool OptimisticTransactionDBInsert( + OptimisticTransactionDB* db, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); + // Increment a key in each set without using a transaction. If this function + // is called in parallel, then Verify() may fail. + // + // Returns true if the write succeeds. + // Error status may be obtained by calling GetLastStatus(). + bool DBInsert(DB* db); + + // Get the ikey'th key from set set_i + static Status DBGet(DB* db, Transaction* txn, ReadOptions& read_options, + uint16_t set_i, uint64_t ikey, bool get_for_update, + uint64_t* int_value, std::string* full_key, + bool* unexpected_error); + + // Returns OK if Invariant is true. + static Status Verify(DB* db, uint16_t num_sets, uint64_t num_keys_per_set = 0, + bool take_snapshot = false, Random64* rand = nullptr, + uint64_t delay_ms = 0); + + // Returns the status of the previous Insert operation + Status GetLastStatus() { return last_status_; } + + // Returns the number of successfully written calls to + // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert + uint64_t GetSuccessCount() { return success_count_; } + + // Returns the number of calls to + // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert that did not + // write any data. + uint64_t GetFailureCount() { return failure_count_; } + + // Returns the sum of user keys/values Put() to the DB. + size_t GetBytesInserted() { return bytes_inserted_; } + + private: + // Input options + Random64* rand_; + const WriteOptions write_options_; + ReadOptions read_options_; + const uint64_t num_keys_; + const uint16_t num_sets_; + + // Number of successful insert batches performed + uint64_t success_count_ = 0; + + // Number of failed insert batches attempted + uint64_t failure_count_ = 0; + + size_t bytes_inserted_ = 0; + + // Status returned by most recent insert operation + Status last_status_; + + // optimization: re-use allocated transaction objects. + Transaction* txn_ = nullptr; + Transaction* optimistic_txn_ = nullptr; + + uint64_t txn_id_; + // The delay between ::Prepare and ::Commit + const uint64_t cmt_delay_ms_; + + bool DoInsert(DB* db, Transaction* txn, bool is_optimistic); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gcc/ppc-asm.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gcc/ppc-asm.h new file mode 100644 index 0000000000..e0bce9c5ae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gcc/ppc-asm.h @@ -0,0 +1,390 @@ +/* PowerPC asm definitions for GNU C. + +Copyright (C) 2002-2020 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Under winnt, 1) gas supports the following as names and 2) in particular + defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */ + +#define r0 0 +#define sp 1 +#define toc 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 + +#define f0 0 +#define f1 1 +#define f2 2 +#define f3 3 +#define f4 4 +#define f5 5 +#define f6 6 +#define f7 7 +#define f8 8 +#define f9 9 +#define f10 10 +#define f11 11 +#define f12 12 +#define f13 13 +#define f14 14 +#define f15 15 +#define f16 16 +#define f17 17 +#define f18 18 +#define f19 19 +#define f20 20 +#define f21 21 +#define f22 22 +#define f23 23 +#define f24 24 +#define f25 25 +#define f26 26 +#define f27 27 +#define f28 28 +#define f29 29 +#define f30 30 +#define f31 31 + +#ifdef __VSX__ +#define f32 32 +#define f33 33 +#define f34 34 +#define f35 35 +#define f36 36 +#define f37 37 +#define f38 38 +#define f39 39 +#define f40 40 +#define f41 41 +#define f42 42 +#define f43 43 +#define f44 44 +#define f45 45 +#define f46 46 +#define f47 47 +#define f48 48 +#define f49 49 +#define f50 50 +#define f51 51 +#define f52 52 +#define f53 53 +#define f54 54 +#define f55 55 +#define f56 56 +#define f57 57 +#define f58 58 +#define f59 59 +#define f60 60 +#define f61 61 +#define f62 62 +#define f63 63 +#endif + +#ifdef __ALTIVEC__ +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#endif + +#ifdef __VSX__ +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#define vs14 14 +#define vs15 15 +#define vs16 16 +#define vs17 17 +#define vs18 18 +#define vs19 19 +#define vs20 20 +#define vs21 21 +#define vs22 22 +#define vs23 23 +#define vs24 24 +#define vs25 25 +#define vs26 26 +#define vs27 27 +#define vs28 28 +#define vs29 29 +#define vs30 30 +#define vs31 31 +#define vs32 32 +#define vs33 33 +#define vs34 34 +#define vs35 35 +#define vs36 36 +#define vs37 37 +#define vs38 38 +#define vs39 39 +#define vs40 40 +#define vs41 41 +#define vs42 42 +#define vs43 43 +#define vs44 44 +#define vs45 45 +#define vs46 46 +#define vs47 47 +#define vs48 48 +#define vs49 49 +#define vs50 50 +#define vs51 51 +#define vs52 52 +#define vs53 53 +#define vs54 54 +#define vs55 55 +#define vs56 56 +#define vs57 57 +#define vs58 58 +#define vs59 59 +#define vs60 60 +#define vs61 61 +#define vs62 62 +#define vs63 63 +#endif + +/* + * Macros to glue together two tokens. + */ + +#ifdef __STDC__ +#define XGLUE(a,b) a##b +#else +#define XGLUE(a,b) a/**/b +#endif + +#define GLUE(a,b) XGLUE(a,b) + +/* + * Macros to begin and end a function written in assembler. If -mcall-aixdesc + * or -mcall-nt, create a function descriptor with the given name, and create + * the real function with one or two leading periods respectively. + */ + +#if defined(__powerpc64__) && _CALL_ELF == 2 + +/* Defining "toc" above breaks @toc in assembler code. */ +#undef toc + +#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name) +#ifdef __PCREL__ +#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc) +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): \ + .localentry FUNC_NAME(name),1 +#else +#define JUMP_TARGET(name) FUNC_NAME(name) +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): \ +0: addis 2,12,(.TOC.-0b)@ha; \ + addi 2,2,(.TOC.-0b)@l; \ + .localentry FUNC_NAME(name),.-FUNC_NAME(name) +#endif /* !__PCREL__ */ + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden FUNC_NAME(name); + +#define FUNC_END(name) \ + .size FUNC_NAME(name),.-FUNC_NAME(name) + +#elif defined (__powerpc64__) + +#define FUNC_NAME(name) GLUE(.,name) +#define JUMP_TARGET(name) FUNC_NAME(name) +#define FUNC_START(name) \ + .section ".opd","aw"; \ +name: \ + .quad GLUE(.,name); \ + .quad .TOC.@tocbase; \ + .quad 0; \ + .previous; \ + .type GLUE(.,name),@function; \ + .globl name; \ + .globl GLUE(.,name); \ +GLUE(.,name): + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden name; \ + .hidden GLUE(.,name); + +#define FUNC_END(name) \ +GLUE(.L,name): \ + .size GLUE(.,name),GLUE(.L,name)-GLUE(.,name) + +#elif defined(_CALL_AIXDESC) + +#ifdef _RELOCATABLE +#define DESC_SECTION ".got2" +#else +#define DESC_SECTION ".got1" +#endif + +#define FUNC_NAME(name) GLUE(.,name) +#define JUMP_TARGET(name) FUNC_NAME(name) +#define FUNC_START(name) \ + .section DESC_SECTION,"aw"; \ +name: \ + .long GLUE(.,name); \ + .long _GLOBAL_OFFSET_TABLE_; \ + .long 0; \ + .previous; \ + .type GLUE(.,name),@function; \ + .globl name; \ + .globl GLUE(.,name); \ +GLUE(.,name): + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden name; \ + .hidden GLUE(.,name); + +#define FUNC_END(name) \ +GLUE(.L,name): \ + .size GLUE(.,name),GLUE(.L,name)-GLUE(.,name) + +#else + +#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name) +#if defined __PIC__ || defined __pic__ +#define JUMP_TARGET(name) FUNC_NAME(name@plt) +#else +#define JUMP_TARGET(name) FUNC_NAME(name) +#endif +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden FUNC_NAME(name); + +#define FUNC_END(name) \ +GLUE(.L,name): \ + .size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name) +#endif + +#ifdef IN_GCC +/* For HAVE_GAS_CFI_DIRECTIVE. */ +#include "auto-host.h" + +#ifdef HAVE_GAS_CFI_DIRECTIVE +# define CFI_STARTPROC .cfi_startproc +# define CFI_ENDPROC .cfi_endproc +# define CFI_OFFSET(reg, off) .cfi_offset reg, off +# define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg +# define CFI_RESTORE(reg) .cfi_restore reg +#else +# define CFI_STARTPROC +# define CFI_ENDPROC +# define CFI_OFFSET(reg, off) +# define CFI_DEF_CFA_REGISTER(reg) +# define CFI_RESTORE(reg) +#endif +#endif + +#if defined __linux__ && !defined __powerpc64__ + .section .note.GNU-stack + .previous +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gtest-1.8.1/fused-src/gtest/gtest.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gtest-1.8.1/fused-src/gtest/gtest.h new file mode 100644 index 0000000000..2d82d8e4d0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/third-party/gtest-1.8.1/fused-src/gtest/gtest.h @@ -0,0 +1,22115 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for Google Test. It should be +// included by any test program that uses Google Test. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! +// +// Acknowledgment: Google Test borrowed the idea of automatic test +// registration from Barthelemy Dagenais' (barthelemy@prologique.com) +// easyUnit framework. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_H_ + +#include +#include +#include +#include + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Low-level types and utilities for porting Google Test to various +// platforms. All macros ending with _ and symbols defined in an +// internal namespace are subject to change without notice. Code +// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't +// end with _ are part of Google Test's public API and can be used by +// code outside Google Test. +// +// This file is fundamental to Google Test. All other Google Test source +// files are expected to #include this. Therefore, it cannot #include +// any other Google Test header. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// Environment-describing macros +// ----------------------------- +// +// Google Test can be used in many different environments. Macros in +// this section tell Google Test what kind of environment it is being +// used in, such that Google Test can provide environment-specific +// features and implementations. +// +// Google Test tries to automatically detect the properties of its +// environment, so users usually don't need to worry about these +// macros. However, the automatic detection is not perfect. +// Sometimes it's necessary for a user to define some of the following +// macros in the build script to override Google Test's decisions. +// +// If the user doesn't define a macro in the list, Google Test will +// provide a default definition. After this header is #included, all +// macros in this list will be defined to either 1 or 0. +// +// Notes to maintainers: +// - Each macro here is a user-tweakable knob; do not grow the list +// lightly. +// - Use #if to key off these macros. Don't use #ifdef or "#if +// defined(...)", which will not work as these macros are ALWAYS +// defined. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string +// is/isn't available +// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring +// is/isn't available +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple +// is/isn't available. +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google +// Test's own tr1 tuple implementation should be +// used. Unused when the user sets +// GTEST_HAS_TR1_TUPLE to 0. +// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test +// is building in C++11/C++98 mode. +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. +// GTEST_DEFAULT_DEATH_TEST_STYLE +// - The default value of --gtest_death_test_style. +// The legacy default has been "fast" in the open +// source version since 2008. The recommended value +// is "threadsafe", and can be set in +// custom/gtest-port.h. + +// Platform-indicating macros +// -------------------------- +// +// Macros indicating the platform on which Google Test is being used +// (a macro is defined to 1 if compiled on the given platform; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_FREEBSD - FreeBSD +// GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_IOS - iOS +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_NETBSD - NetBSD +// GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_QNX - QNX +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_SYMBIAN - Symbian +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_WINDOWS_PHONE - Windows Phone +// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// It is possible that none of the GTEST_OS_* macros are defined. + +// Feature-indicating macros +// ------------------------- +// +// Macros indicating which Google Test features are available (a macro +// is defined to 1 if the corresponding feature is supported; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// These macros are public so that portable tests can be written. +// Such tests typically surround code using a feature with an #if +// which controls that code. For example: +// +// #if GTEST_HAS_DEATH_TEST +// EXPECT_DEATH(DoSomethingDeadly()); +// #endif +// +// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized +// tests) +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_IS_THREADSAFE - Google Test is thread-safe. +// GOOGLETEST_CM0007 DO NOT DELETE +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above RE\b(s) are mutually exclusive. +// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). + +// Misc public macros +// ------------------ +// +// GTEST_FLAG(flag_name) - references the variable corresponding to +// the given Google Test flag. + +// Internal utilities +// ------------------ +// +// The following macros and utilities are for Google Test's INTERNAL +// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY. +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_DISALLOW_ASSIGN_ - disables operator=. +// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is +// suppressed (constant conditional). +// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 +// is suppressed. +// +// C++11 feature wrappers: +// +// testing::internal::forward - portability wrapper for std::forward. +// testing::internal::move - portability wrapper for std::move. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// +// Template meta programming: +// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. +// IteratorTraits - partial implementation of std::iterator_traits, which +// is not available in libCstd when compiled with Sun C++. +// +// Smart pointers: +// scoped_ptr - as in TR2. +// +// Regular expressions: +// RE - a simple regular expression class using the POSIX +// Extended Regular Expression syntax on UNIX-like platforms +// GOOGLETEST_CM0008 DO NOT DELETE +// or a reduced regular exception syntax on other +// platforms, including Windows. +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// Int32, UInt32, Int64, UInt64, TimeInMillis +// - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GTEST_DECLARE_*() - declares a flag. +// GTEST_DEFINE_*() - defines a flag. +// GetInjectableArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an Int32 environment variable. +// StringFromGTestEnv() - parses a string environment variable. + +#include // for isspace, etc +#include // for ptrdiff_t +#include +#include +#include +#ifndef _WIN32_WCE +# include +# include +#endif // !_WIN32_WCE + +#if defined __APPLE__ +# include +# include +#endif + +// Brings in the definition of HAS_GLOBAL_STRING. This must be done +// BEFORE we test HAS_GLOBAL_STRING. +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT + +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the GTEST_OS_* macro. +// It is separate from gtest-port.h so that custom/gtest-port.h can include it. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +# define GTEST_OS_CYGWIN 1 +#elif defined __SYMBIAN32__ +# define GTEST_OS_SYMBIAN 1 +#elif defined _WIN32 +# define GTEST_OS_WINDOWS 1 +# ifdef _WIN32_WCE +# define GTEST_OS_WINDOWS_MOBILE 1 +# elif defined(__MINGW__) || defined(__MINGW32__) +# define GTEST_OS_WINDOWS_MINGW 1 +# elif defined(WINAPI_FAMILY) +# include +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +# define GTEST_OS_WINDOWS_DESKTOP 1 +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +# define GTEST_OS_WINDOWS_PHONE 1 +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +# define GTEST_OS_WINDOWS_RT 1 +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +# define GTEST_OS_WINDOWS_PHONE 1 +# define GTEST_OS_WINDOWS_TV_TITLE 1 +# else + // WINAPI_FAMILY defined but no known partition matched. + // Default to desktop. +# define GTEST_OS_WINDOWS_DESKTOP 1 +# endif +# else +# define GTEST_OS_WINDOWS_DESKTOP 1 +# endif // _WIN32_WCE +#elif defined __APPLE__ +# define GTEST_OS_MAC 1 +# if TARGET_OS_IPHONE +# define GTEST_OS_IOS 1 +# endif +#elif defined __FreeBSD__ +# define GTEST_OS_FREEBSD 1 +#elif defined __Fuchsia__ +# define GTEST_OS_FUCHSIA 1 +#elif defined __linux__ +# define GTEST_OS_LINUX 1 +# if defined __ANDROID__ +# define GTEST_OS_LINUX_ANDROID 1 +# endif +#elif defined __MVS__ +# define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +# define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +# define GTEST_OS_AIX 1 +#elif defined(__hpux) +# define GTEST_OS_HPUX 1 +#elif defined __native_client__ +# define GTEST_OS_NACL 1 +#elif defined __NetBSD__ +# define GTEST_OS_NETBSD 1 +#elif defined __OpenBSD__ +# define GTEST_OS_OPENBSD 1 +#elif defined __QNX__ +# define GTEST_OS_QNX 1 +#endif // __CYGWIN__ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ + +#if !defined(GTEST_DEV_EMAIL_) +# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +# define GTEST_FLAG_PREFIX_ "gtest_" +# define GTEST_FLAG_PREFIX_DASH_ "gtest-" +# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +# define GTEST_NAME_ "Google Test" +# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#endif // !defined(GTEST_DEV_EMAIL_) + +#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) +# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +# define GTEST_GCC_VER_ \ + (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Macros for disabling Microsoft Visual C++ warnings. +// +// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) +// /* code that triggers warnings C4800 and C4385 */ +// GTEST_DISABLE_MSC_WARNINGS_POP_() +#if _MSC_VER >= 1400 +# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) \ + __pragma(warning(disable: warnings)) +# define GTEST_DISABLE_MSC_WARNINGS_POP_() \ + __pragma(warning(pop)) +#else +// Older versions of MSVC don't have __pragma. +# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +# define GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Clang on Windows does not understand MSVC's pragma warning. +// We need clang-specific way to disable function deprecation warning. +#ifdef __clang__ +# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ + _Pragma("clang diagnostic pop") +#else +# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +#ifndef GTEST_LANG_CXX11 +// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when +// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a +// value for __cplusplus, and recent versions of clang, gcc, and +// probably other compilers set that too in C++11 mode. +# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900 +// Compiling in at least C++11 mode. +# define GTEST_LANG_CXX11 1 +# else +# define GTEST_LANG_CXX11 0 +# endif +#endif + +// Distinct from C++11 language support, some environments don't provide +// proper C++11 library support. Notably, it's possible to build in +// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++ +// with no C++11 support. +// +// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__ +// 20110325, but maintenance releases in the 4.4 and 4.5 series followed +// this date, so check for those versions by their date stamps. +// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning +#if GTEST_LANG_CXX11 && \ + (!defined(__GLIBCXX__) || ( \ + __GLIBCXX__ >= 20110325ul && /* GCC >= 4.6.0 */ \ + /* Blacklist of patch releases of older branches: */ \ + __GLIBCXX__ != 20110416ul && /* GCC 4.4.6 */ \ + __GLIBCXX__ != 20120313ul && /* GCC 4.4.7 */ \ + __GLIBCXX__ != 20110428ul && /* GCC 4.5.3 */ \ + __GLIBCXX__ != 20120702ul)) /* GCC 4.5.4 */ +# define GTEST_STDLIB_CXX11 1 +#endif + +// Only use C++11 library features if the library provides them. +#if GTEST_STDLIB_CXX11 +# define GTEST_HAS_STD_BEGIN_AND_END_ 1 +# define GTEST_HAS_STD_FORWARD_LIST_ 1 +# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824) +// works only with VS2015U2 and better +# define GTEST_HAS_STD_FUNCTION_ 1 +# endif +# define GTEST_HAS_STD_INITIALIZER_LIST_ 1 +# define GTEST_HAS_STD_MOVE_ 1 +# define GTEST_HAS_STD_UNIQUE_PTR_ 1 +# define GTEST_HAS_STD_SHARED_PTR_ 1 +# define GTEST_HAS_UNORDERED_MAP_ 1 +# define GTEST_HAS_UNORDERED_SET_ 1 +#endif + +// C++11 specifies that provides std::tuple. +// Some platforms still might not have it, however. +#if GTEST_LANG_CXX11 +# define GTEST_HAS_STD_TUPLE_ 1 +# if defined(__clang__) +// Inspired by +// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros +# if defined(__has_include) && !__has_include() +# undef GTEST_HAS_STD_TUPLE_ +# endif +# elif defined(_MSC_VER) +// Inspired by boost/config/stdlib/dinkumware.hpp +# if defined(_CPPLIB_VER) && _CPPLIB_VER < 520 +# undef GTEST_HAS_STD_TUPLE_ +# endif +# elif defined(__GLIBCXX__) +// Inspired by boost/config/stdlib/libstdcpp3.hpp, +// http://gcc.gnu.org/gcc-4.2/changes.html and +// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2) +# undef GTEST_HAS_STD_TUPLE_ +# endif +# endif +#endif + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if GTEST_OS_WINDOWS +# if !GTEST_OS_WINDOWS_MOBILE +# include +# include +# endif +// In order to avoid having to include , use forward declaration +#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) +// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two +// separate (equivalent) structs, instead of using typedef +typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#else +// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. +// This assumption is verified by +// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. +typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#endif +#else +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +# include +# include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_LINUX_ANDROID +// Used to define __ANDROID_API__ matching the target NDK API level. +# include // NOLINT +#endif + +// Defines this to true iff Google Test can use POSIX regular expressions. +#ifndef GTEST_HAS_POSIX_RE +# if GTEST_OS_LINUX_ANDROID +// On Android, is only available starting with Gingerbread. +# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +# else +# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) +# endif +#endif + +#if GTEST_USES_PCRE +// The appropriate headers have already been included. + +#elif GTEST_HAS_POSIX_RE + +// On some platforms, needs someone to define size_t, and +// won't compile otherwise. We can #include it here as we already +// included , which is guaranteed to define size_t through +// . +# include // NOLINT + +# define GTEST_USES_POSIX_RE 1 + +#elif GTEST_OS_WINDOWS + +// is not available on Windows. Use our own simple regex +// implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#else + +// may not be available on this platform. Use our own +// simple regex implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#endif // GTEST_USES_PCRE + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +# if defined(_MSC_VER) && defined(_CPPUNWIND) +// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__BORLANDC__) +// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +# ifndef _HAS_EXCEPTIONS +# define _HAS_EXCEPTIONS 1 +# endif // _HAS_EXCEPTIONS +# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +# elif defined(__clang__) +// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714, +// but iff cleanups are enabled after that. In Obj-C++ files, there can be +// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions +// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++ +// exceptions starting at clang r206352, but which checked for cleanups prior to +// that. To reliably check for C++ exception availability with clang, check for +// __EXCEPTIONS && __has_feature(cxx_exceptions). +# define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +# elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +# define GTEST_HAS_EXCEPTIONS 1 +# else +// For other compilers, we assume exceptions are disabled to be +// conservative. +# define GTEST_HAS_EXCEPTIONS 0 +# endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#if !defined(GTEST_HAS_STD_STRING) +// Even though we don't use this macro any longer, we keep it in case +// some clients still depend on it. +# define GTEST_HAS_STD_STRING 1 +#elif !GTEST_HAS_STD_STRING +// The user told us that ::std::string isn't available. +# error "::std::string isn't available." +#endif // !defined(GTEST_HAS_STD_STRING) + +#ifndef GTEST_HAS_GLOBAL_STRING +# define GTEST_HAS_GLOBAL_STRING 0 +#endif // GTEST_HAS_GLOBAL_STRING + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// FIXME: uses autoconf to detect whether ::std::wstring +// is available. + +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +# define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) + +#endif // GTEST_HAS_STD_WSTRING + +#ifndef GTEST_HAS_GLOBAL_WSTRING +// The user didn't tell us whether ::wstring is available, so we need +// to figure it out. +# define GTEST_HAS_GLOBAL_WSTRING \ + (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) +#endif // GTEST_HAS_GLOBAL_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +# ifdef _MSC_VER + +# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) + +# ifdef __GXX_RTTI +// When building against STLport with the Android NDK and with +// -frtti -fno-exceptions, the build fails at link time with undefined +// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, +// so disable RTTI when detected. +# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ + !defined(__EXCEPTIONS) +# define GTEST_HAS_RTTI 0 +# else +# define GTEST_HAS_RTTI 1 +# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +# else +# define GTEST_HAS_RTTI 0 +# endif // __GXX_RTTI + +// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends +// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the +// first version with C++ support. +# elif defined(__clang__) + +# define GTEST_HAS_RTTI __has_feature(cxx_rtti) + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +# ifdef __RTTI_ALL__ +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +# else + +// For all other compilers, we assume RTTI is enabled. +# define GTEST_HAS_RTTI 1 + +# endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include when RTTI +// is enabled. +#if GTEST_HAS_RTTI +# include +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we make reasonable assumptions about +// which platforms have pthreads support. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +#define GTEST_HAS_PTHREAD \ + (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is +// true. +# include // NOLINT + +// For timespec and nanosleep, used below. +# include // NOLINT +#endif + +// Determines if hash_map/hash_set are available. +// Only used for testing against those containers. +#if !defined(GTEST_HAS_HASH_MAP_) +# if defined(_MSC_VER) && (_MSC_VER < 1900) +# define GTEST_HAS_HASH_MAP_ 1 // Indicates that hash_map is available. +# define GTEST_HAS_HASH_SET_ 1 // Indicates that hash_set is available. +# endif // _MSC_VER +#endif // !defined(GTEST_HAS_HASH_MAP_) + +// Determines whether Google Test can use tr1/tuple. You can define +// this macro to 0 to prevent Google Test from using tuple (any +// feature depending on tuple with be disabled in this mode). +#ifndef GTEST_HAS_TR1_TUPLE +# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) +// STLport, provided with the Android NDK, has neither or . +# define GTEST_HAS_TR1_TUPLE 0 +# elif defined(_MSC_VER) && (_MSC_VER >= 1910) +// Prevent `warning C4996: 'std::tr1': warning STL4002: +// The non-Standard std::tr1 namespace and TR1-only machinery +// are deprecated and will be REMOVED.` +# define GTEST_HAS_TR1_TUPLE 0 +# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION) +// libc++ doesn't support TR1. +# define GTEST_HAS_TR1_TUPLE 0 +# else +// The user didn't tell us not to do it, so we assume it's OK. +# define GTEST_HAS_TR1_TUPLE 1 +# endif +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether Google Test's own tr1 tuple implementation +// should be used. +#ifndef GTEST_USE_OWN_TR1_TUPLE +// We use our own tuple implementation on Symbian. +# if GTEST_OS_SYMBIAN +# define GTEST_USE_OWN_TR1_TUPLE 1 +# else +// The user didn't tell us, so we need to figure it out. + +// We use our own TR1 tuple if we aren't sure the user has an +// implementation of it already. At this time, libstdc++ 4.0.0+ and +// MSVC 2010 are the only mainstream standard libraries that come +// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler +// pretends to be GCC by defining __GNUC__ and friends, but cannot +// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1 +// tuple in a 323 MB Feature Pack download, which we cannot assume the +// user has. QNX's QCC compiler is a modified GCC but it doesn't +// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, +// and it can be used with some compilers that define __GNUC__. +# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ + && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \ + || (_MSC_VER >= 1600 && _MSC_VER < 1900) +# define GTEST_ENV_HAS_TR1_TUPLE_ 1 +# endif + +// C++11 specifies that provides std::tuple. Use that if gtest is used +// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6 +// can build with clang but need to use gcc4.2's libstdc++). +# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325) +# define GTEST_ENV_HAS_STD_TUPLE_ 1 +# endif + +# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_ +# define GTEST_USE_OWN_TR1_TUPLE 0 +# else +# define GTEST_USE_OWN_TR1_TUPLE 1 +# endif +# endif // GTEST_OS_SYMBIAN +#endif // GTEST_USE_OWN_TR1_TUPLE + +// To avoid conditional compilation we make it gtest-port.h's responsibility +// to #include the header implementing tuple. +#if GTEST_HAS_STD_TUPLE_ +# include // IWYU pragma: export +# define GTEST_TUPLE_NAMESPACE_ ::std +#endif // GTEST_HAS_STD_TUPLE_ + +// We include tr1::tuple even if std::tuple is available to define printers for +// them. +#if GTEST_HAS_TR1_TUPLE +# ifndef GTEST_TUPLE_NAMESPACE_ +# define GTEST_TUPLE_NAMESPACE_ ::std::tr1 +# endif // GTEST_TUPLE_NAMESPACE_ + +# if GTEST_USE_OWN_TR1_TUPLE +// This file was GENERATED by command: +// pump.py gtest-tuple.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2009 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Implements a subset of TR1 tuple needed by Google Test and Google Mock. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ + +#include // For ::std::pair. + +// The compiler used in Symbian has a bug that prevents us from declaring the +// tuple template as a friend (it complains that tuple is redefined). This +// bypasses the bug by declaring the members that should otherwise be +// private as public. +// Sun Studio versions < 12 also have the above bug. +#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: +#else +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ + template friend class tuple; \ + private: +#endif + +// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict +// with our own definitions. Therefore using our own tuple does not work on +// those compilers. +#if defined(_MSC_VER) && _MSC_VER >= 1600 /* 1600 is Visual Studio 2010 */ +# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \ +GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers." +#endif + +// GTEST_n_TUPLE_(T) is the type of an n-tuple. +#define GTEST_0_TUPLE_(T) tuple<> +#define GTEST_1_TUPLE_(T) tuple +#define GTEST_2_TUPLE_(T) tuple +#define GTEST_3_TUPLE_(T) tuple +#define GTEST_4_TUPLE_(T) tuple +#define GTEST_5_TUPLE_(T) tuple +#define GTEST_6_TUPLE_(T) tuple +#define GTEST_7_TUPLE_(T) tuple +#define GTEST_8_TUPLE_(T) tuple +#define GTEST_9_TUPLE_(T) tuple +#define GTEST_10_TUPLE_(T) tuple + +// GTEST_n_TYPENAMES_(T) declares a list of n typenames. +#define GTEST_0_TYPENAMES_(T) +#define GTEST_1_TYPENAMES_(T) typename T##0 +#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 +#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 +#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3 +#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4 +#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5 +#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6 +#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 +#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8 +#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8, typename T##9 + +// In theory, defining stuff in the ::std namespace is undefined +// behavior. We can do this as we are playing the role of a standard +// library vendor. +namespace std { +namespace tr1 { + +template +class tuple; + +// Anything in namespace gtest_internal is Google Test's INTERNAL +// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. +namespace gtest_internal { + +// ByRef::type is T if T is a reference; otherwise it's const T&. +template +struct ByRef { typedef const T& type; }; // NOLINT +template +struct ByRef { typedef T& type; }; // NOLINT + +// A handy wrapper for ByRef. +#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type + +// AddRef::type is T if T is a reference; otherwise it's T&. This +// is the same as tr1::add_reference::type. +template +struct AddRef { typedef T& type; }; // NOLINT +template +struct AddRef { typedef T& type; }; // NOLINT + +// A handy wrapper for AddRef. +#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type + +// A helper for implementing get(). +template class Get; + +// A helper for implementing tuple_element. kIndexValid is true +// iff k < the number of fields in tuple type T. +template +struct TupleElement; + +template +struct TupleElement { + typedef T0 type; +}; + +template +struct TupleElement { + typedef T1 type; +}; + +template +struct TupleElement { + typedef T2 type; +}; + +template +struct TupleElement { + typedef T3 type; +}; + +template +struct TupleElement { + typedef T4 type; +}; + +template +struct TupleElement { + typedef T5 type; +}; + +template +struct TupleElement { + typedef T6 type; +}; + +template +struct TupleElement { + typedef T7 type; +}; + +template +struct TupleElement { + typedef T8 type; +}; + +template +struct TupleElement { + typedef T9 type; +}; + +} // namespace gtest_internal + +template <> +class tuple<> { + public: + tuple() {} + tuple(const tuple& /* t */) {} + tuple& operator=(const tuple& /* t */) { return *this; } +}; + +template +class GTEST_1_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} + + tuple(const tuple& t) : f0_(t.f0_) {} + + template + tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_1_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { + f0_ = t.f0_; + return *this; + } + + T0 f0_; +}; + +template +class GTEST_2_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), + f1_(f1) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} + + template + tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} + template + tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_2_TUPLE_(U)& t) { + return CopyFrom(t); + } + template + tuple& operator=(const ::std::pair& p) { + f0_ = p.first; + f1_ = p.second; + return *this; + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + return *this; + } + + T0 f0_; + T1 f1_; +}; + +template +class GTEST_3_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + template + tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_3_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; +}; + +template +class GTEST_4_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} + + template + tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_4_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; +}; + +template +class GTEST_5_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, + GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_) {} + + template + tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_5_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; +}; + +template +class GTEST_6_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_) {} + + template + tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_6_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; +}; + +template +class GTEST_7_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + template + tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_7_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; +}; + +template +class GTEST_8_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, + GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + template + tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_8_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; +}; + +template +class GTEST_9_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + template + tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_9_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; +}; + +template +class tuple { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), + f9_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} + + template + tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), + f9_(t.f9_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_10_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + f9_ = t.f9_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; + T9 f9_; +}; + +// 6.1.3.2 Tuple creation functions. + +// Known limitations: we don't support passing an +// std::tr1::reference_wrapper to make_tuple(). And we don't +// implement tie(). + +inline tuple<> make_tuple() { return tuple<>(); } + +template +inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { + return GTEST_1_TUPLE_(T)(f0); +} + +template +inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { + return GTEST_2_TUPLE_(T)(f0, f1); +} + +template +inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { + return GTEST_3_TUPLE_(T)(f0, f1, f2); +} + +template +inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3) { + return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); +} + +template +inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4) { + return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); +} + +template +inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5) { + return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); +} + +template +inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6) { + return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); +} + +template +inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { + return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); +} + +template +inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8) { + return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); +} + +template +inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8, const T9& f9) { + return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); +} + +// 6.1.3.3 Tuple helper classes. + +template struct tuple_size; + +template +struct tuple_size { + static const int value = 0; +}; + +template +struct tuple_size { + static const int value = 1; +}; + +template +struct tuple_size { + static const int value = 2; +}; + +template +struct tuple_size { + static const int value = 3; +}; + +template +struct tuple_size { + static const int value = 4; +}; + +template +struct tuple_size { + static const int value = 5; +}; + +template +struct tuple_size { + static const int value = 6; +}; + +template +struct tuple_size { + static const int value = 7; +}; + +template +struct tuple_size { + static const int value = 8; +}; + +template +struct tuple_size { + static const int value = 9; +}; + +template +struct tuple_size { + static const int value = 10; +}; + +template +struct tuple_element { + typedef typename gtest_internal::TupleElement< + k < (tuple_size::value), k, Tuple>::type type; +}; + +#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type + +// 6.1.3.4 Element access. + +namespace gtest_internal { + +template <> +class Get<0> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + Field(Tuple& t) { return t.f0_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + ConstField(const Tuple& t) { return t.f0_; } +}; + +template <> +class Get<1> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + Field(Tuple& t) { return t.f1_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + ConstField(const Tuple& t) { return t.f1_; } +}; + +template <> +class Get<2> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + Field(Tuple& t) { return t.f2_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + ConstField(const Tuple& t) { return t.f2_; } +}; + +template <> +class Get<3> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + Field(Tuple& t) { return t.f3_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + ConstField(const Tuple& t) { return t.f3_; } +}; + +template <> +class Get<4> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + Field(Tuple& t) { return t.f4_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + ConstField(const Tuple& t) { return t.f4_; } +}; + +template <> +class Get<5> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + Field(Tuple& t) { return t.f5_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + ConstField(const Tuple& t) { return t.f5_; } +}; + +template <> +class Get<6> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + Field(Tuple& t) { return t.f6_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + ConstField(const Tuple& t) { return t.f6_; } +}; + +template <> +class Get<7> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + Field(Tuple& t) { return t.f7_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + ConstField(const Tuple& t) { return t.f7_; } +}; + +template <> +class Get<8> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + Field(Tuple& t) { return t.f8_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + ConstField(const Tuple& t) { return t.f8_; } +}; + +template <> +class Get<9> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + Field(Tuple& t) { return t.f9_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + ConstField(const Tuple& t) { return t.f9_; } +}; + +} // namespace gtest_internal + +template +GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get::Field(t); +} + +template +GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(const GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get::ConstField(t); +} + +// 6.1.3.5 Relational operators + +// We only implement == and !=, as we don't have a need for the rest yet. + +namespace gtest_internal { + +// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the +// first k fields of t1 equals the first k fields of t2. +// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if +// k1 != k2. +template +struct SameSizeTuplePrefixComparator; + +template <> +struct SameSizeTuplePrefixComparator<0, 0> { + template + static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { + return true; + } +}; + +template +struct SameSizeTuplePrefixComparator { + template + static bool Eq(const Tuple1& t1, const Tuple2& t2) { + return SameSizeTuplePrefixComparator::Eq(t1, t2) && + ::std::tr1::get(t1) == ::std::tr1::get(t2); + } +}; + +} // namespace gtest_internal + +template +inline bool operator==(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { + return gtest_internal::SameSizeTuplePrefixComparator< + tuple_size::value, + tuple_size::value>::Eq(t, u); +} + +template +inline bool operator!=(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { return !(t == u); } + +// 6.1.4 Pairs. +// Unimplemented. + +} // namespace tr1 +} // namespace std + +#undef GTEST_0_TUPLE_ +#undef GTEST_1_TUPLE_ +#undef GTEST_2_TUPLE_ +#undef GTEST_3_TUPLE_ +#undef GTEST_4_TUPLE_ +#undef GTEST_5_TUPLE_ +#undef GTEST_6_TUPLE_ +#undef GTEST_7_TUPLE_ +#undef GTEST_8_TUPLE_ +#undef GTEST_9_TUPLE_ +#undef GTEST_10_TUPLE_ + +#undef GTEST_0_TYPENAMES_ +#undef GTEST_1_TYPENAMES_ +#undef GTEST_2_TYPENAMES_ +#undef GTEST_3_TYPENAMES_ +#undef GTEST_4_TYPENAMES_ +#undef GTEST_5_TYPENAMES_ +#undef GTEST_6_TYPENAMES_ +#undef GTEST_7_TYPENAMES_ +#undef GTEST_8_TYPENAMES_ +#undef GTEST_9_TYPENAMES_ +#undef GTEST_10_TYPENAMES_ + +#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ +#undef GTEST_BY_REF_ +#undef GTEST_ADD_REF_ +#undef GTEST_TUPLE_ELEMENT_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +# elif GTEST_OS_SYMBIAN + +// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to +// use STLport's tuple implementation, which unfortunately doesn't +// work as the copy of STLport distributed with Symbian is incomplete. +// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to +// use its own tuple implementation. +# ifdef BOOST_HAS_TR1_TUPLE +# undef BOOST_HAS_TR1_TUPLE +# endif // BOOST_HAS_TR1_TUPLE + +// This prevents , which defines +// BOOST_HAS_TR1_TUPLE, from being #included by Boost's . +# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED +# include // IWYU pragma: export // NOLINT + +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) +// GCC 4.0+ implements tr1/tuple in the header. This does +// not conform to the TR1 spec, which requires the header to be . + +# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 +// Until version 4.3.2, gcc has a bug that causes , +// which is #included by , to not compile when RTTI is +// disabled. _TR1_FUNCTIONAL is the header guard for +// . Hence the following #define is used to prevent +// from being included. +# define _TR1_FUNCTIONAL 1 +# include +# undef _TR1_FUNCTIONAL // Allows the user to #include + // if they choose to. +# else +# include // NOLINT +# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 + +// VS 2010 now has tr1 support. +# elif _MSC_VER >= 1600 +# include // IWYU pragma: export // NOLINT + +# else // GTEST_USE_OWN_TR1_TUPLE +# include // IWYU pragma: export // NOLINT +# endif // GTEST_USE_OWN_TR1_TUPLE + +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +# if GTEST_OS_LINUX && !defined(__ia64__) +# if GTEST_OS_LINUX_ANDROID +// On Android, clone() became available at different API levels for each 32-bit +// architecture. +# if defined(__LP64__) || \ + (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) +# define GTEST_HAS_CLONE 1 +# else +# define GTEST_HAS_CLONE 0 +# endif +# else +# define GTEST_HAS_CLONE 1 +# endif +# else +# define GTEST_HAS_CLONE 0 +# endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT +# define GTEST_HAS_STREAM_REDIRECTION 0 +# else +# define GTEST_HAS_STREAM_REDIRECTION 1 +# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// Google Test does not support death tests for VC 7.1 and earlier as +// abort() in a VC 7.1 application compiled as GUI in debug config +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ + GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ + GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \ + GTEST_OS_NETBSD || GTEST_OS_FUCHSIA) +# define GTEST_HAS_DEATH_TEST 1 +#endif + +// Determines whether to support type-driven tests. + +// Typed tests need and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +# define GTEST_HAS_TYPED_TEST 1 +# define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether to support Combine(). This only makes sense when +// value-parameterized tests are enabled. The implementation doesn't +// work on Sun Studio since it doesn't understand templated conversion +// operators. +#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC) +# define GTEST_HAS_COMBINE 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX +# define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +#elif defined(__clang__) +# if __has_attribute(unused) +# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +# endif +#endif +#ifndef GTEST_ATTRIBUTE_UNUSED_ +# define GTEST_ATTRIBUTE_UNUSED_ +#endif + +#if GTEST_LANG_CXX11 +# define GTEST_CXX11_EQUALS_DELETE_ = delete +#else // GTEST_LANG_CXX11 +# define GTEST_CXX11_EQUALS_DELETE_ +#endif // GTEST_LANG_CXX11 + +// Use this annotation before a function that takes a printf format string. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) +# if defined(__MINGW_PRINTF_FORMAT) +// MinGW has two different printf implementations. Ensure the format macro +// matches the selected implementation. See +// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. +# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \ + first_to_check))) +# else +# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +# endif +#else +# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#endif + + +// A macro to disallow operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_ASSIGN_(type) \ + void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_ + +// A macro to disallow copy constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ + type(type const &) GTEST_CXX11_EQUALS_DELETE_; \ + GTEST_DISALLOW_ASSIGN_(type) + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) +# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) +#else +# define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC + +// MS C++ compiler emits warning when a conditional expression is compile time +// constant. In some contexts this warning is false positive and needs to be +// suppressed. Use the following two macros in such cases: +// +// GTEST_INTENTIONAL_CONST_COND_PUSH_() +// while (true) { +// GTEST_INTENTIONAL_CONST_COND_POP_() +// } +# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +# define GTEST_INTENTIONAL_CONST_COND_POP_() \ + GTEST_DISABLE_MSC_WARNINGS_POP_() + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +# if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +# define GTEST_HAS_SEH 1 +# else +// Assume no SEH. +# define GTEST_HAS_SEH 0 +# endif + +#define GTEST_IS_THREADSAFE \ + (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \ + || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \ + || GTEST_HAS_PTHREAD) + +#endif // GTEST_HAS_SEH + +// GTEST_API_ qualifies all symbols that must be exported. The definitions below +// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in +// gtest/internal/custom/gtest-port.h +#ifndef GTEST_API_ + +#ifdef _MSC_VER +# if GTEST_LINKED_AS_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllimport) +# elif GTEST_CREATE_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllexport) +# endif +#elif __GNUC__ >= 4 || defined(__clang__) +# define GTEST_API_ __attribute__((visibility ("default"))) +#endif // _MSC_VER + +#endif // GTEST_API_ + +#ifndef GTEST_API_ +# define GTEST_API_ +#endif // GTEST_API_ + +#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE +# define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#endif // GTEST_DEFAULT_DEATH_TEST_STYLE + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +# define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +# define GTEST_NO_INLINE_ +#endif + +// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. +#if !defined(GTEST_HAS_CXXABI_H_) +# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +# define GTEST_HAS_CXXABI_H_ 1 +# else +# define GTEST_HAS_CXXABI_H_ 0 +# endif +#endif + +// A function level attribute to disable checking for use of uninitialized +// memory when built with MemorySanitizer. +#if defined(__clang__) +# if __has_feature(memory_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \ + __attribute__((no_sanitize_memory)) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +# endif // __has_feature(memory_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __clang__ + +// A function level attribute to disable AddressSanitizer instrumentation. +#if defined(__clang__) +# if __has_feature(address_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +# endif // __has_feature(address_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __clang__ + +// A function level attribute to disable ThreadSanitizer instrumentation. +#if defined(__clang__) +# if __has_feature(thread_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \ + __attribute__((no_sanitize_thread)) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +# endif // __has_feature(thread_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __clang__ + +namespace testing { + +class Message; + +#if defined(GTEST_TUPLE_NAMESPACE_) +// Import tuple and friends into the ::testing namespace. +// It is part of our interface, having them in ::testing allows us to change +// their types as needed. +using GTEST_TUPLE_NAMESPACE_::get; +using GTEST_TUPLE_NAMESPACE_::make_tuple; +using GTEST_TUPLE_NAMESPACE_::tuple; +using GTEST_TUPLE_NAMESPACE_::tuple_size; +using GTEST_TUPLE_NAMESPACE_::tuple_element; +#endif // defined(GTEST_TUPLE_NAMESPACE_) + +namespace internal { + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES, +// names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +#if GTEST_LANG_CXX11 +# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) +#else // !GTEST_LANG_CXX11 +template + struct CompileAssert { +}; + +# define GTEST_COMPILE_ASSERT_(expr, msg) \ + typedef ::testing::internal::CompileAssert<(static_cast(expr))> \ + msg[static_cast(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_ +#endif // !GTEST_LANG_CXX11 + +// Implementation details of GTEST_COMPILE_ASSERT_: +// +// (In C++11, we simply use static_assert instead of the following) +// +// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outter parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert +// +// instead, these compilers will refuse to compile +// +// GTEST_COMPILE_ASSERT_(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + +// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. +// +// This template is declared, but intentionally undefined. +template +struct StaticAssertTypeEqHelper; + +template +struct StaticAssertTypeEqHelper { + enum { value = true }; +}; + +// Same as std::is_same<>. +template +struct IsSame { + enum { value = false }; +}; +template +struct IsSame { + enum { value = true }; +}; + +// Evaluates to the number of elements in 'array'. +#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0])) + +#if GTEST_HAS_GLOBAL_STRING +typedef ::string string; +#else +typedef ::std::string string; +#endif // GTEST_HAS_GLOBAL_STRING + +#if GTEST_HAS_GLOBAL_WSTRING +typedef ::wstring wstring; +#elif GTEST_HAS_STD_WSTRING +typedef ::std::wstring wstring; +#endif // GTEST_HAS_GLOBAL_WSTRING + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines scoped_ptr. + +// RocksDB: use unique_ptr to work around some clang-analyze false reports +template +using scoped_ptr = std::unique_ptr; +/* +// This implementation of scoped_ptr is PARTIAL - it only contains +// enough stuff to satisfy Google Test's need. +template +class scoped_ptr { + public: + typedef T element_type; + + explicit scoped_ptr(T* p = NULL) : ptr_(p) {} + ~scoped_ptr() { reset(); } + + T& operator*() const { return *ptr_; } + T* operator->() const { return ptr_; } + T* get() const { return ptr_; } + + T* release() { + T* const ptr = ptr_; + ptr_ = NULL; + return ptr; + } + + void reset(T* p = NULL) { + if (p != ptr_) { + if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. + delete ptr_; + } + ptr_ = p; + } + } + + friend void swap(scoped_ptr& a, scoped_ptr& b) { + using std::swap; + swap(a.ptr_, b.ptr_); + } + + private: + T* ptr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); +}; +*/ + +// Defines RE. + +#if GTEST_USES_PCRE +// if used, PCRE is injected by custom/gtest-port.h +#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE + +// A simple C++ wrapper for . It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE& other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT + +# if GTEST_HAS_GLOBAL_STRING + + RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT + +# endif // GTEST_HAS_GLOBAL_STRING + + RE(const char* regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char* pattern() const { return pattern_; } + + // FullMatch(str, re) returns true iff regular expression re matches + // the entire str. + // PartialMatch(str, re) returns true iff regular expression re + // matches a substring of str (including str itself). + // + // FIXME: make FullMatch() and PartialMatch() work + // when str contains NUL characters. + static bool FullMatch(const ::std::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +# if GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const ::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +# endif // GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const char* str, const RE& re); + static bool PartialMatch(const char* str, const RE& re); + + private: + void Init(const char* regex); + + // We use a const char* instead of an std::string, as Google Test used to be + // used where std::string is not available. FIXME: change to + // std::string. + const char* pattern_; + bool is_valid_; + +# if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +# else // GTEST_USES_SIMPLE_RE + + const char* full_pattern_; // For FullMatch(); + +# endif + + GTEST_DISALLOW_ASSIGN_(RE); +}; + +#endif // GTEST_USES_PCRE + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { + GTEST_INFO, + GTEST_WARNING, + GTEST_ERROR, + GTEST_FATAL +}; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char* file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream& GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); +}; + +#if !defined(GTEST_LOG_) + +# define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__).GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(NULL); } + +#endif // !defined(GTEST_LOG_) + +#if !defined(GTEST_CHECK_) +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsys: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +# define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#endif // !defined(GTEST_CHECK_) + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ + << gtest_error + +// Adds reference to a type if it is not a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::add_reference, which is not widely available yet. +template +struct AddReference { typedef T& type; }; // NOLINT +template +struct AddReference { typedef T& type; }; // NOLINT + +// A handy wrapper around AddReference that works when the argument T +// depends on template parameters. +#define GTEST_ADD_REFERENCE_(T) \ + typename ::testing::internal::AddReference::type + +// Transforms "T" into "const T&" according to standard reference collapsing +// rules (this is only needed as a backport for C++98 compilers that do not +// support reference collapsing). Specifically, it transforms: +// +// char ==> const char& +// const char ==> const char& +// char& ==> char& +// const char& ==> const char& +// +// Note that the non-const reference will not have "const" added. This is +// standard, and necessary so that "T" can always bind to "const T&". +template +struct ConstRef { typedef const T& type; }; +template +struct ConstRef { typedef T& type; }; + +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + typename ::testing::internal::ConstRef::type + +#if GTEST_HAS_STD_MOVE_ +using std::forward; +using std::move; + +template +struct RvalueRef { + typedef T&& type; +}; +#else // GTEST_HAS_STD_MOVE_ +template +const T& move(const T& t) { + return t; +} +template +GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; } + +template +struct RvalueRef { + typedef const T& type; +}; +#endif // GTEST_HAS_STD_MOVE_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertable to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template +inline To ImplicitCast_(To x) { return x; } + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template // use like this: DownCast_(foo); +inline To DownCast_(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (false) { + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = NULL; + ::testing::internal::ImplicitCast_(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == NULL || dynamic_cast(f) != NULL); +#endif + return static_cast(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template +Derived* CheckedDowncastToActualType(Base* base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); +#endif + +#if GTEST_HAS_DOWNCAST_ + return ::down_cast(base); +#elif GTEST_HAS_RTTI + return dynamic_cast(base); // NOLINT +#else + return static_cast(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ std::string GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ std::string GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION +// Returns the size (in bytes) of a file. +GTEST_API_ size_t GetFileSize(FILE* file); + +// Reads the entire content of a file as a string. +GTEST_API_ std::string ReadEntireFile(FILE* file); + +// All command line arguments. +GTEST_API_ std::vector GetArgvs(); + +#if GTEST_HAS_DEATH_TEST + +std::vector GetInjectableArgvs(); +// Deprecated: pass the args vector by value instead. +void SetInjectableArgvs(const std::vector* new_argvs); +void SetInjectableArgvs(const std::vector& new_argvs); +#if GTEST_HAS_GLOBAL_STRING +void SetInjectableArgvs(const std::vector< ::string>& new_argvs); +#endif // GTEST_HAS_GLOBAL_STRING +void ClearInjectableArgvs(); + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. +#if GTEST_IS_THREADSAFE +# if GTEST_HAS_PTHREAD +// Sleeps for (roughly) n milliseconds. This function is only for testing +// Google Test's own constructs. Don't use it in user tests, either +// directly or indirectly. +inline void SleepMilliseconds(int n) { + const timespec time = { + 0, // 0 seconds. + n * 1000L * 1000L, // And n ms. + }; + nanosleep(&time, NULL); +} +# endif // GTEST_HAS_PTHREAD + +# if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +# elif GTEST_HAS_PTHREAD +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class Notification { + public: + Notification() : notified_(false) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + } + ~Notification() { + pthread_mutex_destroy(&mutex_); + } + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + pthread_mutex_lock(&mutex_); + notified_ = true; + pthread_mutex_unlock(&mutex_); + } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + for (;;) { + pthread_mutex_lock(&mutex_); + const bool notified = notified_; + pthread_mutex_unlock(&mutex_); + if (notified) + break; + SleepMilliseconds(10); + } + } + + private: + pthread_mutex_t mutex_; + bool notified_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; + +# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +GTEST_API_ void SleepMilliseconds(int n); + +// Provides leak-safe Windows kernel handle ownership. +// Used in death tests and in threading support. +class GTEST_API_ AutoHandle { + public: + // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to + // avoid including in this header file. Including is + // undesirable because it defines a lot of symbols and macros that tend to + // conflict with client code. This assumption is verified by + // WindowsTypesTest.HANDLEIsVoidStar. + typedef void* Handle; + AutoHandle(); + explicit AutoHandle(Handle handle); + + ~AutoHandle(); + + Handle Get() const; + void Reset(); + void Reset(Handle handle); + + private: + // Returns true iff the handle is a valid handle object that can be closed. + bool IsCloseable() const; + + Handle handle_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); +}; + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class GTEST_API_ Notification { + public: + Notification(); + void Notify(); + void WaitForNotification(); + + private: + AutoHandle event_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; +# endif // GTEST_HAS_NOTIFICATION_ + +// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD +// defined, but we don't want to use MinGW's pthreads implementation, which +// has conformance problems with some versions of the POSIX standard. +# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { + static_cast(thread)->Run(); + return NULL; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : func_(func), + param_(param), + thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase* const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); + finished_ = true; + } + } + + virtual void Run() { + if (thread_can_start_ != NULL) + thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + UserThreadFunc* const func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification* const thread_can_start_; + bool finished_; // true iff we know that the thread function has finished. + pthread_t thread_; // The native thread object. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; +# endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +// Mutex and ThreadLocal have already been imported into the namespace. +// Nothing to do here. + +# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +// Mutex implements mutex on Windows platforms. It is used in conjunction +// with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the +// // end of the current scope. +// +// A static Mutex *must* be defined or declared using one of the following +// macros: +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// (A non-static Mutex is defined/declared in the usual way). +class GTEST_API_ Mutex { + public: + enum MutexType { kStatic = 0, kDynamic = 1 }; + // We rely on kStaticMutex being 0 as it is to what the linker initializes + // type_ in static mutexes. critical_section_ will be initialized lazily + // in ThreadSafeLazyInit(). + enum StaticConstructorSelector { kStaticMutex = 0 }; + + // This constructor intentionally does nothing. It relies on type_ being + // statically initialized to 0 (effectively setting it to kStatic) and on + // ThreadSafeLazyInit() to lazily initialize the rest of the members. + explicit Mutex(StaticConstructorSelector /*dummy*/) {} + + Mutex(); + ~Mutex(); + + void Lock(); + + void Unlock(); + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld(); + + private: + // Initializes owner_thread_id_ and critical_section_ in static mutexes. + void ThreadSafeLazyInit(); + + // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503, + // we assume that 0 is an invalid value for thread IDs. + unsigned int owner_thread_id_; + + // For static mutexes, we rely on these members being initialized to zeros + // by the linker. + MutexType type_; + long critical_section_init_phase_; // NOLINT + GTEST_CRITICAL_SECTION* critical_section_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex* mutex) + : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + Mutex* const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Base class for ValueHolder. Allows a caller to hold and delete a value +// without knowing its type. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Provides a way for a thread to send notifications to a ThreadLocal +// regardless of its parameter type. +class ThreadLocalBase { + public: + // Creates a new ValueHolder object holding a default value passed to + // this ThreadLocal's constructor and returns it. It is the caller's + // responsibility not to call this when the ThreadLocal instance already + // has a value on the current thread. + virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0; + + protected: + ThreadLocalBase() {} + virtual ~ThreadLocalBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase); +}; + +// Maps a thread to a set of ThreadLocals that have values instantiated on that +// thread and notifies them when the thread exits. A ThreadLocal instance is +// expected to persist until all threads it has values on have terminated. +class GTEST_API_ ThreadLocalRegistry { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance); + + // Invoked when a ThreadLocal instance is destroyed. + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance); +}; + +class GTEST_API_ ThreadWithParamBase { + public: + void Join(); + + protected: + class Runnable { + public: + virtual ~Runnable() {} + virtual void Run() = 0; + }; + + ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start); + virtual ~ThreadWithParamBase(); + + private: + AutoHandle thread_; +}; + +// Helper class for testing Google Test's multi-threading constructs. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) { + } + virtual ~ThreadWithParam() {} + + private: + class RunnableImpl : public Runnable { + public: + RunnableImpl(UserThreadFunc* func, T param) + : func_(func), + param_(param) { + } + virtual ~RunnableImpl() {} + virtual void Run() { + func_(param_); + } + + private: + UserThreadFunc* const func_; + const T param_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl); + }; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; + +// Implements thread-local storage on Windows systems. +// +// // Thread 1 +// ThreadLocal tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// The users of a TheadLocal instance have to make sure that all but one +// threads (including the main one) using that instance have exited before +// destroying it. Otherwise, the per-thread objects managed for them by the +// ThreadLocal instance are not guaranteed to be destroyed on all platforms. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template +class ThreadLocal : public ThreadLocalBase { + public: + ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of T. Can be deleted via its base class without the caller + // knowing the type of T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + + T* GetOrCreateValue() const { + return static_cast( + ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer(); + } + + virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const { + return default_factory_->MakeNewHolder(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + virtual ValueHolder* MakeNewHolder() const { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + }; + + scoped_ptr default_factory_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +# elif GTEST_HAS_PTHREAD + +// MutexBase and Mutex implement mutex on pthreads-based platforms. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + has_owner_ = true; + } + + // Releases this mutex. + void Unlock() { + // Since the lock is being released the owner_ field should no longer be + // considered valid. We don't protect writing to has_owner_ here, as it's + // the caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + has_owner_ = false; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + // has_owner_ indicates whether the owner_ field below contains a valid thread + // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All + // accesses to the owner_ field should be protected by a check of this field. + // An alternative might be to memset() owner_ to all zeros, but there's no + // guarantee that a zero'd pthread_t is necessarily invalid or even different + // from pthread_self(). + bool has_owner_; + pthread_t owner_; // The thread holding the mutex. +}; + +// Forward-declares a static mutex. +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +// The initialization list here does not explicitly initialize each field, +// instead relying on default initialization for the unspecified fields. In +// particular, the owner_ field (a pthread_t) is not explicitly initialized. +// This allows initialization to work whether pthread_t is a scalar or struct. +// The flag -Wmissing-field-initializers must not be specified for this to work. +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0} + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + has_owner_ = false; + } + ~Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase* mutex) + : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase* const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void* value_holder) { + delete static_cast(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() + : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : key_(CreateKey()), + default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T* GetOrCreateValue() const { + ThreadLocalValueHolderBase* const holder = + static_cast(pthread_getspecific(key_)); + if (holder != NULL) { + return CheckedDowncastToActualType(holder)->pointer(); + } + + ValueHolder* const new_holder = default_factory_->MakeNewHolder(); + ThreadLocalValueHolderBase* const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + virtual ValueHolder* MakeNewHolder() const { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + }; + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + scoped_ptr default_factory_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +# endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#else // GTEST_IS_THREADSAFE + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void Lock() {} + void Unlock() {} + void AssertHeld() const {} +}; + +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex*) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T& value) : value_(value) {} + T* pointer() { return &value_; } + const T* pointer() const { return &value_; } + const T& get() const { return value_; } + void set(const T& value) { value_ = value; } + private: + T value_; +}; + +#endif // GTEST_IS_THREADSAFE + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +// Passing non-POD classes through ellipsis (...) crashes the ARM +// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian +// and the IBM XL C/C++ compiler try to instantiate a copy constructor +// for objects passed through ellipsis (...), failing for uncopyable +// objects. We define this to ensure that only POD is passed through +// ellipsis on these systems. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \ + (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130) +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_ELLIPSIS_NEEDS_POD_ 1 +#else +# define GTEST_CAN_COMPARE_NULL 1 +#endif + +// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between +// const T& and const T* in a function template. These compilers +// _can_ decide between class template specializations for T and T*, +// so a tr1::type_traits-like is_pointer works. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) +# define GTEST_NEEDS_IS_POINTER_ 1 +#endif + +template +struct bool_constant { + typedef bool_constant type; + static const bool value = bool_value; +}; +template const bool bool_constant::value; + +typedef bool_constant false_type; +typedef bool_constant true_type; + +template +struct is_same : public false_type {}; + +template +struct is_same : public true_type {}; + + +template +struct is_pointer : public false_type {}; + +template +struct is_pointer : public true_type {}; + +template +struct IteratorTraits { + typedef typename Iterator::value_type value_type; +}; + + +template +struct IteratorTraits { + typedef T value_type; +}; + +template +struct IteratorTraits { + typedef T value_type; +}; + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_SEP_ "\\" +# define GTEST_HAS_ALT_PATH_SEP_ 1 +// The biggest signed integer type the compiler supports. +typedef __int64 BiggestInt; +#else +# define GTEST_PATH_SEP_ "/" +# define GTEST_HAS_ALT_PATH_SEP_ 0 +typedef long long BiggestInt; // NOLINT +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast(ch)) != 0; +} +inline bool IsXDigit(wchar_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} + +inline char ToLower(char ch) { + return static_cast(tolower(static_cast(ch))); +} +inline char ToUpper(char ch) { + return static_cast(toupper(static_cast(ch))); +} + +inline std::string StripTrailingSpaces(std::string str) { + std::string::iterator it = str.end(); + while (it != str.begin() && IsSpace(*--it)) + it = str.erase(it); + return str; +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +# ifdef __BORLANDC__ +inline int IsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +# else // !__BORLANDC__ +# if GTEST_OS_WINDOWS_MOBILE +inline int IsATTY(int /* fd */) { return 0; } +# else +inline int IsATTY(int fd) { return _isatty(fd); } +# endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return _strdup(src); } +# endif // __BORLANDC__ + +# if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +# else +inline int FileNo(FILE* file) { return _fileno(file); } +inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } +inline int RmDir(const char* dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct& st) { + return (_S_IFDIR & st.st_mode) != 0; +} +# endif // GTEST_OS_WINDOWS_MOBILE + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +// Functions deprecated by MSVC 8.0. + +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +inline const char* StrNCpy(char* dest, const char* src, size_t n) { + return strncpy(dest, src, n); +} + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +inline int ChDir(const char* dir) { return chdir(dir); } +#endif +inline FILE* FOpen(const char* path, const char* mode) { + return fopen(path, mode); +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { + return freopen(path, mode, stream); +} +inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE* fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void* buf, unsigned int count) { + return static_cast(read(fd, buf, count)); +} +inline int Write(int fd, const void* buf, unsigned int count) { + return static_cast(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char* StrError(int errnum) { return strerror(errnum); } +#endif +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT + // We are on Windows CE, which has no environment variables. + static_cast(name); // To prevent 'unused argument' warning. + return NULL; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != NULL && env[0] != '\0') ? env : NULL; +#else + return getenv(name); +#endif +} + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +void Abort(); +#else +inline void Abort() { abort(); } +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// MSVC "deprecates" snprintf and issues warnings wherever it is used. In +// order to avoid these warnings, we need to use _snprintf or _snprintf_s on +// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate +// function in order to achieve that. We use macro definition here because +// snprintf is a variadic function. +#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE +// MSVC 2005 and above support variadic macros. +# define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#elif defined(_MSC_VER) +// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't +// complain about _snprintf. +# define GTEST_SNPRINTF_ _snprintf +#else +# define GTEST_SNPRINTF_ snprintf +#endif + +// The maximum number a BiggestInt can represent. This definition +// works no matter BiggestInt is represented in one's complement or +// two's complement. +// +// We cannot rely on numeric_limits in STL, as __int64 and long long +// are not part of standard C++ and numeric_limits doesn't need to be +// defined for them. +const BiggestInt kMaxBiggestInt = + ~(static_cast(1) << (8*sizeof(BiggestInt) - 1)); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize with incorrect + // values of N. + typedef void UInt; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + // unsigned int has size 4 in both gcc and MSVC. + // + // As base/basictypes.h doesn't compile on Windows, we cannot use + // uint32, uint64, and etc here. + typedef int Int; + typedef unsigned int UInt; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: +#if GTEST_OS_WINDOWS + typedef __int64 Int; + typedef unsigned __int64 UInt; +#else + typedef long long Int; // NOLINT + typedef unsigned long long UInt; // NOLINT +#endif // GTEST_OS_WINDOWS +}; + +// Integer types of known sizes. +typedef TypeWithSize<4>::Int Int32; +typedef TypeWithSize<4>::UInt UInt32; +typedef TypeWithSize<8>::Int Int64; +typedef TypeWithSize<8>::UInt UInt64; +typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#if !defined(GTEST_FLAG) +# define GTEST_FLAG(name) FLAGS_gtest_##name +#endif // !defined(GTEST_FLAG) + +#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) +# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 +#endif // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) + +#if !defined(GTEST_DECLARE_bool_) +# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +// Macros for declaring flags. +# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) +# define GTEST_DECLARE_int32_(name) \ + GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) +# define GTEST_DECLARE_string_(name) \ + GTEST_API_ extern ::std::string GTEST_FLAG(name) + +// Macros for defining flags. +# define GTEST_DEFINE_bool_(name, default_val, doc) \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val) +# define GTEST_DEFINE_int32_(name, default_val, doc) \ + GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) +# define GTEST_DEFINE_string_(name, default_val, doc) \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) + +#endif // !defined(GTEST_DECLARE_bool_) + +// Thread annotations +#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) +# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +# define GTEST_LOCK_EXCLUDED_(locks) +#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +// FIXME: Find a better way to refactor flag and environment parsing +// out of both gtest-port.cc and gtest.cc to avoid exporting this utility +// function. +bool ParseInt32(const Message& src_text, const char* str, Int32* value); + +// Parses a bool/Int32/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char* flag, bool default_val); +GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); +std::string OutputFlagAlsoCheckEnvVar(); +const char* StringFromGTestEnv(const char* flag, const char* default_val); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +#if GTEST_OS_LINUX +# include +# include +# include +# include +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include + + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Ensures that there is at least one operator<< in the global namespace. +// See Message& operator<<(...) below for why. +void operator<<(const testing::internal::Secret&, int); + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); + + public: + // Constructs an empty Message. + Message(); + + // Copy constructor. + Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char* str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + +#if GTEST_OS_SYMBIAN + // Streams a value (either a pointer or not) to this object. + template + inline Message& operator <<(const T& value) { + StreamHelper(typename internal::is_pointer::type(), value); + return *this; + } +#else + // Streams a non-pointer value to this object. + template + inline Message& operator <<(const T& val) { + // Some libraries overload << for STL containers. These + // overloads are defined in the global namespace instead of ::std. + // + // C++'s symbol lookup rule (i.e. Koenig lookup) says that these + // overloads are visible in either the std namespace or the global + // namespace, but not other namespaces, including the testing + // namespace which Google Test's Message class is in. + // + // To allow STL containers (and other types that has a << operator + // defined in the global namespace) to be used in Google Test + // assertions, testing::Message must access the custom << operator + // from the global namespace. With this using declaration, + // overloads of << defined in the global namespace and those + // visible via Koenig lookup are both exposed in this function. + using ::operator <<; + *ss_ << val; + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template + inline Message& operator <<(T* const& pointer) { // NOLINT + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + return *this; + } +#endif // GTEST_OS_SYMBIAN + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message& operator <<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message& operator <<(bool b) { + return *this << (b ? "true" : "false"); + } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message& operator <<(const wchar_t* wide_c_str); + Message& operator <<(wchar_t* wide_c_str); + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::std::wstring& wstr); +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::wstring& wstr); +#endif // GTEST_HAS_GLOBAL_WSTRING + + // Gets the text streamed to this object so far as an std::string. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + std::string GetString() const; + + private: +#if GTEST_OS_SYMBIAN + // These are needed as the Nokia Symbian Compiler cannot decide between + // const T& and const T* in a function template. The Nokia compiler _can_ + // decide between class template specializations for T and T*, so a + // tr1::type_traits-like is_pointer works, and we can overload on that. + template + inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) { + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + } + template + inline void StreamHelper(internal::false_type /*is_pointer*/, + const T& value) { + // See the comments in Message& operator <<(const T&) above for why + // we need this using statement. + using ::operator <<; + *ss_ << value; + } +#endif // GTEST_OS_SYMBIAN + + // We'll hold the text streamed to this object here. + const internal::scoped_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message&); +}; + +// Streams a Message to an ostream. +inline std::ostream& operator <<(std::ostream& os, const Message& sb) { + return os << sb.GetString(); +} + +namespace internal { + +// Converts a streamable value to an std::string. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +template +std::string StreamableToString(const T& streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in gtest/internal/gtest-internal.h. +// Do not include this header file separately! + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by gtest-internal.h. +// It should not be #included by other files. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +# include +#endif + +#include +#include + + +namespace testing { +namespace internal { + +// String - an abstract class holding static string utilities. +class GTEST_API_ String { + public: + // Static utility methods + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char* CloneCString(const char* c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char* c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char* Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true iff they have the same content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char* lhs, const char* rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static std::string ShowWideCString(const wchar_t* wide_c_str); + + // Compares two wide C strings. Returns true iff they have the same + // content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); + + // Compares two C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char* lhs, + const char* rhs); + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs); + + // Returns true iff the given string ends with the given suffix, ignoring + // case. Any string is considered to end with an empty suffix. + static bool EndsWithCaseInsensitive( + const std::string& str, const std::string& suffix); + + // Formats an int value as "%02d". + static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + + // Formats an int value as "%X". + static std::string FormatHexInt(int value); + + // Formats a byte as "%02X". + static std::string FormatByte(unsigned char value); + + private: + String(); // Not meant to be instantiated. +}; // class String + +// Gets the content of the stringstream's buffer as an std::string. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") { } + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } + + explicit FilePath(const std::string& pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath& operator=(const FilePath& rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath& rhs) { + pathname_ = rhs.pathname_; + } + + const std::string& string() const { return pathname_; } + const char* c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath& directory, + const FilePath& relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension); + + // Returns true iff the path is "". + bool IsEmpty() const { return pathname_.empty(); } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char* extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char* FindLastPathSeparator() const; + + std::string pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +// This file was GENERATED by command: +// pump.py gtest-type-util.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Type utilities needed for implementing typed and type-parameterized +// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently we support at most 50 types in a list, and at most 50 +// type-parameterized tests in one type-parameterized test case. +// Please contact googletestframework@googlegroups.com if you need +// more. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +# if GTEST_HAS_CXXABI_H_ +# include +# elif defined(__HP_aCC) +# include +# endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// Canonicalizes a given name with respect to the Standard C++ Library. +// This handles removing the inline namespace within `std` that is +// used by various standard libraries (e.g., `std::__1`). Names outside +// of namespace std are returned unmodified. +inline std::string CanonicalizeForStdLibVersioning(std::string s) { + static const char prefix[] = "std::__"; + if (s.compare(0, strlen(prefix), prefix) == 0) { + std::string::size_type end = s.find("::", strlen(prefix)); + if (end != s.npos) { + // Erase everything between the initial `std` and the second `::`. + s.erase(strlen("std"), end - strlen("std")); + } + } + return s; +} + +// GetTypeName() returns a human-readable name of type T. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +# if GTEST_HAS_RTTI + + const char* const name = typeid(T).name(); +# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +# if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +# endif // GTEST_HAS_CXXABI_H_ + char* const readable_name = __cxa_demangle(name, 0, 0, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return CanonicalizeForStdLibVersioning(name_str); +# else + return name; +# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC + +# else + + return ""; + +# endif // GTEST_HAS_RTTI +} + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// AssertyTypeEq::type is defined iff T1 and T2 are the same +// type. This can be used as a compile-time assertion to ensure that +// two types are equal. + +template +struct AssertTypeEq; + +template +struct AssertTypeEq { + typedef bool type; +}; + +// A unique type used as the default value for the arguments of class +// template Types. This allows us to simulate variadic templates +// (e.g. Types, Type, and etc), which C++ doesn't +// support directly. +struct None {}; + +// The following family of struct and struct templates are used to +// represent type lists. In particular, TypesN +// represents a type list with N types (T1, T2, ..., and TN) in it. +// Except for Types0, every struct in the family has two member types: +// Head for the first type in the list, and Tail for the rest of the +// list. + +// The empty type list. +struct Types0 {}; + +// Type lists of length 1, 2, 3, and so on. + +template +struct Types1 { + typedef T1 Head; + typedef Types0 Tail; +}; +template +struct Types2 { + typedef T1 Head; + typedef Types1 Tail; +}; + +template +struct Types3 { + typedef T1 Head; + typedef Types2 Tail; +}; + +template +struct Types4 { + typedef T1 Head; + typedef Types3 Tail; +}; + +template +struct Types5 { + typedef T1 Head; + typedef Types4 Tail; +}; + +template +struct Types6 { + typedef T1 Head; + typedef Types5 Tail; +}; + +template +struct Types7 { + typedef T1 Head; + typedef Types6 Tail; +}; + +template +struct Types8 { + typedef T1 Head; + typedef Types7 Tail; +}; + +template +struct Types9 { + typedef T1 Head; + typedef Types8 Tail; +}; + +template +struct Types10 { + typedef T1 Head; + typedef Types9 Tail; +}; + +template +struct Types11 { + typedef T1 Head; + typedef Types10 Tail; +}; + +template +struct Types12 { + typedef T1 Head; + typedef Types11 Tail; +}; + +template +struct Types13 { + typedef T1 Head; + typedef Types12 Tail; +}; + +template +struct Types14 { + typedef T1 Head; + typedef Types13 Tail; +}; + +template +struct Types15 { + typedef T1 Head; + typedef Types14 Tail; +}; + +template +struct Types16 { + typedef T1 Head; + typedef Types15 Tail; +}; + +template +struct Types17 { + typedef T1 Head; + typedef Types16 Tail; +}; + +template +struct Types18 { + typedef T1 Head; + typedef Types17 Tail; +}; + +template +struct Types19 { + typedef T1 Head; + typedef Types18 Tail; +}; + +template +struct Types20 { + typedef T1 Head; + typedef Types19 Tail; +}; + +template +struct Types21 { + typedef T1 Head; + typedef Types20 Tail; +}; + +template +struct Types22 { + typedef T1 Head; + typedef Types21 Tail; +}; + +template +struct Types23 { + typedef T1 Head; + typedef Types22 Tail; +}; + +template +struct Types24 { + typedef T1 Head; + typedef Types23 Tail; +}; + +template +struct Types25 { + typedef T1 Head; + typedef Types24 Tail; +}; + +template +struct Types26 { + typedef T1 Head; + typedef Types25 Tail; +}; + +template +struct Types27 { + typedef T1 Head; + typedef Types26 Tail; +}; + +template +struct Types28 { + typedef T1 Head; + typedef Types27 Tail; +}; + +template +struct Types29 { + typedef T1 Head; + typedef Types28 Tail; +}; + +template +struct Types30 { + typedef T1 Head; + typedef Types29 Tail; +}; + +template +struct Types31 { + typedef T1 Head; + typedef Types30 Tail; +}; + +template +struct Types32 { + typedef T1 Head; + typedef Types31 Tail; +}; + +template +struct Types33 { + typedef T1 Head; + typedef Types32 Tail; +}; + +template +struct Types34 { + typedef T1 Head; + typedef Types33 Tail; +}; + +template +struct Types35 { + typedef T1 Head; + typedef Types34 Tail; +}; + +template +struct Types36 { + typedef T1 Head; + typedef Types35 Tail; +}; + +template +struct Types37 { + typedef T1 Head; + typedef Types36 Tail; +}; + +template +struct Types38 { + typedef T1 Head; + typedef Types37 Tail; +}; + +template +struct Types39 { + typedef T1 Head; + typedef Types38 Tail; +}; + +template +struct Types40 { + typedef T1 Head; + typedef Types39 Tail; +}; + +template +struct Types41 { + typedef T1 Head; + typedef Types40 Tail; +}; + +template +struct Types42 { + typedef T1 Head; + typedef Types41 Tail; +}; + +template +struct Types43 { + typedef T1 Head; + typedef Types42 Tail; +}; + +template +struct Types44 { + typedef T1 Head; + typedef Types43 Tail; +}; + +template +struct Types45 { + typedef T1 Head; + typedef Types44 Tail; +}; + +template +struct Types46 { + typedef T1 Head; + typedef Types45 Tail; +}; + +template +struct Types47 { + typedef T1 Head; + typedef Types46 Tail; +}; + +template +struct Types48 { + typedef T1 Head; + typedef Types47 Tail; +}; + +template +struct Types49 { + typedef T1 Head; + typedef Types48 Tail; +}; + +template +struct Types50 { + typedef T1 Head; + typedef Types49 Tail; +}; + + +} // namespace internal + +// We don't want to require the users to write TypesN<...> directly, +// as that would require them to count the length. Types<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Types +// will appear as Types in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Types, and Google Test will translate +// that to TypesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Types template. +template +struct Types { + typedef internal::Types50 type; +}; + +template <> +struct Types { + typedef internal::Types0 type; +}; +template +struct Types { + typedef internal::Types1 type; +}; +template +struct Types { + typedef internal::Types2 type; +}; +template +struct Types { + typedef internal::Types3 type; +}; +template +struct Types { + typedef internal::Types4 type; +}; +template +struct Types { + typedef internal::Types5 type; +}; +template +struct Types { + typedef internal::Types6 type; +}; +template +struct Types { + typedef internal::Types7 type; +}; +template +struct Types { + typedef internal::Types8 type; +}; +template +struct Types { + typedef internal::Types9 type; +}; +template +struct Types { + typedef internal::Types10 type; +}; +template +struct Types { + typedef internal::Types11 type; +}; +template +struct Types { + typedef internal::Types12 type; +}; +template +struct Types { + typedef internal::Types13 type; +}; +template +struct Types { + typedef internal::Types14 type; +}; +template +struct Types { + typedef internal::Types15 type; +}; +template +struct Types { + typedef internal::Types16 type; +}; +template +struct Types { + typedef internal::Types17 type; +}; +template +struct Types { + typedef internal::Types18 type; +}; +template +struct Types { + typedef internal::Types19 type; +}; +template +struct Types { + typedef internal::Types20 type; +}; +template +struct Types { + typedef internal::Types21 type; +}; +template +struct Types { + typedef internal::Types22 type; +}; +template +struct Types { + typedef internal::Types23 type; +}; +template +struct Types { + typedef internal::Types24 type; +}; +template +struct Types { + typedef internal::Types25 type; +}; +template +struct Types { + typedef internal::Types26 type; +}; +template +struct Types { + typedef internal::Types27 type; +}; +template +struct Types { + typedef internal::Types28 type; +}; +template +struct Types { + typedef internal::Types29 type; +}; +template +struct Types { + typedef internal::Types30 type; +}; +template +struct Types { + typedef internal::Types31 type; +}; +template +struct Types { + typedef internal::Types32 type; +}; +template +struct Types { + typedef internal::Types33 type; +}; +template +struct Types { + typedef internal::Types34 type; +}; +template +struct Types { + typedef internal::Types35 type; +}; +template +struct Types { + typedef internal::Types36 type; +}; +template +struct Types { + typedef internal::Types37 type; +}; +template +struct Types { + typedef internal::Types38 type; +}; +template +struct Types { + typedef internal::Types39 type; +}; +template +struct Types { + typedef internal::Types40 type; +}; +template +struct Types { + typedef internal::Types41 type; +}; +template +struct Types { + typedef internal::Types42 type; +}; +template +struct Types { + typedef internal::Types43 type; +}; +template +struct Types { + typedef internal::Types44 type; +}; +template +struct Types { + typedef internal::Types45 type; +}; +template +struct Types { + typedef internal::Types46 type; +}; +template +struct Types { + typedef internal::Types47 type; +}; +template +struct Types { + typedef internal::Types48 type; +}; +template +struct Types { + typedef internal::Types49 type; +}; + +namespace internal { + +# define GTEST_TEMPLATE_ template class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +# define GTEST_BIND_(TmplSel, T) \ + TmplSel::template Bind::type + +// A unique struct template used as the default value for the +// arguments of class template Templates. This allows us to simulate +// variadic templates (e.g. Templates, Templates, +// and etc), which C++ doesn't support directly. +template +struct NoneT {}; + +// The following family of struct and struct templates are used to +// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except +// for Templates0, every struct in the family has two member types: +// Head for the selector of the first template in the list, and Tail +// for the rest of the list. + +// The empty template list. +struct Templates0 {}; + +// Template lists of length 1, 2, 3, and so on. + +template +struct Templates1 { + typedef TemplateSel Head; + typedef Templates0 Tail; +}; +template +struct Templates2 { + typedef TemplateSel Head; + typedef Templates1 Tail; +}; + +template +struct Templates3 { + typedef TemplateSel Head; + typedef Templates2 Tail; +}; + +template +struct Templates4 { + typedef TemplateSel Head; + typedef Templates3 Tail; +}; + +template +struct Templates5 { + typedef TemplateSel Head; + typedef Templates4 Tail; +}; + +template +struct Templates6 { + typedef TemplateSel Head; + typedef Templates5 Tail; +}; + +template +struct Templates7 { + typedef TemplateSel Head; + typedef Templates6 Tail; +}; + +template +struct Templates8 { + typedef TemplateSel Head; + typedef Templates7 Tail; +}; + +template +struct Templates9 { + typedef TemplateSel Head; + typedef Templates8 Tail; +}; + +template +struct Templates10 { + typedef TemplateSel Head; + typedef Templates9 Tail; +}; + +template +struct Templates11 { + typedef TemplateSel Head; + typedef Templates10 Tail; +}; + +template +struct Templates12 { + typedef TemplateSel Head; + typedef Templates11 Tail; +}; + +template +struct Templates13 { + typedef TemplateSel Head; + typedef Templates12 Tail; +}; + +template +struct Templates14 { + typedef TemplateSel Head; + typedef Templates13 Tail; +}; + +template +struct Templates15 { + typedef TemplateSel Head; + typedef Templates14 Tail; +}; + +template +struct Templates16 { + typedef TemplateSel Head; + typedef Templates15 Tail; +}; + +template +struct Templates17 { + typedef TemplateSel Head; + typedef Templates16 Tail; +}; + +template +struct Templates18 { + typedef TemplateSel Head; + typedef Templates17 Tail; +}; + +template +struct Templates19 { + typedef TemplateSel Head; + typedef Templates18 Tail; +}; + +template +struct Templates20 { + typedef TemplateSel Head; + typedef Templates19 Tail; +}; + +template +struct Templates21 { + typedef TemplateSel Head; + typedef Templates20 Tail; +}; + +template +struct Templates22 { + typedef TemplateSel Head; + typedef Templates21 Tail; +}; + +template +struct Templates23 { + typedef TemplateSel Head; + typedef Templates22 Tail; +}; + +template +struct Templates24 { + typedef TemplateSel Head; + typedef Templates23 Tail; +}; + +template +struct Templates25 { + typedef TemplateSel Head; + typedef Templates24 Tail; +}; + +template +struct Templates26 { + typedef TemplateSel Head; + typedef Templates25 Tail; +}; + +template +struct Templates27 { + typedef TemplateSel Head; + typedef Templates26 Tail; +}; + +template +struct Templates28 { + typedef TemplateSel Head; + typedef Templates27 Tail; +}; + +template +struct Templates29 { + typedef TemplateSel Head; + typedef Templates28 Tail; +}; + +template +struct Templates30 { + typedef TemplateSel Head; + typedef Templates29 Tail; +}; + +template +struct Templates31 { + typedef TemplateSel Head; + typedef Templates30 Tail; +}; + +template +struct Templates32 { + typedef TemplateSel Head; + typedef Templates31 Tail; +}; + +template +struct Templates33 { + typedef TemplateSel Head; + typedef Templates32 Tail; +}; + +template +struct Templates34 { + typedef TemplateSel Head; + typedef Templates33 Tail; +}; + +template +struct Templates35 { + typedef TemplateSel Head; + typedef Templates34 Tail; +}; + +template +struct Templates36 { + typedef TemplateSel Head; + typedef Templates35 Tail; +}; + +template +struct Templates37 { + typedef TemplateSel Head; + typedef Templates36 Tail; +}; + +template +struct Templates38 { + typedef TemplateSel Head; + typedef Templates37 Tail; +}; + +template +struct Templates39 { + typedef TemplateSel Head; + typedef Templates38 Tail; +}; + +template +struct Templates40 { + typedef TemplateSel Head; + typedef Templates39 Tail; +}; + +template +struct Templates41 { + typedef TemplateSel Head; + typedef Templates40 Tail; +}; + +template +struct Templates42 { + typedef TemplateSel Head; + typedef Templates41 Tail; +}; + +template +struct Templates43 { + typedef TemplateSel Head; + typedef Templates42 Tail; +}; + +template +struct Templates44 { + typedef TemplateSel Head; + typedef Templates43 Tail; +}; + +template +struct Templates45 { + typedef TemplateSel Head; + typedef Templates44 Tail; +}; + +template +struct Templates46 { + typedef TemplateSel Head; + typedef Templates45 Tail; +}; + +template +struct Templates47 { + typedef TemplateSel Head; + typedef Templates46 Tail; +}; + +template +struct Templates48 { + typedef TemplateSel Head; + typedef Templates47 Tail; +}; + +template +struct Templates49 { + typedef TemplateSel Head; + typedef Templates48 Tail; +}; + +template +struct Templates50 { + typedef TemplateSel Head; + typedef Templates49 Tail; +}; + + +// We don't want to require the users to write TemplatesN<...> directly, +// as that would require them to count the length. Templates<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Templates +// will appear as Templates in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Templates, and Google Test will translate +// that to TemplatesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Templates template. +template +struct Templates { + typedef Templates50 type; +}; + +template <> +struct Templates { + typedef Templates0 type; +}; +template +struct Templates { + typedef Templates1 type; +}; +template +struct Templates { + typedef Templates2 type; +}; +template +struct Templates { + typedef Templates3 type; +}; +template +struct Templates { + typedef Templates4 type; +}; +template +struct Templates { + typedef Templates5 type; +}; +template +struct Templates { + typedef Templates6 type; +}; +template +struct Templates { + typedef Templates7 type; +}; +template +struct Templates { + typedef Templates8 type; +}; +template +struct Templates { + typedef Templates9 type; +}; +template +struct Templates { + typedef Templates10 type; +}; +template +struct Templates { + typedef Templates11 type; +}; +template +struct Templates { + typedef Templates12 type; +}; +template +struct Templates { + typedef Templates13 type; +}; +template +struct Templates { + typedef Templates14 type; +}; +template +struct Templates { + typedef Templates15 type; +}; +template +struct Templates { + typedef Templates16 type; +}; +template +struct Templates { + typedef Templates17 type; +}; +template +struct Templates { + typedef Templates18 type; +}; +template +struct Templates { + typedef Templates19 type; +}; +template +struct Templates { + typedef Templates20 type; +}; +template +struct Templates { + typedef Templates21 type; +}; +template +struct Templates { + typedef Templates22 type; +}; +template +struct Templates { + typedef Templates23 type; +}; +template +struct Templates { + typedef Templates24 type; +}; +template +struct Templates { + typedef Templates25 type; +}; +template +struct Templates { + typedef Templates26 type; +}; +template +struct Templates { + typedef Templates27 type; +}; +template +struct Templates { + typedef Templates28 type; +}; +template +struct Templates { + typedef Templates29 type; +}; +template +struct Templates { + typedef Templates30 type; +}; +template +struct Templates { + typedef Templates31 type; +}; +template +struct Templates { + typedef Templates32 type; +}; +template +struct Templates { + typedef Templates33 type; +}; +template +struct Templates { + typedef Templates34 type; +}; +template +struct Templates { + typedef Templates35 type; +}; +template +struct Templates { + typedef Templates36 type; +}; +template +struct Templates { + typedef Templates37 type; +}; +template +struct Templates { + typedef Templates38 type; +}; +template +struct Templates { + typedef Templates39 type; +}; +template +struct Templates { + typedef Templates40 type; +}; +template +struct Templates { + typedef Templates41 type; +}; +template +struct Templates { + typedef Templates42 type; +}; +template +struct Templates { + typedef Templates43 type; +}; +template +struct Templates { + typedef Templates44 type; +}; +template +struct Templates { + typedef Templates45 type; +}; +template +struct Templates { + typedef Templates46 type; +}; +template +struct Templates { + typedef Templates47 type; +}; +template +struct Templates { + typedef Templates48 type; +}; +template +struct Templates { + typedef Templates49 type; +}; + +// The TypeList template makes it possible to use either a single type +// or a Types<...> list in TYPED_TEST_CASE() and +// INSTANTIATE_TYPED_TEST_CASE_P(). + +template +struct TypeList { + typedef Types1 type; +}; + +template +struct TypeList > { + typedef typename Types::type type; +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar + +// Stringifies its argument. +#define GTEST_STRINGIFY_(name) #name + +class ProtocolMessage; +namespace proto2 { class Message; } + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test cases. + +template +::std::string PrintToString(const T& value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// Two overloaded helpers for checking at compile time whether an +// expression is a null pointer literal (i.e. NULL or any 0-valued +// compile-time integral constant). Their return values have +// different sizes, so we can use sizeof() to test which version is +// picked by the compiler. These helpers have no implementations, as +// we only need their signatures. +// +// Given IsNullLiteralHelper(x), the compiler will pick the first +// version if x can be implicitly converted to Secret*, and pick the +// second version otherwise. Since Secret is a secret and incomplete +// type, the only expression a user can write that has type Secret* is +// a null pointer literal. Therefore, we know that x is a null +// pointer literal if and only if the first version is picked by the +// compiler. +char IsNullLiteralHelper(Secret* p); +char (&IsNullLiteralHelper(...))[2]; // NOLINT + +// A compile-time bool constant that is true if and only if x is a +// null pointer literal (i.e. NULL or any 0-valued compile-time +// integral constant). +#ifdef GTEST_ELLIPSIS_NEEDS_POD_ +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_IS_NULL_LITERAL_(x) false +#else +# define GTEST_IS_NULL_LITERAL_(x) \ + (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) +#endif // GTEST_ELLIPSIS_NEEDS_POD_ + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ std::string AppendUserMessage( + const std::string& gtest_msg, const Message& user_msg); + +#if GTEST_HAS_EXCEPTIONS + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \ +/* an exported class was derived from a class that was not exported */) + +// This exception is thrown by (and only by) a failed Google Test +// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions +// are enabled). We derive it from std::runtime_error, which is for +// errors presumably detectable only at run time. Since +// std::runtime_error inherits from std::exception, many testing +// frameworks know how to extract and print the message inside it. +class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult& failure); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275 + +#endif // GTEST_HAS_EXCEPTIONS + +namespace edit_distance { +// Returns the optimal edits to go from 'left' to 'right'. +// All edits cost the same, with replace having lower priority than +// add/remove. +// Simple implementation of the Wagner-Fischer algorithm. +// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm +enum EditType { kMatch, kAdd, kRemove, kReplace }; +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, const std::vector& right); + +// Same as above, but the input is represented as strings. +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right); + +// Create a diff of the input strings in Unified diff format. +GTEST_API_ std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context = 2); + +} // namespace edit_distance + +// Calculate the diff between 'left' and 'right' and return it in unified diff +// format. +// If not null, stores in 'total_line_count' the total number of lines found +// in left + right. +GTEST_API_ std::string DiffStrings(const std::string& left, + const std::string& right, + size_t* total_line_count); + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const std::string& expected_value, + const std::string& actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8*sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = + ~static_cast(0) >> (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ + static const size_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType& x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { + return ReinterpretBits(kExponentBitMask); + } + + // Returns the maximum representable finite floating-point number. + static RawType Max(); + + // Non-static methods + + // Returns the bits that represents this number. + const Bits &bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true iff this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true iff this number is at most kMaxUlps ULP's away from + // rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint& rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) + <= kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits &sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, + const Bits &sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// We cannot use std::numeric_limits::max() as it clashes with the max() +// macro defined by . +template <> +inline float FloatingPoint::Max() { return FLT_MAX; } +template <> +inline double FloatingPoint::Max() { return DBL_MAX; } + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint Float; +typedef FloatingPoint Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test case, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void* TypeId; + +template +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template +bool TypeIdHelper::dummy_ = false; + +// GetTypeId() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test* CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template +class TestFactoryImpl : public TestFactoryBase { + public: + virtual Test* CreateTest() { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestCase() and TearDownTestCase() functions. +typedef void (*SetUpTestCaseFunc)(); +typedef void (*TearDownTestCaseFunc)(); + +struct CodeLocation { + CodeLocation(const std::string& a_file, int a_line) + : file(a_file), line(a_line) {} + + std::string file; + int line; +}; + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + CodeLocation code_location, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// State of the definition of a type-parameterized test case. +class GTEST_API_ TypedTestCasePState { + public: + TypedTestCasePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test case hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char* file, int line, const char* case_name, + const char* test_name) { + if (registered_) { + fprintf(stderr, "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + registered_tests_.insert( + ::std::make_pair(test_name, CodeLocation(file, line))); + return true; + } + + bool TestExists(const std::string& test_name) const { + return registered_tests_.count(test_name) > 0; + } + + const CodeLocation& GetCodeLocation(const std::string& test_name) const { + RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name); + GTEST_CHECK_(it != registered_tests_.end()); + return it->second; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char* VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests); + + private: + typedef ::std::map RegisteredTestsMap; + + bool registered_; + RegisteredTestsMap registered_tests_; +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char* SkipComma(const char* str) { + const char* comma = strchr(str, ','); + if (comma == NULL) { + return NULL; + } + while (IsSpace(*(++comma))) {} + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline std::string GetPrefixUntilComma(const char* str) { + const char* comma = strchr(str, ','); + return comma == NULL ? str : std::string(str, comma); +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. +void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest); + +// The default argument to the template below for the case when the user does +// not provide a name generator. +struct DefaultNameGenerator { + template + static std::string GetName(int i) { + return StreamableToString(i); + } +}; + +template +struct NameGeneratorSelector { + typedef Provided type; +}; + +template +void GenerateNamesRecursively(Types0, std::vector*, int) {} + +template +void GenerateNamesRecursively(Types, std::vector* result, int i) { + result->push_back(NameGenerator::template GetName(i)); + GenerateNamesRecursively(typename Types::Tail(), result, + i + 1); +} + +template +std::vector GenerateNames() { + std::vector result; + GenerateNamesRecursively(Types(), &result, 0); + return result; +} + +// TypeParameterizedTest::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char* prefix, const CodeLocation& code_location, + const char* case_name, const char* test_names, int index, + const std::vector& type_names = + GenerateNames()) { + typedef typename Types::Head Type; + typedef Fixture FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + + "/" + type_names[index]) + .c_str(), + StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), + GetTypeName().c_str(), + NULL, // No value parameter. + code_location, GetTypeId(), TestClass::SetUpTestCase, + TestClass::TearDownTestCase, new TestFactoryImpl); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest::Register(prefix, + code_location, + case_name, + test_names, + index + 1, + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTest { + public: + static bool Register(const char* /*prefix*/, const CodeLocation&, + const char* /*case_name*/, const char* /*test_names*/, + int /*index*/, + const std::vector& = + std::vector() /*type_names*/) { + return true; + } +}; + +// TypeParameterizedTestCase::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template +class TypeParameterizedTestCase { + public: + static bool Register(const char* prefix, CodeLocation code_location, + const TypedTestCasePState* state, const char* case_name, + const char* test_names, + const std::vector& type_names = + GenerateNames()) { + std::string test_name = StripTrailingSpaces( + GetPrefixUntilComma(test_names)); + if (!state->TestExists(test_name)) { + fprintf(stderr, "Failed to get code location for test %s.%s at %s.", + case_name, test_name.c_str(), + FormatFileLocation(code_location.file.c_str(), + code_location.line).c_str()); + fflush(stderr); + posix::Abort(); + } + const CodeLocation& test_location = state->GetCodeLocation(test_name); + + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest::Register( + prefix, test_location, case_name, test_names, 0, type_names); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestCase::Register(prefix, code_location, + state, case_name, + SkipComma(test_names), + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTestCase { + public: + static bool Register(const char* /*prefix*/, const CodeLocation&, + const TypedTestCasePState* /*state*/, + const char* /*case_name*/, const char* /*test_names*/, + const std::vector& = + std::vector() /*type_names*/) { + return true; + } +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( + UnitTest* unit_test, int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char* str) : value(str) {} + operator bool() const { return true; } + const char* value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const UInt32 kMaxRange = 1u << 31; + + explicit Random(UInt32 seed) : state_(seed) {} + + void Reseed(UInt32 seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + UInt32 Generate(UInt32 range); + + private: + UInt32 state_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); +}; + +// Defining a variable of type CompileAssertTypesEqual will cause a +// compiler error iff T1 and T2 are different types. +template +struct CompileAssertTypesEqual; + +template +struct CompileAssertTypesEqual { +}; + +// Removes the reference from a type if it is a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::remove_reference, which is not widely available yet. +template +struct RemoveReference { typedef T type; }; // NOLINT +template +struct RemoveReference { typedef T type; }; // NOLINT + +// A handy wrapper around RemoveReference that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_REFERENCE_(T) \ + typename ::testing::internal::RemoveReference::type + +// Removes const from a type if it is a const type, otherwise leaves +// it unchanged. This is the same as tr1::remove_const, which is not +// widely available yet. +template +struct RemoveConst { typedef T type; }; // NOLINT +template +struct RemoveConst { typedef T type; }; // NOLINT + +// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above +// definition to fail to remove the const in 'const int[3]' and 'const +// char[3][4]'. The following specialization works around the bug. +template +struct RemoveConst { + typedef typename RemoveConst::type type[N]; +}; + +#if defined(_MSC_VER) && _MSC_VER < 1400 +// This is the only specialization that allows VC++ 7.1 to remove const in +// 'const int[3] and 'const int[3][4]'. However, it causes trouble with GCC +// and thus needs to be conditionally compiled. +template +struct RemoveConst { + typedef typename RemoveConst::type type[N]; +}; +#endif + +// A handy wrapper around RemoveConst that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_CONST_(T) \ + typename ::testing::internal::RemoveConst::type + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) + +// ImplicitlyConvertible::value is a compile-time bool +// constant that's true iff type From can be implicitly converted to +// type To. +template +class ImplicitlyConvertible { + private: + // We need the following helper functions only for their types. + // They have no implementations. + + // MakeFrom() is an expression whose type is From. We cannot simply + // use From(), as the type From may not have a public default + // constructor. + static typename AddReference::type MakeFrom(); + + // These two functions are overloaded. Given an expression + // Helper(x), the compiler will pick the first version if x can be + // implicitly converted to type To; otherwise it will pick the + // second version. + // + // The first version returns a value of size 1, and the second + // version returns a value of size 2. Therefore, by checking the + // size of Helper(x), which can be done at compile time, we can tell + // which version of Helper() is used, and hence whether x can be + // implicitly converted to type To. + static char Helper(To); + static char (&Helper(...))[2]; // NOLINT + + // We have to put the 'public' section after the 'private' section, + // or MSVC refuses to compile the code. + public: +#if defined(__BORLANDC__) + // C++Builder cannot use member overload resolution during template + // instantiation. The simplest workaround is to use its C++0x type traits + // functions (C++Builder 2009 and above only). + static const bool value = __is_convertible(From, To); +#else + // MSVC warns about implicitly converting from double to int for + // possible loss of data, so we need to temporarily disable the + // warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244) + static const bool value = + sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif // __BORLANDC__ +}; +template +const bool ImplicitlyConvertible::value; + +// IsAProtocolMessage::value is a compile-time bool constant that's +// true iff T is type ProtocolMessage, proto2::Message, or a subclass +// of those. +template +struct IsAProtocolMessage + : public bool_constant< + ImplicitlyConvertible::value || + ImplicitlyConvertible::value> { +}; + +// When the compiler sees expression IsContainerTest(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest(0). +// The value of the expression is insignificant. +// +// In C++11 mode we check the existence of a const_iterator and that an +// iterator is properly implemented for the container. +// +// For pre-C++11 that we look for both C::iterator and C::const_iterator. +// The reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +#if GTEST_LANG_CXX11 +template ().begin()), + class = decltype(::std::declval().end()), + class = decltype(++::std::declval()), + class = decltype(*::std::declval()), + class = typename C::const_iterator> +IsContainer IsContainerTest(int /* dummy */) { + return 0; +} +#else +template +IsContainer IsContainerTest(int /* dummy */, + typename C::iterator* /* it */ = NULL, + typename C::const_iterator* /* const_it */ = NULL) { + return 0; +} +#endif // GTEST_LANG_CXX11 + +typedef char IsNotContainer; +template +IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } + +// Trait to detect whether a type T is a hash table. +// The heuristic used is that the type contains an inner type `hasher` and does +// not contain an inner type `reverse_iterator`. +// If the container is iterable in reverse, then order might actually matter. +template +struct IsHashTable { + private: + template + static char test(typename U::hasher*, typename U::reverse_iterator*); + template + static int test(typename U::hasher*, ...); + template + static char test(...); + + public: + static const bool value = sizeof(test(0, 0)) == sizeof(int); +}; + +template +const bool IsHashTable::value; + +template +struct VoidT { + typedef void value_type; +}; + +template +struct HasValueType : false_type {}; +template +struct HasValueType > : true_type { +}; + +template (0)) == sizeof(IsContainer), + bool = HasValueType::value> +struct IsRecursiveContainerImpl; + +template +struct IsRecursiveContainerImpl : public false_type {}; + +// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to +// obey the same inconsistencies as the IsContainerTest, namely check if +// something is a container is relying on only const_iterator in C++11 and +// is relying on both const_iterator and iterator otherwise +template +struct IsRecursiveContainerImpl : public false_type {}; + +template +struct IsRecursiveContainerImpl { + #if GTEST_LANG_CXX11 + typedef typename IteratorTraits::value_type + value_type; +#else + typedef typename IteratorTraits::value_type value_type; +#endif + typedef is_same type; +}; + +// IsRecursiveContainer is a unary compile-time predicate that +// evaluates whether C is a recursive container type. A recursive container +// type is a container type whose value_type is equal to the container type +// itself. An example for a recursive container type is +// boost::filesystem::path, whose iterator has a value_type that is equal to +// boost::filesystem::path. +template +struct IsRecursiveContainer : public IsRecursiveContainerImpl::type {}; + +// EnableIf::type is void when 'Cond' is true, and +// undefined when 'Cond' is false. To use SFINAE to make a function +// overload only apply when a particular expression is true, add +// "typename EnableIf::type* = 0" as the last parameter. +template struct EnableIf; +template<> struct EnableIf { typedef void type; }; // NOLINT + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs); + +// This generic version is used when k is 0. +template +inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } + +// This overload is used when k >= 1. +template +inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) + return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template +Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) + return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template +void CopyArray(const T* from, size_t size, U* to); + +// This generic version is used when k is 0. +template +inline void CopyArray(const T& from, U* to) { *to = from; } + +// This overload is used when k >= 1. +template +inline void CopyArray(const T(&from)[N], U(*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template +void CopyArray(const T* from, size_t size, U* to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +// We use 2 different structs to allow non-copyable types to be used, as long +// as RelationToSourceReference() is passed. +struct RelationToSourceReference {}; +struct RelationToSourceCopy {}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element* iterator; + typedef const Element* const_iterator; + + // Constructs from a native array. References the source. + NativeArray(const Element* array, size_t count, RelationToSourceReference) { + InitRef(array, count); + } + + // Constructs from a native array. Copies the source. + NativeArray(const Element* array, size_t count, RelationToSourceCopy) { + InitCopy(array, count); + } + + // Copy constructor. + NativeArray(const NativeArray& rhs) { + (this->*rhs.clone_)(rhs.array_, rhs.size_); + } + + ~NativeArray() { + if (clone_ != &NativeArray::InitRef) + delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray& rhs) const { + return size() == rhs.size() && + ArrayEq(begin(), size(), rhs.begin()); + } + + private: + enum { + kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper< + Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value + }; + + // Initializes this object with a copy of the input. + void InitCopy(const Element* array, size_t a_size) { + Element* const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + size_ = a_size; + clone_ = &NativeArray::InitCopy; + } + + // Initializes this object with a reference of the input. + void InitRef(const Element* array, size_t a_size) { + array_ = array; + size_ = a_size; + clone_ = &NativeArray::InitRef; + } + + const Element* array_; + size_t size_; + void (NativeArray::*clone_)(const Element*, size_t); + + GTEST_DISALLOW_ASSIGN_(NativeArray); +}; + +} // namespace internal +} // namespace testing + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) \ + = ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +// Suppress MSVC warning 4702 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { statement; } + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::ConstCharPtr gtest_msg = "") { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + catch (...) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ + fail(gtest_msg.value) + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ + fail("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: it throws.") + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ + fail("Expected: " #statement " throws an exception.\n" \ + " Actual: it doesn't.") + + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// represenation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage(\ + gtest_ar_, text, #actual, #expected).c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ + fail("Expected: " #statement " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + test_case_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ +class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ + public:\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ + private:\ + virtual void TestBody();\ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ +};\ +\ +::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ + ::test_info_ =\ + ::testing::internal::MakeAndRegisterTestInfo(\ + #test_case_name, #test_name, NULL, NULL, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + (parent_id), \ + parent_class::SetUpTestCase, \ + parent_class::TearDownTestCase, \ + new ::testing::internal::TestFactoryImpl<\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ +void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + + +#include + +namespace testing { +namespace internal { + +GTEST_DECLARE_string_(internal_run_death_test); + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); + DeathTest(); + virtual ~DeathTest() { } + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest* test) : test_(test) { } + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + private: + DeathTest* const test_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char* LastMessage(); + + static void set_last_death_test_message(const std::string& message); + + private: + // A string containing a description of the outcome of the last death test. + static std::string last_death_test_message_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() { } + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +# if GTEST_HAS_EXCEPTIONS +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf(\ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +# else +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +# endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + const ::testing::internal::RE& gtest_regex = (regex); \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != NULL) { \ + ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ + gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel \ + gtest_sentinel(gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + default: \ + break; \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ + fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in +// NDEBUG mode. In this case we need the statements to be executed and the macro +// must accept a streamed message even though the message is never printed. +// The regex object is not evaluated, but it is used to prevent "unused" +// warnings and to avoid an expression that doesn't compile in debug mode. +#define GTEST_EXECUTE_STATEMENT_(statement, regex) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else if (!::testing::internal::AlwaysTrue()) { \ + const ::testing::internal::RE& gtest_regex = (regex); \ + static_cast(gtest_regex); \ + } else \ + ::testing::Message() + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const std::string& a_file, + int a_line, + int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), + write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) + posix::Close(write_fd_); + } + + const std::string& file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + std::string file_; + int line_; + int index_; + int write_fd_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +namespace testing { + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +GTEST_API_ bool InDeathTestChild(); + +} // namespace internal + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i; +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// On the regular expressions used in death tests: +// +// GOOGLETEST_CM0005 DO NOT DELETE +// On POSIX-compliant systems (*nix), we use the library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows or Mac), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// +// FIXME: make thread-safe death tests search the PATH. + +// Asserts that a given statement causes the program to exit, with an +// integer exit status that satisfies predicate, and emitting error output +// that matches regex. +# define ASSERT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) + +// Like ASSERT_EXIT, but continues on to successive tests in the +// test case, if any: +# define EXPECT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given statement causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches regex. +# define ASSERT_DEATH(statement, regex) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Like ASSERT_DEATH, but continues on to successive tests in the +// test case, if any: +# define EXPECT_DEATH(statement, regex) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + bool operator()(int exit_status) const; + private: + // No implementation - assignment is unsupported. + void operator=(const ExitedWithCode& other); + + const int exit_code_; +}; + +# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Tests that an exit code describes an exit due to termination by a +// given signal. +// GOOGLETEST_CM0006 DO NOT DELETE +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + private: + const int signum_; +}; +# endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +# ifdef NDEBUG + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +# else + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + EXPECT_DEATH(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + ASSERT_DEATH(statement, regex) + +# endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on +// systems that support death tests. This allows one to write such a macro +// on a system that does not support death tests and be sure that it will +// compile on a death-test supporting system. It is exposed publicly so that +// systems that have death-tests with stricter requirements than +// GTEST_HAS_DEATH_TEST can write their own equivalent of +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter iff EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) \ + << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) +#endif + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +// This file was GENERATED by command: +// pump.py gtest-param-test.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Macros and functions for implementing parameterized tests +// in Google C++ Testing and Mocking Framework (Google Test) +// +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// GOOGLETEST_CM0001 DO NOT DELETE +#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test case +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_CASE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more then once) the first argument to the +// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the +// actual test case name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests +// in the given test case, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_CASE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + + +#if !GTEST_OS_SYMBIAN +# include +#endif + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Type and function utilities for implementing parameterized tests. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include + +#include +#include +#include +#include + +// Copyright 2003 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// A "smart" pointer type with reference tracking. Every pointer to a +// particular object is kept on a circular linked list. When the last pointer +// to an object is destroyed or reassigned, the object is deleted. +// +// Used properly, this deletes the object when the last reference goes away. +// There are several caveats: +// - Like all reference counting schemes, cycles lead to leaks. +// - Each smart pointer is actually two pointers (8 bytes instead of 4). +// - Every time a pointer is assigned, the entire list of pointers to that +// object is traversed. This class is therefore NOT SUITABLE when there +// will often be more than two or three pointers to a particular object. +// - References are only tracked as long as linked_ptr<> objects are copied. +// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS +// will happen (double deletion). +// +// A good use of this class is storing object references in STL containers. +// You can safely put linked_ptr<> in a vector<>. +// Other uses may not be as good. +// +// Note: If you use an incomplete type with linked_ptr<>, the class +// *containing* linked_ptr<> must have a constructor and destructor (even +// if they do nothing!). +// +// Bill Gibbons suggested we use something like this. +// +// Thread Safety: +// Unlike other linked_ptr implementations, in this implementation +// a linked_ptr object is thread-safe in the sense that: +// - it's safe to copy linked_ptr objects concurrently, +// - it's safe to copy *from* a linked_ptr and read its underlying +// raw pointer (e.g. via get()) concurrently, and +// - it's safe to write to two linked_ptrs that point to the same +// shared object concurrently. +// FIXME: rename this to safe_linked_ptr to avoid +// confusion with normal linked_ptr. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ + +#include +#include + + +namespace testing { +namespace internal { + +// Protects copying of all linked_ptr objects. +GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// This is used internally by all instances of linked_ptr<>. It needs to be +// a non-template class because different types of linked_ptr<> can refer to +// the same object (linked_ptr(obj) vs linked_ptr(obj)). +// So, it needs to be possible for different types of linked_ptr to participate +// in the same circular linked list, so we need a single class type here. +// +// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr. +class linked_ptr_internal { + public: + // Create a new circle that includes only this instance. + void join_new() { + next_ = this; + } + + // Many linked_ptr operations may change p.link_ for some linked_ptr + // variable p in the same circle as this object. Therefore we need + // to prevent two such operations from occurring concurrently. + // + // Note that different types of linked_ptr objects can coexist in a + // circle (e.g. linked_ptr, linked_ptr, and + // linked_ptr). Therefore we must use a single mutex to + // protect all linked_ptr objects. This can create serious + // contention in production code, but is acceptable in a testing + // framework. + + // Join an existing circle. + void join(linked_ptr_internal const* ptr) + GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { + MutexLock lock(&g_linked_ptr_mutex); + + linked_ptr_internal const* p = ptr; + while (p->next_ != ptr) { + assert(p->next_ != this && + "Trying to join() a linked ring we are already in. " + "Is GMock thread safety enabled?"); + p = p->next_; + } + p->next_ = this; + next_ = ptr; + } + + // Leave whatever circle we're part of. Returns true if we were the + // last member of the circle. Once this is done, you can join() another. + bool depart() + GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { + MutexLock lock(&g_linked_ptr_mutex); + + if (next_ == this) return true; + linked_ptr_internal const* p = next_; + while (p->next_ != this) { + assert(p->next_ != next_ && + "Trying to depart() a linked ring we are not in. " + "Is GMock thread safety enabled?"); + p = p->next_; + } + p->next_ = next_; + return false; + } + + private: + mutable linked_ptr_internal const* next_; +}; + +template +class linked_ptr { + public: + typedef T element_type; + + // Take over ownership of a raw pointer. This should happen as soon as + // possible after the object is created. + explicit linked_ptr(T* ptr = NULL) { capture(ptr); } + ~linked_ptr() { depart(); } + + // Copy an existing linked_ptr<>, adding ourselves to the list of references. + template linked_ptr(linked_ptr const& ptr) { copy(&ptr); } + linked_ptr(linked_ptr const& ptr) { // NOLINT + assert(&ptr != this); + copy(&ptr); + } + + // Assignment releases the old value and acquires the new. + template linked_ptr& operator=(linked_ptr const& ptr) { + depart(); + copy(&ptr); + return *this; + } + + linked_ptr& operator=(linked_ptr const& ptr) { + if (&ptr != this) { + depart(); + copy(&ptr); + } + return *this; + } + + // Smart pointer members. + void reset(T* ptr = NULL) { + depart(); + capture(ptr); + } + T* get() const { return value_; } + T* operator->() const { return value_; } + T& operator*() const { return *value_; } + + bool operator==(T* p) const { return value_ == p; } + bool operator!=(T* p) const { return value_ != p; } + template + bool operator==(linked_ptr const& ptr) const { + return value_ == ptr.get(); + } + template + bool operator!=(linked_ptr const& ptr) const { + return value_ != ptr.get(); + } + + private: + template + friend class linked_ptr; + + T* value_; + linked_ptr_internal link_; + + void depart() { + if (link_.depart()) delete value_; + } + + void capture(T* ptr) { + value_ = ptr; + link_.join_new(); + } + + template void copy(linked_ptr const* ptr) { + value_ = ptr->get(); + if (value_) + link_.join(&ptr->link_); + else + link_.join_new(); + } +}; + +template inline +bool operator==(T* ptr, const linked_ptr& x) { + return ptr == x.get(); +} + +template inline +bool operator!=(T* ptr, const linked_ptr& x) { + return ptr != x.get(); +} + +// A function to convert T* into linked_ptr +// Doing e.g. make_linked_ptr(new FooBarBaz(arg)) is a shorter notation +// for linked_ptr >(new FooBarBaz(arg)) +template +linked_ptr make_linked_ptr(T* ptr) { + return linked_ptr(ptr); +} + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// However if T is an STL-style container then it is printed element-wise +// unless foo::PrintTo(const T&, ostream*) is defined. Note that +// operator<<() is ignored for container types. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include // NOLINT +#include +#include +#include +#include + +#if GTEST_HAS_STD_TUPLE_ +# include +#endif + +#if GTEST_HAS_ABSL +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "absl/types/variant.h" +#endif // GTEST_HAS_ABSL + +namespace testing { + +// Definitions in the 'internal' and 'internal2' name spaces are +// subject to change without notice. DO NOT USE THEM IN USER CODE! +namespace internal2 { + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, + ::std::ostream* os); + +// For selecting which printer to use when a given type has neither << +// nor PrintTo(). +enum TypeKind { + kProtobuf, // a protobuf type + kConvertibleToInteger, // a type implicitly convertible to BiggestInt + // (e.g. a named or unnamed enum type) +#if GTEST_HAS_ABSL + kConvertibleToStringView, // a type implicitly convertible to + // absl::string_view +#endif + kOtherType // anything else +}; + +// TypeWithoutFormatter::PrintValue(value, os) is called +// by the universal printer to print a value of type T when neither +// operator<< nor PrintTo() is defined for T, where kTypeKind is the +// "kind" of T as defined by enum TypeKind. +template +class TypeWithoutFormatter { + public: + // This default version is called when kTypeKind is kOtherType. + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo(static_cast( + reinterpret_cast(&value)), + sizeof(value), os); + } +}; + +// We print a protobuf using its ShortDebugString() when the string +// doesn't exceed this many characters; otherwise we print it using +// DebugString() for better readability. +const size_t kProtobufOneLinerMaxLength = 50; + +template +class TypeWithoutFormatter { + public: + static void PrintValue(const T& value, ::std::ostream* os) { + std::string pretty_str = value.ShortDebugString(); + if (pretty_str.length() > kProtobufOneLinerMaxLength) { + pretty_str = "\n" + value.DebugString(); + } + *os << ("<" + pretty_str + ">"); + } +}; + +template +class TypeWithoutFormatter { + public: + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(const T& value, ::std::ostream* os) { + const internal::BiggestInt kBigInt = value; + *os << kBigInt; + } +}; + +#if GTEST_HAS_ABSL +template +class TypeWithoutFormatter { + public: + // Since T has neither operator<< nor PrintTo() but can be implicitly + // converted to absl::string_view, we print it as a absl::string_view. + // + // Note: the implementation is further below, as it depends on + // internal::PrintTo symbol which is defined later in the file. + static void PrintValue(const T& value, ::std::ostream* os); +}; +#endif + +// Prints the given value to the given ostream. If the value is a +// protocol message, its debug string is printed; if it's an enum or +// of a type implicitly convertible to BiggestInt, it's printed as an +// integer; otherwise the bytes in the value are printed. This is +// what UniversalPrinter::Print() does when it knows nothing about +// type T and T has neither << operator nor PrintTo(). +// +// A user can override this behavior for a class type Foo by defining +// a << operator in the namespace where Foo is defined. +// +// We put this operator in namespace 'internal2' instead of 'internal' +// to simplify the implementation, as much code in 'internal' needs to +// use << in STL, which would conflict with our own << were it defined +// in 'internal'. +// +// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If +// we define it to take an std::ostream instead, we'll get an +// "ambiguous overloads" compiler error when trying to print a type +// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether +// operator<<(std::ostream&, const T&) or +// operator<<(std::basic_stream, const Foo&) is more +// specific. +template +::std::basic_ostream& operator<<( + ::std::basic_ostream& os, const T& x) { + TypeWithoutFormatter::value + ? kProtobuf + : internal::ImplicitlyConvertible< + const T&, internal::BiggestInt>::value + ? kConvertibleToInteger + : +#if GTEST_HAS_ABSL + internal::ImplicitlyConvertible< + const T&, absl::string_view>::value + ? kConvertibleToStringView + : +#endif + kOtherType)>::PrintValue(x, &os); + return os; +} + +} // namespace internal2 +} // namespace testing + +// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up +// magic needed for implementing UniversalPrinter won't work. +namespace testing_internal { + +// Used to print a value that is not an STL-style container when the +// user doesn't define PrintTo() for it. +template +void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { + // With the following statement, during unqualified name lookup, + // testing::internal2::operator<< appears as if it was declared in + // the nearest enclosing namespace that contains both + // ::testing_internal and ::testing::internal2, i.e. the global + // namespace. For more details, refer to the C++ Standard section + // 7.3.4-1 [namespace.udir]. This allows us to fall back onto + // testing::internal2::operator<< in case T doesn't come with a << + // operator. + // + // We cannot write 'using ::testing::internal2::operator<<;', which + // gcc 3.3 fails to compile due to a compiler bug. + using namespace ::testing::internal2; // NOLINT + + // Assuming T is defined in namespace foo, in the next statement, + // the compiler will consider all of: + // + // 1. foo::operator<< (thanks to Koenig look-up), + // 2. ::operator<< (as the current namespace is enclosed in ::), + // 3. testing::internal2::operator<< (thanks to the using statement above). + // + // The operator<< whose type matches T best will be picked. + // + // We deliberately allow #2 to be a candidate, as sometimes it's + // impossible to define #1 (e.g. when foo is ::std, defining + // anything in it is undefined behavior unless you are a compiler + // vendor.). + *os << value; +} + +} // namespace testing_internal + +namespace testing { +namespace internal { + +// FormatForComparison::Format(value) formats a +// value of type ToPrint that is an operand of a comparison assertion +// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in +// the comparison, and is used to help determine the best way to +// format the value. In particular, when the value is a C string +// (char pointer) and the other operand is an STL string object, we +// want to format the C string as a string, since we know it is +// compared by value with the string object. If the value is a char +// pointer but the other operand is not an STL string object, we don't +// know whether the pointer is supposed to point to a NUL-terminated +// string, and thus want to print it as a pointer to be safe. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// The default case. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint& value) { + return ::testing::PrintToString(value); + } +}; + +// Array. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint* value) { + return FormatForComparison::Format(value); + } +}; + +// By default, print C string as pointers to be safe, as we don't know +// whether they actually point to a NUL-terminated string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ + template \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(static_cast(value)); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ + +// If a C string is compared with an STL string object, we know it's meant +// to point to a NUL-terminated string, and thus can print it as a string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ + template <> \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); + +#if GTEST_HAS_GLOBAL_STRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string); +#endif + +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring); +#endif + +#if GTEST_HAS_STD_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); +#endif + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char* or void*, and print it as a C string when it is compared +// against an std::string object, for example. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +std::string FormatForComparisonFailureMessage( + const T1& value, const T2& /* other_operand */) { + return FormatForComparison::Format(value); +} + +// UniversalPrinter::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template +class UniversalPrinter; + +template +void UniversalPrint(const T& value, ::std::ostream* os); + +enum DefaultPrinterType { + kPrintContainer, + kPrintPointer, + kPrintFunctionPointer, + kPrintOther, +}; +template struct WrapPrinterType {}; + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +template +void DefaultPrintTo(WrapPrinterType /* dummy */, + const C& container, ::std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (typename C::const_iterator it = container.begin(); + it != container.end(); ++it, ++count) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(*it, os) here as PrintTo() doesn't + // handle *it being a native array. + internal::UniversalPrint(*it, os); + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; +} + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +template +void DefaultPrintTo(WrapPrinterType /* dummy */, + T* p, ::std::ostream* os) { + if (p == NULL) { + *os << "NULL"; + } else { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } +} +template +void DefaultPrintTo(WrapPrinterType /* dummy */, + T* p, ::std::ostream* os) { + if (p == NULL) { + *os << "NULL"; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. + *os << reinterpret_cast(p); + } +} + +// Used to print a non-container, non-pointer value when the user +// doesn't define PrintTo() for it. +template +void DefaultPrintTo(WrapPrinterType /* dummy */, + const T& value, ::std::ostream* os) { + ::testing_internal::DefaultPrintNonContainerTo(value, os); +} + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template +void PrintTo(const T& value, ::std::ostream* os) { + // DefaultPrintTo() is overloaded. The type of its first argument + // determines which version will be picked. + // + // Note that we check for container types here, prior to we check + // for protocol message types in our operator<<. The rationale is: + // + // For protocol messages, we want to give people a chance to + // override Google Mock's format by defining a PrintTo() or + // operator<<. For STL containers, other formats can be + // incompatible with Google Mock's format for the container + // elements; therefore we check for container types here to ensure + // that our format is used. + // + // Note that MSVC and clang-cl do allow an implicit conversion from + // pointer-to-function to pointer-to-object, but clang-cl warns on it. + // So don't use ImplicitlyConvertible if it can be helped since it will + // cause this warning, and use a separate overload of DefaultPrintTo for + // function pointers so that the `*os << p` in the object pointer overload + // doesn't cause that warning either. + DefaultPrintTo( + WrapPrinterType < + (sizeof(IsContainerTest(0)) == sizeof(IsContainer)) && + !IsRecursiveContainer::value + ? kPrintContainer + : !is_pointer::value + ? kPrintOther +#if GTEST_LANG_CXX11 + : std::is_function::type>::value +#else + : !internal::ImplicitlyConvertible::value +#endif + ? kPrintFunctionPointer + : kPrintPointer > (), + value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); +inline void PrintTo(char c, ::std::ostream* os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream* os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); +inline void PrintTo(char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(const unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); +inline void PrintTo(wchar_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::string and ::std::string. +#if GTEST_HAS_GLOBAL_STRING +GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); +inline void PrintTo(const ::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); +inline void PrintTo(const ::std::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} + +// Overloads for ::wstring and ::std::wstring. +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_ABSL +// Overload for absl::string_view. +inline void PrintTo(absl::string_view sp, ::std::ostream* os) { + PrintTo(::std::string(sp), os); +} +#endif // GTEST_HAS_ABSL + +#if GTEST_LANG_CXX11 +inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } +#endif // GTEST_LANG_CXX11 + +#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T& t, ::std::ostream* os); +#endif // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ + +#if GTEST_HAS_TR1_TUPLE +// Overload for ::std::tr1::tuple. Needed for printing function arguments, +// which are packed as tuples. + +// Overloaded PrintTo() for tuples of various arities. We support +// tuples of up-to 10 fields. The following implementation works +// regardless of whether tr1::tuple is implemented using the +// non-standard variadic template feature or not. + +inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo( + const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} +#endif // GTEST_HAS_TR1_TUPLE + +#if GTEST_HAS_STD_TUPLE_ +template +void PrintTo(const ::std::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} +#endif // GTEST_HAS_STD_TUPLE_ + +// Overload for std::pair. +template +void PrintTo(const ::std::pair& value, ::std::ostream* os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter::Print(value.first, os); + *os << ", "; + UniversalPrinter::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T& value, ::std::ostream* os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +#if GTEST_HAS_ABSL + +// Printer for absl::optional + +template +class UniversalPrinter<::absl::optional> { + public: + static void Print(const ::absl::optional& value, ::std::ostream* os) { + *os << '('; + if (!value) { + *os << "nullopt"; + } else { + UniversalPrint(*value, os); + } + *os << ')'; + } +}; + +// Printer for absl::variant + +template +class UniversalPrinter<::absl::variant> { + public: + static void Print(const ::absl::variant& value, ::std::ostream* os) { + *os << '('; + absl::visit(Visitor{os}, value); + *os << ')'; + } + + private: + struct Visitor { + template + void operator()(const U& u) const { + *os << "'" << GetTypeName() << "' with value "; + UniversalPrint(u, os); + } + ::std::ostream* os; + }; +}; + +#endif // GTEST_HAS_ABSL + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template +void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + // FIXME: let the user control the threshold using a flag. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray( + const char* begin, size_t len, ::std::ostream* os); + +// This overload prints a (const) wchar_t array compactly. +GTEST_API_ void UniversalPrintArray( + const wchar_t* begin, size_t len, ::std::ostream* os); + +// Implements printing an array type T[N]. +template +class UniversalPrinter { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream* os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + static void Print(const T& value, ::std::ostream* os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. + +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T (&value)[N], ::std::ostream* os) { + UniversalPrinter::Print(value, os); + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(const char* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(std::string(str), os); + } + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(char* str, ::std::ostream* os) { + UniversalTersePrinter::Print(str, os); + } +}; + +#if GTEST_HAS_STD_WSTRING +template <> +class UniversalTersePrinter { + public: + static void Print(const wchar_t* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(::std::wstring(str), os); + } + } +}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(wchar_t* str, ::std::ostream* os) { + UniversalTersePrinter::Print(str, os); + } +}; + +template +void UniversalTersePrint(const T& value, ::std::ostream* os) { + UniversalTersePrinter::Print(value, os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template +void UniversalPrint(const T& value, ::std::ostream* os) { + // A workarond for the bug in VC++ 7.1 that prevents us from instantiating + // UniversalPrinter with T directly. + typedef T T1; + UniversalPrinter::Print(value, os); +} + +typedef ::std::vector< ::std::string> Strings; + +// TuplePolicy must provide: +// - tuple_size +// size of tuple TupleT. +// - get(const TupleT& t) +// static function extracting element I of tuple TupleT. +// - tuple_element::type +// type of element I of tuple TupleT. +template +struct TuplePolicy; + +#if GTEST_HAS_TR1_TUPLE +template +struct TuplePolicy { + typedef TupleT Tuple; + static const size_t tuple_size = ::std::tr1::tuple_size::value; + + template + struct tuple_element : ::std::tr1::tuple_element(I), Tuple> { + }; + + template + static typename AddReference(I), Tuple>::type>::type + get(const Tuple& tuple) { + return ::std::tr1::get(tuple); + } +}; +template +const size_t TuplePolicy::tuple_size; +#endif // GTEST_HAS_TR1_TUPLE + +#if GTEST_HAS_STD_TUPLE_ +template +struct TuplePolicy< ::std::tuple > { + typedef ::std::tuple Tuple; + static const size_t tuple_size = ::std::tuple_size::value; + + template + struct tuple_element : ::std::tuple_element {}; + + template + static const typename ::std::tuple_element::type& get( + const Tuple& tuple) { + return ::std::get(tuple); + } +}; +template +const size_t TuplePolicy< ::std::tuple >::tuple_size; +#endif // GTEST_HAS_STD_TUPLE_ + +#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ +// This helper template allows PrintTo() for tuples and +// UniversalTersePrintTupleFieldsToStrings() to be defined by +// induction on the number of tuple fields. The idea is that +// TuplePrefixPrinter::PrintPrefixTo(t, os) prints the first N +// fields in tuple t, and can be defined in terms of +// TuplePrefixPrinter. +// +// The inductive case. +template +struct TuplePrefixPrinter { + // Prints the first N fields of a tuple. + template + static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { + TuplePrefixPrinter::PrintPrefixTo(t, os); + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (N > 1) { + GTEST_INTENTIONAL_CONST_COND_POP_() + *os << ", "; + } + UniversalPrinter< + typename TuplePolicy::template tuple_element::type> + ::Print(TuplePolicy::template get(t), os); + } + + // Tersely prints the first N fields of a tuple to a string vector, + // one element for each field. + template + static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { + TuplePrefixPrinter::TersePrintPrefixToStrings(t, strings); + ::std::stringstream ss; + UniversalTersePrint(TuplePolicy::template get(t), &ss); + strings->push_back(ss.str()); + } +}; + +// Base case. +template <> +struct TuplePrefixPrinter<0> { + template + static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} + + template + static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} +}; + +// Helper function for printing a tuple. +// Tuple must be either std::tr1::tuple or std::tuple type. +template +void PrintTupleTo(const Tuple& t, ::std::ostream* os) { + *os << "("; + TuplePrefixPrinter::tuple_size>::PrintPrefixTo(t, os); + *os << ")"; +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { + Strings result; + TuplePrefixPrinter::tuple_size>:: + TersePrintPrefixToStrings(value, &result); + return result; +} +#endif // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ + +} // namespace internal + +#if GTEST_HAS_ABSL +namespace internal2 { +template +void TypeWithoutFormatter::PrintValue( + const T& value, ::std::ostream* os) { + internal::PrintTo(absl::string_view(value), os); +} +} // namespace internal2 +#endif + +template +::std::string PrintToString(const T& value) { + ::std::stringstream ss; + internal::UniversalTersePrinter::Print(value, &ss); + return ss.str(); +} + +} // namespace testing + +// Include any custom printer added by the local installation. +// We must include this header at the end to make sure it can use the +// declarations from this file. +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// This file provides an injection point for custom printers in a local +// installation of gTest. +// It will be included from gtest-printers.h and the overrides in this file +// will be visible to everyone. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +namespace testing { + +// Input to a parameterized test name generator, describing a test parameter. +// Consists of the parameter value and the integer parameter index. +template +struct TestParamInfo { + TestParamInfo(const ParamType& a_param, size_t an_index) : + param(a_param), + index(an_index) {} + ParamType param; + size_t index; +}; + +// A builtin parameterized test name generator which returns the result of +// testing::PrintToString. +struct PrintToStringParamName { + template + std::string operator()(const TestParamInfo& info) const { + return PrintToString(info.param); + } +}; + +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Outputs a message explaining invalid registration of different +// fixture class for the same test case. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, + CodeLocation code_location); + +template class ParamGeneratorInterface; +template class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface. +template +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface* BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator. + virtual ParamIteratorInterface* Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator::operator*(). + virtual const T* Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator::operator==(). + virtual bool Equals(const ParamIteratorInterface& other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface. It wraps ParamIteratorInterface +// and implements the const forward iterator concept. +template +class ParamIterator { + public: + typedef T value_type; + typedef const T& reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} + ParamIterator& operator=(const ParamIterator& other) { + if (this != &other) + impl_.reset(other.impl_->Clone()); + return *this; + } + + const T& operator*() const { return *impl_->Current(); } + const T* operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator& operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface* clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator& other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator& other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator; + explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} + scoped_ptr > impl_; +}; + +// ParamGeneratorInterface is the binary interface to access generators +// defined in other translation units. +template +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface* Begin() const = 0; + virtual ParamIteratorInterface* End() const = 0; +}; + +// Wraps ParamGeneratorInterface and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template +class ParamGenerator { + public: + typedef ParamIterator iterator; + + explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} + + ParamGenerator& operator=(const ParamGenerator& other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + linked_ptr > impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template +class RangeGenerator : public ParamGeneratorInterface { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), end_(end), + step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} + virtual ~RangeGenerator() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, begin_, 0, step_); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + virtual void Advance() { + value_ = static_cast(value_ + step_); + index_++; + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const T* Current() const { return &value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator& other) + : ParamIteratorInterface(), + base_(other.base_), value_(other.value_), index_(other.index_), + step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T& begin, + const T& end, + const IncrementT& step) { + int end_index = 0; + for (T i = begin; i < end; i = static_cast(i + step)) + end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator& other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { + public: + template + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + virtual ~ValuesInIteratorRangeGenerator() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, container_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector ContainerType; + + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + virtual void Advance() { + ++iterator_; + value_.reset(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + virtual const T* Current() const { + if (value_.get() == NULL) + value_.reset(new T(*iterator_)); + return value_.get(); + } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType(&other)->iterator_; + } + + private: + Iterator(const Iterator& other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface(), + base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface* const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of scoped_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable scoped_ptr value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator& other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Default parameterized test name generator, returns a string containing the +// integer test parameter index. +template +std::string DefaultParamName(const TestParamInfo& info) { + Message name_stream; + name_stream << info.index; + return name_stream.GetString(); +} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Parameterized test name overload helpers, which help the +// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized +// test name generator and user param name generator. +template +ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) { + return func; +} + +template +struct ParamNameGenFunc { + typedef std::string Type(const TestParamInfo&); +}; + +template +typename ParamNameGenFunc::Type *GetParamNameGen() { + return DefaultParamName; +} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) : + parameter_(parameter) {} + virtual Test* CreateTest() { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestCaseInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template +class TestMetaFactory + : public TestMetaFactoryBase { + public: + typedef typename TestCase::ParamType ParamType; + + TestMetaFactory() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { + return new ParameterizedTestFactory(parameter); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfoBase is a generic interface +// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestCaseRegistry class holds +// a collection of pointers to the ParameterizedTestCaseInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestCaseInfoBase { + public: + virtual ~ParameterizedTestCaseInfoBase() {} + + // Base part of test case name for display purposes. + virtual const std::string& GetTestCaseName() const = 0; + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test case right before running them in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestCaseInfoBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test case and generators +// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that +// test case. It registers tests with all values generated by all +// generators when asked. +template +class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestCaseInstantiation(). + typedef typename TestCase::ParamType ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator(GeneratorCreationFunc)(); + typedef typename ParamNameGenFunc::Type ParamNameGeneratorFunc; + + explicit ParameterizedTestCaseInfo( + const char* name, CodeLocation code_location) + : test_case_name_(name), code_location_(code_location) {} + + // Test case base name for display purposes. + virtual const std::string& GetTestCaseName() const { return test_case_name_; } + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_case_name is the base name of the test case (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test case base name and DoBar is test base name. + void AddTestPattern(const char* test_case_name, + const char* test_base_name, + TestMetaFactoryBase* meta_factory) { + tests_.push_back(linked_ptr(new TestInfo(test_case_name, + test_base_name, + meta_factory))); + } + // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestCaseInstantiation(const std::string& instantiation_name, + GeneratorCreationFunc* func, + ParamNameGeneratorFunc* name_func, + const char* file, int line) { + instantiations_.push_back( + InstantiationInfo(instantiation_name, func, name_func, file, line)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test case + // test cases right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more then once. + virtual void RegisterTests() { + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + linked_ptr test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); gen_it != instantiations_.end(); + ++gen_it) { + const std::string& instantiation_name = gen_it->name; + ParamGenerator generator((*gen_it->generator)()); + ParamNameGeneratorFunc* name_func = gen_it->name_func; + const char* file = gen_it->file; + int line = gen_it->line; + + std::string test_case_name; + if ( !instantiation_name.empty() ) + test_case_name = instantiation_name + "/"; + test_case_name += test_info->test_case_base_name; + + size_t i = 0; + std::set test_param_names; + for (typename ParamGenerator::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + Message test_name_stream; + + std::string param_name = name_func( + TestParamInfo(*param_it, i)); + + GTEST_CHECK_(IsValidParamName(param_name)) + << "Parameterized test name '" << param_name + << "' is invalid, in " << file + << " line " << line << std::endl; + + GTEST_CHECK_(test_param_names.count(param_name) == 0) + << "Duplicate parameterized test name '" << param_name + << "', in " << file << " line " << line << std::endl; + + test_param_names.insert(param_name); + + test_name_stream << test_info->test_base_name << "/" << param_name; + MakeAndRegisterTestInfo( + test_case_name.c_str(), + test_name_stream.GetString().c_str(), + NULL, // No type parameter. + PrintToString(*param_it).c_str(), + code_location_, + GetTestCaseTypeId(), + TestCase::SetUpTestCase, + TestCase::TearDownTestCase, + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char* a_test_case_base_name, + const char* a_test_base_name, + TestMetaFactoryBase* a_test_meta_factory) : + test_case_base_name(a_test_case_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory) {} + + const std::string test_case_base_name; + const std::string test_base_name; + const scoped_ptr > test_meta_factory; + }; + typedef ::std::vector > TestInfoContainer; + // Records data received from INSTANTIATE_TEST_CASE_P macros: + // + struct InstantiationInfo { + InstantiationInfo(const std::string &name_in, + GeneratorCreationFunc* generator_in, + ParamNameGeneratorFunc* name_func_in, + const char* file_in, + int line_in) + : name(name_in), + generator(generator_in), + name_func(name_func_in), + file(file_in), + line(line_in) {} + + std::string name; + GeneratorCreationFunc* generator; + ParamNameGeneratorFunc* name_func; + const char* file; + int line; + }; + typedef ::std::vector InstantiationContainer; + + static bool IsValidParamName(const std::string& name) { + // Check for empty string + if (name.empty()) + return false; + + // Check for invalid characters + for (std::string::size_type index = 0; index < name.size(); ++index) { + if (!isalnum(name[index]) && name[index] != '_') + return false; + } + + return true; + } + + const std::string test_case_name_; + CodeLocation code_location_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); +}; // class ParameterizedTestCaseInfo + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase +// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P +// macros use it to locate their corresponding ParameterizedTestCaseInfo +// descriptors. +class ParameterizedTestCaseRegistry { + public: + ParameterizedTestCaseRegistry() {} + ~ParameterizedTestCaseRegistry() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + delete *it; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test case. + template + ParameterizedTestCaseInfo* GetTestCasePatternHolder( + const char* test_case_name, + CodeLocation code_location) { + ParameterizedTestCaseInfo* typed_test_info = NULL; + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + if ((*it)->GetTestCaseName() == test_case_name) { + if ((*it)->GetTestCaseTypeId() != GetTypeId()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test case setup and tear-down in this case. + ReportInvalidTestCaseType(test_case_name, code_location); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestCaseInfo >(*it); + } + break; + } + } + if (typed_test_info == NULL) { + typed_test_info = new ParameterizedTestCaseInfo( + test_case_name, code_location); + test_case_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + (*it)->RegisterTests(); + } + } + + private: + typedef ::std::vector TestCaseInfoContainer; + + TestCaseInfoContainer test_case_infos_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); +}; + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +// This file was GENERATED by command: +// pump.py gtest-param-util-generated.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Type and function utilities for implementing parameterized tests. +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently Google Test supports at most 50 arguments in Values, +// and at most 10 arguments in Combine. Please contact +// googletestframework@googlegroups.com if you need more. +// Please note that the number of arguments to Combine is limited +// by the maximum arity of the implementation of tuple which is +// currently set at 10. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + + +namespace testing { + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end); + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]); + +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { + +// Used in the Values() function to provide polymorphic capabilities. +template +class ValueArray1 { + public: + explicit ValueArray1(T1 v1) : v1_(v1) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_)}; + return ValuesIn(array); + } + + ValueArray1(const ValueArray1& other) : v1_(other.v1_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray1& other); + + const T1 v1_; +}; + +template +class ValueArray2 { + public: + ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_)}; + return ValuesIn(array); + } + + ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray2& other); + + const T1 v1_; + const T2 v2_; +}; + +template +class ValueArray3 { + public: + ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_)}; + return ValuesIn(array); + } + + ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray3& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; +}; + +template +class ValueArray4 { + public: + ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_)}; + return ValuesIn(array); + } + + ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray4& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; +}; + +template +class ValueArray5 { + public: + ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_)}; + return ValuesIn(array); + } + + ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray5& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; +}; + +template +class ValueArray6 { + public: + ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_)}; + return ValuesIn(array); + } + + ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray6& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; +}; + +template +class ValueArray7 { + public: + ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_)}; + return ValuesIn(array); + } + + ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray7& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; +}; + +template +class ValueArray8 { + public: + ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_)}; + return ValuesIn(array); + } + + ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray8& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; +}; + +template +class ValueArray9 { + public: + ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_)}; + return ValuesIn(array); + } + + ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray9& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; +}; + +template +class ValueArray10 { + public: + ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_)}; + return ValuesIn(array); + } + + ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray10& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; +}; + +template +class ValueArray11 { + public: + ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_)}; + return ValuesIn(array); + } + + ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray11& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; +}; + +template +class ValueArray12 { + public: + ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_)}; + return ValuesIn(array); + } + + ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray12& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; +}; + +template +class ValueArray13 { + public: + ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_)}; + return ValuesIn(array); + } + + ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray13& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; +}; + +template +class ValueArray14 { + public: + ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_)}; + return ValuesIn(array); + } + + ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray14& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; +}; + +template +class ValueArray15 { + public: + ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_)}; + return ValuesIn(array); + } + + ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray15& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; +}; + +template +class ValueArray16 { + public: + ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_)}; + return ValuesIn(array); + } + + ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray16& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; +}; + +template +class ValueArray17 { + public: + ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_)}; + return ValuesIn(array); + } + + ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray17& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; +}; + +template +class ValueArray18 { + public: + ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_)}; + return ValuesIn(array); + } + + ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray18& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; +}; + +template +class ValueArray19 { + public: + ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_)}; + return ValuesIn(array); + } + + ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray19& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; +}; + +template +class ValueArray20 { + public: + ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_)}; + return ValuesIn(array); + } + + ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray20& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; +}; + +template +class ValueArray21 { + public: + ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_)}; + return ValuesIn(array); + } + + ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray21& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; +}; + +template +class ValueArray22 { + public: + ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_)}; + return ValuesIn(array); + } + + ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray22& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; +}; + +template +class ValueArray23 { + public: + ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_)}; + return ValuesIn(array); + } + + ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray23& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; +}; + +template +class ValueArray24 { + public: + ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_)}; + return ValuesIn(array); + } + + ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray24& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; +}; + +template +class ValueArray25 { + public: + ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_)}; + return ValuesIn(array); + } + + ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray25& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; +}; + +template +class ValueArray26 { + public: + ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_)}; + return ValuesIn(array); + } + + ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray26& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; +}; + +template +class ValueArray27 { + public: + ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_)}; + return ValuesIn(array); + } + + ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray27& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; +}; + +template +class ValueArray28 { + public: + ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_)}; + return ValuesIn(array); + } + + ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray28& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; +}; + +template +class ValueArray29 { + public: + ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_)}; + return ValuesIn(array); + } + + ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray29& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; +}; + +template +class ValueArray30 { + public: + ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_)}; + return ValuesIn(array); + } + + ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray30& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; +}; + +template +class ValueArray31 { + public: + ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_)}; + return ValuesIn(array); + } + + ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray31& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; +}; + +template +class ValueArray32 { + public: + ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_)}; + return ValuesIn(array); + } + + ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray32& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; +}; + +template +class ValueArray33 { + public: + ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_)}; + return ValuesIn(array); + } + + ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray33& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; +}; + +template +class ValueArray34 { + public: + ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_)}; + return ValuesIn(array); + } + + ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray34& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; +}; + +template +class ValueArray35 { + public: + ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_)}; + return ValuesIn(array); + } + + ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray35& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; +}; + +template +class ValueArray36 { + public: + ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_)}; + return ValuesIn(array); + } + + ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray36& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; +}; + +template +class ValueArray37 { + public: + ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_)}; + return ValuesIn(array); + } + + ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray37& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; +}; + +template +class ValueArray38 { + public: + ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_)}; + return ValuesIn(array); + } + + ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray38& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; +}; + +template +class ValueArray39 { + public: + ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_)}; + return ValuesIn(array); + } + + ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray39& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; +}; + +template +class ValueArray40 { + public: + ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_)}; + return ValuesIn(array); + } + + ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray40& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; +}; + +template +class ValueArray41 { + public: + ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_)}; + return ValuesIn(array); + } + + ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray41& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; +}; + +template +class ValueArray42 { + public: + ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_)}; + return ValuesIn(array); + } + + ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray42& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; +}; + +template +class ValueArray43 { + public: + ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), + v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_)}; + return ValuesIn(array); + } + + ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray43& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; +}; + +template +class ValueArray44 { + public: + ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), + v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), + v43_(v43), v44_(v44) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_)}; + return ValuesIn(array); + } + + ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray44& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; +}; + +template +class ValueArray45 { + public: + ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), + v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_)}; + return ValuesIn(array); + } + + ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray45& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; +}; + +template +class ValueArray46 { + public: + ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_)}; + return ValuesIn(array); + } + + ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray46& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; +}; + +template +class ValueArray47 { + public: + ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), + v47_(v47) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_)}; + return ValuesIn(array); + } + + ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray47& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; +}; + +template +class ValueArray48 { + public: + ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), + v46_(v46), v47_(v47), v48_(v48) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_), + static_cast(v48_)}; + return ValuesIn(array); + } + + ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_), v48_(other.v48_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray48& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; +}; + +template +class ValueArray49 { + public: + ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, + T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_), + static_cast(v48_), static_cast(v49_)}; + return ValuesIn(array); + } + + ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray49& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; +}; + +template +class ValueArray50 { + public: + ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, + T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_), + static_cast(v48_), static_cast(v49_), static_cast(v50_)}; + return ValuesIn(array); + } + + ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_), + v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_), + v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_), + v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_), + v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_), + v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_), + v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_), + v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_), + v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_), + v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_), + v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_), + v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_), + v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {} + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray50& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; + const T50 v50_; +}; + +# if GTEST_HAS_COMBINE +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Generates values from the Cartesian product of values produced +// by the argument generators. +// +template +class CartesianProductGenerator2 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator2(const ParamGenerator& g1, + const ParamGenerator& g2) + : g1_(g1), g2_(g2) {} + virtual ~CartesianProductGenerator2() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current2_; + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + linked_ptr current_value_; + }; // class CartesianProductGenerator2::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator2& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; +}; // class CartesianProductGenerator2 + + +template +class CartesianProductGenerator3 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator3(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + virtual ~CartesianProductGenerator3() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current3_; + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + linked_ptr current_value_; + }; // class CartesianProductGenerator3::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator3& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; +}; // class CartesianProductGenerator3 + + +template +class CartesianProductGenerator4 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator4(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + virtual ~CartesianProductGenerator4() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current4_; + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + linked_ptr current_value_; + }; // class CartesianProductGenerator4::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator4& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; +}; // class CartesianProductGenerator4 + + +template +class CartesianProductGenerator5 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator5(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + virtual ~CartesianProductGenerator5() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current5_; + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + linked_ptr current_value_; + }; // class CartesianProductGenerator5::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator5& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; +}; // class CartesianProductGenerator5 + + +template +class CartesianProductGenerator6 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator6(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + virtual ~CartesianProductGenerator6() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current6_; + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + linked_ptr current_value_; + }; // class CartesianProductGenerator6::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator6& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; +}; // class CartesianProductGenerator6 + + +template +class CartesianProductGenerator7 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator7(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + virtual ~CartesianProductGenerator7() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current7_; + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + linked_ptr current_value_; + }; // class CartesianProductGenerator7::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator7& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; +}; // class CartesianProductGenerator7 + + +template +class CartesianProductGenerator8 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator8(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + virtual ~CartesianProductGenerator8() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current8_; + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + linked_ptr current_value_; + }; // class CartesianProductGenerator8::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator8& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; +}; // class CartesianProductGenerator8 + + +template +class CartesianProductGenerator9 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator9(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8, const ParamGenerator& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + virtual ~CartesianProductGenerator9() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8, + const ParamGenerator& g9, + const typename ParamGenerator::iterator& current9) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current9_; + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + const typename ParamGenerator::iterator begin9_; + const typename ParamGenerator::iterator end9_; + typename ParamGenerator::iterator current9_; + linked_ptr current_value_; + }; // class CartesianProductGenerator9::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator9& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; + const ParamGenerator g9_; +}; // class CartesianProductGenerator9 + + +template +class CartesianProductGenerator10 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator10(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8, const ParamGenerator& g9, + const ParamGenerator& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + virtual ~CartesianProductGenerator10() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end(), g10_, g10_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8, + const ParamGenerator& g9, + const typename ParamGenerator::iterator& current9, + const ParamGenerator& g10, + const typename ParamGenerator::iterator& current10) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9), + begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current10_; + if (current10_ == end10_) { + current10_ = begin10_; + ++current9_; + } + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return current_value_.get(); } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_ && + current10_ == typed_other->current10_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_), + begin10_(other.begin10_), + end10_(other.end10_), + current10_(other.current10_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_.reset(new ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_, *current10_)); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_ || + current10_ == end10_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + const typename ParamGenerator::iterator begin9_; + const typename ParamGenerator::iterator end9_; + typename ParamGenerator::iterator current9_; + const typename ParamGenerator::iterator begin10_; + const typename ParamGenerator::iterator end10_; + typename ParamGenerator::iterator current10_; + linked_ptr current_value_; + }; // class CartesianProductGenerator10::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator10& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; + const ParamGenerator g9_; + const ParamGenerator g10_; +}; // class CartesianProductGenerator10 + + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Helper classes providing Combine() with polymorphic features. They allow +// casting CartesianProductGeneratorN to ParamGenerator if T is +// convertible to U. +// +template +class CartesianProductHolder2 { + public: +CartesianProductHolder2(const Generator1& g1, const Generator2& g2) + : g1_(g1), g2_(g2) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator2( + static_cast >(g1_), + static_cast >(g2_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder2& other); + + const Generator1 g1_; + const Generator2 g2_; +}; // class CartesianProductHolder2 + +template +class CartesianProductHolder3 { + public: +CartesianProductHolder3(const Generator1& g1, const Generator2& g2, + const Generator3& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator3( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder3& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; +}; // class CartesianProductHolder3 + +template +class CartesianProductHolder4 { + public: +CartesianProductHolder4(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator4( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder4& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; +}; // class CartesianProductHolder4 + +template +class CartesianProductHolder5 { + public: +CartesianProductHolder5(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator5( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder5& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; +}; // class CartesianProductHolder5 + +template +class CartesianProductHolder6 { + public: +CartesianProductHolder6(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator6( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder6& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; +}; // class CartesianProductHolder6 + +template +class CartesianProductHolder7 { + public: +CartesianProductHolder7(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator7( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder7& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; +}; // class CartesianProductHolder7 + +template +class CartesianProductHolder8 { + public: +CartesianProductHolder8(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator8( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder8& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; +}; // class CartesianProductHolder8 + +template +class CartesianProductHolder9 { + public: +CartesianProductHolder9(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator9( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_), + static_cast >(g9_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder9& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; +}; // class CartesianProductHolder9 + +template +class CartesianProductHolder10 { + public: +CartesianProductHolder10(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9, const Generator10& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator10( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_), + static_cast >(g9_), + static_cast >(g10_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder10& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; + const Generator10 g10_; +}; // class CartesianProductHolder10 + +# endif // GTEST_HAS_COMBINE + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test case is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test case FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test case StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings)); +// +// This instantiates tests from test case StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_CASE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_CASE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename ::testing::internal::IteratorTraits + ::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test case BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); +// +// This instantiates tests from test case BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// Currently, Values() supports from 1 to 50 parameters. +// +template +internal::ValueArray1 Values(T1 v1) { + return internal::ValueArray1(v1); +} + +template +internal::ValueArray2 Values(T1 v1, T2 v2) { + return internal::ValueArray2(v1, v2); +} + +template +internal::ValueArray3 Values(T1 v1, T2 v2, T3 v3) { + return internal::ValueArray3(v1, v2, v3); +} + +template +internal::ValueArray4 Values(T1 v1, T2 v2, T3 v3, T4 v4) { + return internal::ValueArray4(v1, v2, v3, v4); +} + +template +internal::ValueArray5 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5) { + return internal::ValueArray5(v1, v2, v3, v4, v5); +} + +template +internal::ValueArray6 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6) { + return internal::ValueArray6(v1, v2, v3, v4, v5, v6); +} + +template +internal::ValueArray7 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7) { + return internal::ValueArray7(v1, v2, v3, v4, v5, + v6, v7); +} + +template +internal::ValueArray8 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { + return internal::ValueArray8(v1, v2, v3, v4, + v5, v6, v7, v8); +} + +template +internal::ValueArray9 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { + return internal::ValueArray9(v1, v2, v3, + v4, v5, v6, v7, v8, v9); +} + +template +internal::ValueArray10 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { + return internal::ValueArray10(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10); +} + +template +internal::ValueArray11 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) { + return internal::ValueArray11(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); +} + +template +internal::ValueArray12 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) { + return internal::ValueArray12(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); +} + +template +internal::ValueArray13 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) { + return internal::ValueArray13(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); +} + +template +internal::ValueArray14 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { + return internal::ValueArray14(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14); +} + +template +internal::ValueArray15 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { + return internal::ValueArray15(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); +} + +template +internal::ValueArray16 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16) { + return internal::ValueArray16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16); +} + +template +internal::ValueArray17 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17) { + return internal::ValueArray17(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17); +} + +template +internal::ValueArray18 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18) { + return internal::ValueArray18(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18); +} + +template +internal::ValueArray19 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { + return internal::ValueArray19(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); +} + +template +internal::ValueArray20 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { + return internal::ValueArray20(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); +} + +template +internal::ValueArray21 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { + return internal::ValueArray21(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); +} + +template +internal::ValueArray22 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22) { + return internal::ValueArray22(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22); +} + +template +internal::ValueArray23 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23) { + return internal::ValueArray23(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23); +} + +template +internal::ValueArray24 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24) { + return internal::ValueArray24(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24); +} + +template +internal::ValueArray25 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { + return internal::ValueArray25(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25); +} + +template +internal::ValueArray26 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) { + return internal::ValueArray26(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); +} + +template +internal::ValueArray27 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) { + return internal::ValueArray27(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); +} + +template +internal::ValueArray28 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) { + return internal::ValueArray28(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28); +} + +template +internal::ValueArray29 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) { + return internal::ValueArray29(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29); +} + +template +internal::ValueArray30 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { + return internal::ValueArray30(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30); +} + +template +internal::ValueArray31 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { + return internal::ValueArray31(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31); +} + +template +internal::ValueArray32 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32) { + return internal::ValueArray32(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32); +} + +template +internal::ValueArray33 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33) { + return internal::ValueArray33(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); +} + +template +internal::ValueArray34 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34) { + return internal::ValueArray34(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); +} + +template +internal::ValueArray35 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { + return internal::ValueArray35(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); +} + +template +internal::ValueArray36 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { + return internal::ValueArray36(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36); +} + +template +internal::ValueArray37 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37) { + return internal::ValueArray37(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37); +} + +template +internal::ValueArray38 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38) { + return internal::ValueArray38(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, + v33, v34, v35, v36, v37, v38); +} + +template +internal::ValueArray39 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38, T39 v39) { + return internal::ValueArray39(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39); +} + +template +internal::ValueArray40 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, + T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, + T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { + return internal::ValueArray40(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, + v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); +} + +template +internal::ValueArray41 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { + return internal::ValueArray41(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, + v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); +} + +template +internal::ValueArray42 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) { + return internal::ValueArray42(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, + v42); +} + +template +internal::ValueArray43 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) { + return internal::ValueArray43(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, + v41, v42, v43); +} + +template +internal::ValueArray44 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) { + return internal::ValueArray44(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, + v40, v41, v42, v43, v44); +} + +template +internal::ValueArray45 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { + return internal::ValueArray45(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, + v39, v40, v41, v42, v43, v44, v45); +} + +template +internal::ValueArray46 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { + return internal::ValueArray46(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46); +} + +template +internal::ValueArray47 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { + return internal::ValueArray47(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); +} + +template +internal::ValueArray48 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, + T48 v48) { + return internal::ValueArray48(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, + v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); +} + +template +internal::ValueArray49 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, + T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, + T47 v47, T48 v48, T49 v49) { + return internal::ValueArray49(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, + v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); +} + +template +internal::ValueArray50 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, + T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, + T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { + return internal::ValueArray50(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test case FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { + return Values(false, true); +} + +# if GTEST_HAS_COMBINE +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Combine can have up to 10 arguments. This number is currently limited +// by the maximum number of elements in the tuple implementation used by Google +// Test. +// +// Example: +// +// This will instantiate tests in test case AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder2 Combine( + const Generator1& g1, const Generator2& g2) { + return internal::CartesianProductHolder2( + g1, g2); +} + +template +internal::CartesianProductHolder3 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3) { + return internal::CartesianProductHolder3( + g1, g2, g3); +} + +template +internal::CartesianProductHolder4 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4) { + return internal::CartesianProductHolder4( + g1, g2, g3, g4); +} + +template +internal::CartesianProductHolder5 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5) { + return internal::CartesianProductHolder5( + g1, g2, g3, g4, g5); +} + +template +internal::CartesianProductHolder6 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6) { + return internal::CartesianProductHolder6( + g1, g2, g3, g4, g5, g6); +} + +template +internal::CartesianProductHolder7 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7) { + return internal::CartesianProductHolder7( + g1, g2, g3, g4, g5, g6, g7); +} + +template +internal::CartesianProductHolder8 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8) { + return internal::CartesianProductHolder8( + g1, g2, g3, g4, g5, g6, g7, g8); +} + +template +internal::CartesianProductHolder9 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9) { + return internal::CartesianProductHolder9( + g1, g2, g3, g4, g5, g6, g7, g8, g9); +} + +template +internal::CartesianProductHolder10 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9, + const Generator10& g10) { + return internal::CartesianProductHolder10( + g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); +} +# endif // GTEST_HAS_COMBINE + +# define TEST_P(test_case_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + : public test_case_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ + virtual void TestBody(); \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, \ + ::testing::internal::CodeLocation(\ + __FILE__, __LINE__))->AddTestPattern(\ + GTEST_STRINGIFY_(test_case_name), \ + GTEST_STRINGIFY_(test_name), \ + new ::testing::internal::TestMetaFactory< \ + GTEST_TEST_CLASS_NAME_(\ + test_case_name, test_name)>()); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_case_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user +// to specify a function or functor that generates custom test name suffixes +// based on the test parameters. The function should accept one argument of +// type testing::TestParamInfo, and return std::string. +// +// testing::PrintToStringParamName is a builtin test suffix generator that +// returns the value of testing::PrintToString(GetParam()). +// +// Note: test names must be non-empty, unique, and may only contain ASCII +// alphanumeric characters or underscore. Because PrintToString adds quotes +// to std::string and C strings, it won't work for these types. + +# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \ + static ::testing::internal::ParamGenerator \ + gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ + static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + return ::testing::internal::GetParamNameGen \ + (__VA_ARGS__)(info); \ + } \ + static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, \ + ::testing::internal::CodeLocation(\ + __FILE__, __LINE__))->AddTestCaseInstantiation(\ + #prefix, \ + >est_##prefix##test_case_name##_EvalGenerator_, \ + >est_##prefix##test_case_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// Google C++ Testing and Mocking Framework definitions useful in production code. +// GOOGLETEST_CM0003 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void PrivateMethod(); +// FRIEND_TEST(MyClassTest, PrivateMethodWorks); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, PrivateMethodWorks) { +// // Can call MyClass::PrivateMethod() here. +// } +// +// Note: The test class must be in the same namespace as the class being tested. +// For example, putting MyClassTest in an anonymous namespace will not work. + +#define FRIEND_TEST(test_case_name, test_name)\ +friend class test_case_name##_##test_name##_Test + +#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include +#include + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure // Failed and the test should be terminated. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, + const char* a_file_name, + int a_line_number, + const char* a_message) + : type_(a_type), + file_name_(a_file_name == NULL ? "" : a_file_name), + line_number_(a_line_number), + summary_(ExtractSummary(a_message)), + message_(a_message) { + } + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char* file_name() const { + return file_name_.empty() ? NULL : file_name_.c_str(); + } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char* summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char* message() const { return message_.c_str(); } + + // Returns true iff the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true iff the test part failed. + bool failed() const { return type_ != kSuccess; } + + // Returns true iff the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true iff the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static std::string ExtractSummary(const char* message); + + // The name of the source file where the test part took place, or + // "" if the source file is unknown. + std::string file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + std::string summary_; // The test failure summary. + std::string message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult& result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult& GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector array_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); +}; + +// This interface knows how to report a test part result. +class GTEST_API_ TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult& result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + virtual ~HasNewFatalFailureHelper(); + virtual void ReportTestPartResult(const TestPartResult& result); + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface* original_reporter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + public: + ... + typedef std::list List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test case, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types MyTypes; +TYPED_TEST_CASE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_CASE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test case as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + // Since we are inside a derived class template, C++ requires use to + // visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +// TYPED_TEST_CASE takes an optional third argument which allows to specify a +// class that generates custom test name suffixes based on the type. This should +// be a class which has a static template function GetName(int index) returning +// a string for each type. The provided integer index equals the index of the +// type in the provided type list. In many cases the index can be ignored. +// +// For example: +// class MyTypeNames { +// public: +// template +// static std::string GetName(int) { +// if (std::is_same()) return "char"; +// if (std::is_same()) return "int"; +// if (std::is_same()) return "unsignedInt"; +// } +// }; +// TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test case +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_CASE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test case as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test case name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_CASE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test case name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types MyTypes; +INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); +// +// Similar to the optional argument of TYPED_TEST_CASE above, +// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to +// generate custom names. +// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames); + +#endif // 0 + + +// Implements typed tests. + +#if GTEST_HAS_TYPED_TEST + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test case. +# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ + +// Expands to the name of the typedef for the NameGenerator, responsible for +// creating the suffixes of the name. +#define GTEST_NAME_GENERATOR_(TestCaseName) \ + gtest_type_params_##TestCaseName##_NameGenerator + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types) +# define TYPED_TEST_CASE(CaseName, Types, ...) \ + typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \ + CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ + GTEST_NAME_GENERATOR_(CaseName) + +# define TYPED_TEST(CaseName, TestName) \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##CaseName##_##TestName##_registered_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel, \ + GTEST_TYPE_PARAMS_( \ + CaseName)>::Register("", \ + ::testing::internal::CodeLocation( \ + __FILE__, __LINE__), \ + #CaseName, #TestName, 0, \ + ::testing::internal::GenerateNames< \ + GTEST_NAME_GENERATOR_(CaseName), \ + GTEST_TYPE_PARAMS_(CaseName)>()); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, \ + TestName)::TestBody() + +#endif // GTEST_HAS_TYPED_TEST + +// Implements type-parameterized tests. + +#if GTEST_HAS_TYPED_TEST_P + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test case are defined in. The exact +// name of the namespace is subject to change without notice. +# define GTEST_CASE_NAMESPACE_(TestCaseName) \ + gtest_case_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test case. +# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ + gtest_typed_test_case_p_state_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test case. +# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ + gtest_registered_test_names_##TestCaseName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +# define TYPED_TEST_CASE_P(CaseName) \ + static ::testing::internal::TypedTestCasePState \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) + +# define TYPED_TEST_P(CaseName, TestName) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + template \ + class TestName : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ + __FILE__, __LINE__, #CaseName, #TestName); \ + } \ + template \ + void GTEST_CASE_NAMESPACE_(CaseName)::TestName::TestBody() + +# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \ + GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \ + __FILE__, __LINE__, #__VA_ARGS__) + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types) +# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...) \ + static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestCase< \ + CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \ + ::testing::internal::TypeList< Types >::type>:: \ + Register(#Prefix, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName, \ + GTEST_REGISTERED_TEST_NAMES_(CaseName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ + ::testing::internal::TypeList< Types >::type>()) + +#endif // GTEST_HAS_TYPED_TEST_P + +#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Depending on the platform, different string classes are available. +// On Linux, in addition to ::std::string, Google also makes use of +// class ::string, which has the same interface as ::std::string, but +// has a different implementation. +// +// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that +// ::string is available AND is a distinct type to ::std::string, or +// define it to 0 to indicate otherwise. +// +// If ::std::string and ::string are the same class on your platform +// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0. +// +// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined +// heuristically. + +namespace testing { + +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4805) +# pragma warning(disable:4100) +#endif + + +// Declares the flags. + +// This flag temporary enables the disabled tests. +GTEST_DECLARE_bool_(also_run_disabled_tests); + +// This flag brings the debugger on an assertion failure. +GTEST_DECLARE_bool_(break_on_failure); + +// This flag controls whether Google Test catches all test-thrown exceptions +// and logs them as failures. +GTEST_DECLARE_bool_(catch_exceptions); + +// This flag enables using colors in terminal output. Available values are +// "yes" to enable colors, "no" (disable colors), or "auto" (the default) +// to let Google Test decide. +GTEST_DECLARE_string_(color); + +// This flag sets up the filter to select by name using a glob pattern +// the tests to run. If the filter is not given all tests are executed. +GTEST_DECLARE_string_(filter); + +// This flag controls whether Google Test installs a signal handler that dumps +// debugging information when fatal signals are raised. +GTEST_DECLARE_bool_(install_failure_signal_handler); + +// This flag causes the Google Test to list tests. None of the tests listed +// are actually run if the flag is provided. +GTEST_DECLARE_bool_(list_tests); + +// This flag controls whether Google Test emits a detailed XML report to a file +// in addition to its normal textual output. +GTEST_DECLARE_string_(output); + +// This flags control whether Google Test prints the elapsed time for each +// test. +GTEST_DECLARE_bool_(print_time); + +// This flags control whether Google Test prints UTF8 characters as text. +GTEST_DECLARE_bool_(print_utf8); + +// This flag specifies the random number seed. +GTEST_DECLARE_int32_(random_seed); + +// This flag sets how many times the tests are repeated. The default value +// is 1. If the value is -1 the tests are repeating forever. +GTEST_DECLARE_int32_(repeat); + +// This flag controls whether Google Test includes Google Test internal +// stack frames in failure stack traces. +GTEST_DECLARE_bool_(show_internal_stack_frames); + +// When this flag is specified, tests' order is randomized on every iteration. +GTEST_DECLARE_bool_(shuffle); + +// This flag specifies the maximum number of stack frames to be +// printed in a failure message. +GTEST_DECLARE_int32_(stack_trace_depth); + +// When this flag is specified, a failed assertion will throw an +// exception if exceptions are enabled, or exit the program with a +// non-zero code otherwise. For use with an external test framework. +GTEST_DECLARE_bool_(throw_on_failure); + +// When this flag is set with a "host:port" string, on supported +// platforms test results are streamed to the specified port on +// the specified host machine. +GTEST_DECLARE_string_(stream_result_to); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DECLARE_string_(flagfile); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +// The upper limit for valid stack trace depths. +const int kMaxStackTraceDepth = 100; + +namespace internal { + +class AssertHelper; +class DefaultGlobalTestPartResultReporter; +class ExecDeathTest; +class NoExecDeathTest; +class FinalSuccessChecker; +class GTestFlagSaver; +class StreamingListenerTest; +class TestResultAccessor; +class TestEventListenersAccessor; +class TestEventRepeater; +class UnitTestRecordPropertyTestHelper; +class WindowsDeathTest; +class FuchsiaDeathTest; +class UnitTestImpl* GetUnitTestImpl(); +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message); + +} // namespace internal + +// The friend relationship of some of these classes is cyclic. +// If we don't forward declare them the compiler might confuse the classes +// in friendship clauses with same named classes on the scope. +class Test; +class TestCase; +class TestInfo; +class UnitTest; + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + +#if defined(_MSC_VER) && _MSC_VER < 1910 + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif + + // Used in the EXPECT_TRUE/FALSE(bool_expression). + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template + explicit AssertionResult( + const T& success, + typename internal::EnableIf< + !internal::ImplicitlyConvertible::value>::type* + /*enabler*/ = NULL) + : success_(success) {} + +#if defined(_MSC_VER) && _MSC_VER < 1910 + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + + // Assignment operator. + AssertionResult& operator=(AssertionResult other) { + swap(other); + return *this; + } + + // Returns true iff the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != NULL ? message_->c_str() : ""; + } + // FIXME: Remove this after making sure no clients use it. + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == NULL) + message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult& other); + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + internal::scoped_ptr< ::std::string> message_; +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +} // namespace testing + +// Includes the auto-generated header that implements a family of generic +// predicate assertion macros. This include comes late because it relies on +// APIs declared above. +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command +// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! +// +// Implements a family of generic predicate assertion macros. + +// GOOGLETEST_CM0001 DO NOT DELETE + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + + +namespace testing { + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template +AssertionResult AssertPred1Helper(const char* pred_text, + const char* e1, + Pred pred, + const T1& v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, v1), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ + #v1, \ + pred, \ + v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template +AssertionResult AssertPred2Helper(const char* pred_text, + const char* e1, + const char* e2, + Pred pred, + const T1& v1, + const T2& v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ + #v1, \ + #v2, \ + pred, \ + v1, \ + v2), on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template +AssertionResult AssertPred3Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + pred, \ + v1, \ + v2, \ + v3), on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template +AssertionResult AssertPred4Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4), on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template +AssertionResult AssertPred5Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + const char* e5, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4, + const T5& v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ", " + << e5 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4 + << "\n" << e5 << " evaluates to " << v5; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + #v5, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4, \ + v5), on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + + + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +namespace testing { + +// The abstract class that all tests inherit from. +// +// In Google Test, a unit test program contains one or many TestCases, and +// each TestCase contains one or many Tests. +// +// When you define a test using the TEST macro, you don't need to +// explicitly derive from Test - the TEST macro automatically does +// this for you. +// +// The only time you derive from Test is when defining a test fixture +// to be used in a TEST_F. For example: +// +// class FooTest : public testing::Test { +// protected: +// void SetUp() override { ... } +// void TearDown() override { ... } +// ... +// }; +// +// TEST_F(FooTest, Bar) { ... } +// TEST_F(FooTest, Baz) { ... } +// +// Test is not copyable. +class GTEST_API_ Test { + public: + friend class TestInfo; + + // Defines types for pointers to functions that set up and tear down + // a test case. + typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc; + typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc; + + // The d'tor is virtual as we intend to inherit from Test. + virtual ~Test(); + + // Sets up the stuff shared by all tests in this test case. + // + // Google Test will call Foo::SetUpTestCase() before running the first + // test in test case Foo. Hence a sub-class can define its own + // SetUpTestCase() method to shadow the one defined in the super + // class. + static void SetUpTestCase() {} + + // Tears down the stuff shared by all tests in this test case. + // + // Google Test will call Foo::TearDownTestCase() after running the last + // test in test case Foo. Hence a sub-class can define its own + // TearDownTestCase() method to shadow the one defined in the super + // class. + static void TearDownTestCase() {} + + // Returns true iff the current test has a fatal failure. + static bool HasFatalFailure(); + + // Returns true iff the current test has a non-fatal failure. + static bool HasNonfatalFailure(); + + // Returns true iff the current test has a (either fatal or + // non-fatal) failure. + static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } + + // Logs a property for the current test, test case, or for the entire + // invocation of the test program when used outside of the context of a + // test case. Only the last value for a given key is remembered. These + // are public static so they can be called from utility functions that are + // not members of the test fixture. Calls to RecordProperty made during + // lifespan of the test (from the moment its constructor starts to the + // moment its destructor finishes) will be output in XML as attributes of + // the element. Properties recorded from fixture's + // SetUpTestCase or TearDownTestCase are logged as attributes of the + // corresponding element. Calls to RecordProperty made in the + // global context (before or after invocation of RUN_ALL_TESTS and from + // SetUp/TearDown method of Environment objects registered with Google + // Test) will be output as attributes of the element. + static void RecordProperty(const std::string& key, const std::string& value); + static void RecordProperty(const std::string& key, int value); + + protected: + // Creates a Test object. + Test(); + + // Sets up the test fixture. + virtual void SetUp(); + + // Tears down the test fixture. + virtual void TearDown(); + + private: + // Returns true iff the current test has the same fixture class as + // the first test in the current test case. + static bool HasSameFixtureClass(); + + // Runs the test after the test fixture has been set up. + // + // A sub-class must implement this to define the test logic. + // + // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM. + // Instead, use the TEST or TEST_F macro. + virtual void TestBody() = 0; + + // Sets up, executes, and tears down the test. + void Run(); + + // Deletes self. We deliberately pick an unusual name for this + // internal method to avoid clashing with names used in user TESTs. + void DeleteSelf_() { delete this; } + + const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_; + + // Often a user misspells SetUp() as Setup() and spends a long time + // wondering why it is never called by Google Test. The declaration of + // the following method is solely for catching such an error at + // compile time: + // + // - The return type is deliberately chosen to be not void, so it + // will be a conflict if void Setup() is declared in the user's + // test fixture. + // + // - This method is private, so it will be another compiler error + // if the method is called from the user's test fixture. + // + // DO NOT OVERRIDE THIS FUNCTION. + // + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + + // We disallow copying Tests. + GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); +}; + +typedef internal::TimeInMillis TimeInMillis; + +// A copyable object representing a user specified test property which can be +// output as a key/value string pair. +// +// Don't inherit from TestProperty as its destructor is not virtual. +class TestProperty { + public: + // C'tor. TestProperty does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestProperty object. + TestProperty(const std::string& a_key, const std::string& a_value) : + key_(a_key), value_(a_value) { + } + + // Gets the user supplied key. + const char* key() const { + return key_.c_str(); + } + + // Gets the user supplied value. + const char* value() const { + return value_.c_str(); + } + + // Sets a new value, overriding the one supplied in the constructor. + void SetValue(const std::string& new_value) { + value_ = new_value; + } + + private: + // The key supplied by the user. + std::string key_; + // The value supplied by the user. + std::string value_; +}; + +// The result of a single Test. This includes a list of +// TestPartResults, a list of TestProperties, a count of how many +// death tests there are in the Test, and how much time it took to run +// the Test. +// +// TestResult is not copyable. +class GTEST_API_ TestResult { + public: + // Creates an empty TestResult. + TestResult(); + + // D'tor. Do not inherit from TestResult. + ~TestResult(); + + // Gets the number of all test parts. This is the sum of the number + // of successful test parts and the number of failed test parts. + int total_part_count() const; + + // Returns the number of the test properties. + int test_property_count() const; + + // Returns true iff the test passed (i.e. no test part failed). + bool Passed() const { return !Failed(); } + + // Returns true iff the test failed. + bool Failed() const; + + // Returns true iff the test fatally failed. + bool HasFatalFailure() const; + + // Returns true iff the test has a non-fatal failure. + bool HasNonfatalFailure() const; + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns the i-th test part result among all the results. i can range from 0 + // to total_part_count() - 1. If i is not in that range, aborts the program. + const TestPartResult& GetTestPartResult(int i) const; + + // Returns the i-th test property. i can range from 0 to + // test_property_count() - 1. If i is not in that range, aborts the + // program. + const TestProperty& GetTestProperty(int i) const; + + private: + friend class TestInfo; + friend class TestCase; + friend class UnitTest; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::ExecDeathTest; + friend class internal::TestResultAccessor; + friend class internal::UnitTestImpl; + friend class internal::WindowsDeathTest; + friend class internal::FuchsiaDeathTest; + + // Gets the vector of TestPartResults. + const std::vector& test_part_results() const { + return test_part_results_; + } + + // Gets the vector of TestProperties. + const std::vector& test_properties() const { + return test_properties_; + } + + // Sets the elapsed time. + void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } + + // Adds a test property to the list. The property is validated and may add + // a non-fatal failure if invalid (e.g., if it conflicts with reserved + // key names). If a property is already recorded for the same key, the + // value will be updated, rather than storing multiple values for the same + // key. xml_element specifies the element for which the property is being + // recorded and is used for validation. + void RecordProperty(const std::string& xml_element, + const TestProperty& test_property); + + // Adds a failure if the key is a reserved attribute of Google Test + // testcase tags. Returns true if the property is valid. + // FIXME: Validate attribute names are legal and human readable. + static bool ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property); + + // Adds a test part result to the list. + void AddTestPartResult(const TestPartResult& test_part_result); + + // Returns the death test count. + int death_test_count() const { return death_test_count_; } + + // Increments the death test count, returning the new count. + int increment_death_test_count() { return ++death_test_count_; } + + // Clears the test part results. + void ClearTestPartResults(); + + // Clears the object. + void Clear(); + + // Protects mutable state of the property vector and of owned + // properties, whose values may be updated. + internal::Mutex test_properites_mutex_; + + // The vector of TestPartResults + std::vector test_part_results_; + // The vector of TestProperties + std::vector test_properties_; + // Running count of death tests. + int death_test_count_; + // The elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + + // We disallow copying TestResult. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); +}; // class TestResult + +// A TestInfo object stores the following information about a test: +// +// Test case name +// Test name +// Whether the test should be run +// A function pointer that creates the test object when invoked +// Test result +// +// The constructor of TestInfo registers itself with the UnitTest +// singleton such that the RUN_ALL_TESTS() macro knows which tests to +// run. +class GTEST_API_ TestInfo { + public: + // Destructs a TestInfo object. This function is not virtual, so + // don't inherit from TestInfo. + ~TestInfo(); + + // Returns the test case name. + const char* test_case_name() const { return test_case_name_.c_str(); } + + // Returns the test name. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a typed + // or a type-parameterized test. + const char* type_param() const { + if (type_param_.get() != NULL) + return type_param_->c_str(); + return NULL; + } + + // Returns the text representation of the value parameter, or NULL if this + // is not a value-parameterized test. + const char* value_param() const { + if (value_param_.get() != NULL) + return value_param_->c_str(); + return NULL; + } + + // Returns the file name where this test is defined. + const char* file() const { return location_.file.c_str(); } + + // Returns the line where this test is defined. + int line() const { return location_.line; } + + // Return true if this test should not be run because it's in another shard. + bool is_in_another_shard() const { return is_in_another_shard_; } + + // Returns true if this test should run, that is if the test is not + // disabled (or it is disabled but the also_run_disabled_tests flag has + // been specified) and its full name matches the user-specified filter. + // + // Google Test allows the user to filter the tests by their full names. + // The full name of a test Bar in test case Foo is defined as + // "Foo.Bar". Only the tests that match the filter will run. + // + // A filter is a colon-separated list of glob (not regex) patterns, + // optionally followed by a '-' and a colon-separated list of + // negative patterns (tests to exclude). A test is run if it + // matches one of the positive patterns and does not match any of + // the negative patterns. + // + // For example, *A*:Foo.* is a filter that matches any string that + // contains the character 'A' or starts with "Foo.". + bool should_run() const { return should_run_; } + + // Returns true iff this test will appear in the XML report. + bool is_reportable() const { + // The XML report includes tests matching the filter, excluding those + // run in other shards. + return matches_filter_ && !is_in_another_shard_; + } + + // Returns the result of the test. + const TestResult* result() const { return &result_; } + + private: +#if GTEST_HAS_DEATH_TEST + friend class internal::DefaultDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + friend class Test; + friend class TestCase; + friend class internal::UnitTestImpl; + friend class internal::StreamingListenerTest; + friend TestInfo* internal::MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + internal::CodeLocation code_location, + internal::TypeId fixture_class_id, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc, + internal::TestFactoryBase* factory); + + // Constructs a TestInfo object. The newly constructed instance assumes + // ownership of the factory object. + TestInfo(const std::string& test_case_name, + const std::string& name, + const char* a_type_param, // NULL if not a type-parameterized test + const char* a_value_param, // NULL if not a value-parameterized test + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory); + + // Increments the number of death tests encountered in this test so + // far. + int increment_death_test_count() { + return result_.increment_death_test_count(); + } + + // Creates the test object, runs it, records its result, and then + // deletes it. + void Run(); + + static void ClearTestResult(TestInfo* test_info) { + test_info->result_.Clear(); + } + + // These fields are immutable properties of the test. + const std::string test_case_name_; // Test case name + const std::string name_; // Test name + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const internal::scoped_ptr type_param_; + // Text representation of the value parameter, or NULL if this is not a + // value-parameterized test. + const internal::scoped_ptr value_param_; + internal::CodeLocation location_; + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True iff this test should run + bool is_disabled_; // True iff this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + bool is_in_another_shard_; // Will be run in another shard. + internal::TestFactoryBase* const factory_; // The factory that creates + // the test object + + // This field is mutable and needs to be reset before running the + // test for the second time. + TestResult result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); +}; + +// A test case, which consists of a vector of TestInfos. +// +// TestCase is not copyable. +class GTEST_API_ TestCase { + public: + // Creates a TestCase with the given name. + // + // TestCase does NOT have a default constructor. Always use this + // constructor to create a TestCase object. + // + // Arguments: + // + // name: name of the test case + // a_type_param: the name of the test's type parameter, or NULL if + // this is not a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + TestCase(const char* name, const char* a_type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc); + + // Destructor of TestCase. + virtual ~TestCase(); + + // Gets the name of the TestCase. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a + // type-parameterized test case. + const char* type_param() const { + if (type_param_.get() != NULL) + return type_param_->c_str(); + return NULL; + } + + // Returns true if any test in this test case should run. + bool should_run() const { return should_run_; } + + // Gets the number of successful tests in this test case. + int successful_test_count() const; + + // Gets the number of failed tests in this test case. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests in this test case. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Get the number of tests in this test case that should run. + int test_to_run_count() const; + + // Gets the number of all tests in this test case. + int total_test_count() const; + + // Returns true iff the test case passed. + bool Passed() const { return !Failed(); } + + // Returns true iff the test case failed. + bool Failed() const { return failed_test_count() > 0; } + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + const TestInfo* GetTestInfo(int i) const; + + // Returns the TestResult that holds test properties recorded during + // execution of SetUpTestCase and TearDownTestCase. + const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; } + + private: + friend class Test; + friend class internal::UnitTestImpl; + + // Gets the (mutable) vector of TestInfos in this TestCase. + std::vector& test_info_list() { return test_info_list_; } + + // Gets the (immutable) vector of TestInfos in this TestCase. + const std::vector& test_info_list() const { + return test_info_list_; + } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + TestInfo* GetMutableTestInfo(int i); + + // Sets the should_run member. + void set_should_run(bool should) { should_run_ = should; } + + // Adds a TestInfo to this test case. Will delete the TestInfo upon + // destruction of the TestCase object. + void AddTestInfo(TestInfo * test_info); + + // Clears the results of all tests in this test case. + void ClearResult(); + + // Clears the results of all tests in the given test case. + static void ClearTestCaseResult(TestCase* test_case) { + test_case->ClearResult(); + } + + // Runs every test in this TestCase. + void Run(); + + // Runs SetUpTestCase() for this TestCase. This wrapper is needed + // for catching exceptions thrown from SetUpTestCase(). + void RunSetUpTestCase() { (*set_up_tc_)(); } + + // Runs TearDownTestCase() for this TestCase. This wrapper is + // needed for catching exceptions thrown from TearDownTestCase(). + void RunTearDownTestCase() { (*tear_down_tc_)(); } + + // Returns true iff test passed. + static bool TestPassed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Passed(); + } + + // Returns true iff test failed. + static bool TestFailed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Failed(); + } + + // Returns true iff the test is disabled and will be reported in the XML + // report. + static bool TestReportableDisabled(const TestInfo* test_info) { + return test_info->is_reportable() && test_info->is_disabled_; + } + + // Returns true iff test is disabled. + static bool TestDisabled(const TestInfo* test_info) { + return test_info->is_disabled_; + } + + // Returns true iff this test will appear in the XML report. + static bool TestReportable(const TestInfo* test_info) { + return test_info->is_reportable(); + } + + // Returns true if the given test should run. + static bool ShouldRunTest(const TestInfo* test_info) { + return test_info->should_run(); + } + + // Shuffles the tests in this test case. + void ShuffleTests(internal::Random* random); + + // Restores the test order to before the first shuffle. + void UnshuffleTests(); + + // Name of the test case. + std::string name_; + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const internal::scoped_ptr type_param_; + // The vector of TestInfos in their original order. It owns the + // elements in the vector. + std::vector test_info_list_; + // Provides a level of indirection for the test list to allow easy + // shuffling and restoring the test order. The i-th element in this + // vector is the index of the i-th test in the shuffled test list. + std::vector test_indices_; + // Pointer to the function that sets up the test case. + Test::SetUpTestCaseFunc set_up_tc_; + // Pointer to the function that tears down the test case. + Test::TearDownTestCaseFunc tear_down_tc_; + // True iff any test in this test case should run. + bool should_run_; + // Elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + // Holds test properties recorded during execution of SetUpTestCase and + // TearDownTestCase. + TestResult ad_hoc_test_result_; + + // We disallow copying TestCases. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase); +}; + +// An Environment object is capable of setting up and tearing down an +// environment. You should subclass this to define your own +// environment(s). +// +// An Environment object does the set-up and tear-down in virtual +// methods SetUp() and TearDown() instead of the constructor and the +// destructor, as: +// +// 1. You cannot safely throw from a destructor. This is a problem +// as in some cases Google Test is used where exceptions are enabled, and +// we may want to implement ASSERT_* using exceptions where they are +// available. +// 2. You cannot use ASSERT_* directly in a constructor or +// destructor. +class Environment { + public: + // The d'tor is virtual as we need to subclass Environment. + virtual ~Environment() {} + + // Override this to define how to set up the environment. + virtual void SetUp() {} + + // Override this to define how to tear down the environment. + virtual void TearDown() {} + private: + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } +}; + +#if GTEST_HAS_EXCEPTIONS + +// Exception which can be thrown from TestEventListener::OnTestPartResult. +class GTEST_API_ AssertionException + : public internal::GoogleTestFailureException { + public: + explicit AssertionException(const TestPartResult& result) + : GoogleTestFailureException(result) {} +}; + +#endif // GTEST_HAS_EXCEPTIONS + +// The interface for tracing execution of tests. The methods are organized in +// the order the corresponding events are fired. +class TestEventListener { + public: + virtual ~TestEventListener() {} + + // Fired before any test activity starts. + virtual void OnTestProgramStart(const UnitTest& unit_test) = 0; + + // Fired before each iteration of tests starts. There may be more than + // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration + // index, starting from 0. + virtual void OnTestIterationStart(const UnitTest& unit_test, + int iteration) = 0; + + // Fired before environment set-up for each iteration of tests starts. + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0; + + // Fired after environment set-up for each iteration of tests ends. + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0; + + // Fired before the test case starts. + virtual void OnTestCaseStart(const TestCase& test_case) = 0; + + // Fired before the test starts. + virtual void OnTestStart(const TestInfo& test_info) = 0; + + // Fired after a failed assertion or a SUCCEED() invocation. + // If you want to throw an exception from this function to skip to the next + // TEST, it must be AssertionException defined above, or inherited from it. + virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0; + + // Fired after the test ends. + virtual void OnTestEnd(const TestInfo& test_info) = 0; + + // Fired after the test case ends. + virtual void OnTestCaseEnd(const TestCase& test_case) = 0; + + // Fired before environment tear-down for each iteration of tests starts. + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0; + + // Fired after environment tear-down for each iteration of tests ends. + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; + + // Fired after each iteration of tests finishes. + virtual void OnTestIterationEnd(const UnitTest& unit_test, + int iteration) = 0; + + // Fired after all test activities have ended. + virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; +}; + +// The convenience class for users who need to override just one or two +// methods and are not concerned that a possible change to a signature of +// the methods they override will not be caught during the build. For +// comments about each method please see the definition of TestEventListener +// above. +class EmptyTestEventListener : public TestEventListener { + public: + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) {} + virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {} + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} + virtual void OnTestStart(const TestInfo& /*test_info*/) {} + virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {} + virtual void OnTestEnd(const TestInfo& /*test_info*/) {} + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} + virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {} + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/, + int /*iteration*/) {} + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} +}; + +// TestEventListeners lets users add listeners to track events in Google Test. +class GTEST_API_ TestEventListeners { + public: + TestEventListeners(); + ~TestEventListeners(); + + // Appends an event listener to the end of the list. Google Test assumes + // the ownership of the listener (i.e. it will delete the listener when + // the test program finishes). + void Append(TestEventListener* listener); + + // Removes the given event listener from the list and returns it. It then + // becomes the caller's responsibility to delete the listener. Returns + // NULL if the listener is not found in the list. + TestEventListener* Release(TestEventListener* listener); + + // Returns the standard listener responsible for the default console + // output. Can be removed from the listeners list to shut down default + // console output. Note that removing this object from the listener list + // with Release transfers its ownership to the caller and makes this + // function return NULL the next time. + TestEventListener* default_result_printer() const { + return default_result_printer_; + } + + // Returns the standard listener responsible for the default XML output + // controlled by the --gtest_output=xml flag. Can be removed from the + // listeners list by users who want to shut down the default XML output + // controlled by this flag and substitute it with custom one. Note that + // removing this object from the listener list with Release transfers its + // ownership to the caller and makes this function return NULL the next + // time. + TestEventListener* default_xml_generator() const { + return default_xml_generator_; + } + + private: + friend class TestCase; + friend class TestInfo; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::NoExecDeathTest; + friend class internal::TestEventListenersAccessor; + friend class internal::UnitTestImpl; + + // Returns repeater that broadcasts the TestEventListener events to all + // subscribers. + TestEventListener* repeater(); + + // Sets the default_result_printer attribute to the provided listener. + // The listener is also added to the listener list and previous + // default_result_printer is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultResultPrinter(TestEventListener* listener); + + // Sets the default_xml_generator attribute to the provided listener. The + // listener is also added to the listener list and previous + // default_xml_generator is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultXmlGenerator(TestEventListener* listener); + + // Controls whether events will be forwarded by the repeater to the + // listeners in the list. + bool EventForwardingEnabled() const; + void SuppressEventForwarding(); + + // The actual list of listeners. + internal::TestEventRepeater* repeater_; + // Listener responsible for the standard result output. + TestEventListener* default_result_printer_; + // Listener responsible for the creation of the XML output file. + TestEventListener* default_xml_generator_; + + // We disallow copying TestEventListeners. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); +}; + +// A UnitTest consists of a vector of TestCases. +// +// This is a singleton class. The only instance of UnitTest is +// created when UnitTest::GetInstance() is first called. This +// instance is never deleted. +// +// UnitTest is not copyable. +// +// This class is thread-safe as long as the methods are called +// according to their specification. +class GTEST_API_ UnitTest { + public: + // Gets the singleton UnitTest object. The first time this method + // is called, a UnitTest object is constructed and returned. + // Consecutive calls will return the same object. + static UnitTest* GetInstance(); + + // Runs all tests in this UnitTest object and prints the result. + // Returns 0 if successful, or 1 otherwise. + // + // This method can only be called from the main thread. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + int Run() GTEST_MUST_USE_RESULT_; + + // Returns the working directory when the first TEST() or TEST_F() + // was executed. The UnitTest object owns the string. + const char* original_working_dir() const; + + // Returns the TestCase object for the test that's currently running, + // or NULL if no test is running. + const TestCase* current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_); + + // Returns the TestInfo object for the test that's currently running, + // or NULL if no test is running. + const TestInfo* current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_); + + // Returns the random seed used at the start of the current test run. + int random_seed() const; + + // Returns the ParameterizedTestCaseRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + internal::ParameterizedTestCaseRegistry& parameterized_test_registry() + GTEST_LOCK_EXCLUDED_(mutex_); + + // Gets the number of successful test cases. + int successful_test_case_count() const; + + // Gets the number of failed test cases. + int failed_test_case_count() const; + + // Gets the number of all test cases. + int total_test_case_count() const; + + // Gets the number of all test cases that contain at least one test + // that should run. + int test_case_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const; + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const; + + // Returns true iff the unit test passed (i.e. all test cases passed). + bool Passed() const; + + // Returns true iff the unit test failed (i.e. some test case failed + // or something outside of all tests failed). + bool Failed() const; + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + const TestCase* GetTestCase(int i) const; + + // Returns the TestResult containing information on test failures and + // properties logged outside of individual test cases. + const TestResult& ad_hoc_test_result() const; + + // Returns the list of event listeners that can be used to track events + // inside Google Test. + TestEventListeners& listeners(); + + private: + // Registers and returns a global test environment. When a test + // program is run, all global test environments will be set-up in + // the order they were registered. After all tests in the program + // have finished, all global test environments will be torn-down in + // the *reverse* order they were registered. + // + // The UnitTest object takes ownership of the given environment. + // + // This method can only be called from the main thread. + Environment* AddEnvironment(Environment* env); + + // Adds a TestPartResult to the current TestResult object. All + // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) + // eventually call this to report their results. The user code + // should use the assertion macros instead of calling this directly. + void AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, + int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Adds a TestProperty to the current TestResult object when invoked from + // inside a test, to current TestCase's ad_hoc_test_result_ when invoked + // from SetUpTestCase or TearDownTestCase, or to the global property set + // when invoked elsewhere. If the result already contains a property with + // the same key, the value will be updated. + void RecordProperty(const std::string& key, const std::string& value); + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + TestCase* GetMutableTestCase(int i); + + // Accessors for the implementation object. + internal::UnitTestImpl* impl() { return impl_; } + const internal::UnitTestImpl* impl() const { return impl_; } + + // These classes and functions are friends as they need to access private + // members of UnitTest. + friend class ScopedTrace; + friend class Test; + friend class internal::AssertHelper; + friend class internal::StreamingListenerTest; + friend class internal::UnitTestRecordPropertyTestHelper; + friend Environment* AddGlobalTestEnvironment(Environment* env); + friend internal::UnitTestImpl* internal::GetUnitTestImpl(); + friend void internal::ReportFailureInUnknownLocation( + TestPartResult::Type result_type, + const std::string& message); + + // Creates an empty UnitTest. + UnitTest(); + + // D'tor + virtual ~UnitTest(); + + // Pushes a trace defined by SCOPED_TRACE() on to the per-thread + // Google Test trace stack. + void PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Pops a trace from the per-thread Google Test trace stack. + void PopGTestTrace() + GTEST_LOCK_EXCLUDED_(mutex_); + + // Protects mutable state in *impl_. This is mutable as some const + // methods need to lock it too. + mutable internal::Mutex mutex_; + + // Opaque implementation object. This field is never changed once + // the object is constructed. We don't mark it as const here, as + // doing so will cause a warning in the constructor of UnitTest. + // Mutable state in *impl_ is protected by mutex_. + internal::UnitTestImpl* impl_; + + // We disallow copying UnitTest. + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); +}; + +// A convenient wrapper for adding an environment for the test +// program. +// +// You should call this before RUN_ALL_TESTS() is called, probably in +// main(). If you use gtest_main, you need to call this before main() +// starts for it to take effect. For example, you can define a global +// variable like this: +// +// testing::Environment* const foo_env = +// testing::AddGlobalTestEnvironment(new FooEnvironment); +// +// However, we strongly recommend you to write your own main() and +// call AddGlobalTestEnvironment() there, as relying on initialization +// of global variables makes the code harder to read and may cause +// problems when you register multiple environments from different +// translation units and the environments have dependencies among them +// (remember that the compiler doesn't guarantee the order in which +// global variables from different translation units are initialized). +inline Environment* AddGlobalTestEnvironment(Environment* env) { + return UnitTest::GetInstance()->AddEnvironment(env); +} + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +GTEST_API_ void InitGoogleTest(int* argc, char** argv); + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); + +namespace internal { + +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers +// when calling EXPECT_* in a tight loop. +template +AssertionResult CmpHelperEQFailure(const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, const T2& rhs) { + return EqFailure(lhs_expression, + rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), + false); +} + +// The helper function for {ASSERT|EXPECT}_EQ. +template +AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, + const T2& rhs) { + if (lhs == rhs) { + return AssertionSuccess(); + } + + return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); +} + +// With this overloaded version, we allow anonymous enums to be used +// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums +// can be implicitly cast to BiggestInt. +GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, + BiggestInt lhs, + BiggestInt rhs); + +// The helper class for {ASSERT|EXPECT}_EQ. The template argument +// lhs_is_null_literal is true iff the first argument to ASSERT_EQ() +// is a null pointer literal. The following default implementation is +// for lhs_is_null_literal being false. +template +class EqHelper { + public: + // This templatized version is for the general case. + template + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, + const T2& rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } + + // With this overloaded version, we allow anonymous enums to be used + // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous + // enums can be implicitly cast to BiggestInt. + // + // Even though its body looks the same as the above version, we + // cannot merge the two, as it will make anonymous enums unhappy. + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, + BiggestInt lhs, + BiggestInt rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } +}; + +// This specialization is used when the first argument to ASSERT_EQ() +// is a null pointer literal, like NULL, false, or 0. +template <> +class EqHelper { + public: + // We define two overloaded versions of Compare(). The first + // version will be picked when the second argument to ASSERT_EQ() is + // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or + // EXPECT_EQ(false, a_bool). + template + static AssertionResult Compare( + const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, + const T2& rhs, + // The following line prevents this overload from being considered if T2 + // is not a pointer type. We need this because ASSERT_EQ(NULL, my_ptr) + // expands to Compare("", "", NULL, my_ptr), which requires a conversion + // to match the Secret* in the other overload, which would otherwise make + // this template match better. + typename EnableIf::value>::type* = 0) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); + } + + // This version will be picked when the second argument to ASSERT_EQ() is a + // pointer, e.g. ASSERT_EQ(NULL, a_pointer). + template + static AssertionResult Compare( + const char* lhs_expression, + const char* rhs_expression, + // We used to have a second template parameter instead of Secret*. That + // template parameter would deduce to 'long', making this a better match + // than the first overload even without the first overload's EnableIf. + // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to + // non-pointer argument" (even a deduced integral argument), so the old + // implementation caused warnings in user code. + Secret* /* lhs (NULL) */, + T* rhs) { + // We already know that 'lhs' is a null pointer. + return CmpHelperEQ(lhs_expression, rhs_expression, + static_cast(NULL), rhs); + } +}; + +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers +// when calling EXPECT_OP in a tight loop. +template +AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, + const T1& val1, const T2& val2, + const char* op) { + return AssertionFailure() + << "Expected: (" << expr1 << ") " << op << " (" << expr2 + << "), actual: " << FormatForComparisonFailureMessage(val1, val2) + << " vs " << FormatForComparisonFailureMessage(val2, val1); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste +// of similar code. +// +// For each templatized helper function, we also define an overloaded +// version for BiggestInt in order to reduce code bloat and allow +// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled +// with gcc 4. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ +template \ +AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) {\ + if (val1 op val2) {\ + return AssertionSuccess();\ + } else {\ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\ + }\ +}\ +GTEST_API_ AssertionResult CmpHelper##op_name(\ + const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2) + +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// Implements the helper function for {ASSERT|EXPECT}_NE +GTEST_IMPL_CMP_HELPER_(NE, !=); +// Implements the helper function for {ASSERT|EXPECT}_LE +GTEST_IMPL_CMP_HELPER_(LE, <=); +// Implements the helper function for {ASSERT|EXPECT}_LT +GTEST_IMPL_CMP_HELPER_(LT, <); +// Implements the helper function for {ASSERT|EXPECT}_GE +GTEST_IMPL_CMP_HELPER_(GE, >=); +// Implements the helper function for {ASSERT|EXPECT}_GT +GTEST_IMPL_CMP_HELPER_(GT, >); + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRNE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + + +// Helper function for *_STREQ on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2); + +// Helper function for *_STRNE on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2); + +} // namespace internal + +// IsSubstring() and IsNotSubstring() are intended to be used as the +// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by +// themselves. They check whether needle is a substring of haystack +// (NULL is considered a substring of itself only), and return an +// appropriate error message when they fail. +// +// The {needle,haystack}_expr arguments are the stringified +// expressions that generated the two real arguments. +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack); +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack); + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack); +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +// Helper template function for comparing floating-points. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + RawType lhs_value, + RawType rhs_value) { + const FloatingPoint lhs(lhs_value), rhs(rhs_value); + + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + ::std::stringstream lhs_ss; + lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << lhs_value; + + ::std::stringstream rhs_ss; + rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << rhs_value; + + return EqFailure(lhs_expression, + rhs_expression, + StringStreamToString(&lhs_ss), + StringStreamToString(&rhs_ss), + false); +} + +// Helper function for implementing ASSERT_NEAR. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, + double val2, + double abs_error); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// A class that enables one to stream messages to assertion macros +class GTEST_API_ AssertHelper { + public: + // Constructor. + AssertHelper(TestPartResult::Type type, + const char* file, + int line, + const char* message); + ~AssertHelper(); + + // Message assignment is a semantic trick to enable assertion + // streaming; see the GTEST_MESSAGE_ macro below. + void operator=(const Message& message) const; + + private: + // We put our data in a struct so that the size of the AssertHelper class can + // be as small as possible. This is important because gcc is incapable of + // re-using stack space even for temporary variables, so every EXPECT_EQ + // reserves stack space for another AssertHelper. + struct AssertHelperData { + AssertHelperData(TestPartResult::Type t, + const char* srcfile, + int line_num, + const char* msg) + : type(t), file(srcfile), line(line_num), message(msg) { } + + TestPartResult::Type const type; + const char* const file; + int const line; + std::string const message; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + }; + + AssertHelperData* const data_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); +}; + +} // namespace internal + +// The pure interface class that all value-parameterized tests inherit from. +// A value-parameterized class must inherit from both ::testing::Test and +// ::testing::WithParamInterface. In most cases that just means inheriting +// from ::testing::TestWithParam, but more complicated test hierarchies +// may need to inherit from Test and WithParamInterface at different levels. +// +// This interface has support for accessing the test parameter value via +// the GetParam() method. +// +// Use it with one of the parameter generator defining functions, like Range(), +// Values(), ValuesIn(), Bool(), and Combine(). +// +// class FooTest : public ::testing::TestWithParam { +// protected: +// FooTest() { +// // Can use GetParam() here. +// } +// virtual ~FooTest() { +// // Can use GetParam() here. +// } +// virtual void SetUp() { +// // Can use GetParam() here. +// } +// virtual void TearDown { +// // Can use GetParam() here. +// } +// }; +// TEST_P(FooTest, DoesBar) { +// // Can use GetParam() method here. +// Foo foo; +// ASSERT_TRUE(foo.DoesBar(GetParam())); +// } +// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); + +template +class WithParamInterface { + public: + typedef T ParamType; + virtual ~WithParamInterface() {} + + // The current parameter value. Is also available in the test fixture's + // constructor. This member function is non-static, even though it only + // references static data, to reduce the opportunity for incorrect uses + // like writing 'WithParamInterface::GetParam()' for a test that + // uses a fixture whose parameter type is int. + const ParamType& GetParam() const { + GTEST_CHECK_(parameter_ != NULL) + << "GetParam() can only be called inside a value-parameterized test " + << "-- did you intend to write TEST_P instead of TEST_F?"; + return *parameter_; + } + + private: + // Sets parameter value. The caller is responsible for making sure the value + // remains alive and unchanged throughout the current test. + static void SetParam(const ParamType* parameter) { + parameter_ = parameter; + } + + // Static value used for accessing parameter during a test lifetime. + static const ParamType* parameter_; + + // TestClass must be a subclass of WithParamInterface and Test. + template friend class internal::ParameterizedTestFactory; +}; + +template +const T* WithParamInterface::parameter_ = NULL; + +// Most value-parameterized classes can ignore the existence of +// WithParamInterface, and can just inherit from ::testing::TestWithParam. + +template +class TestWithParam : public Test, public WithParamInterface { +}; + +// Macros for indicating success/failure in test code. + +// ADD_FAILURE unconditionally adds a failure to the current test. +// SUCCEED generates a success - it doesn't automatically make the +// current test successful, as a test is only successful when it has +// no failure. +// +// EXPECT_* verifies that a certain condition is satisfied. If not, +// it behaves like ADD_FAILURE. In particular: +// +// EXPECT_TRUE verifies that a Boolean condition is true. +// EXPECT_FALSE verifies that a Boolean condition is false. +// +// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except +// that they will also abort the current function on failure. People +// usually want the fail-fast behavior of FAIL and ASSERT_*, but those +// writing data-driven tests often find themselves using ADD_FAILURE +// and EXPECT_* more. + +// Generates a nonfatal failure with a generic message. +#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed") + +// Generates a nonfatal failure at the given source file location with +// a generic message. +#define ADD_FAILURE_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kNonFatalFailure) + +// Generates a fatal failure with a generic message. +#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") + +// Define this macro to 1 to omit the definition of FAIL(), which is a +// generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_FAIL +# define FAIL() GTEST_FAIL() +#endif + +// Generates a success with a generic message. +#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded") + +// Define this macro to 1 to omit the definition of SUCCEED(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_SUCCEED +# define SUCCEED() GTEST_SUCCEED() +#endif + +// Macros for testing exceptions. +// +// * {ASSERT|EXPECT}_THROW(statement, expected_exception): +// Tests that the statement throws the expected exception. +// * {ASSERT|EXPECT}_NO_THROW(statement): +// Tests that the statement doesn't throw any exception. +// * {ASSERT|EXPECT}_ANY_THROW(statement): +// Tests that the statement throws an exception. + +#define EXPECT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_) +#define EXPECT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define EXPECT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define ASSERT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_) +#define ASSERT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_) +#define ASSERT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_) + +// Boolean assertions. Condition can be either a Boolean expression or an +// AssertionResult. For more information on how to use AssertionResult with +// these macros see comments on that class. +#define EXPECT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_NONFATAL_FAILURE_) +#define EXPECT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_NONFATAL_FAILURE_) +#define ASSERT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_FATAL_FAILURE_) +#define ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_FATAL_FAILURE_) + +// Macros for testing equalities and inequalities. +// +// * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// +// When they are not, Google Test prints both the tested expressions and +// their actual values. The values must be compatible built-in types, +// or you will get a compiler error. By "compatible" we mean that the +// values can be compared by the respective operator. +// +// Note: +// +// 1. It is possible to make a user-defined type work with +// {ASSERT|EXPECT}_??(), but that requires overloading the +// comparison operators and is thus discouraged by the Google C++ +// Usage Guide. Therefore, you are advised to use the +// {ASSERT|EXPECT}_TRUE() macro to assert that two objects are +// equal. +// +// 2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on +// pointers (in particular, C strings). Therefore, if you use it +// with two C strings, you are testing how their locations in memory +// are related, not how their content is related. To compare two C +// strings by content, use {ASSERT|EXPECT}_STR*(). +// +// 3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to +// {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you +// what the actual value is when it fails, and similarly for the +// other comparisons. +// +// 4. Do not depend on the order in which {ASSERT|EXPECT}_??() +// evaluate their arguments, which is undefined. +// +// 5. These macros evaluate their arguments exactly once. +// +// Examples: +// +// EXPECT_NE(Foo(), 5); +// EXPECT_EQ(a_pointer, NULL); +// ASSERT_LT(i, array_size); +// ASSERT_GT(records.size(), 0) << "There is no record left."; + +#define EXPECT_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal:: \ + EqHelper::Compare, \ + val1, val2) +#define EXPECT_NE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define EXPECT_LE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define EXPECT_LT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define EXPECT_GE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define EXPECT_GT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +#define GTEST_ASSERT_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal:: \ + EqHelper::Compare, \ + val1, val2) +#define GTEST_ASSERT_NE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define GTEST_ASSERT_LE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define GTEST_ASSERT_LT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define GTEST_ASSERT_GE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define GTEST_ASSERT_GT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of +// ASSERT_XY(), which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_ASSERT_EQ +# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_NE +# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LE +# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LT +# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GE +# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GT +# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#endif + +// C-string Comparisons. All tests treat NULL and any non-NULL string +// as different. Two NULLs are equal. +// +// * {ASSERT|EXPECT}_STREQ(s1, s2): Tests that s1 == s2 +// * {ASSERT|EXPECT}_STRNE(s1, s2): Tests that s1 != s2 +// * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case +// * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case +// +// For wide or narrow string objects, you can use the +// {ASSERT|EXPECT}_??() macros. +// +// Don't depend on the order in which the arguments are evaluated, +// which is undefined. +// +// These macros evaluate their arguments exactly once. + +#define EXPECT_STREQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) +#define EXPECT_STRNE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define EXPECT_STRCASEEQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define EXPECT_STRCASENE(s1, s2)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +#define ASSERT_STREQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) +#define ASSERT_STRNE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define ASSERT_STRCASEEQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define ASSERT_STRCASENE(s1, s2)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +// Macros for comparing floating-point numbers. +// +// * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2): +// Tests that two float values are almost equal. +// * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2): +// Tests that two double values are almost equal. +// * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): +// Tests that v1 and v2 are within the given distance to each other. +// +// Google Test uses ULP-based comparison to automatically pick a default +// error bound that is appropriate for the operands. See the +// FloatingPoint template class in gtest-internal.h if you are +// interested in the implementation details. + +#define EXPECT_FLOAT_EQ(val1, val2)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define EXPECT_DOUBLE_EQ(val1, val2)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define ASSERT_FLOAT_EQ(val1, val2)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define ASSERT_DOUBLE_EQ(val1, val2)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + val1, val2) + +#define EXPECT_NEAR(val1, val2, abs_error)\ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ + val1, val2, abs_error) + +#define ASSERT_NEAR(val1, val2, abs_error)\ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ + val1, val2, abs_error) + +// These predicate format functions work on floating-point values, and +// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. +// +// EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0); + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2); +GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2); + + +#if GTEST_OS_WINDOWS + +// Macros that test for HRESULT failure and success, these are only useful +// on Windows, and rely on Windows SDK macros and APIs to compile. +// +// * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr) +// +// When expr unexpectedly fails or succeeds, Google Test prints the +// expected result and the actual result with both a human-readable +// string representation of the error, if available, as well as the +// hex result code. +# define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +# define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +# define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +# define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#endif // GTEST_OS_WINDOWS + +// Macros that execute statement and check that it doesn't generate new fatal +// failures in the current thread. +// +// * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement); +// +// Examples: +// +// EXPECT_NO_FATAL_FAILURE(Process()); +// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; +// +#define ASSERT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) +#define EXPECT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + +// Causes a trace (including the given source file path and line number, +// and the given message) to be included in every test failure message generated +// by code in the scope of the lifetime of an instance of this class. The effect +// is undone with the destruction of the instance. +// +// The message argument can be anything streamable to std::ostream. +// +// Example: +// testing::ScopedTrace trace("file.cc", 123, "message"); +// +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + + // Template version. Uses Message() to convert the values into strings. + // Slow, but flexible. + template + ScopedTrace(const char* file, int line, const T& message) { + PushTrace(file, line, (Message() << message).GetString()); + } + + // Optimize for some known types. + ScopedTrace(const char* file, int line, const char* message) { + PushTrace(file, line, message ? message : "(null)"); + } + +#if GTEST_HAS_GLOBAL_STRING + ScopedTrace(const char* file, int line, const ::string& message) { + PushTrace(file, line, message); + } +#endif + + ScopedTrace(const char* file, int line, const std::string& message) { + PushTrace(file, line, message); + } + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + void PushTrace(const char* file, int line, std::string message); + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + +// Causes a trace (including the source file path, the current line +// number, and the given message) to be included in every test failure +// message generated by code in the current scope. The effect is +// undone when the control leaves the current scope. +// +// The message argument can be anything streamable to std::ostream. +// +// In the implementation, we include the current line number as part +// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s +// to appear in the same block - as long as they are on different +// lines. +// +// Assuming that each thread maintains its own stack of traces. +// Therefore, a SCOPED_TRACE() would (correctly) only affect the +// assertions in its own thread. +#define SCOPED_TRACE(message) \ + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ + __FILE__, __LINE__, (message)) + + +// Compile-time assertion for type equality. +// StaticAssertTypeEq() compiles iff type1 and type2 are +// the same type. The value it returns is not interesting. +// +// Instead of making StaticAssertTypeEq a class template, we make it a +// function template that invokes a helper class template. This +// prevents a user from misusing StaticAssertTypeEq by +// defining objects of that type. +// +// CAVEAT: +// +// When used inside a method of a class template, +// StaticAssertTypeEq() is effective ONLY IF the method is +// instantiated. For example, given: +// +// template class Foo { +// public: +// void Bar() { testing::StaticAssertTypeEq(); } +// }; +// +// the code: +// +// void Test1() { Foo foo; } +// +// will NOT generate a compiler error, as Foo::Bar() is never +// actually instantiated. Instead, you need: +// +// void Test2() { Foo foo; foo.Bar(); } +// +// to cause a compiler error. +template +bool StaticAssertTypeEq() { + (void)internal::StaticAssertTypeEqHelper(); + return true; +} + +// Defines a test. +// +// The first parameter is the name of the test case, and the second +// parameter is the name of the test within the test case. +// +// The convention is to end the test case name with "Test". For +// example, a test case for the Foo class can be named FooTest. +// +// Test code should appear between braces after an invocation of +// this macro. Example: +// +// TEST(FooTest, InitializesCorrectly) { +// Foo foo; +// EXPECT_TRUE(foo.StatusIsOK()); +// } + +// Note that we call GetTestTypeId() instead of GetTypeId< +// ::testing::Test>() here to get the type ID of testing::Test. This +// is to work around a suspected linker bug when using Google Test as +// a framework on Mac OS X. The bug causes GetTypeId< +// ::testing::Test>() to return different values depending on whether +// the call is from the Google Test framework itself or from user test +// code. GetTestTypeId() is guaranteed to always return the same +// value, as it always calls GetTypeId<>() from the Google Test +// framework. +#define GTEST_TEST(test_case_name, test_name)\ + GTEST_TEST_(test_case_name, test_name, \ + ::testing::Test, ::testing::internal::GetTestTypeId()) + +// Define this macro to 1 to omit the definition of TEST(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_TEST +# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name) +#endif + +// Defines a test that uses a test fixture. +// +// The first parameter is the name of the test fixture class, which +// also doubles as the test case name. The second parameter is the +// name of the test within the test case. +// +// A test fixture class must be declared earlier. The user should put +// the test code between braces after using this macro. Example: +// +// class FooTest : public testing::Test { +// protected: +// virtual void SetUp() { b_.AddElement(3); } +// +// Foo a_; +// Foo b_; +// }; +// +// TEST_F(FooTest, InitializesCorrectly) { +// EXPECT_TRUE(a_.StatusIsOK()); +// } +// +// TEST_F(FooTest, ReturnsElementCountCorrectly) { +// EXPECT_EQ(a_.size(), 0); +// EXPECT_EQ(b_.size(), 1); +// } + +#define TEST_F(test_fixture, test_name)\ + GTEST_TEST_(test_fixture, test_name, test_fixture, \ + ::testing::internal::GetTypeId()) + +// Returns a path to temporary directory. +// Tries to determine an appropriate directory for the platform. +GTEST_API_ std::string TempDir(); + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +} // namespace testing + +// Use this function in main() to run all tests. It returns 0 if all +// tests are successful, or 1 otherwise. +// +// RUN_ALL_TESTS() should be invoked after the command line has been +// parsed by InitGoogleTest(). +// +// This function was formerly a macro; thus, it is in the global +// namespace and has an all-caps name. +int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; + +inline int RUN_ALL_TESTS() { + return ::testing::UnitTest::GetInstance()->Run(); +} + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GTEST_INCLUDE_GTEST_GTEST_H_ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/block_cache_analyzer/block_cache_trace_analyzer.h new file mode 100644 index 0000000000..2f1ebd139b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/block_cache_analyzer/block_cache_trace_analyzer.h @@ -0,0 +1,397 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/utilities/sim_cache.h" +#include "trace_replay/block_cache_tracer.h" +#include "utilities/simulator_cache/cache_simulator.h" + +namespace ROCKSDB_NAMESPACE { + +// Statistics of a key refereneced by a Get. +struct GetKeyInfo { + uint64_t key_id = 0; + std::vector access_sequence_number_timeline; + std::vector access_timeline; + + void AddAccess(const BlockCacheTraceRecord& access, + uint64_t access_sequnce_number) { + access_sequence_number_timeline.push_back(access_sequnce_number); + access_timeline.push_back(access.access_timestamp); + } +}; + +// Statistics of a block. +struct BlockAccessInfo { + uint64_t block_id = 0; + uint64_t table_id = 0; + uint64_t block_offset = 0; + uint64_t num_accesses = 0; + uint64_t block_size = 0; + uint64_t first_access_time = 0; + uint64_t last_access_time = 0; + uint64_t num_keys = 0; + std::map> + key_num_access_map; // for keys exist in this block. + std::map> + non_exist_key_num_access_map; // for keys do not exist in this block. + uint64_t num_referenced_key_exist_in_block = 0; + uint64_t referenced_data_size = 0; + std::map caller_num_access_map; + // caller:timestamp:number_of_accesses. The granularity of the timestamp is + // seconds. + std::map> + caller_num_accesses_timeline; + // Unique blocks since the last access. + std::set unique_blocks_since_last_access; + // Number of reuses grouped by reuse distance. + std::map reuse_distance_count; + + // The access sequence numbers of this block. + std::vector access_sequence_number_timeline; + std::map> + caller_access_sequence__number_timeline; + // The access timestamp in microseconds of this block. + std::vector access_timeline; + std::map> caller_access_timeline; + + void AddAccess(const BlockCacheTraceRecord& access, + uint64_t access_sequnce_number) { + if (block_size != 0 && access.block_size != 0) { + assert(block_size == access.block_size); + } + if (num_keys != 0 && access.num_keys_in_block != 0) { + assert(num_keys == access.num_keys_in_block); + } + if (first_access_time == 0) { + first_access_time = access.access_timestamp; + } + table_id = BlockCacheTraceHelper::GetTableId(access); + block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access); + last_access_time = access.access_timestamp; + block_size = access.block_size; + caller_num_access_map[access.caller]++; + num_accesses++; + // access.access_timestamp is in microsecond. + const uint64_t timestamp_in_seconds = + access.access_timestamp / kMicrosInSecond; + caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1; + // Populate the feature vectors. + access_sequence_number_timeline.push_back(access_sequnce_number); + caller_access_sequence__number_timeline[access.caller].push_back( + access_sequnce_number); + access_timeline.push_back(access.access_timestamp); + caller_access_timeline[access.caller].push_back(access.access_timestamp); + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type, + access.caller)) { + num_keys = access.num_keys_in_block; + if (access.referenced_key_exist_in_block) { + if (key_num_access_map.find(access.referenced_key) == + key_num_access_map.end()) { + referenced_data_size += access.referenced_data_size; + } + key_num_access_map[access.referenced_key][access.caller]++; + num_referenced_key_exist_in_block++; + if (referenced_data_size > block_size && block_size != 0) { + ParsedInternalKey internal_key; + Status s = ParseInternalKey(access.referenced_key, &internal_key, + false /* log_err_key */); // TODO + assert(s.ok()); // TODO + } + } else { + non_exist_key_num_access_map[access.referenced_key][access.caller]++; + } + } + } +}; + +// Aggregates stats of a block given a block type. +struct BlockTypeAccessInfoAggregate { + std::map block_access_info_map; +}; + +// Aggregates BlockTypeAggregate given a SST file. +struct SSTFileAccessInfoAggregate { + uint32_t level; + std::map block_type_aggregates_map; +}; + +// Aggregates SSTFileAggregate given a column family. +struct ColumnFamilyAccessInfoAggregate { + std::map fd_aggregates_map; +}; + +struct Features { + std::vector elapsed_time_since_last_access; + std::vector num_accesses_since_last_access; + std::vector num_past_accesses; +}; + +struct Predictions { + std::vector elapsed_time_till_next_access; + std::vector num_accesses_till_next_access; +}; + +class BlockCacheTraceAnalyzer { + public: + BlockCacheTraceAnalyzer( + const std::string& trace_file_path, const std::string& output_dir, + const std::string& human_readable_trace_file_path, + bool compute_reuse_distance, bool mrc_only, + bool is_human_readable_trace_file, + std::unique_ptr&& cache_simulator); + ~BlockCacheTraceAnalyzer() = default; + // No copy and move. + BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete; + BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete; + BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete; + BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete; + + // Read all access records in the given trace_file, maintains the stats of + // a block, and aggregates the information by block type, sst file, and column + // family. Subsequently, the caller may call Print* functions to print + // statistics. + Status Analyze(); + + // Print a summary of statistics of the trace, e.g., + // Number of files: 2 Number of blocks: 50 Number of accesses: 50 + // Number of Index blocks: 10 + // Number of Filter blocks: 10 + // Number of Data blocks: 10 + // Number of UncompressionDict blocks: 10 + // Number of RangeDeletion blocks: 10 + // *************************************************************** + // Caller Get: Number of accesses 10 + // Caller Get: Number of accesses per level break down + // Level 0: Number of accesses: 10 + // Caller Get: Number of accesses per block type break down + // Block Type Index: Number of accesses: 2 + // Block Type Filter: Number of accesses: 2 + // Block Type Data: Number of accesses: 2 + // Block Type UncompressionDict: Number of accesses: 2 + // Block Type RangeDeletion: Number of accesses: 2 + void PrintStatsSummary() const; + + // Print block size distribution and the distribution break down by block type + // and column family. + void PrintBlockSizeStats() const; + + // Print access count distribution and the distribution break down by block + // type and column family. + void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k, + uint32_t top_k) const; + + // Print data block accesses by user Get and Multi-Get. + // It prints out 1) A histogram on the percentage of keys accessed in a data + // block break down by if a referenced key exists in the data block andthe + // histogram break down by column family. 2) A histogram on the percentage of + // accesses on keys exist in a data block and its break down by column family. + void PrintDataBlockAccessStats() const; + + // Write the percentage of accesses break down by column family into a csv + // file saved in 'output_dir'. + // + // The file is named "percentage_of_accesses_summary". The file format is + // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in + // the trace. + void WritePercentAccessSummaryStats() const; + + // Write the percentage of accesses for the given caller break down by column + // family, level, and block type into a csv file saved in 'output_dir'. + // + // It generates two files: 1) caller_level_percentage_of_accesses_summary and + // 2) caller_bt_percentage_of_accesses_summary which break down by the level + // and block type, respectively. The file format is + // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in + // the trace. + void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const; + + // Write the access count summary into a csv file saved in 'output_dir'. + // It groups blocks by their access count. + // + // It generates two files: 1) cf_access_count_summary and 2) + // bt_access_count_summary which break down the access count by column family + // and block type, respectively. The file format is + // cf/bt,bucket_0,bucket_1,...,bucket_N. + void WriteAccessCountSummaryStats( + const std::vector& access_count_buckets, + bool user_access_only) const; + + // Write miss ratio curves of simulated cache configurations into a csv file + // named "mrc" saved in 'output_dir'. + // + // The file format is + // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses". + void WriteMissRatioCurves() const; + + // Write miss ratio timeline of simulated cache configurations into several + // csv files, one per cache capacity saved in 'output_dir'. + // + // The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique cache names + // (cache_name+num_shard_bits+ghost_capacity). + void WriteMissRatioTimeline(uint64_t time_unit) const; + + // Write misses timeline of simulated cache configurations into several + // csv files, one per cache capacity saved in 'output_dir'. + // + // The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique cache names + // (cache_name+num_shard_bits+ghost_capacity). + void WriteMissTimeline(uint64_t time_unit) const; + + // Write the access timeline into a csv file saved in 'output_dir'. + // + // The file is named "label_access_timeline".The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique labels found in the trace. + void WriteAccessTimeline(const std::string& label, uint64_t time_unit, + bool user_access_only) const; + + // Write the reuse distance into a csv file saved in 'output_dir'. Reuse + // distance is defined as the cumulated size of unique blocks read between two + // consective accesses on the same block. + // + // The file is named "label_reuse_distance". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseDistance(const std::string& label_str, + const std::vector& distance_buckets) const; + + // Write the reuse interval into a csv file saved in 'output_dir'. Reuse + // interval is defined as the time between two consecutive accesses on the + // same block. + // + // The file is named "label_reuse_interval". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseInterval(const std::string& label_str, + const std::vector& time_buckets) const; + + // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse + // lifetime is defined as the time interval between the first access of a + // block and its last access. + // + // The file is named "label_reuse_lifetime". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseLifetime(const std::string& label_str, + const std::vector& time_buckets) const; + + // Write the reuse timeline into a csv file saved in 'output_dir'. + // + // The file is named + // "block_type_user_access_only_reuse_window_reuse_timeline". The file format + // is start_time,0,1,...,N where N equals trace_duration / reuse_window. + void WriteBlockReuseTimeline(const uint64_t reuse_window, + bool user_access_only, + TraceType block_type) const; + + // Write the Get spatical locality into csv files saved in 'output_dir'. + // + // It generates three csv files. label_percent_ref_keys, + // label_percent_accesses_on_ref_keys, and + // label_percent_data_size_on_ref_keys. + void WriteGetSpatialLocality( + const std::string& label_str, + const std::vector& percent_buckets) const; + + void WriteCorrelationFeatures(const std::string& label_str, + uint32_t max_number_of_values) const; + + void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const; + + void WriteSkewness(const std::string& label_str, + const std::vector& percent_buckets, + TraceType target_block_type) const; + + const std::map& + TEST_cf_aggregates_map() const { + return cf_aggregates_map_; + } + + private: + std::set ParseLabelStr(const std::string& label_str) const; + + std::string BuildLabel(const std::set& labels, + const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + TableReaderCaller caller, uint64_t block_key, + const BlockAccessInfo& block) const; + + void ComputeReuseDistance(BlockAccessInfo* info) const; + + Status RecordAccess(const BlockCacheTraceRecord& access); + + void UpdateReuseIntervalStats( + const std::string& label, const std::vector& time_buckets, + const std::map timeline, + std::map>* + label_time_num_reuses, + uint64_t* total_num_reuses) const; + + std::string OutputPercentAccessStats( + uint64_t total_accesses, + const std::map& cf_access_count) const; + + void WriteStatsToFile( + const std::string& label_str, const std::vector& time_buckets, + const std::string& filename_suffix, + const std::map>& label_data, + uint64_t ntotal) const; + + void TraverseBlocks( + std::function + block_callback, + std::set* labels = nullptr) const; + + void UpdateFeatureVectors( + const std::vector& access_sequence_number_timeline, + const std::vector& access_timeline, const std::string& label, + std::map* label_features, + std::map* label_predictions) const; + + void WriteCorrelationFeaturesToFile( + const std::string& label, + const std::map& label_features, + const std::map& label_predictions, + uint32_t max_number_of_values) const; + + ROCKSDB_NAMESPACE::Env* env_; + const std::string trace_file_path_; + const std::string output_dir_; + std::string human_readable_trace_file_path_; + const bool compute_reuse_distance_; + const bool mrc_only_; + const bool is_human_readable_trace_file_; + + BlockCacheTraceHeader header_; + std::unique_ptr cache_simulator_; + std::map cf_aggregates_map_; + std::map block_info_map_; + std::unordered_map get_key_info_map_; + uint64_t access_sequence_number_ = 0; + uint64_t trace_start_timestamp_in_seconds_ = 0; + uint64_t trace_end_timestamp_in_seconds_ = 0; + MissRatioStats miss_ratio_stats_; + uint64_t unique_block_id_ = 1; + uint64_t unique_get_key_id_ = 1; + BlockCacheHumanReadableTraceWriter human_readable_trace_writer_; +}; + +int block_cache_trace_analyzer_tool(int argc, char** argv); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/dump/db_dump_tool.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/dump/db_dump_tool.cc new file mode 100644 index 0000000000..535e70c433 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/dump/db_dump_tool.cc @@ -0,0 +1,258 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/db_dump_tool.h" + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +bool DbDumpTool::Run(const DumpOptions& dump_options, + ROCKSDB_NAMESPACE::Options options) { + ROCKSDB_NAMESPACE::DB* dbptr; + ROCKSDB_NAMESPACE::Status status; + std::unique_ptr dumpfile; + char hostname[1024]; + int64_t timesec = 0; + std::string abspath; + char json[4096]; + + static const char* magicstr = "ROCKDUMP"; + static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1}; + + ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default(); + + // Open the database + options.create_if_missing = false; + status = ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, dump_options.db_path, + &dbptr); + if (!status.ok()) { + std::cerr << "Unable to open database '" << dump_options.db_path + << "' for reading: " << status.ToString() << std::endl; + return false; + } + + const std::unique_ptr db(dbptr); + + status = env->NewWritableFile(dump_options.dump_location, &dumpfile, + ROCKSDB_NAMESPACE::EnvOptions()); + if (!status.ok()) { + std::cerr << "Unable to open dump file '" << dump_options.dump_location + << "' for writing: " << status.ToString() << std::endl; + return false; + } + + ROCKSDB_NAMESPACE::Slice magicslice(magicstr, 8); + status = dumpfile->Append(magicslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + ROCKSDB_NAMESPACE::Slice versionslice(versionstr, 8); + status = dumpfile->Append(versionslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + if (dump_options.anonymous) { + snprintf(json, sizeof(json), "{}"); + } else { + status = env->GetHostName(hostname, sizeof(hostname)); + status = env->GetCurrentTime(×ec); + status = env->GetAbsolutePath(dump_options.db_path, &abspath); + snprintf(json, sizeof(json), + "{ \"database-path\": \"%s\", \"hostname\": \"%s\", " + "\"creation-time\": %" PRIi64 " }", + abspath.c_str(), hostname, timesec); + } + + ROCKSDB_NAMESPACE::Slice infoslice(json, strlen(json)); + char infosize[4]; + ROCKSDB_NAMESPACE::EncodeFixed32(infosize, (uint32_t)infoslice.size()); + ROCKSDB_NAMESPACE::Slice infosizeslice(infosize, 4); + status = dumpfile->Append(infosizeslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + status = dumpfile->Append(infoslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + const std::unique_ptr it( + db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + char keysize[4]; + ROCKSDB_NAMESPACE::EncodeFixed32(keysize, (uint32_t)it->key().size()); + ROCKSDB_NAMESPACE::Slice keysizeslice(keysize, 4); + status = dumpfile->Append(keysizeslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + status = dumpfile->Append(it->key()); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + char valsize[4]; + ROCKSDB_NAMESPACE::EncodeFixed32(valsize, (uint32_t)it->value().size()); + ROCKSDB_NAMESPACE::Slice valsizeslice(valsize, 4); + status = dumpfile->Append(valsizeslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + status = dumpfile->Append(it->value()); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + } + if (!it->status().ok()) { + std::cerr << "Database iteration failed: " << status.ToString() + << std::endl; + return false; + } + return true; +} + +bool DbUndumpTool::Run(const UndumpOptions& undump_options, + ROCKSDB_NAMESPACE::Options options) { + ROCKSDB_NAMESPACE::DB* dbptr; + ROCKSDB_NAMESPACE::Status status; + ROCKSDB_NAMESPACE::Env* env; + std::unique_ptr dumpfile; + ROCKSDB_NAMESPACE::Slice slice; + char scratch8[8]; + + static const char* magicstr = "ROCKDUMP"; + static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1}; + + env = ROCKSDB_NAMESPACE::Env::Default(); + + status = env->NewSequentialFile(undump_options.dump_location, &dumpfile, + ROCKSDB_NAMESPACE::EnvOptions()); + if (!status.ok()) { + std::cerr << "Unable to open dump file '" << undump_options.dump_location + << "' for reading: " << status.ToString() << std::endl; + return false; + } + + status = dumpfile->Read(8, &slice, scratch8); + if (!status.ok() || slice.size() != 8 || + memcmp(slice.data(), magicstr, 8) != 0) { + std::cerr << "File '" << undump_options.dump_location + << "' is not a recognizable dump file." << std::endl; + return false; + } + + status = dumpfile->Read(8, &slice, scratch8); + if (!status.ok() || slice.size() != 8 || + memcmp(slice.data(), versionstr, 8) != 0) { + std::cerr << "File '" << undump_options.dump_location + << "' version not recognized." << std::endl; + return false; + } + + status = dumpfile->Read(4, &slice, scratch8); + if (!status.ok() || slice.size() != 4) { + std::cerr << "Unable to read info blob size." << std::endl; + return false; + } + uint32_t infosize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); + status = dumpfile->Skip(infosize); + if (!status.ok()) { + std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl; + return false; + } + + options.create_if_missing = true; + status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &dbptr); + if (!status.ok()) { + std::cerr << "Unable to open database '" << undump_options.db_path + << "' for writing: " << status.ToString() << std::endl; + return false; + } + + const std::unique_ptr db(dbptr); + + uint32_t last_keysize = 64; + size_t last_valsize = 1 << 20; + std::unique_ptr keyscratch(new char[last_keysize]); + std::unique_ptr valscratch(new char[last_valsize]); + + while (1) { + uint32_t keysize, valsize; + ROCKSDB_NAMESPACE::Slice keyslice; + ROCKSDB_NAMESPACE::Slice valslice; + + status = dumpfile->Read(4, &slice, scratch8); + if (!status.ok() || slice.size() != 4) break; + keysize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); + if (keysize > last_keysize) { + while (keysize > last_keysize) last_keysize *= 2; + keyscratch = std::unique_ptr(new char[last_keysize]); + } + + status = dumpfile->Read(keysize, &keyslice, keyscratch.get()); + if (!status.ok() || keyslice.size() != keysize) { + std::cerr << "Key read failure: " + << (status.ok() ? "insufficient data" : status.ToString()) + << std::endl; + return false; + } + + status = dumpfile->Read(4, &slice, scratch8); + if (!status.ok() || slice.size() != 4) { + std::cerr << "Unable to read value size: " + << (status.ok() ? "insufficient data" : status.ToString()) + << std::endl; + return false; + } + valsize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); + if (valsize > last_valsize) { + while (valsize > last_valsize) last_valsize *= 2; + valscratch = std::unique_ptr(new char[last_valsize]); + } + + status = dumpfile->Read(valsize, &valslice, valscratch.get()); + if (!status.ok() || valslice.size() != valsize) { + std::cerr << "Unable to read value: " + << (status.ok() ? "insufficient data" : status.ToString()) + << std::endl; + return false; + } + + status = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), keyslice, valslice); + if (!status.ok()) { + fprintf(stderr, "Unable to write database entry\n"); + return false; + } + } + + if (undump_options.compact_db) { + status = db->CompactRange(ROCKSDB_NAMESPACE::CompactRangeOptions(), nullptr, + nullptr); + if (!status.ok()) { + fprintf(stderr, + "Unable to compact the database after loading the dumped file\n"); + return false; + } + } + return true; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/io_tracer_parser_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/io_tracer_parser_tool.h new file mode 100644 index 0000000000..c79d4c510c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/io_tracer_parser_tool.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +struct IOTraceHeader; +struct IOTraceRecord; + +// IOTraceRecordParser class reads the IO trace file (in binary format) and +// dumps the human readable records in output_file_. +class IOTraceRecordParser { + public: + explicit IOTraceRecordParser(const std::string& input_file); + + // ReadIOTraceRecords reads the binary trace file records one by one and + // invoke PrintHumanReadableIOTraceRecord to dump the records in output_file_. + int ReadIOTraceRecords(); + + private: + void PrintHumanReadableHeader(const IOTraceHeader& header); + void PrintHumanReadableIOTraceRecord(const IOTraceRecord& record); + + // Binary file that contains IO trace records. + std::string input_file_; +}; + +int io_tracer_parser(int argc, char** argv); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/ldb_cmd_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/ldb_cmd_impl.h new file mode 100644 index 0000000000..97de981b1a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/ldb_cmd_impl.h @@ -0,0 +1,744 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/utilities/ldb_cmd.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactorCommand : public LDBCommand { + public: + static std::string Name() { return "compact"; } + + CompactorCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool null_from_; + std::string from_; + bool null_to_; + std::string to_; +}; + +class DBFileDumperCommand : public LDBCommand { + public: + static std::string Name() { return "dump_live_files"; } + + DBFileDumperCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool decode_blob_index_; + bool dump_uncompressed_blobs_; +}; + +class DBLiveFilesMetadataDumperCommand : public LDBCommand { + public: + static std::string Name() { return "list_live_files_metadata"; } + + DBLiveFilesMetadataDumperCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool sort_by_filename_; + + static const std::string ARG_SORT_BY_FILENAME; +}; + +class DBDumperCommand : public LDBCommand { + public: + static std::string Name() { return "dump"; } + + DBDumperCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + /** + * Extract file name from the full path. We handle both the forward slash (/) + * and backslash (\) to make sure that different OS-s are supported. + */ + static std::string GetFileNameFromPath(const std::string& s) { + std::size_t n = s.find_last_of("/\\"); + + if (std::string::npos == n) { + return s; + } else { + return s.substr(n + 1); + } + } + + void DoDumpCommand(); + + bool null_from_; + std::string from_; + bool null_to_; + std::string to_; + int max_keys_; + std::string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + std::string path_; + bool decode_blob_index_; + bool dump_uncompressed_blobs_; + + static const std::string ARG_COUNT_ONLY; + static const std::string ARG_COUNT_DELIM; + static const std::string ARG_STATS; + static const std::string ARG_TTL_BUCKET; +}; + +class InternalDumpCommand : public LDBCommand { + public: + static std::string Name() { return "idump"; } + + InternalDumpCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool has_from_; + std::string from_; + bool has_to_; + std::string to_; + int max_keys_; + std::string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + bool is_input_key_hex_; + bool decode_blob_index_; + + static const std::string ARG_DELIM; + static const std::string ARG_COUNT_ONLY; + static const std::string ARG_COUNT_DELIM; + static const std::string ARG_STATS; + static const std::string ARG_INPUT_KEY_HEX; +}; + +class DBLoaderCommand : public LDBCommand { + public: + static std::string Name() { return "load"; } + + DBLoaderCommand(std::string& db_name, std::vector& args); + + DBLoaderCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + void OverrideBaseOptions() override; + + private: + bool disable_wal_; + bool bulk_load_; + bool compact_; + + static const std::string ARG_DISABLE_WAL; + static const std::string ARG_BULK_LOAD; + static const std::string ARG_COMPACT; +}; + +class ManifestDumpCommand : public LDBCommand { + public: + static std::string Name() { return "manifest_dump"; } + + ManifestDumpCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + private: + bool verbose_; + bool json_; + std::string path_; + + static const std::string ARG_VERBOSE; + static const std::string ARG_JSON; + static const std::string ARG_PATH; +}; + +class UpdateManifestCommand : public LDBCommand { + public: + static std::string Name() { return "update_manifest"; } + + UpdateManifestCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + virtual void DoCommand() override; + + virtual bool NoDBOpen() override { return true; } + + private: + bool verbose_; + bool update_temperatures_; + // TODO future: checksum_func for populating checksums + + static const std::string ARG_VERBOSE; + static const std::string ARG_UPDATE_TEMPERATURES; +}; + +class FileChecksumDumpCommand : public LDBCommand { + public: + static std::string Name() { return "file_checksum_dump"; } + + FileChecksumDumpCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + private: + std::string path_; + bool is_checksum_hex_; + + static const std::string ARG_PATH; +}; + +class GetPropertyCommand : public LDBCommand { + public: + static std::string Name() { return "get_property"; } + + GetPropertyCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + void DoCommand() override; + + private: + std::string property_; +}; + +class ListColumnFamiliesCommand : public LDBCommand { + public: + static std::string Name() { return "list_column_families"; } + + ListColumnFamiliesCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } +}; + +class CreateColumnFamilyCommand : public LDBCommand { + public: + static std::string Name() { return "create_column_family"; } + + CreateColumnFamilyCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + private: + std::string new_cf_name_; +}; + +class DropColumnFamilyCommand : public LDBCommand { + public: + static std::string Name() { return "drop_column_family"; } + + DropColumnFamilyCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + private: + std::string cf_name_to_drop_; +}; + +class ReduceDBLevelsCommand : public LDBCommand { + public: + static std::string Name() { return "reduce_levels"; } + + ReduceDBLevelsCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override; + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + static void Help(std::string& msg); + + static std::vector PrepareArgs(const std::string& db_path, + int new_levels, + bool print_old_level = false); + + private: + int old_levels_; + int new_levels_; + bool print_old_levels_; + + static const std::string ARG_NEW_LEVELS; + static const std::string ARG_PRINT_OLD_LEVELS; + + Status GetOldNumOfLevels(Options& opt, int* levels); +}; + +class ChangeCompactionStyleCommand : public LDBCommand { + public: + static std::string Name() { return "change_compaction_style"; } + + ChangeCompactionStyleCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags); + + void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override; + + void DoCommand() override; + + static void Help(std::string& msg); + + private: + int old_compaction_style_; + int new_compaction_style_; + + static const std::string ARG_OLD_COMPACTION_STYLE; + static const std::string ARG_NEW_COMPACTION_STYLE; +}; + +class WALDumperCommand : public LDBCommand { + public: + static std::string Name() { return "dump_wal"; } + + WALDumperCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + bool NoDBOpen() override { return true; } + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool print_header_; + std::string wal_file_; + bool print_values_; + bool is_write_committed_; // default will be set to true + + static const std::string ARG_WAL_FILE; + static const std::string ARG_WRITE_COMMITTED; + static const std::string ARG_PRINT_HEADER; + static const std::string ARG_PRINT_VALUE; +}; + +class GetCommand : public LDBCommand { + public: + static std::string Name() { return "get"; } + + GetCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + +class ApproxSizeCommand : public LDBCommand { + public: + static std::string Name() { return "approxsize"; } + + ApproxSizeCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string start_key_; + std::string end_key_; +}; + +class BatchPutCommand : public LDBCommand { + public: + static std::string Name() { return "batchput"; } + + BatchPutCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + void OverrideBaseOptions() override; + + private: + /** + * The key-values to be inserted. + */ + std::vector> key_values_; +}; + +class ScanCommand : public LDBCommand { + public: + static std::string Name() { return "scan"; } + + ScanCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string start_key_; + std::string end_key_; + bool start_key_specified_; + bool end_key_specified_; + int max_keys_scanned_; + bool no_value_; +}; + +class DeleteCommand : public LDBCommand { + public: + static std::string Name() { return "delete"; } + + DeleteCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + +class SingleDeleteCommand : public LDBCommand { + public: + static std::string Name() { return "singledelete"; } + + SingleDeleteCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + +class DeleteRangeCommand : public LDBCommand { + public: + static std::string Name() { return "deleterange"; } + + DeleteRangeCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string begin_key_; + std::string end_key_; +}; + +class PutCommand : public LDBCommand { + public: + static std::string Name() { return "put"; } + + PutCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + void OverrideBaseOptions() override; + + private: + std::string key_; + std::string value_; +}; + +/** + * Command that starts up a REPL shell that allows + * get/put/delete. + */ +class DBQuerierCommand : public LDBCommand { + public: + static std::string Name() { return "query"; } + + DBQuerierCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + static const char* HELP_CMD; + static const char* GET_CMD; + static const char* PUT_CMD; + static const char* DELETE_CMD; +}; + +class CheckConsistencyCommand : public LDBCommand { + public: + static std::string Name() { return "checkconsistency"; } + + CheckConsistencyCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + static void Help(std::string& ret); +}; + +class CheckPointCommand : public LDBCommand { + public: + static std::string Name() { return "checkpoint"; } + + CheckPointCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + std::string checkpoint_dir_; + + private: + static const std::string ARG_CHECKPOINT_DIR; +}; + +class RepairCommand : public LDBCommand { + public: + static std::string Name() { return "repair"; } + + RepairCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + void OverrideBaseOptions() override; + + static void Help(std::string& ret); + + protected: + bool verbose_; + + private: + static const std::string ARG_VERBOSE; +}; + +class BackupEngineCommand : public LDBCommand { + public: + BackupEngineCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + protected: + static void Help(const std::string& name, std::string& ret); + std::string backup_env_uri_; + std::string backup_fs_uri_; + std::string backup_dir_; + int num_threads_; + std::unique_ptr logger_; + std::shared_ptr backup_env_guard_; + + private: + static const std::string ARG_BACKUP_DIR; + static const std::string ARG_BACKUP_ENV_URI; + static const std::string ARG_BACKUP_FS_URI; + static const std::string ARG_NUM_THREADS; + static const std::string ARG_STDERR_LOG_LEVEL; +}; + +class BackupCommand : public BackupEngineCommand { + public: + static std::string Name() { return "backup"; } + BackupCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + void DoCommand() override; + static void Help(std::string& ret); +}; + +class RestoreCommand : public BackupEngineCommand { + public: + static std::string Name() { return "restore"; } + RestoreCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + void DoCommand() override; + bool NoDBOpen() override { return true; } + static void Help(std::string& ret); +}; + +class WriteExternalSstFilesCommand : public LDBCommand { + public: + static std::string Name() { return "write_extern_sst"; } + WriteExternalSstFilesCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + void OverrideBaseOptions() override; + + static void Help(std::string& ret); + + private: + std::string output_sst_path_; +}; + +class IngestExternalSstFilesCommand : public LDBCommand { + public: + static std::string Name() { return "ingest_extern_sst"; } + IngestExternalSstFilesCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + void OverrideBaseOptions() override; + + static void Help(std::string& ret); + + private: + std::string input_sst_path_; + bool move_files_; + bool snapshot_consistency_; + bool allow_global_seqno_; + bool allow_blocking_flush_; + bool ingest_behind_; + bool write_global_seqno_; + + static const std::string ARG_MOVE_FILES; + static const std::string ARG_SNAPSHOT_CONSISTENCY; + static const std::string ARG_ALLOW_GLOBAL_SEQNO; + static const std::string ARG_ALLOW_BLOCKING_FLUSH; + static const std::string ARG_INGEST_BEHIND; + static const std::string ARG_WRITE_GLOBAL_SEQNO; +}; + +// Command that prints out range delete tombstones in SST files. +class ListFileRangeDeletesCommand : public LDBCommand { + public: + static std::string Name() { return "list_file_range_deletes"; } + + ListFileRangeDeletesCommand(const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + int max_keys_ = 1000; +}; + +// Command that removes the SST file forcibly from the manifest. +class UnsafeRemoveSstFileCommand : public LDBCommand { + public: + static std::string Name() { return "unsafe_remove_sst_file"; } + + UnsafeRemoveSstFileCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + private: + uint64_t sst_file_number_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/simulated_hybrid_file_system.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/simulated_hybrid_file_system.h new file mode 100644 index 0000000000..44b37eadeb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/simulated_hybrid_file_system.h @@ -0,0 +1,124 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// A FileSystem simulates hybrid file system by ingesting latency and limit +// IOPs. +// This class is only used for development purpose and should not be used +// in production. +// Right now we ingest 15ms latency and allow 100 requests per second when +// the file is for warm temperature. +// When the object is destroyed, the list of warm files are written to a +// file, which can be used to reopen a FileSystem and still recover the +// list. This is to allow the information to preserve between db_bench +// runs. +class SimulatedHybridFileSystem : public FileSystemWrapper { + public: + // metadata_file_name stores metadata of the files, so that it can be + // loaded after process restarts. If the file doesn't exist, create + // one. The file is written when the class is destroyed. + // throughput_multiplier: multiplier of throughput. For example, 1 is to + // simulate single disk spindle. 4 is to simualte 4 disk spindles. + // is_full_fs_warm: if true, all files are all included in slow I/O + // simulation. + SimulatedHybridFileSystem(const std::shared_ptr& base, + const std::string& metadata_file_name, + int throughput_multiplier, bool is_full_fs_warm); + + ~SimulatedHybridFileSystem() override; + + public: + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + const char* Name() const override { return name_.c_str(); } + + private: + // Limit 100 requests per second. Rate limiter is designed to byte but + // we use it as fixed bytes is one request. + std::shared_ptr rate_limiter_; + std::mutex mutex_; + std::unordered_set warm_file_set_; + std::string metadata_file_name_; + std::string name_; + bool is_full_fs_warm_; +}; + +// Simulated random access file that can control IOPs and latency to simulate +// specific storage media +class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper { + public: + SimulatedHybridRaf(std::unique_ptr&& t, + std::shared_ptr rate_limiter, + Temperature temperature) + : FSRandomAccessFileOwnerWrapper(std::move(t)), + rate_limiter_(rate_limiter), + temperature_(temperature) {} + + ~SimulatedHybridRaf() override {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + private: + std::shared_ptr rate_limiter_; + Temperature temperature_; + + void SimulateIOWait(int64_t num_requests) const; +}; + +class SimulatedWritableFile : public FSWritableFileWrapper { + public: + SimulatedWritableFile(std::unique_ptr&& t, + std::shared_ptr rate_limiter) + : FSWritableFileWrapper(t.get()), + file_guard_(std::move(t)), + rate_limiter_(rate_limiter) {} + IOStatus Append(const Slice& data, const IOOptions&, + IODebugContext*) override; + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + + private: + std::unique_ptr file_guard_; + std::shared_ptr rate_limiter_; + size_t unsynced_bytes = 0; + + void SimulateIOWait(int64_t num_requests) const; +}; +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/trace_analyzer_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/trace_analyzer_tool.h new file mode 100644 index 0000000000..2c3042bdc5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/tools/trace_analyzer_tool.h @@ -0,0 +1,329 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/write_batch.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { + +// Value sizes may be used as denominators. Replacing 0 value sizes with this +// positive integer avoids division error. +extern const size_t kShadowValueSize /* = 10*/; + +enum TraceOperationType : int { + kGet = 0, + kPut = 1, + kDelete = 2, + kSingleDelete = 3, + kRangeDelete = 4, + kMerge = 5, + kIteratorSeek = 6, + kIteratorSeekForPrev = 7, + kMultiGet = 8, + kPutEntity = 9, + kTaTypeNum = 10 +}; + +struct TraceUnit { + uint64_t ts; + uint32_t type; + uint32_t cf_id; + size_t value_size; + std::string key; +}; + +struct TypeCorrelation { + uint64_t count; + uint64_t total_ts; +}; + +struct StatsUnit { + uint64_t key_id; + uint64_t access_count; + uint64_t latest_ts; + uint64_t succ_count; // current only used to count Get if key found + uint32_t cf_id; + size_t value_size; + std::vector v_correlation; +}; + +class AnalyzerOptions { + public: + std::vector> correlation_map; + std::vector> correlation_list; + + AnalyzerOptions(); + + ~AnalyzerOptions(); + + void SparseCorrelationInput(const std::string& in_str); +}; + +// Note that, for the variable names in the trace_analyzer, +// Starting with 'a_' means the variable is used for 'accessed_keys'. +// Starting with 'w_' means it is used for 'the whole key space'. +// Ending with '_f' means a file write or reader pointer. +// For example, 'a_count' means 'accessed_keys_count', +// 'w_key_f' means 'whole_key_space_file'. + +struct TraceStats { + uint32_t cf_id; + std::string cf_name; + uint64_t a_count; + uint64_t a_succ_count; + uint64_t a_key_id; + uint64_t a_key_size_sqsum; + uint64_t a_key_size_sum; + uint64_t a_key_mid; + uint64_t a_value_size_sqsum; + uint64_t a_value_size_sum; + uint64_t a_value_mid; + uint32_t a_peak_qps; + double a_ave_qps; + std::map a_key_stats; + std::map a_count_stats; + std::map a_key_size_stats; + std::map a_value_size_stats; + std::map a_qps_stats; + std::map> a_qps_prefix_stats; + std::priority_queue, + std::vector>, + std::greater>> + top_k_queue; + std::priority_queue, + std::vector>, + std::greater>> + top_k_prefix_access; + std::priority_queue, + std::vector>, + std::greater>> + top_k_prefix_ave; + std::priority_queue, + std::vector>, + std::greater>> + top_k_qps_sec; + std::list time_series; + std::vector> correlation_output; + std::map uni_key_num; + + std::unique_ptr time_series_f; + std::unique_ptr a_key_f; + std::unique_ptr a_count_dist_f; + std::unique_ptr a_prefix_cut_f; + std::unique_ptr a_value_size_f; + std::unique_ptr a_key_size_f; + std::unique_ptr a_key_num_f; + std::unique_ptr a_qps_f; + std::unique_ptr a_top_qps_prefix_f; + std::unique_ptr w_key_f; + std::unique_ptr w_prefix_cut_f; + + TraceStats(); + ~TraceStats(); + TraceStats(const TraceStats&) = delete; + TraceStats& operator=(const TraceStats&) = delete; + TraceStats(TraceStats&&) = default; + TraceStats& operator=(TraceStats&&) = default; +}; + +struct TypeUnit { + std::string type_name; + bool enabled; + uint64_t total_keys; + uint64_t total_access; + uint64_t total_succ_access; + uint32_t sample_count; + std::map stats; + TypeUnit() = default; + ~TypeUnit() = default; + TypeUnit(const TypeUnit&) = delete; + TypeUnit& operator=(const TypeUnit&) = delete; + TypeUnit(TypeUnit&&) = default; + TypeUnit& operator=(TypeUnit&&) = default; +}; + +struct CfUnit { + uint32_t cf_id; + uint64_t w_count; // total keys in this cf if we use the whole key space + uint64_t a_count; // the total keys in this cf that are accessed + std::map w_key_size_stats; // whole key space key size + // statistic this cf + std::map cf_qps; +}; + +class TraceAnalyzer : private TraceRecord::Handler, + private WriteBatch::Handler { + public: + TraceAnalyzer(std::string& trace_path, std::string& output_path, + AnalyzerOptions _analyzer_opts); + ~TraceAnalyzer(); + + Status PrepareProcessing(); + + Status StartProcessing(); + + Status MakeStatistics(); + + Status ReProcessing(); + + Status EndProcessing(); + + Status WriteTraceUnit(TraceUnit& unit); + + std::vector& GetTaVector() { return ta_; } + + private: + using TraceRecord::Handler::Handle; + Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* result) override; + Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr* result) override; + Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) override; + Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) override; + + using WriteBatch::Handler::PutCF; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override; + + using WriteBatch::Handler::PutEntityCF; + Status PutEntityCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override; + + using WriteBatch::Handler::DeleteCF; + Status DeleteCF(uint32_t column_family_id, const Slice& key) override; + + using WriteBatch::Handler::SingleDeleteCF; + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override; + + using WriteBatch::Handler::DeleteRangeCF; + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override; + + using WriteBatch::Handler::MergeCF; + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override; + + // The following hanlders are not implemented, return Status::OK() to avoid + // the running time assertion and other irrelevant falures. + using WriteBatch::Handler::PutBlobIndexCF; + Status PutBlobIndexCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::OK(); + } + + // The default implementation of LogData does nothing. + using WriteBatch::Handler::LogData; + void LogData(const Slice& /*blob*/) override {} + + using WriteBatch::Handler::MarkBeginPrepare; + Status MarkBeginPrepare(bool = false) override { return Status::OK(); } + + using WriteBatch::Handler::MarkEndPrepare; + Status MarkEndPrepare(const Slice& /*xid*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkNoop; + Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkRollback; + Status MarkRollback(const Slice& /*xid*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkCommit; + Status MarkCommit(const Slice& /*xid*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkCommitWithTimestamp; + Status MarkCommitWithTimestamp(const Slice& /*xid*/, + const Slice& /*commit_ts*/) override { + return Status::OK(); + } + + // Process each trace operation and output the analysis result to + // stdout/files. + Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp, + std::vector cf_ids, + std::vector keys, + std::vector value_sizes); + + Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp, + uint32_t cf_id, const Slice& key, + size_t value_size); + + ROCKSDB_NAMESPACE::Env* env_; + EnvOptions env_options_; + std::unique_ptr trace_reader_; + size_t offset_; + char buffer_[1024]; + // Timestamp of a WriteBatch, used in its iteration. + uint64_t write_batch_ts_; + std::string trace_name_; + std::string output_path_; + AnalyzerOptions analyzer_opts_; + uint64_t total_requests_; + uint64_t total_access_keys_; + uint64_t total_gets_; + uint64_t total_writes_; + uint64_t total_seeks_; + uint64_t total_seek_prevs_; + uint64_t total_multigets_; + uint64_t trace_create_time_; + uint64_t begin_time_; + uint64_t end_time_; + uint64_t time_series_start_; + uint32_t sample_max_; + uint32_t cur_time_sec_; + std::unique_ptr + trace_sequence_f_; // readable trace + std::unique_ptr qps_f_; // overall qps + std::unique_ptr + cf_qps_f_; // The qps of each CF> + std::vector ta_; // The main statistic collecting data structure + std::map cfs_; // All the cf_id appears in this trace; + std::vector qps_peak_; + std::vector qps_ave_; + + Status ReadTraceHeader(Trace* header); + Status ReadTraceFooter(Trace* footer); + Status ReadTraceRecord(Trace* trace); + Status KeyStatsInsertion(const uint32_t& type, const uint32_t& cf_id, + const std::string& key, const size_t value_size, + const uint64_t ts); + Status StatsUnitCorrelationUpdate(StatsUnit& unit, const uint32_t& type, + const uint64_t& ts, const std::string& key); + Status OpenStatsOutputFiles(const std::string& type, TraceStats& new_stats); + Status CreateOutputFile( + const std::string& type, const std::string& cf_name, + const std::string& ending, + std::unique_ptr* f_ptr); + Status CloseOutputFiles(); + + void PrintStatistics(); + Status TraceUnitWriter( + std::unique_ptr& f_ptr, TraceUnit& unit); + Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id, + const Slice& key, const size_t value_size, + const uint64_t ts); + Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats); + Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit); + Status MakeStatisticQPS(); + int db_version_; +}; + +int trace_analyzer_tool(int argc, char** argv); + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.cc new file mode 100644 index 0000000000..6a3442c5b3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.cc @@ -0,0 +1,509 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "trace_replay/block_cache_tracer.h" + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "rocksdb/trace_record.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +bool ShouldTrace(const Slice& block_key, + const BlockCacheTraceOptions& trace_options) { + if (trace_options.sampling_frequency == 0 || + trace_options.sampling_frequency == 1) { + return true; + } + // We use spatial downsampling so that we have a complete access history for a + // block. + return 0 == GetSliceRangedNPHash(block_key, trace_options.sampling_frequency); +} +} // namespace + +const uint64_t kMicrosInSecond = 1000 * 1000; +const uint64_t kSecondInMinute = 60; +const uint64_t kSecondInHour = 3600; +const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = + "UnknownColumnFamily"; +const uint64_t BlockCacheTraceRecord::kReservedGetId = 0; +const uint64_t BlockCacheTraceHelper::kReservedGetId = 0; + +bool BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( + TraceType block_type, TableReaderCaller caller) { + return (block_type == TraceType::kBlockTraceDataBlock) && + IsGetOrMultiGet(caller); +} + +bool BlockCacheTraceHelper::IsGetOrMultiGet(TableReaderCaller caller) { + return caller == TableReaderCaller::kUserGet || + caller == TableReaderCaller::kUserMultiGet; +} + +bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) { + return caller == TableReaderCaller::kUserGet || + caller == TableReaderCaller::kUserMultiGet || + caller == TableReaderCaller::kUserIterator || + caller == TableReaderCaller::kUserApproximateSize || + caller == TableReaderCaller::kUserVerifyChecksum; +} + +std::string BlockCacheTraceHelper::ComputeRowKey( + const BlockCacheTraceRecord& access) { + if (!IsGetOrMultiGet(access.caller)) { + return ""; + } + Slice key = ExtractUserKey(access.referenced_key); + return std::to_string(access.sst_fd_number) + "_" + key.ToString(); +} + +uint64_t BlockCacheTraceHelper::GetTableId( + const BlockCacheTraceRecord& access) { + if (!IsGetOrMultiGet(access.caller) || access.referenced_key.size() < 4) { + return 0; + } + return static_cast(DecodeFixed32(access.referenced_key.data())) + 1; +} + +uint64_t BlockCacheTraceHelper::GetSequenceNumber( + const BlockCacheTraceRecord& access) { + if (!IsGetOrMultiGet(access.caller)) { + return 0; + } + if (access.caller == TableReaderCaller::kUserMultiGet && + access.referenced_key.size() < 4) { + return 0; + } + return access.get_from_user_specified_snapshot + ? 1 + GetInternalKeySeqno(access.referenced_key) + : 0; +} + +uint64_t BlockCacheTraceHelper::GetBlockOffsetInFile( + const BlockCacheTraceRecord& access) { + Slice input(access.block_key); + uint64_t offset = 0; + while (true) { + uint64_t tmp = 0; + if (GetVarint64(&input, &tmp)) { + offset = tmp; + } else { + break; + } + } + return offset; +} + +BlockCacheTraceWriterImpl::BlockCacheTraceWriterImpl( + SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options, + std::unique_ptr&& trace_writer) + : clock_(clock), + trace_options_(trace_options), + trace_writer_(std::move(trace_writer)) {} + +Status BlockCacheTraceWriterImpl::WriteBlockAccess( + const BlockCacheTraceRecord& record, const Slice& block_key, + const Slice& cf_name, const Slice& referenced_key) { + uint64_t trace_file_size = trace_writer_->GetFileSize(); + if (trace_file_size > trace_options_.max_trace_file_size) { + return Status::OK(); + } + Trace trace; + trace.ts = record.access_timestamp; + trace.type = record.block_type; + PutLengthPrefixedSlice(&trace.payload, block_key); + PutFixed64(&trace.payload, record.block_size); + PutFixed64(&trace.payload, record.cf_id); + PutLengthPrefixedSlice(&trace.payload, cf_name); + PutFixed32(&trace.payload, record.level); + PutFixed64(&trace.payload, record.sst_fd_number); + trace.payload.push_back(record.caller); + trace.payload.push_back(record.is_cache_hit); + trace.payload.push_back(record.no_insert); + if (BlockCacheTraceHelper::IsGetOrMultiGet(record.caller)) { + PutFixed64(&trace.payload, record.get_id); + trace.payload.push_back(record.get_from_user_specified_snapshot); + PutLengthPrefixedSlice(&trace.payload, referenced_key); + } + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record.block_type, + record.caller)) { + PutFixed64(&trace.payload, record.referenced_data_size); + PutFixed64(&trace.payload, record.num_keys_in_block); + trace.payload.push_back(record.referenced_key_exist_in_block); + } + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + return trace_writer_->Write(encoded_trace); +} + +Status BlockCacheTraceWriterImpl::WriteHeader() { + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = TraceType::kTraceBegin; + PutLengthPrefixedSlice(&trace.payload, kTraceMagic); + PutFixed32(&trace.payload, kMajorVersion); + PutFixed32(&trace.payload, kMinorVersion); + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + return trace_writer_->Write(encoded_trace); +} + +BlockCacheTraceReader::BlockCacheTraceReader( + std::unique_ptr&& reader) + : trace_reader_(std::move(reader)) {} + +Status BlockCacheTraceReader::ReadHeader(BlockCacheTraceHeader* header) { + assert(header != nullptr); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + Trace trace; + s = TracerHelper::DecodeTrace(encoded_trace, &trace); + if (!s.ok()) { + return s; + } + header->start_time = trace.ts; + Slice enc_slice = Slice(trace.payload); + Slice magnic_number; + if (!GetLengthPrefixedSlice(&enc_slice, &magnic_number)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read the magic number."); + } + if (magnic_number.ToString() != kTraceMagic) { + return Status::Corruption( + "Corrupted header in the trace file: Magic number does not match."); + } + if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read rocksdb major " + "version number."); + } + if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read rocksdb minor " + "version number."); + } + // We should have retrieved all information in the header. + if (!enc_slice.empty()) { + return Status::Corruption( + "Corrupted header in the trace file: The length of header is too " + "long."); + } + return Status::OK(); +} + +Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { + assert(record); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + Trace trace; + s = TracerHelper::DecodeTrace(encoded_trace, &trace); + if (!s.ok()) { + return s; + } + record->access_timestamp = trace.ts; + record->block_type = trace.type; + Slice enc_slice = Slice(trace.payload); + + const unsigned int kCharSize = 1; + + Slice block_key; + if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) { + return Status::Incomplete( + "Incomplete access record: Failed to read block key."); + } + record->block_key = block_key.ToString(); + if (!GetFixed64(&enc_slice, &record->block_size)) { + return Status::Incomplete( + "Incomplete access record: Failed to read block size."); + } + if (!GetFixed64(&enc_slice, &record->cf_id)) { + return Status::Incomplete( + "Incomplete access record: Failed to read column family ID."); + } + Slice cf_name; + if (!GetLengthPrefixedSlice(&enc_slice, &cf_name)) { + return Status::Incomplete( + "Incomplete access record: Failed to read column family name."); + } + record->cf_name = cf_name.ToString(); + if (!GetFixed32(&enc_slice, &record->level)) { + return Status::Incomplete( + "Incomplete access record: Failed to read level."); + } + if (!GetFixed64(&enc_slice, &record->sst_fd_number)) { + return Status::Incomplete( + "Incomplete access record: Failed to read SST file number."); + } + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read caller."); + } + record->caller = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read is_cache_hit."); + } + record->is_cache_hit = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read no_insert."); + } + record->no_insert = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + if (BlockCacheTraceHelper::IsGetOrMultiGet(record->caller)) { + if (!GetFixed64(&enc_slice, &record->get_id)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the get id."); + } + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read " + "get_from_user_specified_snapshot."); + } + record->get_from_user_specified_snapshot = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kCharSize); + Slice referenced_key; + if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the referenced key."); + } + record->referenced_key = referenced_key.ToString(); + } + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record->block_type, + record->caller)) { + if (!GetFixed64(&enc_slice, &record->referenced_data_size)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the referenced data size."); + } + if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the number of keys in the " + "block."); + } + if (enc_slice.empty()) { + return Status::Incomplete( + "Incomplete access record: Failed to read " + "referenced_key_exist_in_block."); + } + record->referenced_key_exist_in_block = static_cast(enc_slice[0]); + } + return Status::OK(); +} + +BlockCacheHumanReadableTraceWriter::~BlockCacheHumanReadableTraceWriter() { + if (human_readable_trace_file_writer_) { + human_readable_trace_file_writer_->Flush().PermitUncheckedError(); + human_readable_trace_file_writer_->Close().PermitUncheckedError(); + } +} + +Status BlockCacheHumanReadableTraceWriter::NewWritableFile( + const std::string& human_readable_trace_file_path, + ROCKSDB_NAMESPACE::Env* env) { + if (human_readable_trace_file_path.empty()) { + return Status::InvalidArgument( + "The provided human_readable_trace_file_path is null."); + } + return env->NewWritableFile(human_readable_trace_file_path, + &human_readable_trace_file_writer_, EnvOptions()); +} + +Status BlockCacheHumanReadableTraceWriter::WriteHumanReadableTraceRecord( + const BlockCacheTraceRecord& access, uint64_t block_id, + uint64_t get_key_id) { + if (!human_readable_trace_file_writer_) { + return Status::OK(); + } + int ret = snprintf( + trace_record_buffer_, sizeof(trace_record_buffer_), + "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32 + ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64 + ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n", + access.access_timestamp, block_id, access.block_type, access.block_size, + access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number, + access.caller, access.no_insert, access.get_id, get_key_id, + access.referenced_data_size, access.is_cache_hit, + access.referenced_key_exist_in_block, access.num_keys_in_block, + BlockCacheTraceHelper::GetTableId(access), + BlockCacheTraceHelper::GetSequenceNumber(access), + static_cast(access.block_key.size()), + static_cast(access.referenced_key.size()), + BlockCacheTraceHelper::GetBlockOffsetInFile(access)); + if (ret < 0) { + return Status::IOError("failed to format the output"); + } + std::string printout(trace_record_buffer_); + return human_readable_trace_file_writer_->Append(printout); +} + +BlockCacheHumanReadableTraceReader::BlockCacheHumanReadableTraceReader( + const std::string& trace_file_path) + : BlockCacheTraceReader(/*trace_reader=*/nullptr) { + human_readable_trace_reader_.open(trace_file_path, std::ifstream::in); +} + +BlockCacheHumanReadableTraceReader::~BlockCacheHumanReadableTraceReader() { + human_readable_trace_reader_.close(); +} + +Status BlockCacheHumanReadableTraceReader::ReadHeader( + BlockCacheTraceHeader* /*header*/) { + return Status::OK(); +} + +Status BlockCacheHumanReadableTraceReader::ReadAccess( + BlockCacheTraceRecord* record) { + std::string line; + if (!std::getline(human_readable_trace_reader_, line)) { + return Status::Incomplete("No more records to read."); + } + std::stringstream ss(line); + std::vector record_strs; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + record_strs.push_back(substr); + } + if (record_strs.size() != 21) { + return Status::Incomplete("Records format is wrong."); + } + + record->access_timestamp = ParseUint64(record_strs[0]); + uint64_t block_key = ParseUint64(record_strs[1]); + record->block_type = static_cast(ParseUint64(record_strs[2])); + record->block_size = ParseUint64(record_strs[3]); + record->cf_id = ParseUint64(record_strs[4]); + record->cf_name = record_strs[5]; + record->level = static_cast(ParseUint64(record_strs[6])); + record->sst_fd_number = ParseUint64(record_strs[7]); + record->caller = static_cast(ParseUint64(record_strs[8])); + record->no_insert = static_cast(ParseUint64(record_strs[9])); + record->get_id = ParseUint64(record_strs[10]); + uint64_t get_key_id = ParseUint64(record_strs[11]); + + record->referenced_data_size = ParseUint64(record_strs[12]); + record->is_cache_hit = static_cast(ParseUint64(record_strs[13])); + record->referenced_key_exist_in_block = + static_cast(ParseUint64(record_strs[14])); + record->num_keys_in_block = ParseUint64(record_strs[15]); + uint64_t table_id = ParseUint64(record_strs[16]); + if (table_id > 0) { + // Decrement since valid table id in the trace file equals traced table id + // + 1. + table_id -= 1; + } + uint64_t get_sequence_number = ParseUint64(record_strs[17]); + if (get_sequence_number > 0) { + record->get_from_user_specified_snapshot = true; + // Decrement since valid seq number in the trace file equals traced seq + // number + 1. + get_sequence_number -= 1; + } + uint64_t block_key_size = ParseUint64(record_strs[18]); + uint64_t get_key_size = ParseUint64(record_strs[19]); + uint64_t block_offset = ParseUint64(record_strs[20]); + + std::string tmp_block_key; + PutVarint64(&tmp_block_key, block_key); + PutVarint64(&tmp_block_key, block_offset); + // Append 1 until the size is the same as traced block key size. + while (record->block_key.size() < block_key_size - tmp_block_key.size()) { + record->block_key += "1"; + } + record->block_key += tmp_block_key; + + if (get_key_id != 0) { + std::string tmp_get_key; + PutFixed64(&tmp_get_key, get_key_id); + PutFixed64(&tmp_get_key, get_sequence_number << 8); + PutFixed32(&record->referenced_key, static_cast(table_id)); + // Append 1 until the size is the same as traced key size. + while (record->referenced_key.size() < get_key_size - tmp_get_key.size()) { + record->referenced_key += "1"; + } + record->referenced_key += tmp_get_key; + } + return Status::OK(); +} + +BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); } + +BlockCacheTracer::~BlockCacheTracer() { EndTrace(); } + +Status BlockCacheTracer::StartTrace( + const BlockCacheTraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (writer_.load()) { + return Status::Busy(); + } + get_id_counter_.store(1); + trace_options_ = trace_options; + writer_.store(trace_writer.release()); + return writer_.load()->WriteHeader(); +} + +void BlockCacheTracer::EndTrace() { + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (!writer_.load()) { + return; + } + delete writer_.load(); + writer_.store(nullptr); +} + +Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, + const Slice& cf_name, + const Slice& referenced_key) { + if (!writer_.load() || !ShouldTrace(block_key, trace_options_)) { + return Status::OK(); + } + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (!writer_.load()) { + return Status::OK(); + } + return writer_.load()->WriteBlockAccess(record, block_key, cf_name, + referenced_key); +} + +uint64_t BlockCacheTracer::NextGetId() { + if (!writer_.load(std::memory_order_relaxed)) { + return BlockCacheTraceHelper::kReservedGetId; + } + uint64_t prev_value = get_id_counter_.fetch_add(1); + if (prev_value == BlockCacheTraceHelper::kReservedGetId) { + // fetch and add again. + return get_id_counter_.fetch_add(1); + } + return prev_value; +} + +std::unique_ptr NewBlockCacheTraceWriter( + SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options, + std::unique_ptr&& trace_writer) { + return std::unique_ptr(new BlockCacheTraceWriterImpl( + clock, trace_options, std::move(trace_writer))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.h new file mode 100644 index 0000000000..301c7d95ee --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/block_cache_tracer.h @@ -0,0 +1,239 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/block_cache_trace_writer.h" +#include "rocksdb/options.h" +#include "rocksdb/table_reader_caller.h" +#include "rocksdb/trace_reader_writer.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { +class Env; +class SystemClock; + +extern const uint64_t kMicrosInSecond; +extern const uint64_t kSecondInMinute; +extern const uint64_t kSecondInHour; + +struct BlockCacheTraceRecord; + +class BlockCacheTraceHelper { + public: + static bool IsGetOrMultiGetOnDataBlock(TraceType block_type, + TableReaderCaller caller); + static bool IsGetOrMultiGet(TableReaderCaller caller); + static bool IsUserAccess(TableReaderCaller caller); + // Row key is a concatenation of the access's fd_number and the referenced + // user key. + static std::string ComputeRowKey(const BlockCacheTraceRecord& access); + // The first four bytes of the referenced key in a Get request is the table + // id. + static uint64_t GetTableId(const BlockCacheTraceRecord& access); + // The sequence number of a get request is the last part of the referenced + // key. + static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access); + // Block offset in a file is the last varint64 in the block key. + static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access); + + static const std::string kUnknownColumnFamilyName; + static const uint64_t kReservedGetId; +}; + +// Lookup context for tracing block cache accesses. +// We trace block accesses at five places: +// 1. BlockBasedTable::GetFilter +// 2. BlockBasedTable::GetUncompressedDict. +// 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, +// and range deletion block.) +// 4. BlockBasedTable::Get. (To trace the referenced key and whether the +// referenced key exists in a fetched data block.) +// 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the +// referenced key exists in a fetched data block.) +// The context is created at: +// 1. BlockBasedTable::Get. (kUserGet) +// 2. BlockBasedTable::MultiGet. (kUserMGet) +// 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or +// external SST ingestion calls this function.) +// 4. BlockBasedTable::Open. (kPrefetch) +// 5. Index/Filter::CacheDependencies. (kPrefetch) +// 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or +// kUserApproximateSize). +struct BlockCacheLookupContext { + BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {} + BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id, + bool _get_from_user_specified_snapshot) + : caller(_caller), + get_id(_get_id), + get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {} + const TableReaderCaller caller; + // These are populated when we perform lookup/insert on block cache. The block + // cache tracer uses these inforation when logging the block access at + // BlockBasedTable::GET and BlockBasedTable::MultiGet. + bool is_cache_hit = false; + bool no_insert = false; + TraceType block_type = TraceType::kTraceMax; + uint64_t block_size = 0; + std::string block_key; + uint64_t num_keys_in_block = 0; + // The unique id associated with Get and MultiGet. This enables us to track + // how many blocks a Get/MultiGet request accesses. We can also measure the + // impact of row cache vs block cache. + uint64_t get_id = 0; + std::string referenced_key; + bool get_from_user_specified_snapshot = false; + + void FillLookupContext(bool _is_cache_hit, bool _no_insert, + TraceType _block_type, uint64_t _block_size, + const std::string& _block_key, + uint64_t _num_keys_in_block) { + is_cache_hit = _is_cache_hit; + no_insert = _no_insert; + block_type = _block_type; + block_size = _block_size; + block_key = _block_key; + num_keys_in_block = _num_keys_in_block; + } +}; + +struct BlockCacheTraceHeader { + uint64_t start_time; + uint32_t rocksdb_major_version; + uint32_t rocksdb_minor_version; +}; + +// BlockCacheTraceWriter captures all RocksDB block cache accesses using a +// user-provided TraceWriter. Every RocksDB operation is written as a single +// trace. Each trace will have a timestamp and type, followed by the trace +// payload. +class BlockCacheTraceWriterImpl : public BlockCacheTraceWriter { + public: + BlockCacheTraceWriterImpl(SystemClock* clock, + const BlockCacheTraceWriterOptions& trace_options, + std::unique_ptr&& trace_writer); + ~BlockCacheTraceWriterImpl() = default; + // No copy and move. + BlockCacheTraceWriterImpl(const BlockCacheTraceWriterImpl&) = delete; + BlockCacheTraceWriterImpl& operator=(const BlockCacheTraceWriterImpl&) = + delete; + BlockCacheTraceWriterImpl(BlockCacheTraceWriterImpl&&) = delete; + BlockCacheTraceWriterImpl& operator=(BlockCacheTraceWriterImpl&&) = delete; + + // Pass Slice references to avoid copy. + Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key) override; + + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number and RocksDB version. + Status WriteHeader() override; + + private: + SystemClock* clock_; + BlockCacheTraceWriterOptions trace_options_; + std::unique_ptr trace_writer_; +}; + +// Write a trace record in human readable format, see +// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format +// for details. +class BlockCacheHumanReadableTraceWriter { + public: + ~BlockCacheHumanReadableTraceWriter(); + + Status NewWritableFile(const std::string& human_readable_trace_file_path, + ROCKSDB_NAMESPACE::Env* env); + + Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access, + uint64_t block_id, uint64_t get_key_id); + + private: + char trace_record_buffer_[1024 * 1024]; + std::unique_ptr + human_readable_trace_file_writer_; +}; + +// BlockCacheTraceReader helps read the trace file generated by +// BlockCacheTraceWriter using a user provided TraceReader. +class BlockCacheTraceReader { + public: + BlockCacheTraceReader(std::unique_ptr&& reader); + virtual ~BlockCacheTraceReader() = default; + // No copy and move. + BlockCacheTraceReader(const BlockCacheTraceReader&) = delete; + BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete; + BlockCacheTraceReader(BlockCacheTraceReader&&) = delete; + BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete; + + Status ReadHeader(BlockCacheTraceHeader* header); + + Status ReadAccess(BlockCacheTraceRecord* record); + + private: + std::unique_ptr trace_reader_; +}; + +// Read a trace record in human readable format, see +// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format +// for detailed. +class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader { + public: + BlockCacheHumanReadableTraceReader(const std::string& trace_file_path); + + ~BlockCacheHumanReadableTraceReader(); + + Status ReadHeader(BlockCacheTraceHeader* header); + + Status ReadAccess(BlockCacheTraceRecord* record); + + private: + std::ifstream human_readable_trace_reader_; +}; + +// A block cache tracer. It downsamples the accesses according to +// trace_options and uses BlockCacheTraceWriter to write the access record to +// the trace file. +class BlockCacheTracer { + public: + BlockCacheTracer(); + ~BlockCacheTracer(); + // No copy and move. + BlockCacheTracer(const BlockCacheTracer&) = delete; + BlockCacheTracer& operator=(const BlockCacheTracer&) = delete; + BlockCacheTracer(BlockCacheTracer&&) = delete; + BlockCacheTracer& operator=(BlockCacheTracer&&) = delete; + + // Start writing block cache accesses to the trace_writer. + Status StartTrace(const BlockCacheTraceOptions& trace_options, + std::unique_ptr&& trace_writer); + + // Stop writing block cache accesses to the trace_writer. + void EndTrace(); + + bool is_tracing_enabled() const { + return writer_.load(std::memory_order_relaxed); + } + + Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key); + + // GetId cycles from 1 to std::numeric_limits::max(). + uint64_t NextGetId(); + + private: + BlockCacheTraceOptions trace_options_; + // A mutex protects the writer_. + InstrumentedMutex trace_writer_mutex_; + std::atomic writer_; + std::atomic get_id_counter_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.cc new file mode 100644 index 0000000000..a860130f85 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.cc @@ -0,0 +1,303 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "trace_replay/io_tracer.h" + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_reader_writer.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +IOTraceWriter::IOTraceWriter(SystemClock* clock, + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) + : clock_(clock), + trace_options_(trace_options), + trace_writer_(std::move(trace_writer)) {} + +Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record, + IODebugContext* dbg) { + uint64_t trace_file_size = trace_writer_->GetFileSize(); + if (trace_file_size > trace_options_.max_trace_file_size) { + return Status::OK(); + } + Trace trace; + trace.ts = record.access_timestamp; + trace.type = record.trace_type; + PutFixed64(&trace.payload, record.io_op_data); + Slice file_operation(record.file_operation); + PutLengthPrefixedSlice(&trace.payload, file_operation); + PutFixed64(&trace.payload, record.latency); + Slice io_status(record.io_status); + PutLengthPrefixedSlice(&trace.payload, io_status); + Slice file_name(record.file_name); + PutLengthPrefixedSlice(&trace.payload, file_name); + + // Each bit in io_op_data stores which corresponding info from IOTraceOp will + // be added in the trace. Foreg, if bit at position 1 is set then + // IOTraceOp::kIOLen (length) will be logged in the record (Since + // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in + // io_op_data one by one and, update corresponsing info in the trace record, + // unset that bit to find other set bits until io_op_data = 0. + /* Write remaining options based on io_op_data set by file operation */ + int64_t io_op_data = static_cast(record.io_op_data); + while (io_op_data) { + // Find the rightmost set bit. + uint32_t set_pos = static_cast(log2(io_op_data & -io_op_data)); + switch (set_pos) { + case IOTraceOp::kIOFileSize: + PutFixed64(&trace.payload, record.file_size); + break; + case IOTraceOp::kIOLen: + PutFixed64(&trace.payload, record.len); + break; + case IOTraceOp::kIOOffset: + PutFixed64(&trace.payload, record.offset); + break; + default: + assert(false); + } + // unset the rightmost bit. + io_op_data &= (io_op_data - 1); + } + + int64_t trace_data = 0; + if (dbg) { + trace_data = static_cast(dbg->trace_data); + } + PutFixed64(&trace.payload, trace_data); + while (trace_data) { + // Find the rightmost set bit. + uint32_t set_pos = static_cast(log2(trace_data & -trace_data)); + switch (set_pos) { + case IODebugContext::TraceData::kRequestID: { + Slice request_id(dbg->request_id); + PutLengthPrefixedSlice(&trace.payload, request_id); + } break; + default: + assert(false); + } + // unset the rightmost bit. + trace_data &= (trace_data - 1); + } + + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + return trace_writer_->Write(encoded_trace); +} + +Status IOTraceWriter::WriteHeader() { + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = TraceType::kTraceBegin; + PutLengthPrefixedSlice(&trace.payload, kTraceMagic); + PutFixed32(&trace.payload, kMajorVersion); + PutFixed32(&trace.payload, kMinorVersion); + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + return trace_writer_->Write(encoded_trace); +} + +IOTraceReader::IOTraceReader(std::unique_ptr&& reader) + : trace_reader_(std::move(reader)) {} + +Status IOTraceReader::ReadHeader(IOTraceHeader* header) { + assert(header != nullptr); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + Trace trace; + s = TracerHelper::DecodeTrace(encoded_trace, &trace); + if (!s.ok()) { + return s; + } + header->start_time = trace.ts; + Slice enc_slice = Slice(trace.payload); + Slice magic_number; + if (!GetLengthPrefixedSlice(&enc_slice, &magic_number)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read the magic number."); + } + if (magic_number.ToString() != kTraceMagic) { + return Status::Corruption( + "Corrupted header in the trace file: Magic number does not match."); + } + if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read rocksdb major " + "version number."); + } + if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) { + return Status::Corruption( + "Corrupted header in the trace file: Failed to read rocksdb minor " + "version number."); + } + // We should have retrieved all information in the header. + if (!enc_slice.empty()) { + return Status::Corruption( + "Corrupted header in the trace file: The length of header is too " + "long."); + } + return Status::OK(); +} + +Status IOTraceReader::ReadIOOp(IOTraceRecord* record) { + assert(record); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + Trace trace; + s = TracerHelper::DecodeTrace(encoded_trace, &trace); + if (!s.ok()) { + return s; + } + record->access_timestamp = trace.ts; + record->trace_type = trace.type; + Slice enc_slice = Slice(trace.payload); + + if (!GetFixed64(&enc_slice, &record->io_op_data)) { + return Status::Incomplete( + "Incomplete access record: Failed to read trace data."); + } + Slice file_operation; + if (!GetLengthPrefixedSlice(&enc_slice, &file_operation)) { + return Status::Incomplete( + "Incomplete access record: Failed to read file operation."); + } + record->file_operation = file_operation.ToString(); + if (!GetFixed64(&enc_slice, &record->latency)) { + return Status::Incomplete( + "Incomplete access record: Failed to read latency."); + } + Slice io_status; + if (!GetLengthPrefixedSlice(&enc_slice, &io_status)) { + return Status::Incomplete( + "Incomplete access record: Failed to read IO status."); + } + record->io_status = io_status.ToString(); + Slice file_name; + if (!GetLengthPrefixedSlice(&enc_slice, &file_name)) { + return Status::Incomplete( + "Incomplete access record: Failed to read file name."); + } + record->file_name = file_name.ToString(); + + // Each bit in io_op_data stores which corresponding info from IOTraceOp will + // be added in the trace. Foreg, if bit at position 1 is set then + // IOTraceOp::kIOLen (length) will be logged in the record (Since + // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in + // io_op_data one by one and, update corresponsing info in the trace record, + // unset that bit to find other set bits until io_op_data = 0. + /* Read remaining options based on io_op_data set by file operation */ + // Assuming 63 bits will be used at max. + int64_t io_op_data = static_cast(record->io_op_data); + while (io_op_data) { + // Find the rightmost set bit. + uint32_t set_pos = static_cast(log2(io_op_data & -io_op_data)); + switch (set_pos) { + case IOTraceOp::kIOFileSize: + if (!GetFixed64(&enc_slice, &record->file_size)) { + return Status::Incomplete( + "Incomplete access record: Failed to read file size."); + } + break; + case IOTraceOp::kIOLen: + if (!GetFixed64(&enc_slice, &record->len)) { + return Status::Incomplete( + "Incomplete access record: Failed to read length."); + } + break; + case IOTraceOp::kIOOffset: + if (!GetFixed64(&enc_slice, &record->offset)) { + return Status::Incomplete( + "Incomplete access record: Failed to read offset."); + } + break; + default: + assert(false); + } + // unset the rightmost bit. + io_op_data &= (io_op_data - 1); + } + + if (!GetFixed64(&enc_slice, &record->trace_data)) { + return Status::Incomplete( + "Incomplete access record: Failed to read trace op."); + } + int64_t trace_data = static_cast(record->trace_data); + while (trace_data) { + // Find the rightmost set bit. + uint32_t set_pos = static_cast(log2(trace_data & -trace_data)); + switch (set_pos) { + case IODebugContext::TraceData::kRequestID: { + Slice request_id; + if (!GetLengthPrefixedSlice(&enc_slice, &request_id)) { + return Status::Incomplete( + "Incomplete access record: Failed to request id."); + } + record->request_id = request_id.ToString(); + } break; + default: + assert(false); + } + // unset the rightmost bit. + trace_data &= (trace_data - 1); + } + + return Status::OK(); +} + +IOTracer::IOTracer() : tracing_enabled(false) { writer_.store(nullptr); } + +IOTracer::~IOTracer() { EndIOTrace(); } + +Status IOTracer::StartIOTrace(SystemClock* clock, + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (writer_.load()) { + return Status::Busy(); + } + trace_options_ = trace_options; + writer_.store( + new IOTraceWriter(clock, trace_options, std::move(trace_writer))); + tracing_enabled = true; + return writer_.load()->WriteHeader(); +} + +void IOTracer::EndIOTrace() { + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (!writer_.load()) { + return; + } + delete writer_.load(); + writer_.store(nullptr); + tracing_enabled = false; +} + +void IOTracer::WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg) { + if (!writer_.load()) { + return; + } + InstrumentedMutexLock lock_guard(&trace_writer_mutex_); + if (!writer_.load()) { + return; + } + writer_.load()->WriteIOOp(record, dbg).PermitUncheckedError(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.h new file mode 100644 index 0000000000..3fc7cdba0a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/io_tracer.h @@ -0,0 +1,185 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "port/lang.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/trace_record.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; +class TraceReader; +class TraceWriter; + +/* In order to log new data in trace record for specified operations, do + following: + 1. Add new data in IOTraceOP (say kIONewData= 3) + 2. Log it in IOTraceWriter::WriteIOOp, and read that in + IOTraceReader::ReadIOOp and + IOTraceRecordParser::PrintHumanReadableIOTraceRecord in the switch case. + 3. In the FileSystemTracer APIs where this data will be logged with, update + io_op_data |= (1 << IOTraceOp::kIONewData). +*/ +enum IOTraceOp : char { + // The value of each enum represents the bitwise position for + // IOTraceRecord.io_op_data. + kIOFileSize = 0, + kIOLen = 1, + kIOOffset = 2, +}; + +struct IOTraceRecord { + // Required fields for all accesses. + uint64_t access_timestamp = 0; + TraceType trace_type = TraceType::kTraceMax; + // Each bit in io_op_data stores which corresponding info from IOTraceOp will + // be added in the trace. Foreg, if bit at position 1 is set then + // IOTraceOp::kIOLen (length) will be logged in the record. + uint64_t io_op_data = 0; + std::string file_operation; + uint64_t latency = 0; + std::string io_status; + // Stores file name instead of full path. + std::string file_name; + + // Fields added to record based on IO operation. + uint64_t len = 0; + uint64_t offset = 0; + uint64_t file_size = 0; + + // Additional information passed in IODebugContext. + uint64_t trace_data = 0; + std::string request_id; + + IOTraceRecord() {} + + IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type, + const uint64_t& _io_op_data, const std::string& _file_operation, + const uint64_t& _latency, const std::string& _io_status, + const std::string& _file_name, const uint64_t& _file_size = 0) + : access_timestamp(_access_timestamp), + trace_type(_trace_type), + io_op_data(_io_op_data), + file_operation(_file_operation), + latency(_latency), + io_status(_io_status), + file_name(_file_name), + file_size(_file_size) {} + + IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type, + const uint64_t& _io_op_data, const std::string& _file_operation, + const uint64_t& _latency, const std::string& _io_status, + const std::string& _file_name, const uint64_t& _len, + const uint64_t& _offset) + : access_timestamp(_access_timestamp), + trace_type(_trace_type), + io_op_data(_io_op_data), + file_operation(_file_operation), + latency(_latency), + io_status(_io_status), + file_name(_file_name), + len(_len), + offset(_offset) {} +}; + +struct IOTraceHeader { + uint64_t start_time; + uint32_t rocksdb_major_version; + uint32_t rocksdb_minor_version; +}; + +// IOTraceWriter writes IO operation as a single trace. Each trace will have a +// timestamp and type, followed by the trace payload. +class IOTraceWriter { + public: + IOTraceWriter(SystemClock* clock, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer); + ~IOTraceWriter() = default; + // No copy and move. + IOTraceWriter(const IOTraceWriter&) = delete; + IOTraceWriter& operator=(const IOTraceWriter&) = delete; + IOTraceWriter(IOTraceWriter&&) = delete; + IOTraceWriter& operator=(IOTraceWriter&&) = delete; + + Status WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg); + + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number and RocksDB version. + Status WriteHeader(); + + private: + SystemClock* clock_; + TraceOptions trace_options_; + std::unique_ptr trace_writer_; +}; + +// IOTraceReader helps read the trace file generated by IOTraceWriter. +class IOTraceReader { + public: + explicit IOTraceReader(std::unique_ptr&& reader); + ~IOTraceReader() = default; + // No copy and move. + IOTraceReader(const IOTraceReader&) = delete; + IOTraceReader& operator=(const IOTraceReader&) = delete; + IOTraceReader(IOTraceReader&&) = delete; + IOTraceReader& operator=(IOTraceReader&&) = delete; + + Status ReadHeader(IOTraceHeader* header); + + Status ReadIOOp(IOTraceRecord* record); + + private: + std::unique_ptr trace_reader_; +}; + +// An IO tracer. It uses IOTraceWriter to write the access record to the +// trace file. +class IOTracer { + public: + IOTracer(); + ~IOTracer(); + // No copy and move. + IOTracer(const IOTracer&) = delete; + IOTracer& operator=(const IOTracer&) = delete; + IOTracer(IOTracer&&) = delete; + IOTracer& operator=(IOTracer&&) = delete; + + // no_sanitize is added for tracing_enabled. writer_ is protected under mutex + // so even if user call Start/EndIOTrace and tracing_enabled is not updated in + // the meanwhile, WriteIOOp will anyways check the writer_ protected under + // mutex and ignore the operation if writer_is null. So its ok if + // tracing_enabled shows non updated value. + + // Start writing IO operations to the trace_writer. + TSAN_SUPPRESSION Status + StartIOTrace(SystemClock* clock, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer); + + // Stop writing IO operations to the trace_writer. + TSAN_SUPPRESSION void EndIOTrace(); + + TSAN_SUPPRESSION bool is_tracing_enabled() const { return tracing_enabled; } + + void WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg); + + private: + TraceOptions trace_options_; + // A mutex protects the writer_. + InstrumentedMutex trace_writer_mutex_; + std::atomic writer_; + // bool tracing_enabled is added to avoid costly operation of checking atomic + // variable 'writer_' is nullptr or not in is_tracing_enabled(). + // is_tracing_enabled() is invoked multiple times by FileSystem classes. + bool tracing_enabled; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record.cc new file mode 100644 index 0000000000..21df0275dd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/trace_record.h" + +#include + +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_record_result.h" +#include "trace_replay/trace_record_handler.h" + +namespace ROCKSDB_NAMESPACE { + +// TraceRecord +TraceRecord::TraceRecord(uint64_t timestamp) : timestamp_(timestamp) {} + +uint64_t TraceRecord::GetTimestamp() const { return timestamp_; } + +TraceRecord::Handler* TraceRecord::NewExecutionHandler( + DB* db, const std::vector& handles) { + return new TraceExecutionHandler(db, handles); +} + +// QueryTraceRecord +QueryTraceRecord::QueryTraceRecord(uint64_t timestamp) + : TraceRecord(timestamp) {} + +// WriteQueryTraceRecord +WriteQueryTraceRecord::WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, + uint64_t timestamp) + : QueryTraceRecord(timestamp), rep_(std::move(write_batch_rep)) {} + +WriteQueryTraceRecord::WriteQueryTraceRecord(const std::string& write_batch_rep, + uint64_t timestamp) + : QueryTraceRecord(timestamp) { + rep_.PinSelf(write_batch_rep); +} + +WriteQueryTraceRecord::~WriteQueryTraceRecord() { rep_.clear(); } + +Slice WriteQueryTraceRecord::GetWriteBatchRep() const { return Slice(rep_); } + +Status WriteQueryTraceRecord::Accept( + Handler* handler, std::unique_ptr* result) { + assert(handler != nullptr); + return handler->Handle(*this, result); +} + +// GetQueryTraceRecord +GetQueryTraceRecord::GetQueryTraceRecord(uint32_t column_family_id, + PinnableSlice&& key, + uint64_t timestamp) + : QueryTraceRecord(timestamp), + cf_id_(column_family_id), + key_(std::move(key)) {} + +GetQueryTraceRecord::GetQueryTraceRecord(uint32_t column_family_id, + const std::string& key, + uint64_t timestamp) + : QueryTraceRecord(timestamp), cf_id_(column_family_id) { + key_.PinSelf(key); +} + +GetQueryTraceRecord::~GetQueryTraceRecord() { key_.clear(); } + +uint32_t GetQueryTraceRecord::GetColumnFamilyID() const { return cf_id_; } + +Slice GetQueryTraceRecord::GetKey() const { return Slice(key_); } + +Status GetQueryTraceRecord::Accept(Handler* handler, + std::unique_ptr* result) { + assert(handler != nullptr); + return handler->Handle(*this, result); +} + +// IteratorQueryTraceRecord +IteratorQueryTraceRecord::IteratorQueryTraceRecord(uint64_t timestamp) + : QueryTraceRecord(timestamp) {} + +IteratorQueryTraceRecord::IteratorQueryTraceRecord(PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, + uint64_t timestamp) + : QueryTraceRecord(timestamp), + lower_(std::move(lower_bound)), + upper_(std::move(upper_bound)) {} + +IteratorQueryTraceRecord::IteratorQueryTraceRecord( + const std::string& lower_bound, const std::string& upper_bound, + uint64_t timestamp) + : QueryTraceRecord(timestamp) { + lower_.PinSelf(lower_bound); + upper_.PinSelf(upper_bound); +} + +IteratorQueryTraceRecord::~IteratorQueryTraceRecord() {} + +Slice IteratorQueryTraceRecord::GetLowerBound() const { return Slice(lower_); } + +Slice IteratorQueryTraceRecord::GetUpperBound() const { return Slice(upper_); } + +// IteratorSeekQueryTraceRecord +IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord( + SeekType seek_type, uint32_t column_family_id, PinnableSlice&& key, + uint64_t timestamp) + : IteratorQueryTraceRecord(timestamp), + type_(seek_type), + cf_id_(column_family_id), + key_(std::move(key)) {} + +IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord( + SeekType seek_type, uint32_t column_family_id, const std::string& key, + uint64_t timestamp) + : IteratorQueryTraceRecord(timestamp), + type_(seek_type), + cf_id_(column_family_id) { + key_.PinSelf(key); +} + +IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord( + SeekType seek_type, uint32_t column_family_id, PinnableSlice&& key, + PinnableSlice&& lower_bound, PinnableSlice&& upper_bound, + uint64_t timestamp) + : IteratorQueryTraceRecord(std::move(lower_bound), std::move(upper_bound), + timestamp), + type_(seek_type), + cf_id_(column_family_id), + key_(std::move(key)) {} + +IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord( + SeekType seek_type, uint32_t column_family_id, const std::string& key, + const std::string& lower_bound, const std::string& upper_bound, + uint64_t timestamp) + : IteratorQueryTraceRecord(lower_bound, upper_bound, timestamp), + type_(seek_type), + cf_id_(column_family_id) { + key_.PinSelf(key); +} + +IteratorSeekQueryTraceRecord::~IteratorSeekQueryTraceRecord() { key_.clear(); } + +TraceType IteratorSeekQueryTraceRecord::GetTraceType() const { + return static_cast(type_); +} + +IteratorSeekQueryTraceRecord::SeekType +IteratorSeekQueryTraceRecord::GetSeekType() const { + return type_; +} + +uint32_t IteratorSeekQueryTraceRecord::GetColumnFamilyID() const { + return cf_id_; +} + +Slice IteratorSeekQueryTraceRecord::GetKey() const { return Slice(key_); } + +Status IteratorSeekQueryTraceRecord::Accept( + Handler* handler, std::unique_ptr* result) { + assert(handler != nullptr); + return handler->Handle(*this, result); +} + +// MultiGetQueryTraceRecord +MultiGetQueryTraceRecord::MultiGetQueryTraceRecord( + std::vector column_family_ids, std::vector&& keys, + uint64_t timestamp) + : QueryTraceRecord(timestamp), + cf_ids_(column_family_ids), + keys_(std::move(keys)) {} + +MultiGetQueryTraceRecord::MultiGetQueryTraceRecord( + std::vector column_family_ids, + const std::vector& keys, uint64_t timestamp) + : QueryTraceRecord(timestamp), cf_ids_(column_family_ids) { + keys_.reserve(keys.size()); + for (const std::string& key : keys) { + PinnableSlice ps; + ps.PinSelf(key); + keys_.push_back(std::move(ps)); + } +} + +MultiGetQueryTraceRecord::~MultiGetQueryTraceRecord() { + cf_ids_.clear(); + keys_.clear(); +} + +std::vector MultiGetQueryTraceRecord::GetColumnFamilyIDs() const { + return cf_ids_; +} + +std::vector MultiGetQueryTraceRecord::GetKeys() const { + return std::vector(keys_.begin(), keys_.end()); +} + +Status MultiGetQueryTraceRecord::Accept( + Handler* handler, std::unique_ptr* result) { + assert(handler != nullptr); + return handler->Handle(*this, result); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.cc new file mode 100644 index 0000000000..ca179e8700 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "trace_replay/trace_record_handler.h" + +#include "rocksdb/iterator.h" +#include "rocksdb/trace_record_result.h" +#include "rocksdb/write_batch.h" + +namespace ROCKSDB_NAMESPACE { + +// TraceExecutionHandler +TraceExecutionHandler::TraceExecutionHandler( + DB* db, const std::vector& handles) + : TraceRecord::Handler(), + db_(db), + write_opts_(WriteOptions()), + read_opts_(ReadOptions()) { + assert(db != nullptr); + assert(!handles.empty()); + cf_map_.reserve(handles.size()); + for (ColumnFamilyHandle* handle : handles) { + assert(handle != nullptr); + cf_map_.insert({handle->GetID(), handle}); + } + clock_ = db_->GetEnv()->GetSystemClock().get(); +} + +TraceExecutionHandler::~TraceExecutionHandler() { cf_map_.clear(); } + +Status TraceExecutionHandler::Handle( + const WriteQueryTraceRecord& record, + std::unique_ptr* result) { + if (result != nullptr) { + result->reset(nullptr); + } + uint64_t start = clock_->NowMicros(); + + WriteBatch batch(record.GetWriteBatchRep().ToString()); + Status s = db_->Write(write_opts_, &batch); + + uint64_t end = clock_->NowMicros(); + + if (s.ok() && result != nullptr) { + result->reset(new StatusOnlyTraceExecutionResult(s, start, end, + record.GetTraceType())); + } + + return s; +} + +Status TraceExecutionHandler::Handle( + const GetQueryTraceRecord& record, + std::unique_ptr* result) { + if (result != nullptr) { + result->reset(nullptr); + } + auto it = cf_map_.find(record.GetColumnFamilyID()); + if (it == cf_map_.end()) { + return Status::Corruption("Invalid Column Family ID."); + } + + uint64_t start = clock_->NowMicros(); + + std::string value; + Status s = db_->Get(read_opts_, it->second, record.GetKey(), &value); + + uint64_t end = clock_->NowMicros(); + + // Treat not found as ok, return other errors. + if (!s.ok() && !s.IsNotFound()) { + return s; + } + + if (result != nullptr) { + // Report the actual opetation status in TraceExecutionResult + result->reset(new SingleValueTraceExecutionResult( + std::move(s), std::move(value), start, end, record.GetTraceType())); + } + return Status::OK(); +} + +Status TraceExecutionHandler::Handle( + const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) { + if (result != nullptr) { + result->reset(nullptr); + } + auto it = cf_map_.find(record.GetColumnFamilyID()); + if (it == cf_map_.end()) { + return Status::Corruption("Invalid Column Family ID."); + } + + ReadOptions r_opts = read_opts_; + Slice lower = record.GetLowerBound(); + if (!lower.empty()) { + r_opts.iterate_lower_bound = &lower; + } + Slice upper = record.GetUpperBound(); + if (!upper.empty()) { + r_opts.iterate_upper_bound = &upper; + } + Iterator* single_iter = db_->NewIterator(r_opts, it->second); + + uint64_t start = clock_->NowMicros(); + + switch (record.GetSeekType()) { + case IteratorSeekQueryTraceRecord::kSeekForPrev: { + single_iter->SeekForPrev(record.GetKey()); + break; + } + default: { + single_iter->Seek(record.GetKey()); + break; + } + } + + uint64_t end = clock_->NowMicros(); + + Status s = single_iter->status(); + if (s.ok() && result != nullptr) { + if (single_iter->Valid()) { + PinnableSlice ps_key; + ps_key.PinSelf(single_iter->key()); + PinnableSlice ps_value; + ps_value.PinSelf(single_iter->value()); + result->reset(new IteratorTraceExecutionResult( + true, s, std::move(ps_key), std::move(ps_value), start, end, + record.GetTraceType())); + } else { + result->reset(new IteratorTraceExecutionResult( + false, s, "", "", start, end, record.GetTraceType())); + } + } + delete single_iter; + + return s; +} + +Status TraceExecutionHandler::Handle( + const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) { + if (result != nullptr) { + result->reset(nullptr); + } + std::vector handles; + handles.reserve(record.GetColumnFamilyIDs().size()); + for (uint32_t cf_id : record.GetColumnFamilyIDs()) { + auto it = cf_map_.find(cf_id); + if (it == cf_map_.end()) { + return Status::Corruption("Invalid Column Family ID."); + } + handles.push_back(it->second); + } + + std::vector keys = record.GetKeys(); + + if (handles.empty() || keys.empty()) { + return Status::InvalidArgument("Empty MultiGet cf_ids or keys."); + } + if (handles.size() != keys.size()) { + return Status::InvalidArgument("MultiGet cf_ids and keys size mismatch."); + } + + uint64_t start = clock_->NowMicros(); + + std::vector values; + std::vector ss = db_->MultiGet(read_opts_, handles, keys, &values); + + uint64_t end = clock_->NowMicros(); + + // Treat not found as ok, return other errors. + for (const Status& s : ss) { + if (!s.ok() && !s.IsNotFound()) { + return s; + } + } + + if (result != nullptr) { + // Report the actual opetation status in TraceExecutionResult + result->reset(new MultiValuesTraceExecutionResult( + std::move(ss), std::move(values), start, end, record.GetTraceType())); + } + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.h new file mode 100644 index 0000000000..88cf317dda --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_handler.h @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +// Handler to execute TraceRecord. +class TraceExecutionHandler : public TraceRecord::Handler { + public: + TraceExecutionHandler(DB* db, + const std::vector& handles); + virtual ~TraceExecutionHandler() override; + + virtual Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* result) override; + virtual Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr* result) override; + virtual Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) override; + virtual Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) override; + + private: + DB* db_; + std::unordered_map cf_map_; + WriteOptions write_opts_; + ReadOptions read_opts_; + SystemClock* clock_; +}; + +// To do: Handler for trace_analyzer. + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_result.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_result.cc new file mode 100644 index 0000000000..9c0cb43ad9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_record_result.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/trace_record_result.h" + +namespace ROCKSDB_NAMESPACE { + +// TraceRecordResult +TraceRecordResult::TraceRecordResult(TraceType trace_type) + : trace_type_(trace_type) {} + +TraceType TraceRecordResult::GetTraceType() const { return trace_type_; } + +// TraceExecutionResult +TraceExecutionResult::TraceExecutionResult(uint64_t start_timestamp, + uint64_t end_timestamp, + TraceType trace_type) + : TraceRecordResult(trace_type), + ts_start_(start_timestamp), + ts_end_(end_timestamp) { + assert(ts_start_ <= ts_end_); +} + +uint64_t TraceExecutionResult::GetStartTimestamp() const { return ts_start_; } + +uint64_t TraceExecutionResult::GetEndTimestamp() const { return ts_end_; } + +// StatusOnlyTraceExecutionResult +StatusOnlyTraceExecutionResult::StatusOnlyTraceExecutionResult( + Status status, uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type) + : TraceExecutionResult(start_timestamp, end_timestamp, trace_type), + status_(std::move(status)) {} + +const Status& StatusOnlyTraceExecutionResult::GetStatus() const { + return status_; +} + +Status StatusOnlyTraceExecutionResult::Accept(Handler* handler) { + assert(handler != nullptr); + return handler->Handle(*this); +} + +// SingleValueTraceExecutionResult +SingleValueTraceExecutionResult::SingleValueTraceExecutionResult( + Status status, const std::string& value, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type) + : TraceExecutionResult(start_timestamp, end_timestamp, trace_type), + status_(std::move(status)), + value_(value) {} + +SingleValueTraceExecutionResult::SingleValueTraceExecutionResult( + Status status, std::string&& value, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type) + : TraceExecutionResult(start_timestamp, end_timestamp, trace_type), + status_(std::move(status)), + value_(std::move(value)) {} + +SingleValueTraceExecutionResult::~SingleValueTraceExecutionResult() { + value_.clear(); +} + +const Status& SingleValueTraceExecutionResult::GetStatus() const { + return status_; +} + +const std::string& SingleValueTraceExecutionResult::GetValue() const { + return value_; +} + +Status SingleValueTraceExecutionResult::Accept(Handler* handler) { + assert(handler != nullptr); + return handler->Handle(*this); +} + +// MultiValuesTraceExecutionResult +MultiValuesTraceExecutionResult::MultiValuesTraceExecutionResult( + std::vector multi_status, std::vector values, + uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type) + : TraceExecutionResult(start_timestamp, end_timestamp, trace_type), + multi_status_(std::move(multi_status)), + values_(std::move(values)) {} + +MultiValuesTraceExecutionResult::~MultiValuesTraceExecutionResult() { + multi_status_.clear(); + values_.clear(); +} + +const std::vector& MultiValuesTraceExecutionResult::GetMultiStatus() + const { + return multi_status_; +} + +const std::vector& MultiValuesTraceExecutionResult::GetValues() + const { + return values_; +} + +Status MultiValuesTraceExecutionResult::Accept(Handler* handler) { + assert(handler != nullptr); + return handler->Handle(*this); +} + +// IteratorTraceExecutionResult +IteratorTraceExecutionResult::IteratorTraceExecutionResult( + bool valid, Status status, PinnableSlice&& key, PinnableSlice&& value, + uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type) + : TraceExecutionResult(start_timestamp, end_timestamp, trace_type), + valid_(valid), + status_(std::move(status)), + key_(std::move(key)), + value_(std::move(value)) {} + +IteratorTraceExecutionResult::IteratorTraceExecutionResult( + bool valid, Status status, const std::string& key, const std::string& value, + uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type) + : TraceExecutionResult(start_timestamp, end_timestamp, trace_type), + valid_(valid), + status_(std::move(status)) { + key_.PinSelf(key); + value_.PinSelf(value); +} + +IteratorTraceExecutionResult::~IteratorTraceExecutionResult() { + key_.clear(); + value_.clear(); +} + +bool IteratorTraceExecutionResult::GetValid() const { return valid_; } + +const Status& IteratorTraceExecutionResult::GetStatus() const { + return status_; +} + +Slice IteratorTraceExecutionResult::GetKey() const { return Slice(key_); } + +Slice IteratorTraceExecutionResult::GetValue() const { return Slice(value_); } + +Status IteratorTraceExecutionResult::Accept(Handler* handler) { + assert(handler != nullptr); + return handler->Handle(*this); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.cc new file mode 100644 index 0000000000..c681e374c4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.cc @@ -0,0 +1,622 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "trace_replay/trace_replay.h" + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/write_batch.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kTraceMagic = "feedcafedeadbeef"; + +namespace { +void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) { + Slice buf(buffer); + GetFixed32(&buf, cf_id); + GetLengthPrefixedSlice(&buf, key); +} +} // namespace + +Status TracerHelper::ParseVersionStr(std::string& v_string, int* v_num) { + if (v_string.find_first_of('.') == std::string::npos || + v_string.find_first_of('.') != v_string.find_last_of('.')) { + return Status::Corruption( + "Corrupted trace file. Incorrect version format."); + } + int tmp_num = 0; + for (int i = 0; i < static_cast(v_string.size()); i++) { + if (v_string[i] == '.') { + continue; + } else if (isdigit(v_string[i])) { + tmp_num = tmp_num * 10 + (v_string[i] - '0'); + } else { + return Status::Corruption( + "Corrupted trace file. Incorrect version format"); + } + } + *v_num = tmp_num; + return Status::OK(); +} + +Status TracerHelper::ParseTraceHeader(const Trace& header, int* trace_version, + int* db_version) { + std::vector s_vec; + int begin = 0, end; + for (int i = 0; i < 3; i++) { + assert(header.payload.find("\t", begin) != std::string::npos); + end = static_cast(header.payload.find("\t", begin)); + s_vec.push_back(header.payload.substr(begin, end - begin)); + begin = end + 1; + } + + std::string t_v_str, db_v_str; + assert(s_vec.size() == 3); + assert(s_vec[1].find("Trace Version: ") != std::string::npos); + t_v_str = s_vec[1].substr(15); + assert(s_vec[2].find("RocksDB Version: ") != std::string::npos); + db_v_str = s_vec[2].substr(17); + + Status s; + s = ParseVersionStr(t_v_str, trace_version); + if (s != Status::OK()) { + return s; + } + s = ParseVersionStr(db_v_str, db_version); + return s; +} + +void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) { + assert(encoded_trace); + PutFixed64(encoded_trace, trace.ts); + encoded_trace->push_back(trace.type); + PutFixed32(encoded_trace, static_cast(trace.payload.size())); + encoded_trace->append(trace.payload); +} + +Status TracerHelper::DecodeTrace(const std::string& encoded_trace, + Trace* trace) { + assert(trace != nullptr); + Slice enc_slice = Slice(encoded_trace); + if (!GetFixed64(&enc_slice, &trace->ts)) { + return Status::Incomplete("Decode trace string failed"); + } + if (enc_slice.size() < kTraceTypeSize + kTracePayloadLengthSize) { + return Status::Incomplete("Decode trace string failed"); + } + trace->type = static_cast(enc_slice[0]); + enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize); + trace->payload = enc_slice.ToString(); + return Status::OK(); +} + +Status TracerHelper::DecodeHeader(const std::string& encoded_trace, + Trace* header) { + Status s = TracerHelper::DecodeTrace(encoded_trace, header); + + if (header->type != kTraceBegin) { + return Status::Corruption("Corrupted trace file. Incorrect header."); + } + if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) { + return Status::Corruption("Corrupted trace file. Incorrect magic."); + } + + return s; +} + +bool TracerHelper::SetPayloadMap(uint64_t& payload_map, + const TracePayloadType payload_type) { + uint64_t old_state = payload_map; + uint64_t tmp = 1; + payload_map |= (tmp << payload_type); + return old_state != payload_map; +} + +Status TracerHelper::DecodeTraceRecord(Trace* trace, int trace_file_version, + std::unique_ptr* record) { + assert(trace != nullptr); + + if (record != nullptr) { + record->reset(nullptr); + } + + switch (trace->type) { + // Write + case kTraceWrite: { + PinnableSlice rep; + if (trace_file_version < 2) { + rep.PinSelf(trace->payload); + } else { + Slice buf(trace->payload); + GetFixed64(&buf, &trace->payload_map); + int64_t payload_map = static_cast(trace->payload_map); + Slice write_batch_data; + while (payload_map) { + // Find the rightmost set bit. + uint32_t set_pos = + static_cast(log2(payload_map & -payload_map)); + switch (set_pos) { + case TracePayloadType::kWriteBatchData: { + GetLengthPrefixedSlice(&buf, &write_batch_data); + break; + } + default: { + assert(false); + } + } + // unset the rightmost bit. + payload_map &= (payload_map - 1); + } + rep.PinSelf(write_batch_data); + } + + if (record != nullptr) { + record->reset(new WriteQueryTraceRecord(std::move(rep), trace->ts)); + } + + return Status::OK(); + } + // Get + case kTraceGet: { + uint32_t cf_id = 0; + Slice get_key; + + if (trace_file_version < 2) { + DecodeCFAndKey(trace->payload, &cf_id, &get_key); + } else { + Slice buf(trace->payload); + GetFixed64(&buf, &trace->payload_map); + int64_t payload_map = static_cast(trace->payload_map); + while (payload_map) { + // Find the rightmost set bit. + uint32_t set_pos = + static_cast(log2(payload_map & -payload_map)); + switch (set_pos) { + case TracePayloadType::kGetCFID: { + GetFixed32(&buf, &cf_id); + break; + } + case TracePayloadType::kGetKey: { + GetLengthPrefixedSlice(&buf, &get_key); + break; + } + default: { + assert(false); + } + } + // unset the rightmost bit. + payload_map &= (payload_map - 1); + } + } + + if (record != nullptr) { + PinnableSlice ps; + ps.PinSelf(get_key); + record->reset(new GetQueryTraceRecord(cf_id, std::move(ps), trace->ts)); + } + + return Status::OK(); + } + // Iterator Seek and SeekForPrev + case kTraceIteratorSeek: + case kTraceIteratorSeekForPrev: { + uint32_t cf_id = 0; + Slice iter_key; + Slice lower_bound; + Slice upper_bound; + + if (trace_file_version < 2) { + DecodeCFAndKey(trace->payload, &cf_id, &iter_key); + } else { + Slice buf(trace->payload); + GetFixed64(&buf, &trace->payload_map); + int64_t payload_map = static_cast(trace->payload_map); + while (payload_map) { + // Find the rightmost set bit. + uint32_t set_pos = + static_cast(log2(payload_map & -payload_map)); + switch (set_pos) { + case TracePayloadType::kIterCFID: { + GetFixed32(&buf, &cf_id); + break; + } + case TracePayloadType::kIterKey: { + GetLengthPrefixedSlice(&buf, &iter_key); + break; + } + case TracePayloadType::kIterLowerBound: { + GetLengthPrefixedSlice(&buf, &lower_bound); + break; + } + case TracePayloadType::kIterUpperBound: { + GetLengthPrefixedSlice(&buf, &upper_bound); + break; + } + default: { + assert(false); + } + } + // unset the rightmost bit. + payload_map &= (payload_map - 1); + } + } + + if (record != nullptr) { + PinnableSlice ps_key; + ps_key.PinSelf(iter_key); + PinnableSlice ps_lower; + ps_lower.PinSelf(lower_bound); + PinnableSlice ps_upper; + ps_upper.PinSelf(upper_bound); + record->reset(new IteratorSeekQueryTraceRecord( + static_cast(trace->type), + cf_id, std::move(ps_key), std::move(ps_lower), std::move(ps_upper), + trace->ts)); + } + + return Status::OK(); + } + // MultiGet + case kTraceMultiGet: { + if (trace_file_version < 2) { + return Status::Corruption("MultiGet is not supported."); + } + + uint32_t multiget_size = 0; + std::vector cf_ids; + std::vector multiget_keys; + + Slice cfids_payload; + Slice keys_payload; + Slice buf(trace->payload); + GetFixed64(&buf, &trace->payload_map); + int64_t payload_map = static_cast(trace->payload_map); + while (payload_map) { + // Find the rightmost set bit. + uint32_t set_pos = + static_cast(log2(payload_map & -payload_map)); + switch (set_pos) { + case TracePayloadType::kMultiGetSize: { + GetFixed32(&buf, &multiget_size); + break; + } + case TracePayloadType::kMultiGetCFIDs: { + GetLengthPrefixedSlice(&buf, &cfids_payload); + break; + } + case TracePayloadType::kMultiGetKeys: { + GetLengthPrefixedSlice(&buf, &keys_payload); + break; + } + default: { + assert(false); + } + } + // unset the rightmost bit. + payload_map &= (payload_map - 1); + } + if (multiget_size == 0) { + return Status::InvalidArgument("Empty MultiGet cf_ids or keys."); + } + + // Decode the cfids_payload and keys_payload + cf_ids.reserve(multiget_size); + multiget_keys.reserve(multiget_size); + for (uint32_t i = 0; i < multiget_size; i++) { + uint32_t tmp_cfid = 0; + Slice tmp_key; + GetFixed32(&cfids_payload, &tmp_cfid); + GetLengthPrefixedSlice(&keys_payload, &tmp_key); + cf_ids.push_back(tmp_cfid); + Slice s(tmp_key); + PinnableSlice ps; + ps.PinSelf(s); + multiget_keys.push_back(std::move(ps)); + } + + if (record != nullptr) { + record->reset(new MultiGetQueryTraceRecord( + std::move(cf_ids), std::move(multiget_keys), trace->ts)); + } + + return Status::OK(); + } + default: + return Status::NotSupported("Unsupported trace type."); + } +} + +Tracer::Tracer(SystemClock* clock, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) + : clock_(clock), + trace_options_(trace_options), + trace_writer_(std::move(trace_writer)), + trace_request_count_(0) { + // TODO: What if this fails? + WriteHeader().PermitUncheckedError(); +} + +Tracer::~Tracer() { trace_writer_.reset(); } + +Status Tracer::Write(WriteBatch* write_batch) { + TraceType trace_type = kTraceWrite; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = trace_type; + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kWriteBatchData); + PutFixed64(&trace.payload, trace.payload_map); + PutLengthPrefixedSlice(&trace.payload, Slice(write_batch->Data())); + return WriteTrace(trace); +} + +Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) { + TraceType trace_type = kTraceGet; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = trace_type; + // Set the payloadmap of the struct member that will be encoded in the + // payload. + TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetCFID); + TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetKey); + // Encode the Get struct members into payload. Make sure add them in order. + PutFixed64(&trace.payload, trace.payload_map); + PutFixed32(&trace.payload, column_family->GetID()); + PutLengthPrefixedSlice(&trace.payload, key); + return WriteTrace(trace); +} + +Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound) { + TraceType trace_type = kTraceIteratorSeek; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = trace_type; + // Set the payloadmap of the struct member that will be encoded in the + // payload. + TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID); + TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey); + if (lower_bound.size() > 0) { + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kIterLowerBound); + } + if (upper_bound.size() > 0) { + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kIterUpperBound); + } + // Encode the Iterator struct members into payload. Make sure add them in + // order. + PutFixed64(&trace.payload, trace.payload_map); + PutFixed32(&trace.payload, cf_id); + PutLengthPrefixedSlice(&trace.payload, key); + if (lower_bound.size() > 0) { + PutLengthPrefixedSlice(&trace.payload, lower_bound); + } + if (upper_bound.size() > 0) { + PutLengthPrefixedSlice(&trace.payload, upper_bound); + } + return WriteTrace(trace); +} + +Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { + TraceType trace_type = kTraceIteratorSeekForPrev; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = trace_type; + // Set the payloadmap of the struct member that will be encoded in the + // payload. + TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID); + TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey); + if (lower_bound.size() > 0) { + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kIterLowerBound); + } + if (upper_bound.size() > 0) { + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kIterUpperBound); + } + // Encode the Iterator struct members into payload. Make sure add them in + // order. + PutFixed64(&trace.payload, trace.payload_map); + PutFixed32(&trace.payload, cf_id); + PutLengthPrefixedSlice(&trace.payload, key); + if (lower_bound.size() > 0) { + PutLengthPrefixedSlice(&trace.payload, lower_bound); + } + if (upper_bound.size() > 0) { + PutLengthPrefixedSlice(&trace.payload, upper_bound); + } + return WriteTrace(trace); +} + +Status Tracer::MultiGet(const size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys) { + if (num_keys == 0) { + return Status::OK(); + } + std::vector v_column_families; + std::vector v_keys; + v_column_families.resize(num_keys); + v_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; i++) { + v_column_families[i] = column_families[i]; + v_keys[i] = keys[i]; + } + return MultiGet(v_column_families, v_keys); +} + +Status Tracer::MultiGet(const size_t num_keys, + ColumnFamilyHandle* column_family, const Slice* keys) { + if (num_keys == 0) { + return Status::OK(); + } + std::vector column_families; + std::vector v_keys; + column_families.resize(num_keys); + v_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; i++) { + column_families[i] = column_family; + v_keys[i] = keys[i]; + } + return MultiGet(column_families, v_keys); +} + +Status Tracer::MultiGet(const std::vector& column_families, + const std::vector& keys) { + if (column_families.size() != keys.size()) { + return Status::Corruption("the CFs size and keys size does not match!"); + } + TraceType trace_type = kTraceMultiGet; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + uint32_t multiget_size = static_cast(keys.size()); + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = trace_type; + // Set the payloadmap of the struct member that will be encoded in the + // payload. + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kMultiGetSize); + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kMultiGetCFIDs); + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kMultiGetKeys); + // Encode the CFIDs inorder + std::string cfids_payload; + std::string keys_payload; + for (uint32_t i = 0; i < multiget_size; i++) { + assert(i < column_families.size()); + assert(i < keys.size()); + PutFixed32(&cfids_payload, column_families[i]->GetID()); + PutLengthPrefixedSlice(&keys_payload, keys[i]); + } + // Encode the Get struct members into payload. Make sure add them in order. + PutFixed64(&trace.payload, trace.payload_map); + PutFixed32(&trace.payload, multiget_size); + PutLengthPrefixedSlice(&trace.payload, cfids_payload); + PutLengthPrefixedSlice(&trace.payload, keys_payload); + return WriteTrace(trace); +} + +bool Tracer::ShouldSkipTrace(const TraceType& trace_type) { + if (IsTraceFileOverMax()) { + return true; + } + + TraceFilterType filter_mask = kTraceFilterNone; + switch (trace_type) { + case kTraceNone: + case kTraceBegin: + case kTraceEnd: + filter_mask = kTraceFilterNone; + break; + case kTraceWrite: + filter_mask = kTraceFilterWrite; + break; + case kTraceGet: + filter_mask = kTraceFilterGet; + break; + case kTraceIteratorSeek: + filter_mask = kTraceFilterIteratorSeek; + break; + case kTraceIteratorSeekForPrev: + filter_mask = kTraceFilterIteratorSeekForPrev; + break; + case kBlockTraceIndexBlock: + case kBlockTraceFilterBlock: + case kBlockTraceDataBlock: + case kBlockTraceUncompressionDictBlock: + case kBlockTraceRangeDeletionBlock: + case kIOTracer: + filter_mask = kTraceFilterNone; + break; + case kTraceMultiGet: + filter_mask = kTraceFilterMultiGet; + break; + case kTraceMax: + assert(false); + filter_mask = kTraceFilterNone; + break; + } + if (filter_mask != kTraceFilterNone && trace_options_.filter & filter_mask) { + return true; + } + + ++trace_request_count_; + if (trace_request_count_ < trace_options_.sampling_frequency) { + return true; + } + trace_request_count_ = 0; + return false; +} + +bool Tracer::IsTraceFileOverMax() { + uint64_t trace_file_size = trace_writer_->GetFileSize(); + return (trace_file_size > trace_options_.max_trace_file_size); +} + +Status Tracer::WriteHeader() { + std::ostringstream s; + s << kTraceMagic << "\t" + << "Trace Version: " << kTraceFileMajorVersion << "." + << kTraceFileMinorVersion << "\t" + << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t" + << "Format: Timestamp OpType Payload\n"; + std::string header(s.str()); + + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = kTraceBegin; + trace.payload = header; + return WriteTrace(trace); +} + +Status Tracer::WriteFooter() { + Trace trace; + trace.ts = clock_->NowMicros(); + trace.type = kTraceEnd; + TracerHelper::SetPayloadMap(trace.payload_map, + TracePayloadType::kEmptyPayload); + trace.payload = ""; + return WriteTrace(trace); +} + +Status Tracer::WriteTrace(const Trace& trace) { + std::string encoded_trace; + TracerHelper::EncodeTrace(trace, &encoded_trace); + return trace_writer_->Write(Slice(encoded_trace)); +} + +Status Tracer::Close() { return WriteFooter(); } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.h new file mode 100644 index 0000000000..9aba5ceb72 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/trace_replay/trace_replay.h @@ -0,0 +1,183 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/utilities/replayer.h" + +namespace ROCKSDB_NAMESPACE { + +// This file contains Tracer and Replayer classes that enable capturing and +// replaying RocksDB traces. + +class ColumnFamilyHandle; +class ColumnFamilyData; +class DB; +class DBImpl; +class Env; +class Slice; +class SystemClock; +class TraceReader; +class TraceWriter; +class WriteBatch; + +struct ReadOptions; +struct TraceOptions; +struct WriteOptions; + +extern const std::string kTraceMagic; +const unsigned int kTraceTimestampSize = 8; +const unsigned int kTraceTypeSize = 1; +const unsigned int kTracePayloadLengthSize = 4; +const unsigned int kTraceMetadataSize = + kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize; + +static const int kTraceFileMajorVersion = 0; +static const int kTraceFileMinorVersion = 2; + +// The data structure that defines a single trace. +struct Trace { + uint64_t ts; // timestamp + TraceType type; + // Each bit in payload_map stores which corresponding struct member added in + // the payload. Each TraceType has its corresponding payload struct. For + // example, if bit at position 0 is set in write payload, then the write batch + // will be addedd. + uint64_t payload_map = 0; + // Each trace type has its own payload_struct, which will be serilized in the + // payload. + std::string payload; + + void reset() { + ts = 0; + type = kTraceMax; + payload_map = 0; + payload.clear(); + } +}; + +enum TracePayloadType : char { + // Each member of all query payload structs should have a corresponding flag + // here. Make sure to add them sequentially in the order of it is added. + kEmptyPayload = 0, + kWriteBatchData = 1, + kGetCFID = 2, + kGetKey = 3, + kIterCFID = 4, + kIterKey = 5, + kIterLowerBound = 6, + kIterUpperBound = 7, + kMultiGetSize = 8, + kMultiGetCFIDs = 9, + kMultiGetKeys = 10, +}; + +class TracerHelper { + public: + // Parse the string with major and minor version only + static Status ParseVersionStr(std::string& v_string, int* v_num); + + // Parse the trace file version and db version in trace header + static Status ParseTraceHeader(const Trace& header, int* trace_version, + int* db_version); + + // Encode a version 0.1 trace object into the given string. + static void EncodeTrace(const Trace& trace, std::string* encoded_trace); + + // Decode a string into the given trace object. + static Status DecodeTrace(const std::string& encoded_trace, Trace* trace); + + // Decode a string into the given trace header. + static Status DecodeHeader(const std::string& encoded_trace, Trace* header); + + // Set the payload map based on the payload type + static bool SetPayloadMap(uint64_t& payload_map, + const TracePayloadType payload_type); + + // Decode a Trace object into the corresponding TraceRecord. + // Return Status::OK() if nothing is wrong, record will be set accordingly. + // Return Status::NotSupported() if the trace type is not support, or the + // corresponding error status, record will be set to nullptr. + static Status DecodeTraceRecord(Trace* trace, int trace_file_version, + std::unique_ptr* record); +}; + +// Tracer captures all RocksDB operations using a user-provided TraceWriter. +// Every RocksDB operation is written as a single trace. Each trace will have a +// timestamp and type, followed by the trace payload. +class Tracer { + public: + Tracer(SystemClock* clock, const TraceOptions& trace_options, + std::unique_ptr&& trace_writer); + ~Tracer(); + + // Trace all write operations -- Put, Merge, Delete, SingleDelete, Write + Status Write(WriteBatch* write_batch); + + // Trace Get operations. + Status Get(ColumnFamilyHandle* cfname, const Slice& key); + + // Trace Iterators. + Status IteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound); + Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound); + + // Trace MultiGet + + Status MultiGet(const size_t num_keys, ColumnFamilyHandle** column_families, + const Slice* keys); + + Status MultiGet(const size_t num_keys, ColumnFamilyHandle* column_family, + const Slice* keys); + + Status MultiGet(const std::vector& column_family, + const std::vector& keys); + + // Returns true if the trace is over the configured max trace file limit. + // False otherwise. + bool IsTraceFileOverMax(); + + // Returns true if the order of write trace records must match the order of + // the corresponding records logged to WAL and applied to the DB. + bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; } + + // Writes a trace footer at the end of the tracing + Status Close(); + + private: + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number, trace version, RocksDB version, and + // trace format. + Status WriteHeader(); + + // Write a trace footer, typically on ending a trace, with some metadata. + Status WriteFooter(); + + // Write a single trace using the provided TraceWriter to the underlying + // system, say, a filesystem or a streaming service. + Status WriteTrace(const Trace& trace); + + // Helps in filtering and sampling of traces. + // Returns true if a trace should be skipped, false otherwise. + bool ShouldSkipTrace(const TraceType& type); + + SystemClock* clock_; + TraceOptions trace_options_; + std::unique_ptr trace_writer_; + uint64_t trace_request_count_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/aligned_buffer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/aligned_buffer.h new file mode 100644 index 0000000000..acab56c215 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/aligned_buffer.h @@ -0,0 +1,235 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +// This file contains utilities to handle the alignment of pages and buffers. + +// Truncate to a multiple of page_size, which is also a page boundary. This +// helps to figuring out the right alignment. +// Example: +// TruncateToPageBoundary(4096, 5000) => 4096 +// TruncateToPageBoundary((4096, 10000) => 8192 +inline size_t TruncateToPageBoundary(size_t page_size, size_t s) { + s -= (s & (page_size - 1)); + assert((s % page_size) == 0); + return s; +} + +// Round up x to a multiple of y. +// Example: +// Roundup(13, 5) => 15 +// Roundup(201, 16) => 208 +inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } + +// Round down x to a multiple of y. +// Example: +// Rounddown(13, 5) => 10 +// Rounddown(201, 16) => 192 +inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; } + +// AlignedBuffer manages a buffer by taking alignment into consideration, and +// aligns the buffer start and end positions. It is mainly used for direct I/O, +// though it can be used other purposes as well. +// It also supports expanding the managed buffer, and copying whole or part of +// the data from old buffer into the new expanded buffer. Such a copy especially +// helps in cases avoiding an IO to re-fetch the data from disk. +// +// Example: +// AlignedBuffer buf; +// buf.Alignment(alignment); +// buf.AllocateNewBuffer(user_requested_buf_size); +// ... +// buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true, +// copy_offset, copy_len); +class AlignedBuffer { + size_t alignment_; + std::unique_ptr buf_; + size_t capacity_; + size_t cursize_; + char* bufstart_; + + public: + AlignedBuffer() + : alignment_(), capacity_(0), cursize_(0), bufstart_(nullptr) {} + + AlignedBuffer(AlignedBuffer&& o) noexcept { *this = std::move(o); } + + AlignedBuffer& operator=(AlignedBuffer&& o) noexcept { + alignment_ = std::move(o.alignment_); + buf_ = std::move(o.buf_); + capacity_ = std::move(o.capacity_); + cursize_ = std::move(o.cursize_); + bufstart_ = std::move(o.bufstart_); + return *this; + } + + AlignedBuffer(const AlignedBuffer&) = delete; + + AlignedBuffer& operator=(const AlignedBuffer&) = delete; + + static bool isAligned(const void* ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; + } + + static bool isAligned(size_t n, size_t alignment) { + return n % alignment == 0; + } + + size_t Alignment() const { return alignment_; } + + size_t Capacity() const { return capacity_; } + + size_t CurrentSize() const { return cursize_; } + + const char* BufferStart() const { return bufstart_; } + + char* BufferStart() { return bufstart_; } + + void Clear() { cursize_ = 0; } + + char* Release() { + cursize_ = 0; + capacity_ = 0; + bufstart_ = nullptr; + return buf_.release(); + } + + void Alignment(size_t alignment) { + assert(alignment > 0); + assert((alignment & (alignment - 1)) == 0); + alignment_ = alignment; + } + + // Allocates a new buffer and sets the start position to the first aligned + // byte. + // + // requested_capacity: requested new buffer capacity. This capacity will be + // rounded up based on alignment. + // copy_data: Copy data from old buffer to new buffer. If copy_offset and + // copy_len are not passed in and the new requested capacity is bigger + // than the existing buffer's capacity, the data in the exising buffer is + // fully copied over to the new buffer. + // copy_offset: Copy data from this offset in old buffer. + // copy_len: Number of bytes to copy. + // + // The function does nothing if the new requested_capacity is smaller than + // the current buffer capacity and copy_data is true i.e. the old buffer is + // retained as is. + void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false, + uint64_t copy_offset = 0, size_t copy_len = 0) { + assert(alignment_ > 0); + assert((alignment_ & (alignment_ - 1)) == 0); + + copy_len = copy_len > 0 ? copy_len : cursize_; + if (copy_data && requested_capacity < copy_len) { + // If we are downsizing to a capacity that is smaller than the current + // data in the buffer -- Ignore the request. + return; + } + + size_t new_capacity = Roundup(requested_capacity, alignment_); + char* new_buf = new char[new_capacity + alignment_]; + char* new_bufstart = reinterpret_cast( + (reinterpret_cast(new_buf) + (alignment_ - 1)) & + ~static_cast(alignment_ - 1)); + + if (copy_data) { + assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_); + memcpy(new_bufstart, bufstart_ + copy_offset, copy_len); + cursize_ = copy_len; + } else { + cursize_ = 0; + } + + bufstart_ = new_bufstart; + capacity_ = new_capacity; + buf_.reset(new_buf); + } + + // Append to the buffer. + // + // src : source to copy the data from. + // append_size : number of bytes to copy from src. + // Returns the number of bytes appended. + // + // If append_size is more than the remaining buffer size only the + // remaining-size worth of bytes are copied. + size_t Append(const char* src, size_t append_size) { + size_t buffer_remaining = capacity_ - cursize_; + size_t to_copy = std::min(append_size, buffer_remaining); + + if (to_copy > 0) { + memcpy(bufstart_ + cursize_, src, to_copy); + cursize_ += to_copy; + } + return to_copy; + } + + // Read from the buffer. + // + // dest : destination buffer to copy the data to. + // offset : the buffer offset to start reading from. + // read_size : the number of bytes to copy from the buffer to dest. + // Returns the number of bytes read/copied to dest. + size_t Read(char* dest, size_t offset, size_t read_size) const { + assert(offset < cursize_); + + size_t to_read = 0; + if (offset < cursize_) { + to_read = std::min(cursize_ - offset, read_size); + } + if (to_read > 0) { + memcpy(dest, bufstart_ + offset, to_read); + } + return to_read; + } + + // Pad to the end of alignment with "padding" + void PadToAlignmentWith(int padding) { + size_t total_size = Roundup(cursize_, alignment_); + size_t pad_size = total_size - cursize_; + + if (pad_size > 0) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + } + + void PadWith(size_t pad_size, int padding) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + + // After a partial flush move the tail to the beginning of the buffer. + void RefitTail(size_t tail_offset, size_t tail_size) { + if (tail_size > 0) { + memmove(bufstart_, bufstart_ + tail_offset, tail_size); + } + cursize_ = tail_size; + } + + // Returns a place to start appending. + // WARNING: Note that it is possible to write past the end of the buffer if + // the buffer is modified without using the write APIs or encapsulation + // offered by AlignedBuffer. It is up to the user to guard against such + // errors. + char* Destination() { return bufstart_ + cursize_; } + + void Size(size_t cursize) { cursize_ = cursize; } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.cc new file mode 100644 index 0000000000..080c1ae966 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.cc @@ -0,0 +1,81 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#if USE_COROUTINES +#include "util/async_file_reader.h" + +namespace ROCKSDB_NAMESPACE { +bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) { + if (tail_) { + tail_->next_ = awaiter; + } + tail_ = awaiter; + if (!head_) { + head_ = awaiter; + } + num_reqs_ += awaiter->num_reqs_; + awaiter->io_handle_.resize(awaiter->num_reqs_); + awaiter->del_fn_.resize(awaiter->num_reqs_); + for (size_t i = 0; i < awaiter->num_reqs_; ++i) { + IOStatus s = awaiter->file_->ReadAsync( + awaiter->read_reqs_[i], awaiter->opts_, + [](const FSReadRequest& req, void* cb_arg) { + FSReadRequest* read_req = static_cast(cb_arg); + read_req->status = req.status; + read_req->result = req.result; + }, + &awaiter->read_reqs_[i], &awaiter->io_handle_[i], &awaiter->del_fn_[i], + /*aligned_buf=*/nullptr); + if (!s.ok()) { + // For any non-ok status, the FileSystem will not call the callback + // So let's update the status ourselves + awaiter->read_reqs_[i].status = s; + } + } + return true; +} + +void AsyncFileReader::Wait() { + if (!head_) { + return; + } + ReadAwaiter* waiter; + std::vector io_handles; + IOStatus s; + io_handles.reserve(num_reqs_); + waiter = head_; + do { + for (size_t i = 0; i < waiter->num_reqs_; ++i) { + if (waiter->io_handle_[i]) { + io_handles.push_back(waiter->io_handle_[i]); + } + } + } while (waiter != tail_ && (waiter = waiter->next_)); + if (io_handles.size() > 0) { + StopWatch sw(SystemClock::Default().get(), stats_, POLL_WAIT_MICROS); + s = fs_->Poll(io_handles, io_handles.size()); + } + do { + waiter = head_; + head_ = waiter->next_; + + for (size_t i = 0; i < waiter->num_reqs_; ++i) { + if (waiter->io_handle_[i] && waiter->del_fn_[i]) { + waiter->del_fn_[i](waiter->io_handle_[i]); + } + if (waiter->read_reqs_[i].status.ok() && !s.ok()) { + // Override the request status with the Poll error + waiter->read_reqs_[i].status = s; + } + } + waiter->awaiting_coro_.resume(); + } while (waiter != tail_); + head_ = tail_ = nullptr; + RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_reqs_); + num_reqs_ = 0; +} +} // namespace ROCKSDB_NAMESPACE +#endif // USE_COROUTINES diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.h new file mode 100644 index 0000000000..df69a840eb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/async_file_reader.h @@ -0,0 +1,144 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory).#pragma once +#pragma once + +#if USE_COROUTINES +#include "file/random_access_file_reader.h" +#include "folly/experimental/coro/ViaIfAsync.h" +#include "port/port.h" +#include "rocksdb/file_system.h" +#include "rocksdb/statistics.h" +#include "util/autovector.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { +class SingleThreadExecutor; + +// AsyncFileReader implements the Awaitable concept, which allows calling +// coroutines to co_await it. When the AsyncFileReader Awaitable is +// resumed, it initiates the fie reads requested by the awaiting caller +// by calling RandomAccessFileReader's ReadAsync. It then suspends the +// awaiting coroutine. The suspended awaiter is later resumed by Wait(). +class AsyncFileReader { + class ReadAwaiter; + template + class ReadOperation; + + public: + AsyncFileReader(FileSystem* fs, Statistics* stats) : fs_(fs), stats_(stats) {} + + ~AsyncFileReader() {} + + ReadOperation MultiReadAsync(RandomAccessFileReader* file, + const IOOptions& opts, + FSReadRequest* read_reqs, + size_t num_reqs, + AlignedBuf* aligned_buf) noexcept { + return ReadOperation{*this, file, opts, + read_reqs, num_reqs, aligned_buf}; + } + + private: + friend SingleThreadExecutor; + + // Implementation of the Awaitable concept + class ReadAwaiter { + public: + explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file, + const IOOptions& opts, FSReadRequest* read_reqs, + size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept + : reader_(reader), + file_(file), + opts_(opts), + read_reqs_(read_reqs), + num_reqs_(num_reqs), + next_(nullptr) {} + + bool await_ready() noexcept { return false; } + + // A return value of true means suspend the awaiter (calling coroutine). The + // awaiting_coro parameter is the handle of the awaiter. The handle can be + // resumed later, so we cache it here. + bool await_suspend( + folly::coro::impl::coroutine_handle<> awaiting_coro) noexcept { + awaiting_coro_ = awaiting_coro; + // MultiReadAsyncImpl always returns true, so caller will be suspended + return reader_.MultiReadAsyncImpl(this); + } + + void await_resume() noexcept {} + + private: + friend AsyncFileReader; + + // The parameters passed to MultiReadAsync are cached here when the caller + // calls MultiReadAsync. Later, when the execution of this awaitable is + // started, these are used to do the actual IO + AsyncFileReader& reader_; + RandomAccessFileReader* file_; + const IOOptions& opts_; + FSReadRequest* read_reqs_; + size_t num_reqs_; + autovector io_handle_; + autovector del_fn_; + folly::coro::impl::coroutine_handle<> awaiting_coro_; + // Use this to link to the next ReadAwaiter in the suspended coroutine + // list. The head and tail of the list are tracked by AsyncFileReader. + // We use this approach rather than an STL container in order to avoid + // extra memory allocations. The coroutine call already allocates a + // ReadAwaiter object. + ReadAwaiter* next_; + }; + + // An instance of ReadOperation is returned to the caller of MultiGetAsync. + // This represents an awaitable that can be started later. + template + class ReadOperation { + public: + explicit ReadOperation(AsyncFileReader& reader, + RandomAccessFileReader* file, const IOOptions& opts, + FSReadRequest* read_reqs, size_t num_reqs, + AlignedBuf* aligned_buf) noexcept + : reader_(reader), + file_(file), + opts_(opts), + read_reqs_(read_reqs), + num_reqs_(num_reqs), + aligned_buf_(aligned_buf) {} + + auto viaIfAsync(folly::Executor::KeepAlive<> executor) const { + return folly::coro::co_viaIfAsync( + std::move(executor), + Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_}); + } + + private: + AsyncFileReader& reader_; + RandomAccessFileReader* file_; + const IOOptions& opts_; + FSReadRequest* read_reqs_; + size_t num_reqs_; + AlignedBuf* aligned_buf_; + }; + + // This function does the actual work when this awaitable starts execution + bool MultiReadAsyncImpl(ReadAwaiter* awaiter); + + // Called by the SingleThreadExecutor to poll for async IO completion. + // This also resumes the awaiting coroutines. + void Wait(); + + // Head of the queue of awaiters waiting for async IO completion + ReadAwaiter* head_ = nullptr; + // Tail of the awaiter queue + ReadAwaiter* tail_ = nullptr; + // Total number of pending async IOs + size_t num_reqs_ = 0; + FileSystem* fs_; + Statistics* stats_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // USE_COROUTINES diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/autovector.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/autovector.h new file mode 100644 index 0000000000..79ee5de572 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/autovector.h @@ -0,0 +1,393 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "port/lang.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * shrink_to_fit() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + using value_type = T; + using difference_type = typename std::vector::difference_type; + using size_type = typename std::vector::size_type; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + using self_type = iterator_impl; + using value_type = TValueType; + using reference = TValueType&; + using pointer = TValueType*; + using difference_type = typename TAutoVector::difference_type; + using iterator_category = std::random_access_iterator_tag; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect), index_(index){}; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() {} + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // ++iterator + self_type& operator++() { + ++index_; + return *this; + } + + // iterator++ + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // --iterator + self_type& operator--() { + --index_; + return *this; + } + + // iterator-- + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) const { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) const { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() const { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + + pointer operator->() const { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + reference operator[](difference_type len) const { return *(*this + len); } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { return !(*this == other); } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + using iterator = iterator_impl; + using const_iterator = iterator_impl; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + autovector() : values_(reinterpret_cast(buf_)) {} + + autovector(std::initializer_list init_list) + : values_(reinterpret_cast(buf_)) { + for (const T& item : init_list) { + push_back(item); + } + } + + ~autovector() { clear(); } + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { return num_stack_items_ + vect_.size(); } + + // resize does not guarantee anything about the contents of the newly + // available elements + void resize(size_type n) { + if (n > kSize) { + vect_.resize(n - kSize); + while (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_++])) value_type(); + } + num_stack_items_ = kSize; + } else { + vect_.clear(); + while (num_stack_items_ < n) { + new ((void*)(&values_[num_stack_items_++])) value_type(); + } + while (num_stack_items_ > n) { + values_[--num_stack_items_].~value_type(); + } + } + } + + bool empty() const { return size() == 0; } + + size_type capacity() const { return kSize + vect_.capacity(); } + + void reserve(size_t cap) { + if (cap > kSize) { + vect_.reserve(cap - kSize); + } + + assert(cap <= capacity()); + } + + const_reference operator[](size_type n) const { + assert(n < size()); + if (n < kSize) { + return values_[n]; + } + return vect_[n - kSize]; + } + + reference operator[](size_type n) { + assert(n < size()); + if (n < kSize) { + return values_[n]; + } + return vect_[n - kSize]; + } + + const_reference at(size_type n) const { + assert(n < size()); + return (*this)[n]; + } + + reference at(size_type n) { + assert(n < size()); + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_])) value_type(); + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_])) value_type(); + values_[num_stack_items_++] = item; + } else { + vect_.push_back(item); + } + } + + template +#if _LIBCPP_STD_VER > 14 + reference emplace_back(Args&&... args) { + if (num_stack_items_ < kSize) { + return *(new ((void*)(&values_[num_stack_items_++])) + value_type(std::forward(args)...)); + } else { + return vect_.emplace_back(std::forward(args)...); + } + } +#else + void emplace_back(Args&&... args) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_++])) + value_type(std::forward(args)...); + } else { + vect_.emplace_back(std::forward(args)...); + } + } +#endif + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + values_[--num_stack_items_].~value_type(); + } + } + + void clear() { + while (num_stack_items_ > 0) { + values_[--num_stack_items_].~value_type(); + } + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { assign(other); } + + autovector& operator=(const autovector& other) { return assign(other); } + + autovector(autovector&& other) noexcept { *this = std::move(other); } + autovector& operator=(autovector&& other); + + // -- Iterator Operations + iterator begin() { return iterator(this, 0); } + + const_iterator begin() const { return const_iterator(this, 0); } + + iterator end() { return iterator(this, this->size()); } + + const_iterator end() const { return const_iterator(this, this->size()); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { return reverse_iterator(begin()); } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + alignas(alignof( + value_type)) char buf_[kSize * + sizeof(value_type)]; // the first `kSize` items + pointer values_; + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign( + const autovector& other) { + values_ = reinterpret_cast(buf_); + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +template +autovector& autovector::operator=( + autovector&& other) { + values_ = reinterpret_cast(buf_); + vect_ = std::move(other.vect_); + size_t n = other.num_stack_items_; + num_stack_items_ = n; + other.num_stack_items_ = 0; + for (size_t i = 0; i < n; ++i) { + values_[i] = std::move(other.values_[i]); + } + return *this; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/bloom_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/bloom_impl.h new file mode 100644 index 0000000000..53b70aa681 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/bloom_impl.h @@ -0,0 +1,489 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Implementation details of various Bloom filter implementations used in +// RocksDB. (DynamicBloom is in a separate file for now because it +// supports concurrent write.) + +#pragma once +#include +#include + +#include + +#include "port/port.h" // for PREFETCH +#include "rocksdb/slice.h" +#include "util/hash.h" + +#ifdef __AVX2__ +#include +#endif + +namespace ROCKSDB_NAMESPACE { + +class BloomMath { + public: + // False positive rate of a standard Bloom filter, for given ratio of + // filter memory bits to added keys, and number of probes per operation. + // (The false positive rate is effectively independent of scale, assuming + // the implementation scales OK.) + static double StandardFpRate(double bits_per_key, int num_probes) { + // Standard very-good-estimate formula. See + // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes); + } + + // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter, + // for given ratio of filter memory bits to added keys, number of probes per + // operation (all within the given block or cache line size), and block or + // cache line size. + static double CacheLocalFpRate(double bits_per_key, int num_probes, + int cache_line_bits) { + if (bits_per_key <= 0.0) { + // Fix a discontinuity + return 1.0; + } + double keys_per_cache_line = cache_line_bits / bits_per_key; + // A reasonable estimate is the average of the FP rates for one standard + // deviation above and below the mean bucket occupancy. See + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math + double keys_stddev = std::sqrt(keys_per_cache_line); + double crowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes); + double uncrowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes); + return (crowded_fp + uncrowded_fp) / 2; + } + + // False positive rate of querying a new item against `num_keys` items, all + // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes + // themselves are stored losslessly. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) { + double inv_fingerprint_space = std::pow(0.5, fingerprint_bits); + // Base estimate assumes each key maps to a unique fingerprint. + // Could be > 1 in extreme cases. + double base_estimate = num_keys * inv_fingerprint_space; + // To account for potential overlap, we choose between two formulas + if (base_estimate > 0.0001) { + // A very good formula assuming we don't construct a floating point + // number extremely close to 1. Always produces a probability < 1. + return 1.0 - std::exp(-base_estimate); + } else { + // A very good formula when base_estimate is far below 1. (Subtract + // away the integral-approximated sum that some key has same hash as + // one coming before it in a list.) + return base_estimate - (base_estimate * base_estimate * 0.5); + } + } + + // Returns the probably of either of two independent(-ish) events + // happening, given their probabilities. (This is useful for combining + // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate + // for a hash-efficient Bloom filter's FP rate. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double IndependentProbabilitySum(double rate1, double rate2) { + // Use formula that avoids floating point extremely close to 1 if + // rates are extremely small. + return rate1 + rate2 - (rate1 * rate2); + } +}; + +// A fast, flexible, and accurate cache-local Bloom implementation with +// SIMD-optimized query performance (currently using AVX2 on Intel). Write +// performance and non-SIMD read are very good, benefiting from FastRange32 +// used in place of % and single-cycle multiplication on recent processors. +// +// Most other SIMD Bloom implementations sacrifice flexibility and/or +// accuracy by requiring num_probes to be a power of two and restricting +// where each probe can occur in a cache line. This implementation sacrifices +// SIMD-optimization for add (might still be possible, especially with AVX512) +// in favor of allowing any num_probes, not crossing cache line boundary, +// and accuracy close to theoretical best accuracy for a cache-local Bloom. +// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket +// (Intel cache line size) is 0.9535% FP rate. This implementation yields +// about 0.957%. (Compare to LegacyLocalityBloomImpl at 1.138%, or +// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.) +// +// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or +// a 64-bit hash (split into two uint32s). With many millions of keys, the +// false positive rate associated with using a 32-bit hash can dominate the +// false positive rate of the underlying filter. At 10 bits/key setting, the +// inflection point is about 40 million keys, so 32-bit hash is a bad idea +// with 10s of millions of keys or more. +// +// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange +// to pick a cache line, which can be faster than 64-bit in some cases. +// This only hurts accuracy as you get into 10s of GB for a single filter, +// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to +// 64-bit fastrange if you need filters so big. ;) +// +// Using only a 32-bit input hash within each cache line has negligible +// impact for any reasonable cache line / bucket size, for arbitrary filter +// size, and potentially saves intermediate data size in some cases vs. +// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic +// to generate indices, I might do the same, as a single multiplication +// suffices to generate a sufficiently mixed 64 bits from 32 bits.) +// +// This implementation is currently tied to Intel cache line size, 64 bytes == +// 512 bits. If there's sufficient demand for other cache line sizes, this is +// a pretty good implementation to extend, but slight performance enhancements +// are possible with an alternate implementation (probably not very compatible +// with SIMD): +// (1) Use rotation in addition to multiplication for remixing +// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy +// because lower bits never depend on original upper bits.) +// (2) Extract more than one bit index from each re-mix. (Only if rotation +// or similar is part of remix, because otherwise you're making the +// multiplication-only problem worse.) +// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per +// re-mix. +// +class FastLocalBloomImpl { + public: + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes, + int hash_bits) { + return BloomMath::IndependentProbabilitySum( + BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes, + /*cache line bits*/ 512), + BloomMath::FingerprintFpRate(keys, hash_bits)); + } + + static inline int ChooseNumProbes(int millibits_per_key) { + // Since this implementation can (with AVX2) make up to 8 probes + // for the same cost, we pick the most accurate num_probes, based + // on actual tests of the implementation. Note that for higher + // bits/key, the best choice for cache-local Bloom can be notably + // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k. + if (millibits_per_key <= 2080) { + return 1; + } else if (millibits_per_key <= 3580) { + return 2; + } else if (millibits_per_key <= 5100) { + return 3; + } else if (millibits_per_key <= 6640) { + return 4; + } else if (millibits_per_key <= 8300) { + return 5; + } else if (millibits_per_key <= 10070) { + return 6; + } else if (millibits_per_key <= 11720) { + return 7; + } else if (millibits_per_key <= 14001) { + // Would be something like <= 13800 but sacrificing *slightly* for + // more settings using <= 8 probes. + return 8; + } else if (millibits_per_key <= 16050) { + return 9; + } else if (millibits_per_key <= 18300) { + return 10; + } else if (millibits_per_key <= 22001) { + return 11; + } else if (millibits_per_key <= 25501) { + return 12; + } else if (millibits_per_key > 50000) { + // Top out at 24 probes (three sets of 8) + return 24; + } else { + // Roughly optimal choices for remaining range + // e.g. + // 28000 -> 12, 28001 -> 13 + // 50000 -> 23, 50001 -> 24 + return (millibits_per_key - 1) / 2000 - 1; + } + } + + static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes, + int num_probes, char *data) { + uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + AddHashPrepared(h2, num_probes, data + bytes_to_cache_line); + } + + static inline void AddHashPrepared(uint32_t h2, int num_probes, + char *data_at_cache_line) { + uint32_t h = h2; + for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { + // 9-bit address within 512 bit cache line + int bitpos = h >> (32 - 9); + data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7)); + } + } + + static inline void PrepareHash(uint32_t h1, uint32_t len_bytes, + const char *data, + uint32_t /*out*/ *byte_offset) { + uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */); + PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */); + *byte_offset = bytes_to_cache_line; + } + + static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes, + int num_probes, const char *data) { + uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); + } + + static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, + const char *data_at_cache_line) { + uint32_t h = h2; +#ifdef __AVX2__ + int rem_probes = num_probes; + + // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18, + // etc.} one can insert specialized code for rem_probes <= 2, bypassing + // the SIMD code in those cases. There is a detectable but minor overhead + // applied to other values of num_probes (when not statically determined), + // but smoother performance curve vs. num_probes. But for now, when + // in doubt, don't add unnecessary code. + + // Powers of 32-bit golden ratio, mod 2**32. + const __m256i multipliers = + _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, + 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); + + for (;;) { + // Eight copies of hash + __m256i hash_vector = _mm256_set1_epi32(h); + + // Same effect as repeated multiplication by 0x9e3779b9 thanks to + // associativity of multiplication. + hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); + + // Now the top 9 bits of each of the eight 32-bit values in + // hash_vector are bit addresses for probes within the cache line. + // While the platform-independent code uses byte addressing (6 bits + // to pick a byte + 3 bits to pick a bit within a byte), here we work + // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit + // within a word) because that works well with AVX2 and is equivalent + // under little-endian. + + // Shift each right by 28 bits to get 4-bit word addresses. + const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); + + // Gather 32-bit values spread over 512 bits by 4-bit address. In + // essence, we are dereferencing eight pointers within the cache + // line. + // + // Option 1: AVX2 gather (seems to be a little slow - understandable) + // const __m256i value_vector = + // _mm256_i32gather_epi32(static_cast(data_at_cache_line), + // word_addresses, + // /*bytes / i32*/ 4); + // END Option 1 + // Potentially unaligned as we're not *always* cache-aligned -> loadu + const __m256i *mm_data = + reinterpret_cast(data_at_cache_line); + __m256i lower = _mm256_loadu_si256(mm_data); + __m256i upper = _mm256_loadu_si256(mm_data + 1); + // Option 2: AVX512VL permute hack + // Only negligibly faster than Option 3, so not yet worth supporting + // const __m256i value_vector = + // _mm256_permutex2var_epi32(lower, word_addresses, upper); + // END Option 2 + // Option 3: AVX2 permute+blend hack + // Use lowest three bits to order probing values, as if all from same + // 256 bit piece. + lower = _mm256_permutevar8x32_epi32(lower, word_addresses); + upper = _mm256_permutevar8x32_epi32(upper, word_addresses); + // Just top 1 bit of address, to select between lower and upper. + const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); + // Finally: the next 8 probed 32-bit values, in probing sequence order. + const __m256i value_vector = + _mm256_blendv_epi8(lower, upper, upper_lower_selector); + // END Option 3 + + // We might not need to probe all 8, so build a mask for selecting only + // what we need. (The k_selector(s) could be pre-computed but that + // doesn't seem to make a noticeable performance difference.) + const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + // Subtract rem_probes from each of those constants + __m256i k_selector = + _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes)); + // Negative after subtract -> use/select + // Keep only high bit (logical shift right each by 31). + k_selector = _mm256_srli_epi32(k_selector, 31); + + // Strip off the 4 bit word address (shift left) + __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); + // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. + bit_addresses = _mm256_srli_epi32(bit_addresses, 27); + // Build a bit mask + const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); + + // Like ((~value_vector) & bit_mask) == 0) + bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; + + // This check first so that it's easy for branch predictor to optimize + // num_probes <= 8 case, making it free of unpredictable branches. + if (rem_probes <= 8) { + return match; + } else if (!match) { + return false; + } + // otherwise + // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power + h *= 0xab25f4c1; + rem_probes -= 8; + } +#else + for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { + // 9-bit address within 512 bit cache line + int bitpos = h >> (32 - 9); + if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) { + return false; + } + } + return true; +#endif + } +}; + +// A legacy Bloom filter implementation with no locality of probes (slow). +// It uses double hashing to generate a sequence of hash values. +// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have +// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004]. +// +// DO NOT REUSE +// +class LegacyNoLocalityBloomImpl { + public: + static inline int ChooseNumProbes(int bits_per_key) { + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + return num_probes; + } + + static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes, + char *data) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (int i = 0; i < num_probes; i++) { + const uint32_t bitpos = h % total_bits; + data[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } + } + + static inline bool HashMayMatch(uint32_t h, uint32_t total_bits, + int num_probes, const char *data) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (int i = 0; i < num_probes; i++) { + const uint32_t bitpos = h % total_bits; + if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + return true; + } +}; + +// A legacy Bloom filter implementation with probes local to a single +// cache line (fast). Because SST files might be transported between +// platforms, the cache line size is a parameter rather than hard coded. +// (But if specified as a constant parameter, an optimizing compiler +// should take advantage of that.) +// +// When ExtraRotates is false, this implementation is notably deficient in +// accuracy. Specifically, it uses double hashing with a 1/512 chance of the +// increment being zero (when cache line size is 512 bits). Thus, there's a +// 1/512 chance of probing only one index, which we'd expect to incur about +// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at +// https://github.com/facebook/rocksdb/issues/4120 +// +// DO NOT REUSE +// +template +class LegacyLocalityBloomImpl { + private: + static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) { + uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h; + return offset_h % num_lines; + } + + public: + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) { + double bits_per_key = 8.0 * bytes / keys; + double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes, + /*cache line bits*/ 512); + if (!ExtraRotates) { + // Good estimate of impact of flaw in index computation. + // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key. + // The + 22 shifts it nicely to fit for lower bits/key. + filter_rate += 0.1 / (bits_per_key * 0.75 + 22); + } else { + // Not yet validated + assert(false); + } + // Always uses 32-bit hash + double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32); + return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate); + } + + static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes, + char *data, int log2_cache_line_bytes) { + const int log2_cache_line_bits = log2_cache_line_bytes + 3; + + char *data_at_offset = + data + (GetLine(h, num_lines) << log2_cache_line_bytes); + const uint32_t delta = (h >> 17) | (h << 15); + for (int i = 0; i < num_probes; ++i) { + // Mask to bit-within-cache-line address + const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); + data_at_offset[bitpos / 8] |= (1 << (bitpos % 8)); + if (ExtraRotates) { + h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); + } + h += delta; + } + } + + static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines, + const char *data, + uint32_t /*out*/ *byte_offset, + int log2_cache_line_bytes) { + uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; + PREFETCH(data + b, 0 /* rw */, 1 /* locality */); + PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */, + 1 /* locality */); + *byte_offset = b; + } + + static inline bool HashMayMatch(uint32_t h, uint32_t num_lines, + int num_probes, const char *data, + int log2_cache_line_bytes) { + uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; + return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes); + } + + static inline bool HashMayMatchPrepared(uint32_t h, int num_probes, + const char *data_at_offset, + int log2_cache_line_bytes) { + const int log2_cache_line_bits = log2_cache_line_bytes + 3; + + const uint32_t delta = (h >> 17) | (h << 15); + for (int i = 0; i < num_probes; ++i) { + // Mask to bit-within-cache-line address + const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); + if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + if (ExtraRotates) { + h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); + } + h += delta; + } + return true; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc new file mode 100644 index 0000000000..67ef5d3495 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc @@ -0,0 +1,79 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include + +#include "rocksdb/version.h" +#include "rocksdb/utilities/object_registry.h" +#include "util/string_util.h" + +// The build script may replace these values with real values based +// on whether or not GIT is available and the platform settings +static const std::string rocksdb_build_git_sha = "rocksdb_build_git_sha:"; +static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:"""; +#define HAS_GIT_CHANGES 0 +#if HAS_GIT_CHANGES == 0 +// If HAS_GIT_CHANGES is 0, the GIT date is used. +// Use the time the branch/tag was last modified +static const std::string rocksdb_build_date = "rocksdb_build_date:"""; +#else +// If HAS_GIT_CHANGES is > 0, the branch/tag has modifications. +// Use the time the build was created. +static const std::string rocksdb_build_date = "rocksdb_build_date:"""; +#endif + +extern "C" { + +} // extern "C" + +std::unordered_map ROCKSDB_NAMESPACE::ObjectRegistry::builtins_ = { + +}; + +namespace ROCKSDB_NAMESPACE { +static void AddProperty(std::unordered_map *props, const std::string& name) { + size_t colon = name.find(":"); + if (colon != std::string::npos && colon > 0 && colon < name.length() - 1) { + // If we found a "@:", then this property was a build-time substitution that failed. Skip it + size_t at = name.find("@", colon); + if (at != colon + 1) { + // Everything before the colon is the name, after is the value + (*props)[name.substr(0, colon)] = name.substr(colon + 1); + } + } +} + +static std::unordered_map* LoadPropertiesSet() { + auto * properties = new std::unordered_map(); + AddProperty(properties, rocksdb_build_git_sha); + AddProperty(properties, rocksdb_build_git_tag); + AddProperty(properties, rocksdb_build_date); + return properties; +} + +const std::unordered_map& GetRocksBuildProperties() { + static std::unique_ptr> props(LoadPropertiesSet()); + return *props; +} + +std::string GetRocksVersionAsString(bool with_patch) { + std::string version = std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR); + if (with_patch) { + return version + "." + std::to_string(ROCKSDB_PATCH); + } else { + return version; + } +} + +std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) { + std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true); + if (verbose) { + for (const auto& it : GetRocksBuildProperties()) { + info.append("\n "); + info.append(it.first); + info.append(": "); + info.append(it.second); + } + } + return info; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc.in b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc.in new file mode 100644 index 0000000000..56bc878562 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/build_version.cc.in @@ -0,0 +1,79 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include + +#include "rocksdb/version.h" +#include "rocksdb/utilities/object_registry.h" +#include "util/string_util.h" + +// The build script may replace these values with real values based +// on whether or not GIT is available and the platform settings +static const std::string rocksdb_build_git_sha = "rocksdb_build_git_sha:@GIT_SHA@"; +static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:@GIT_TAG@"; +#define HAS_GIT_CHANGES @GIT_MOD@ +#if HAS_GIT_CHANGES == 0 +// If HAS_GIT_CHANGES is 0, the GIT date is used. +// Use the time the branch/tag was last modified +static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@"; +#else +// If HAS_GIT_CHANGES is > 0, the branch/tag has modifications. +// Use the time the build was created. +static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@"; +#endif + +extern "C" { +@ROCKSDB_PLUGIN_EXTERNS@ +} // extern "C" + +std::unordered_map ROCKSDB_NAMESPACE::ObjectRegistry::builtins_ = { + @ROCKSDB_PLUGIN_BUILTINS@ +}; + +namespace ROCKSDB_NAMESPACE { +static void AddProperty(std::unordered_map *props, const std::string& name) { + size_t colon = name.find(":"); + if (colon != std::string::npos && colon > 0 && colon < name.length() - 1) { + // If we found a "@:", then this property was a build-time substitution that failed. Skip it + size_t at = name.find("@", colon); + if (at != colon + 1) { + // Everything before the colon is the name, after is the value + (*props)[name.substr(0, colon)] = name.substr(colon + 1); + } + } +} + +static std::unordered_map* LoadPropertiesSet() { + auto * properties = new std::unordered_map(); + AddProperty(properties, rocksdb_build_git_sha); + AddProperty(properties, rocksdb_build_git_tag); + AddProperty(properties, rocksdb_build_date); + return properties; +} + +const std::unordered_map& GetRocksBuildProperties() { + static std::unique_ptr> props(LoadPropertiesSet()); + return *props; +} + +std::string GetRocksVersionAsString(bool with_patch) { + std::string version = std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR); + if (with_patch) { + return version + "." + std::to_string(ROCKSDB_PATCH); + } else { + return version; + } +} + +std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) { + std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true); + if (verbose) { + for (const auto& it : GetRocksBuildProperties()) { + info.append("\n "); + info.append(it.first); + info.append(": "); + info.append(it.second); + } + } + return info; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cast_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cast_util.h new file mode 100644 index 0000000000..c91b6ff1ee --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cast_util.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// The helper function to assert the move from dynamic_cast<> to +// static_cast<> is correct. This function is to deal with legacy code. +// It is not recommended to add new code to issue class casting. The preferred +// solution is to implement the functionality without a need of casting. +template +inline DestClass* static_cast_with_check(SrcClass* x) { + DestClass* ret = static_cast(x); +#ifdef ROCKSDB_USE_RTTI + assert(ret == dynamic_cast(x)); +#endif + return ret; +} + +// A wrapper around static_cast for lossless conversion between integral +// types, including enum types. For example, this can be used for converting +// between signed/unsigned or enum type and underlying type without fear of +// stripping away data, now or in the future. +template +inline To lossless_cast(From x) { + using FromValue = typename std::remove_reference::type; + static_assert( + std::is_integral::value || std::is_enum::value, + "Only works on integral types"); + static_assert(std::is_integral::value || std::is_enum::value, + "Only works on integral types"); + static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless"); + return static_cast(x); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/channel.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/channel.h new file mode 100644 index 0000000000..19b9562977 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/channel.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +template +class channel { + public: + explicit channel() : eof_(false) {} + + channel(const channel&) = delete; + void operator=(const channel&) = delete; + + void sendEof() { + std::lock_guard lk(lock_); + eof_ = true; + cv_.notify_all(); + } + + bool eof() { + std::lock_guard lk(lock_); + return buffer_.empty() && eof_; + } + + size_t size() const { + std::lock_guard lk(lock_); + return buffer_.size(); + } + + // writes elem to the queue + void write(T&& elem) { + std::unique_lock lk(lock_); + buffer_.emplace(std::forward(elem)); + cv_.notify_one(); + } + + /// Moves a dequeued element onto elem, blocking until an element + /// is available. + // returns false if EOF + bool read(T& elem) { + std::unique_lock lk(lock_); + cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); }); + if (eof_ && buffer_.empty()) { + return false; + } + elem = std::move(buffer_.front()); + buffer_.pop(); + cv_.notify_one(); + return true; + } + + private: + std::condition_variable cv_; + mutable std::mutex lock_; + std::queue buffer_; + bool eof_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cleanable.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cleanable.cc new file mode 100644 index 0000000000..89a7ab9be3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/cleanable.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cleanable.h" + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +Cleanable::Cleanable() { + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +Cleanable::~Cleanable() { DoCleanup(); } + +Cleanable::Cleanable(Cleanable&& other) noexcept { *this = std::move(other); } + +Cleanable& Cleanable::operator=(Cleanable&& other) noexcept { + assert(this != &other); // https://stackoverflow.com/a/9322542/454544 + cleanup_ = other.cleanup_; + other.cleanup_.function = nullptr; + other.cleanup_.next = nullptr; + return *this; +} + +// If the entire linked list was on heap we could have simply add attach one +// link list to another. However the head is an embeded object to avoid the cost +// of creating objects for most of the use cases when the Cleanable has only one +// Cleanup to do. We could put evernything on heap if benchmarks show no +// negative impact on performance. +// Also we need to iterate on the linked list since there is no pointer to the +// tail. We can add the tail pointer but maintainin it might negatively impact +// the perforamnce for the common case of one cleanup where tail pointer is not +// needed. Again benchmarks could clarify that. +// Even without a tail pointer we could iterate on the list, find the tail, and +// have only that node updated without the need to insert the Cleanups one by +// one. This however would be redundant when the source Cleanable has one or a +// few Cleanups which is the case most of the time. +// TODO(myabandeh): if the list is too long we should maintain a tail pointer +// and have the entire list (minus the head that has to be inserted separately) +// merged with the target linked list at once. +void Cleanable::DelegateCleanupsTo(Cleanable* other) { + assert(other != nullptr); + if (cleanup_.function == nullptr) { + return; + } + Cleanup* c = &cleanup_; + other->RegisterCleanup(c->function, c->arg1, c->arg2); + c = c->next; + while (c != nullptr) { + Cleanup* next = c->next; + other->RegisterCleanup(c); + c = next; + } + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +void Cleanable::RegisterCleanup(Cleanable::Cleanup* c) { + assert(c != nullptr); + if (cleanup_.function == nullptr) { + cleanup_.function = c->function; + cleanup_.arg1 = c->arg1; + cleanup_.arg2 = c->arg2; + delete c; + } else { + c->next = cleanup_.next; + cleanup_.next = c; + } +} + +void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != nullptr); + Cleanup* c; + if (cleanup_.function == nullptr) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +struct SharedCleanablePtr::Impl : public Cleanable { + std::atomic ref_count{1}; // Start with 1 ref + void Ref() { ref_count.fetch_add(1, std::memory_order_relaxed); } + void Unref() { + if (ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) { + // Last ref + delete this; + } + } + static void UnrefWrapper(void* arg1, void* /*arg2*/) { + static_cast(arg1)->Unref(); + } +}; + +void SharedCleanablePtr::Reset() { + if (ptr_) { + ptr_->Unref(); + ptr_ = nullptr; + } +} + +void SharedCleanablePtr::Allocate() { + Reset(); + ptr_ = new Impl(); +} + +SharedCleanablePtr::SharedCleanablePtr(const SharedCleanablePtr& from) { + *this = from; +} + +SharedCleanablePtr::SharedCleanablePtr(SharedCleanablePtr&& from) noexcept { + *this = std::move(from); +} + +SharedCleanablePtr& SharedCleanablePtr::operator=( + const SharedCleanablePtr& from) { + if (this != &from) { + Reset(); + ptr_ = from.ptr_; + if (ptr_) { + ptr_->Ref(); + } + } + return *this; +} + +SharedCleanablePtr& SharedCleanablePtr::operator=( + SharedCleanablePtr&& from) noexcept { + assert(this != &from); // https://stackoverflow.com/a/9322542/454544 + Reset(); + ptr_ = from.ptr_; + from.ptr_ = nullptr; + return *this; +} + +SharedCleanablePtr::~SharedCleanablePtr() { Reset(); } + +Cleanable& SharedCleanablePtr::operator*() { + return *ptr_; // implicit upcast +} + +Cleanable* SharedCleanablePtr::operator->() { + return ptr_; // implicit upcast +} + +Cleanable* SharedCleanablePtr::get() { + return ptr_; // implicit upcast +} + +void SharedCleanablePtr::RegisterCopyWith(Cleanable* target) { + if (ptr_) { + // "Virtual" copy of the pointer + ptr_->Ref(); + target->RegisterCleanup(&Impl::UnrefWrapper, ptr_, nullptr); + } +} + +void SharedCleanablePtr::MoveAsCleanupTo(Cleanable* target) { + if (ptr_) { + // "Virtual" move of the pointer + target->RegisterCleanup(&Impl::UnrefWrapper, ptr_, nullptr); + ptr_ = nullptr; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.cc new file mode 100644 index 0000000000..3da8afaa27 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +// conversion' conversion from 'type1' to 'type2', possible loss of data +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4244) +#endif +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.h new file mode 100644 index 0000000000..3168fd2fd1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding.h @@ -0,0 +1,389 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Encoding independent of machine byte order: +// * Fixed-length numbers are encoded with least-significant byte first +// (little endian, native order on Intel and others) +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format +// +// Some related functions are provided in coding_lean.h + +#pragma once +#include +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/coding_lean.h" + +// Some processors does not allow unaligned access to memory +#if defined(__sparc) +#define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED +#endif + +namespace ROCKSDB_NAMESPACE { + +// The maximum length of a varint in bytes for 64-bit. +const uint32_t kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed16(std::string* dst, uint16_t value); +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2); +extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2, uint32_t value3); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutVarint64Varint64(std::string* dst, uint64_t value1, + uint64_t value2); +extern void PutVarint32Varint64(std::string* dst, uint32_t value1, + uint64_t value2); +extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1, + uint32_t value2, uint64_t value3); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); +extern void PutLengthPrefixedSlicePartsWithPadding( + std::string* dst, const SliceParts& slice_parts, size_t pad_sz); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetFixed64(Slice* input, uint64_t* value); +extern bool GetFixed32(Slice* input, uint32_t* value); +extern bool GetFixed16(Slice* input, uint16_t* value); +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetVarsignedint64(Slice* input, int64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +// This function assumes data is well-formed. +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); + +// Borrowed from +// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208 +constexpr inline uint64_t i64ToZigzag(const int64_t l) { + return (static_cast(l) << 1) ^ static_cast(l >> 63); +} +inline int64_t zigzagToI64(uint64_t n) { + return (n >> 1) ^ -static_cast(n & 1); +} + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// nullptr on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p, const char* limit, + uint32_t* v); +extern const char* GetVarint64Ptr(const char* p, const char* limit, + uint64_t* v); +inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, + int64_t* value) { + uint64_t u = 0; + const char* ret = GetVarint64Ptr(p, limit, &u); + *value = zigzagToI64(u); + return ret; +} + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// Pull the last 8 bits and cast it to a character +inline void PutFixed16(std::string* dst, uint16_t value) { + if (port::kLittleEndian) { + dst->append(const_cast(reinterpret_cast(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed16(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + if (port::kLittleEndian) { + dst->append(const_cast(reinterpret_cast(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + if (port::kLittleEndian) { + dst->append(const_cast(reinterpret_cast(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint32(std::string* dst, uint32_t v1, uint32_t v2) { + char buf[10]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1, + uint32_t v2, uint32_t v3) { + char buf[15]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + ptr = EncodeVarint32(ptr, v3); + dst->append(buf, static_cast(ptr - buf)); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[kMaxVarint64Length]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarsignedint64(std::string* dst, int64_t v) { + char buf[kMaxVarint64Length]; + // Using Zigzag format to convert signed to unsigned + char* ptr = EncodeVarint64(buf, i64ToZigzag(v)); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) { + char buf[20]; + char* ptr = EncodeVarint64(buf, v1); + ptr = EncodeVarint64(ptr, v2); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint64(std::string* dst, uint32_t v1, uint64_t v2) { + char buf[15]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint64(ptr, v2); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint32Varint64(std::string* dst, uint32_t v1, + uint32_t v2, uint64_t v3) { + char buf[20]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + ptr = EncodeVarint64(ptr, v3); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, static_cast(value.size())); + dst->append(value.data(), value.size()); +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes, + const SliceParts& slice_parts) { + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, static_cast(total_bytes)); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts); +} + +inline void PutLengthPrefixedSlicePartsWithPadding( + std::string* dst, const SliceParts& slice_parts, size_t pad_sz) { + PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts); + dst->append(pad_sz, '\0'); +} + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +inline bool GetFixed64(Slice* input, uint64_t* value) { + if (input->size() < sizeof(uint64_t)) { + return false; + } + *value = DecodeFixed64(input->data()); + input->remove_prefix(sizeof(uint64_t)); + return true; +} + +inline bool GetFixed32(Slice* input, uint32_t* value) { + if (input->size() < sizeof(uint32_t)) { + return false; + } + *value = DecodeFixed32(input->data()); + input->remove_prefix(sizeof(uint32_t)); + return true; +} + +inline bool GetFixed16(Slice* input, uint16_t* value) { + if (input->size() < sizeof(uint16_t)) { + return false; + } + *value = DecodeFixed16(input->data()); + input->remove_prefix(sizeof(uint16_t)); + return true; +} + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + +inline bool GetVarsignedint64(Slice* input, int64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarsignedint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + // unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char + auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +template +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void +PutUnaligned(T* memory, const T& value) { +#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) + char* nonAlignedMemory = reinterpret_cast(memory); + memcpy(nonAlignedMemory, reinterpret_cast(&value), sizeof(T)); +#else + *memory = value; +#endif +} + +template +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void +GetUnaligned(const T* memory, T* value) { +#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) + char* nonAlignedMemory = reinterpret_cast(value); + memcpy(nonAlignedMemory, reinterpret_cast(memory), sizeof(T)); +#else + *value = *memory; +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding_lean.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding_lean.h new file mode 100644 index 0000000000..6966f7a668 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coding_lean.h @@ -0,0 +1,101 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Encoding independent of machine byte order: +// * Fixed-length numbers are encoded with least-significant byte first +// (little endian, native order on Intel and others) +// +// More functions in coding.h + +#pragma once + +#include +#include + +#include "port/port.h" // for port::kLittleEndian + +namespace ROCKSDB_NAMESPACE { + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +// -- Implementation of the functions declared above +inline void EncodeFixed16(char* buf, uint16_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + } +} + +inline void EncodeFixed32(char* buf, uint32_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + } +} + +inline void EncodeFixed64(char* buf, uint64_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; + } +} + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint16_t DecodeFixed16(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint16_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) | + (static_cast(static_cast(ptr[1])) << 8)); + } +} + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) | + (static_cast(static_cast(ptr[1])) << 8) | + (static_cast(static_cast(ptr[2])) << 16) | + (static_cast(static_cast(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compaction_job_stats_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compaction_job_stats_impl.cc new file mode 100644 index 0000000000..587a26f247 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compaction_job_stats_impl.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/compaction_job_stats.h" + +namespace ROCKSDB_NAMESPACE { + + +void CompactionJobStats::Reset() { + elapsed_micros = 0; + cpu_micros = 0; + + num_input_records = 0; + num_blobs_read = 0; + num_input_files = 0; + num_input_files_at_output_level = 0; + + num_output_records = 0; + num_output_files = 0; + num_output_files_blob = 0; + + is_full_compaction = false; + is_manual_compaction = false; + + total_input_bytes = 0; + total_blob_bytes_read = 0; + total_output_bytes = 0; + total_output_bytes_blob = 0; + + num_records_replaced = 0; + + total_input_raw_key_bytes = 0; + total_input_raw_value_bytes = 0; + + num_input_deletion_records = 0; + num_expired_deletion_records = 0; + + num_corrupt_keys = 0; + + file_write_nanos = 0; + file_range_sync_nanos = 0; + file_fsync_nanos = 0; + file_prepare_write_nanos = 0; + + smallest_output_key_prefix.clear(); + largest_output_key_prefix.clear(); + + num_single_del_fallthru = 0; + num_single_del_mismatch = 0; +} + +void CompactionJobStats::Add(const CompactionJobStats& stats) { + elapsed_micros += stats.elapsed_micros; + cpu_micros += stats.cpu_micros; + + num_input_records += stats.num_input_records; + num_blobs_read += stats.num_blobs_read; + num_input_files += stats.num_input_files; + num_input_files_at_output_level += stats.num_input_files_at_output_level; + + num_output_records += stats.num_output_records; + num_output_files += stats.num_output_files; + num_output_files_blob += stats.num_output_files_blob; + + total_input_bytes += stats.total_input_bytes; + total_blob_bytes_read += stats.total_blob_bytes_read; + total_output_bytes += stats.total_output_bytes; + total_output_bytes_blob += stats.total_output_bytes_blob; + + num_records_replaced += stats.num_records_replaced; + + total_input_raw_key_bytes += stats.total_input_raw_key_bytes; + total_input_raw_value_bytes += stats.total_input_raw_value_bytes; + + num_input_deletion_records += stats.num_input_deletion_records; + num_expired_deletion_records += stats.num_expired_deletion_records; + + num_corrupt_keys += stats.num_corrupt_keys; + + file_write_nanos += stats.file_write_nanos; + file_range_sync_nanos += stats.file_range_sync_nanos; + file_fsync_nanos += stats.file_fsync_nanos; + file_prepare_write_nanos += stats.file_prepare_write_nanos; + + num_single_del_fallthru += stats.num_single_del_fallthru; + num_single_del_mismatch += stats.num_single_del_mismatch; +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/comparator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/comparator.cc new file mode 100644 index 0000000000..19fd47387e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/comparator.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/comparator.h" + +#include + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() {} + static const char* kClassName() { return "leveldb.BytewiseComparator"; } + const char* Name() const override { return kClassName(); } + + int Compare(const Slice& a, const Slice& b) const override { + return a.compare(b); + } + + bool Equal(const Slice& a, const Slice& b) const override { return a == b; } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t start_byte = static_cast((*start)[diff_index]); + uint8_t limit_byte = static_cast(limit[diff_index]); + if (start_byte >= limit_byte) { + // Cannot shorten since limit is smaller than start or start is + // already the shortest possible. + return; + } + assert(start_byte < limit_byte); + + if (diff_index < limit.size() - 1 || start_byte + 1 < limit_byte) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + } else { + // v + // A A 1 A A A + // A A 2 + // + // Incrementing the current byte will make start bigger than limit, we + // will skip this byte, and find the first non 0xFF byte in start and + // increment it. + diff_index++; + + while (diff_index < start->size()) { + // Keep moving until we find the first non 0xFF byte to + // increment it + if (static_cast((*start)[diff_index]) < + static_cast(0xff)) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + break; + } + diff_index++; + } + } + assert(Compare(*start, limit) < 0); + } + } + + void FindShortSuccessor(std::string* key) const override { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i + 1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + if (s.size() != t.size() || s.size() == 0) { + return false; + } + size_t diff_ind = s.difference_offset(t); + // same slice + if (diff_ind >= s.size()) return false; + uint8_t byte_s = static_cast(s[diff_ind]); + uint8_t byte_t = static_cast(t[diff_ind]); + // first different byte must be consecutive, and remaining bytes must be + // 0xff for s and 0x00 for t + if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) { + for (size_t i = diff_ind + 1; i < s.size(); ++i) { + byte_s = static_cast(s[i]); + byte_t = static_cast(t[i]); + if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) { + return false; + } + } + return true; + } else { + return false; + } + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return false; + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b, + bool /*b_has_ts*/) const override { + return a.compare(b); + } + + bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override { + return a == b; + } +}; + +class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { + public: + ReverseBytewiseComparatorImpl() {} + + static const char* kClassName() { + return "rocksdb.ReverseBytewiseComparator"; + } + const char* Name() const override { return kClassName(); } + + int Compare(const Slice& a, const Slice& b) const override { + return -a.compare(b); + } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + assert(diff_index <= min_length); + if (diff_index == min_length) { + // Do not shorten if one string is a prefix of the other + // + // We could handle cases like: + // V + // A A 2 X Y + // A A 2 + // in a similar way as BytewiseComparator::FindShortestSeparator(). + // We keep it simple by not implementing it. We can come back to it + // later when needed. + } else { + uint8_t start_byte = static_cast((*start)[diff_index]); + uint8_t limit_byte = static_cast(limit[diff_index]); + if (start_byte > limit_byte && diff_index < start->size() - 1) { + // Case like + // V + // A A 3 A A + // A A 1 B B + // + // or + // v + // A A 2 A A + // A A 1 B B + // In this case "AA2" will be good. +#ifndef NDEBUG + std::string old_start = *start; +#endif + start->resize(diff_index + 1); +#ifndef NDEBUG + assert(old_start >= *start); +#endif + assert(Slice(*start).compare(limit) > 0); + } + } + } + + void FindShortSuccessor(std::string* /*key*/) const override { + // Don't do anything for simplicity. + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + // Always returning false to prevent surfacing design flaws in + // auto_prefix_mode + (void)s, (void)t; + return false; + // "Correct" implementation: + // return BytewiseComparatorImpl::IsSameLengthImmediateSuccessor(t, s); + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return false; + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b, + bool /*b_has_ts*/) const override { + return -a.compare(b); + } +}; + +// EXPERIMENTAL +// Comparator with 64-bit integer timestamp. +// We did not performance test this yet. +template +class ComparatorWithU64TsImpl : public Comparator { + static_assert(std::is_base_of::value, + "template type must be a inherited type of comparator"); + + public: + explicit ComparatorWithU64TsImpl() : Comparator(/*ts_sz=*/sizeof(uint64_t)) { + assert(cmp_without_ts_.timestamp_size() == 0); + } + + static const char* kClassName() { + static std::string class_name = kClassNameInternal(); + return class_name.c_str(); + } + + const char* Name() const override { return kClassName(); } + + void FindShortSuccessor(std::string*) const override {} + void FindShortestSeparator(std::string*, const Slice&) const override {} + int Compare(const Slice& a, const Slice& b) const override { + int ret = CompareWithoutTimestamp(a, b); + size_t ts_sz = timestamp_size(); + if (ret != 0) { + return ret; + } + // Compare timestamp. + // For the same user key with different timestamps, larger (newer) timestamp + // comes first. + return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz), + ExtractTimestampFromUserKey(b, ts_sz)); + } + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + const size_t ts_sz = timestamp_size(); + assert(!a_has_ts || a.size() >= ts_sz); + assert(!b_has_ts || b.size() >= ts_sz); + Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a; + Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b; + return cmp_without_ts_.Compare(lhs, rhs); + } + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + assert(ts1.size() == sizeof(uint64_t)); + assert(ts2.size() == sizeof(uint64_t)); + uint64_t lhs = DecodeFixed64(ts1.data()); + uint64_t rhs = DecodeFixed64(ts2.data()); + if (lhs < rhs) { + return -1; + } else if (lhs > rhs) { + return 1; + } else { + return 0; + } + } + + private: + static std::string kClassNameInternal() { + std::stringstream ss; + ss << TComparator::kClassName() << ".u64ts"; + return ss.str(); + } + + TComparator cmp_without_ts_; +}; + +} // namespace + +const Comparator* BytewiseComparator() { + STATIC_AVOID_DESTRUCTION(BytewiseComparatorImpl, bytewise); + return &bytewise; +} + +const Comparator* ReverseBytewiseComparator() { + STATIC_AVOID_DESTRUCTION(ReverseBytewiseComparatorImpl, rbytewise); + return &rbytewise; +} + +const Comparator* BytewiseComparatorWithU64Ts() { + STATIC_AVOID_DESTRUCTION(ComparatorWithU64TsImpl, + comp_with_u64_ts); + return &comp_with_u64_ts; +} + +static int RegisterBuiltinComparators(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + BytewiseComparatorImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return BytewiseComparator(); }); + library.AddFactory( + ReverseBytewiseComparatorImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); + library.AddFactory( + ComparatorWithU64TsImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return BytewiseComparatorWithU64Ts(); }); + return 3; +} + +Status Comparator::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + const Comparator** result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinComparators(*(ObjectLibrary::Default().get()), ""); + }); + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, *result, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + if (id == BytewiseComparatorImpl::kClassName()) { + *result = BytewiseComparator(); + } else if (id == ReverseBytewiseComparatorImpl::kClassName()) { + *result = ReverseBytewiseComparator(); + } else if (id == + ComparatorWithU64TsImpl::kClassName()) { + *result = BytewiseComparatorWithU64Ts(); + } else if (value.empty()) { + // No Id and no options. Clear the object + *result = nullptr; + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { + status = config_options.registry->NewStaticObject(id, result); + if (!status.ok()) { + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + return Status::OK(); + } else { + return status; + } + } else { + Comparator* comparator = const_cast(*result); + status = + Customizable::ConfigureNewObject(config_options, comparator, opt_map); + } + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.cc new file mode 100644 index 0000000000..712d333ee6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2022-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +StreamingCompress* StreamingCompress::Create(CompressionType compression_type, + const CompressionOptions& opts, + uint32_t compress_format_version, + size_t max_output_len) { + switch (compression_type) { + case kZSTD: { + if (!ZSTD_Streaming_Supported()) { + return nullptr; + } + return new ZSTDStreamingCompress(opts, compress_format_version, + max_output_len); + } + default: + return nullptr; + } +} + +StreamingUncompress* StreamingUncompress::Create( + CompressionType compression_type, uint32_t compress_format_version, + size_t max_output_len) { + switch (compression_type) { + case kZSTD: { + if (!ZSTD_Streaming_Supported()) { + return nullptr; + } + return new ZSTDStreamingUncompress(compress_format_version, + max_output_len); + } + default: + return nullptr; + } +} + +int ZSTDStreamingCompress::Compress(const char* input, size_t input_size, + char* output, size_t* output_pos) { + assert(input != nullptr && output != nullptr && output_pos != nullptr); + *output_pos = 0; + // Don't need to compress an empty input + if (input_size == 0) { + return 0; + } +#ifndef ZSTD_STREAMING + (void)input; + (void)input_size; + (void)output; + return -1; +#else + if (input_buffer_.src == nullptr || input_buffer_.src != input) { + // New input + // Catch errors where the previous input was not fully decompressed. + assert(input_buffer_.pos == input_buffer_.size); + input_buffer_ = {input, input_size, /*pos=*/0}; + } else if (input_buffer_.src == input) { + // Same input, not fully compressed. + } + ZSTD_outBuffer output_buffer = {output, max_output_len_, /*pos=*/0}; + const size_t remaining = + ZSTD_compressStream2(cctx_, &output_buffer, &input_buffer_, ZSTD_e_end); + if (ZSTD_isError(remaining)) { + // Failure + Reset(); + return -1; + } + // Success + *output_pos = output_buffer.pos; + return (int)remaining; +#endif +} + +void ZSTDStreamingCompress::Reset() { +#ifdef ZSTD_STREAMING + ZSTD_CCtx_reset(cctx_, ZSTD_ResetDirective::ZSTD_reset_session_only); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif +} + +int ZSTDStreamingUncompress::Uncompress(const char* input, size_t input_size, + char* output, size_t* output_pos) { + assert(output != nullptr && output_pos != nullptr); + *output_pos = 0; + // Don't need to uncompress an empty input + if (input_size == 0) { + return 0; + } +#ifdef ZSTD_STREAMING + if (input) { + // New input + input_buffer_ = {input, input_size, /*pos=*/0}; + } + ZSTD_outBuffer output_buffer = {output, max_output_len_, /*pos=*/0}; + size_t ret = ZSTD_decompressStream(dctx_, &output_buffer, &input_buffer_); + if (ZSTD_isError(ret)) { + Reset(); + return -1; + } + *output_pos = output_buffer.pos; + return (int)(input_buffer_.size - input_buffer_.pos); +#else + (void)input; + (void)input_size; + (void)output; + return -1; +#endif +} + +void ZSTDStreamingUncompress::Reset() { +#ifdef ZSTD_STREAMING + ZSTD_DCtx_reset(dctx_, ZSTD_ResetDirective::ZSTD_reset_session_only); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.h new file mode 100644 index 0000000000..eb5d1c7bbc --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression.h @@ -0,0 +1,1795 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once + +#include +#include +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else // OS_FREEBSD +#include +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE +#include + +#include "memory/memory_allocator_impl.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_type.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/string_util.h" + +#ifdef SNAPPY +#include +#endif + +#ifdef ZLIB +#include +#endif + +#ifdef BZIP2 +#include +#endif + +#if defined(LZ4) +#include +#include +#endif + +#if defined(ZSTD) +#include +// v1.1.3+ +#if ZSTD_VERSION_NUMBER >= 10103 +#include +#endif // ZSTD_VERSION_NUMBER >= 10103 +// v1.4.0+ +#if ZSTD_VERSION_NUMBER >= 10400 +#define ZSTD_STREAMING +#endif // ZSTD_VERSION_NUMBER >= 10400 +namespace ROCKSDB_NAMESPACE { +// Need this for the context allocation override +// On windows we need to do this explicitly +#if (ZSTD_VERSION_NUMBER >= 500) +#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \ + defined(ZSTD_STATIC_LINKING_ONLY) +#define ROCKSDB_ZSTD_CUSTOM_MEM +namespace port { +ZSTD_customMem GetJeZstdAllocationOverrides(); +} // namespace port +#endif // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && + // defined(ZSTD_STATIC_LINKING_ONLY) + +// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use +// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was +// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came +// in v1.1.4, so that is the version we require. As of today's latest version +// (v1.3.8), they are both still in the experimental API, which means they are +// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set. +#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104 +#define ROCKSDB_ZSTD_DDICT +#endif // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104 + +// Cached data represents a portion that can be re-used +// If, in the future we have more than one native context to +// cache we can arrange this as a tuple +class ZSTDUncompressCachedData { + public: + using ZSTDNativeContext = ZSTD_DCtx*; + ZSTDUncompressCachedData() {} + // Init from cache + ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete; + ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete; + ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) noexcept + : ZSTDUncompressCachedData() { + *this = std::move(o); + } + ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o) noexcept { + assert(zstd_ctx_ == nullptr); + std::swap(zstd_ctx_, o.zstd_ctx_); + std::swap(cache_idx_, o.cache_idx_); + return *this; + } + ZSTDNativeContext Get() const { return zstd_ctx_; } + int64_t GetCacheIndex() const { return cache_idx_; } + void CreateIfNeeded() { + if (zstd_ctx_ == nullptr) { +#ifdef ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = + ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides()); +#else // ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = ZSTD_createDCtx(); +#endif // ROCKSDB_ZSTD_CUSTOM_MEM + cache_idx_ = -1; + } + } + void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) { + zstd_ctx_ = o.zstd_ctx_; + cache_idx_ = idx; + } + ~ZSTDUncompressCachedData() { + if (zstd_ctx_ != nullptr && cache_idx_ == -1) { + ZSTD_freeDCtx(zstd_ctx_); + } + } + + private: + ZSTDNativeContext zstd_ctx_ = nullptr; + int64_t cache_idx_ = -1; // -1 means this instance owns the context +}; +#endif // (ZSTD_VERSION_NUMBER >= 500) +} // namespace ROCKSDB_NAMESPACE +#endif // ZSTD + +#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500) +namespace ROCKSDB_NAMESPACE { +class ZSTDUncompressCachedData { + void* padding; // unused + public: + using ZSTDNativeContext = void*; + ZSTDUncompressCachedData() {} + ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {} + ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete; + ZSTDUncompressCachedData(ZSTDUncompressCachedData&&) noexcept = default; + ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&) noexcept = + default; + ZSTDNativeContext Get() const { return nullptr; } + int64_t GetCacheIndex() const { return -1; } + void CreateIfNeeded() {} + void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {} + + private: + void ignore_padding__() { padding = nullptr; } +}; +} // namespace ROCKSDB_NAMESPACE +#endif + +#if defined(XPRESS) +#include "port/xpress.h" +#endif + +namespace ROCKSDB_NAMESPACE { + +// Holds dictionary and related data, like ZSTD's digested compression +// dictionary. +struct CompressionDict { +#if ZSTD_VERSION_NUMBER >= 700 + ZSTD_CDict* zstd_cdict_ = nullptr; +#endif // ZSTD_VERSION_NUMBER >= 700 + std::string dict_; + + public: +#if ZSTD_VERSION_NUMBER >= 700 + CompressionDict(std::string dict, CompressionType type, int level) { +#else // ZSTD_VERSION_NUMBER >= 700 + CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) { +#endif // ZSTD_VERSION_NUMBER >= 700 + dict_ = std::move(dict); +#if ZSTD_VERSION_NUMBER >= 700 + zstd_cdict_ = nullptr; + if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) { + if (level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } + // Should be safe (but slower) if below call fails as we'll use the + // raw dictionary to compress. + zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level); + assert(zstd_cdict_ != nullptr); + } +#endif // ZSTD_VERSION_NUMBER >= 700 + } + + ~CompressionDict() { +#if ZSTD_VERSION_NUMBER >= 700 + size_t res = 0; + if (zstd_cdict_ != nullptr) { + res = ZSTD_freeCDict(zstd_cdict_); + } + assert(res == 0); // Last I checked they can't fail + (void)res; // prevent unused var warning +#endif // ZSTD_VERSION_NUMBER >= 700 + } + +#if ZSTD_VERSION_NUMBER >= 700 + const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; } +#endif // ZSTD_VERSION_NUMBER >= 700 + + Slice GetRawDict() const { return dict_; } + + static const CompressionDict& GetEmptyDict() { + static CompressionDict empty_dict{}; + return empty_dict; + } + + CompressionDict() = default; + // Disable copy/move + CompressionDict(const CompressionDict&) = delete; + CompressionDict& operator=(const CompressionDict&) = delete; + CompressionDict(CompressionDict&&) = delete; + CompressionDict& operator=(CompressionDict&&) = delete; +}; + +// Holds dictionary and related data, like ZSTD's digested uncompression +// dictionary. +struct UncompressionDict { + // Block containing the data for the compression dictionary in case the + // constructor that takes a string parameter is used. + std::string dict_; + + // Block containing the data for the compression dictionary in case the + // constructor that takes a Slice parameter is used and the passed in + // CacheAllocationPtr is not nullptr. + CacheAllocationPtr allocation_; + + // Slice pointing to the compression dictionary data. Can point to + // dict_, allocation_, or some other memory location, depending on how + // the object was constructed. + Slice slice_; + +#ifdef ROCKSDB_ZSTD_DDICT + // Processed version of the contents of slice_ for ZSTD compression. + ZSTD_DDict* zstd_ddict_ = nullptr; +#endif // ROCKSDB_ZSTD_DDICT + +#ifdef ROCKSDB_ZSTD_DDICT + UncompressionDict(std::string dict, bool using_zstd) +#else // ROCKSDB_ZSTD_DDICT + UncompressionDict(std::string dict, bool /* using_zstd */) +#endif // ROCKSDB_ZSTD_DDICT + : dict_(std::move(dict)), slice_(dict_) { +#ifdef ROCKSDB_ZSTD_DDICT + if (!slice_.empty() && using_zstd) { + zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size()); + assert(zstd_ddict_ != nullptr); + } +#endif // ROCKSDB_ZSTD_DDICT + } + +#ifdef ROCKSDB_ZSTD_DDICT + UncompressionDict(Slice slice, CacheAllocationPtr&& allocation, + bool using_zstd) +#else // ROCKSDB_ZSTD_DDICT + UncompressionDict(Slice slice, CacheAllocationPtr&& allocation, + bool /* using_zstd */) +#endif // ROCKSDB_ZSTD_DDICT + : allocation_(std::move(allocation)), slice_(std::move(slice)) { +#ifdef ROCKSDB_ZSTD_DDICT + if (!slice_.empty() && using_zstd) { + zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size()); + assert(zstd_ddict_ != nullptr); + } +#endif // ROCKSDB_ZSTD_DDICT + } + + UncompressionDict(UncompressionDict&& rhs) + : dict_(std::move(rhs.dict_)), + allocation_(std::move(rhs.allocation_)), + slice_(std::move(rhs.slice_)) +#ifdef ROCKSDB_ZSTD_DDICT + , + zstd_ddict_(rhs.zstd_ddict_) +#endif + { +#ifdef ROCKSDB_ZSTD_DDICT + rhs.zstd_ddict_ = nullptr; +#endif + } + + ~UncompressionDict() { +#ifdef ROCKSDB_ZSTD_DDICT + size_t res = 0; + if (zstd_ddict_ != nullptr) { + res = ZSTD_freeDDict(zstd_ddict_); + } + assert(res == 0); // Last I checked they can't fail + (void)res; // prevent unused var warning +#endif // ROCKSDB_ZSTD_DDICT + } + + UncompressionDict& operator=(UncompressionDict&& rhs) { + if (this == &rhs) { + return *this; + } + + dict_ = std::move(rhs.dict_); + allocation_ = std::move(rhs.allocation_); + slice_ = std::move(rhs.slice_); + +#ifdef ROCKSDB_ZSTD_DDICT + zstd_ddict_ = rhs.zstd_ddict_; + rhs.zstd_ddict_ = nullptr; +#endif + + return *this; + } + + // The object is self-contained if the string constructor is used, or the + // Slice constructor is invoked with a non-null allocation. Otherwise, it + // is the caller's responsibility to ensure that the underlying storage + // outlives this object. + bool own_bytes() const { return !dict_.empty() || allocation_; } + + const Slice& GetRawDict() const { return slice_; } + + // For TypedCacheInterface + const Slice& ContentSlice() const { return slice_; } + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock; + static constexpr BlockType kBlockType = BlockType::kCompressionDictionary; + +#ifdef ROCKSDB_ZSTD_DDICT + const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; } +#endif // ROCKSDB_ZSTD_DDICT + + static const UncompressionDict& GetEmptyDict() { + static UncompressionDict empty_dict{}; + return empty_dict; + } + + size_t ApproximateMemoryUsage() const { + size_t usage = sizeof(struct UncompressionDict); + usage += dict_.size(); + if (allocation_) { + auto allocator = allocation_.get_deleter().allocator; + if (allocator) { + usage += allocator->UsableSize(allocation_.get(), slice_.size()); + } else { + usage += slice_.size(); + } + } +#ifdef ROCKSDB_ZSTD_DDICT + usage += ZSTD_sizeof_DDict(zstd_ddict_); +#endif // ROCKSDB_ZSTD_DDICT + return usage; + } + + UncompressionDict() = default; + // Disable copy + UncompressionDict(const CompressionDict&) = delete; + UncompressionDict& operator=(const CompressionDict&) = delete; +}; + +class CompressionContext { + private: +#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500) + ZSTD_CCtx* zstd_ctx_ = nullptr; + void CreateNativeContext(CompressionType type) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { +#ifdef ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = + ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides()); +#else // ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = ZSTD_createCCtx(); +#endif // ROCKSDB_ZSTD_CUSTOM_MEM + } + } + void DestroyNativeContext() { + if (zstd_ctx_ != nullptr) { + ZSTD_freeCCtx(zstd_ctx_); + } + } + + public: + // callable inside ZSTD_Compress + ZSTD_CCtx* ZSTDPreallocCtx() const { + assert(zstd_ctx_ != nullptr); + return zstd_ctx_; + } + +#else // ZSTD && (ZSTD_VERSION_NUMBER >= 500) + private: + void CreateNativeContext(CompressionType /* type */) {} + void DestroyNativeContext() {} +#endif // ZSTD && (ZSTD_VERSION_NUMBER >= 500) + public: + explicit CompressionContext(CompressionType type) { + CreateNativeContext(type); + } + ~CompressionContext() { DestroyNativeContext(); } + CompressionContext(const CompressionContext&) = delete; + CompressionContext& operator=(const CompressionContext&) = delete; +}; + +class CompressionInfo { + const CompressionOptions& opts_; + const CompressionContext& context_; + const CompressionDict& dict_; + const CompressionType type_; + const uint64_t sample_for_compression_; + + public: + CompressionInfo(const CompressionOptions& _opts, + const CompressionContext& _context, + const CompressionDict& _dict, CompressionType _type, + uint64_t _sample_for_compression) + : opts_(_opts), + context_(_context), + dict_(_dict), + type_(_type), + sample_for_compression_(_sample_for_compression) {} + + const CompressionOptions& options() const { return opts_; } + const CompressionContext& context() const { return context_; } + const CompressionDict& dict() const { return dict_; } + CompressionType type() const { return type_; } + uint64_t SampleForCompression() const { return sample_for_compression_; } +}; + +class UncompressionContext { + private: + CompressionContextCache* ctx_cache_ = nullptr; + ZSTDUncompressCachedData uncomp_cached_data_; + + public: + explicit UncompressionContext(CompressionType type) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { + ctx_cache_ = CompressionContextCache::Instance(); + uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData(); + } + } + ~UncompressionContext() { + if (uncomp_cached_data_.GetCacheIndex() != -1) { + assert(ctx_cache_ != nullptr); + ctx_cache_->ReturnCachedZSTDUncompressData( + uncomp_cached_data_.GetCacheIndex()); + } + } + UncompressionContext(const UncompressionContext&) = delete; + UncompressionContext& operator=(const UncompressionContext&) = delete; + + ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const { + return uncomp_cached_data_.Get(); + } +}; + +class UncompressionInfo { + const UncompressionContext& context_; + const UncompressionDict& dict_; + const CompressionType type_; + + public: + UncompressionInfo(const UncompressionContext& _context, + const UncompressionDict& _dict, CompressionType _type) + : context_(_context), dict_(_dict), type_(_type) {} + + const UncompressionContext& context() const { return context_; } + const UncompressionDict& dict() const { return dict_; } + CompressionType type() const { return type_; } +}; + +inline bool Snappy_Supported() { +#ifdef SNAPPY + return true; +#else + return false; +#endif +} + +inline bool Zlib_Supported() { +#ifdef ZLIB + return true; +#else + return false; +#endif +} + +inline bool BZip2_Supported() { +#ifdef BZIP2 + return true; +#else + return false; +#endif +} + +inline bool LZ4_Supported() { +#ifdef LZ4 + return true; +#else + return false; +#endif +} + +inline bool XPRESS_Supported() { +#ifdef XPRESS + return true; +#else + return false; +#endif +} + +inline bool ZSTD_Supported() { +#ifdef ZSTD + // ZSTD format is finalized since version 0.8.0. + return (ZSTD_versionNumber() >= 800); +#else + return false; +#endif +} + +inline bool ZSTDNotFinal_Supported() { +#ifdef ZSTD + return true; +#else + return false; +#endif +} + +inline bool ZSTD_Streaming_Supported() { +#if defined(ZSTD) && defined(ZSTD_STREAMING) + return true; +#else + return false; +#endif +} + +inline bool StreamingCompressionTypeSupported( + CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return true; + case kZSTD: + return ZSTD_Streaming_Supported(); + default: + return false; + } +} + +inline bool CompressionTypeSupported(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return true; + case kSnappyCompression: + return Snappy_Supported(); + case kZlibCompression: + return Zlib_Supported(); + case kBZip2Compression: + return BZip2_Supported(); + case kLZ4Compression: + return LZ4_Supported(); + case kLZ4HCCompression: + return LZ4_Supported(); + case kXpressCompression: + return XPRESS_Supported(); + case kZSTDNotFinalCompression: + return ZSTDNotFinal_Supported(); + case kZSTD: + return ZSTD_Supported(); + default: + assert(false); + return false; + } +} + +inline bool DictCompressionTypeSupported(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return false; + case kSnappyCompression: + return false; + case kZlibCompression: + return Zlib_Supported(); + case kBZip2Compression: + return false; + case kLZ4Compression: + case kLZ4HCCompression: +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + return LZ4_Supported(); +#else + return false; +#endif + case kXpressCompression: + return false; + case kZSTDNotFinalCompression: +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + return ZSTDNotFinal_Supported(); +#else + return false; +#endif + case kZSTD: +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + return ZSTD_Supported(); +#else + return false; +#endif + default: + assert(false); + return false; + } +} + +inline std::string CompressionTypeToString(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return "NoCompression"; + case kSnappyCompression: + return "Snappy"; + case kZlibCompression: + return "Zlib"; + case kBZip2Compression: + return "BZip2"; + case kLZ4Compression: + return "LZ4"; + case kLZ4HCCompression: + return "LZ4HC"; + case kXpressCompression: + return "Xpress"; + case kZSTD: + return "ZSTD"; + case kZSTDNotFinalCompression: + return "ZSTDNotFinal"; + case kDisableCompressionOption: + return "DisableOption"; + default: + assert(false); + return ""; + } +} + +inline std::string CompressionOptionsToString( + CompressionOptions& compression_options) { + std::string result; + result.reserve(512); + result.append("window_bits=") + .append(std::to_string(compression_options.window_bits)) + .append("; "); + result.append("level=") + .append(std::to_string(compression_options.level)) + .append("; "); + result.append("strategy=") + .append(std::to_string(compression_options.strategy)) + .append("; "); + result.append("max_dict_bytes=") + .append(std::to_string(compression_options.max_dict_bytes)) + .append("; "); + result.append("zstd_max_train_bytes=") + .append(std::to_string(compression_options.zstd_max_train_bytes)) + .append("; "); + result.append("enabled=") + .append(std::to_string(compression_options.enabled)) + .append("; "); + result.append("max_dict_buffer_bytes=") + .append(std::to_string(compression_options.max_dict_buffer_bytes)) + .append("; "); + result.append("use_zstd_dict_trainer=") + .append(std::to_string(compression_options.use_zstd_dict_trainer)) + .append("; "); + return result; +} + +// compress_format_version can have two values: +// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed +// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent +// way. +// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the +// start of compressed block. Snappy format is the same as version 1. + +inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#else + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +inline CacheAllocationPtr Snappy_Uncompress( + const char* input, size_t length, size_t* uncompressed_size, + MemoryAllocator* allocator = nullptr) { +#ifdef SNAPPY + size_t uncompressed_length = 0; + if (!snappy::GetUncompressedLength(input, length, &uncompressed_length)) { + return nullptr; + } + + CacheAllocationPtr output = AllocateBlock(uncompressed_length, allocator); + + if (!snappy::RawUncompress(input, length, output.get())) { + return nullptr; + } + + *uncompressed_size = uncompressed_length; + + return output; +#else + (void)input; + (void)length; + (void)uncompressed_size; + (void)allocator; + return nullptr; +#endif +} + +namespace compression { +// returns size +inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) { + PutVarint32(output, length); + return output->size(); +} + +inline bool GetDecompressedSizeInfo(const char** input_data, + size_t* input_length, + uint32_t* output_len) { + auto new_input_data = + GetVarint32Ptr(*input_data, *input_data + *input_length, output_len); + if (new_input_data == nullptr) { + return false; + } + *input_length -= (new_input_data - *input_data); + *input_data = new_input_data; + return true; +} +} // namespace compression + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool Zlib_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef ZLIB + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } + + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + level = Z_DEFAULT_COMPRESSION; + } else { + level = info.options().level; + } + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits, + memLevel, info.options().strategy); + if (st != Z_OK) { + return false; + } + + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + // Initialize the compression library's dictionary + st = deflateSetDictionary( + &_stream, reinterpret_cast(compression_dict.data()), + static_cast(compression_dict.size())); + if (st != Z_OK) { + deflateEnd(&_stream); + return false; + } + } + + // Get an upper bound on the compressed size. + size_t upper_bound = + deflateBound(&_stream, static_cast(length)); + output->resize(output_header_len + upper_bound); + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef*)input; + _stream.avail_in = static_cast(length); + + // Initialize the output size. + _stream.avail_out = static_cast(upper_bound); + _stream.next_out = reinterpret_cast(&(*output)[output_header_len]); + + bool compressed = false; + st = deflate(&_stream, Z_FINISH); + if (st == Z_STREAM_END) { + compressed = true; + output->resize(output->size() - _stream.avail_out); + } + // The only return value we really care about is Z_STREAM_END. + // Z_OK means insufficient output space. This means the compression is + // bigger than decompressed size. Just fail the compression in that case. + + deflateEnd(&_stream); + return compressed; +#else + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr Zlib_Uncompress( + const UncompressionInfo& info, const char* input_data, size_t input_length, + size_t* uncompressed_size, uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr, int windowBits = -14) { +#ifdef ZLIB + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast( + std::min(proposed_output_len, + static_cast(std::numeric_limits::max()))); + } + + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = + inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + const Slice& compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + // Initialize the compression library's dictionary + st = inflateSetDictionary( + &_stream, reinterpret_cast(compression_dict.data()), + static_cast(compression_dict.size())); + if (st != Z_OK) { + return nullptr; + } + } + + _stream.next_in = (Bytef*)input_data; + _stream.avail_in = static_cast(input_length); + + auto output = AllocateBlock(output_len, allocator); + + _stream.next_out = (Bytef*)output.get(); + _stream.avail_out = static_cast(output_len); + + bool done = false; + while (!done) { + st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + size_t old_sz = output_len; + uint32_t output_len_delta = output_len / 5; + output_len += output_len_delta < 10 ? 10 : output_len_delta; + auto tmp = AllocateBlock(output_len, allocator); + memcpy(tmp.get(), output.get(), old_sz); + output = std::move(tmp); + + // Set more output. + _stream.next_out = (Bytef*)(output.get() + old_sz); + _stream.avail_out = static_cast(output_len - old_sz); + break; + } + case Z_BUF_ERROR: + default: + inflateEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + assert(output_len >= _stream.avail_out); + *uncompressed_size = output_len - _stream.avail_out; + inflateEnd(&_stream); + return output; +#else + (void)info; + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)compress_format_version; + (void)allocator; + (void)windowBits; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool BZip2_Compress(const CompressionInfo& /*info*/, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef BZIP2 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(output_header_len + length); + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Compress the input, and put compressed data in output. + _stream.next_in = (char*)input; + _stream.avail_in = static_cast(length); + + // Initialize the output size. + _stream.avail_out = static_cast(length); + _stream.next_out = reinterpret_cast(&(*output)[output_header_len]); + + bool compressed = false; + st = BZ2_bzCompress(&_stream, BZ_FINISH); + if (st == BZ_STREAM_END) { + compressed = true; + output->resize(output->size() - _stream.avail_out); + } + // The only return value we really care about is BZ_STREAM_END. + // BZ_FINISH_OK means insufficient output space. This means the compression + // is bigger than decompressed size. Just fail the compression in that case. + + BZ2_bzCompressEnd(&_stream); + return compressed; +#else + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline CacheAllocationPtr BZip2_Uncompress( + const char* input_data, size_t input_length, size_t* uncompressed_size, + uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) { +#ifdef BZIP2 + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the next page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast( + std::min(proposed_output_len, + static_cast(std::numeric_limits::max()))); + } + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char*)input_data; + _stream.avail_in = static_cast(input_length); + + auto output = AllocateBlock(output_len, allocator); + + _stream.next_out = (char*)output.get(); + _stream.avail_out = static_cast(output_len); + + bool done = false; + while (!done) { + st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + done = true; + break; + case BZ_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + uint32_t old_sz = output_len; + output_len = output_len * 1.2; + auto tmp = AllocateBlock(output_len, allocator); + memcpy(tmp.get(), output.get(), old_sz); + output = std::move(tmp); + + // Set more output. + _stream.next_out = (char*)(output.get() + old_sz); + _stream.avail_out = static_cast(output_len - old_sz); + break; + } + default: + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + assert(output_len >= _stream.avail_out); + *uncompressed_size = output_len - _stream.avail_out; + BZ2_bzDecompressEnd(&_stream); + return output; +#else + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)compress_format_version; + (void)allocator; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool LZ4_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + int compress_bound = LZ4_compressBound(static_cast(length)); + output->resize(static_cast(output_header_len + compress_bound)); + + int outlen; +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_stream_t* stream = LZ4_createStream(); + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + LZ4_loadDict(stream, compression_dict.data(), + static_cast(compression_dict.size())); + } +#if LZ4_VERSION_NUMBER >= 10700 // r129+ + outlen = + LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len], + static_cast(length), compress_bound, 1); +#else // up to r128 + outlen = LZ4_compress_limitedOutput_continue( + stream, input, &(*output)[output_header_len], static_cast(length), + compress_bound); +#endif + LZ4_freeStream(stream); +#else // up to r123 + outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (outlen == 0) { + return false; + } + output->resize(static_cast(output_header_len + outlen)); + return true; +#else // LZ4 + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info, + const char* input_data, + size_t input_length, + size_t* uncompressed_size, + uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr) { +#ifdef LZ4 + uint32_t output_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + if (input_length < 8) { + return nullptr; + } + if (port::kLittleEndian) { + memcpy(&output_len, input_data, sizeof(output_len)); + } else { + memcpy(&output_len, input_data + 4, sizeof(output_len)); + } + input_length -= 8; + input_data += 8; + } + + auto output = AllocateBlock(output_len, allocator); + + int decompress_bytes = 0; + +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_streamDecode_t* stream = LZ4_createStreamDecode(); + const Slice& compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + LZ4_setStreamDecode(stream, compression_dict.data(), + static_cast(compression_dict.size())); + } + decompress_bytes = LZ4_decompress_safe_continue( + stream, input_data, output.get(), static_cast(input_length), + static_cast(output_len)); + LZ4_freeStreamDecode(stream); +#else // up to r123 + decompress_bytes = LZ4_decompress_safe(input_data, output.get(), + static_cast(input_length), + static_cast(output_len)); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (decompress_bytes < 0) { + return nullptr; + } + assert(decompress_bytes == static_cast(output_len)); + *uncompressed_size = decompress_bytes; + return output; +#else // LZ4 + (void)info; + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)compress_format_version; + (void)allocator; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool LZ4HC_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + int compress_bound = LZ4_compressBound(static_cast(length)); + output->resize(static_cast(output_header_len + compress_bound)); + + int outlen; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + level = 0; // lz4hc.h says any value < 1 will be sanitized to default + } else { + level = info.options().level; + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_streamHC_t* stream = LZ4_createStreamHC(); + LZ4_resetStreamHC(stream, level); + Slice compression_dict = info.dict().GetRawDict(); + const char* compression_dict_data = + compression_dict.size() > 0 ? compression_dict.data() : nullptr; + size_t compression_dict_size = compression_dict.size(); + if (compression_dict_data != nullptr) { + LZ4_loadDictHC(stream, compression_dict_data, + static_cast(compression_dict_size)); + } + +#if LZ4_VERSION_NUMBER >= 10700 // r129+ + outlen = + LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len], + static_cast(length), compress_bound); +#else // r124-r128 + outlen = LZ4_compressHC_limitedOutput_continue( + stream, input, &(*output)[output_header_len], static_cast(length), + compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10700 + LZ4_freeStreamHC(stream); + +#elif LZ4_VERSION_MAJOR // r113-r123 + outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), + compress_bound, level); +#else // up to r112 + outlen = + LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (outlen == 0) { + return false; + } + output->resize(static_cast(output_header_len + outlen)); + return true; +#else // LZ4 + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +#ifdef XPRESS +inline bool XPRESS_Compress(const char* input, size_t length, + std::string* output) { + return port::xpress::Compress(input, length, output); +} +#else +inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/, + std::string* /*output*/) { + return false; +} +#endif + +#ifdef XPRESS +inline char* XPRESS_Uncompress(const char* input_data, size_t input_length, + size_t* uncompressed_size) { + return port::xpress::Decompress(input_data, input_length, uncompressed_size); +} +#else +inline char* XPRESS_Uncompress(const char* /*input_data*/, + size_t /*input_length*/, + size_t* /*uncompressed_size*/) { + return nullptr; +} +#endif + +inline bool ZSTD_Compress(const CompressionInfo& info, const char* input, + size_t length, ::std::string* output) { +#ifdef ZSTD + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + + size_t compressBound = ZSTD_compressBound(length); + output->resize(static_cast(output_header_len + compressBound)); + size_t outlen = 0; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } else { + level = info.options().level; + } +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + ZSTD_CCtx* context = info.context().ZSTDPreallocCtx(); + assert(context != nullptr); +#if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+ + if (info.dict().GetDigestedZstdCDict() != nullptr) { + outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len], + compressBound, input, length, + info.dict().GetDigestedZstdCDict()); + } +#endif // ZSTD_VERSION_NUMBER >= 700 + if (outlen == 0) { + outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len], + compressBound, input, length, + info.dict().GetRawDict().data(), + info.dict().GetRawDict().size(), level); + } +#else // up to v0.4.x + outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input, + length, level); +#endif // ZSTD_VERSION_NUMBER >= 500 + if (outlen == 0) { + return false; + } + output->resize(output_header_len + outlen); + return true; +#else // ZSTD + (void)info; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr ZSTD_Uncompress( + const UncompressionInfo& info, const char* input_data, size_t input_length, + size_t* uncompressed_size, MemoryAllocator* allocator = nullptr) { +#ifdef ZSTD + uint32_t output_len = 0; + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + + auto output = AllocateBlock(output_len, allocator); + size_t actual_output_length = 0; +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + ZSTD_DCtx* context = info.context().GetZSTDContext(); + assert(context != nullptr); +#ifdef ROCKSDB_ZSTD_DDICT + if (info.dict().GetDigestedZstdDDict() != nullptr) { + actual_output_length = ZSTD_decompress_usingDDict( + context, output.get(), output_len, input_data, input_length, + info.dict().GetDigestedZstdDDict()); + } +#endif // ROCKSDB_ZSTD_DDICT + if (actual_output_length == 0) { + actual_output_length = ZSTD_decompress_usingDict( + context, output.get(), output_len, input_data, input_length, + info.dict().GetRawDict().data(), info.dict().GetRawDict().size()); + } +#else // up to v0.4.x + (void)info; + actual_output_length = + ZSTD_decompress(output.get(), output_len, input_data, input_length); +#endif // ZSTD_VERSION_NUMBER >= 500 + assert(actual_output_length == output_len); + *uncompressed_size = actual_output_length; + return output; +#else // ZSTD + (void)info; + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)allocator; + return nullptr; +#endif +} + +inline bool ZSTD_TrainDictionarySupported() { +#ifdef ZSTD + // Dictionary trainer is available since v0.6.1 for static linking, but not + // available for dynamic linking until v1.1.3. For now we enable the feature + // in v1.1.3+ only. + return (ZSTD_versionNumber() >= 10103); +#else + return false; +#endif +} + +inline std::string ZSTD_TrainDictionary(const std::string& samples, + const std::vector& sample_lens, + size_t max_dict_bytes) { + // Dictionary trainer is available since v0.6.1 for static linking, but not + // available for dynamic linking until v1.1.3. For now we enable the feature + // in v1.1.3+ only. +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ + assert(samples.empty() == sample_lens.empty()); + if (samples.empty()) { + return ""; + } + std::string dict_data(max_dict_bytes, '\0'); + size_t dict_len = ZDICT_trainFromBuffer( + &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0], + static_cast(sample_lens.size())); + if (ZDICT_isError(dict_len)) { + return ""; + } + assert(dict_len <= max_dict_bytes); + dict_data.resize(dict_len); + return dict_data; +#else // up to v1.1.2 + assert(false); + (void)samples; + (void)sample_lens; + (void)max_dict_bytes; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10103 +} + +inline std::string ZSTD_TrainDictionary(const std::string& samples, + size_t sample_len_shift, + size_t max_dict_bytes) { + // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable + // only since v0.8.0. For now we enable the feature in stable versions only. +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ + // skips potential partial sample at the end of "samples" + size_t num_samples = samples.size() >> sample_len_shift; + std::vector sample_lens(num_samples, size_t(1) << sample_len_shift); + return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes); +#else // up to v1.1.2 + assert(false); + (void)samples; + (void)sample_len_shift; + (void)max_dict_bytes; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10103 +} + +inline bool ZSTD_FinalizeDictionarySupported() { +#ifdef ZSTD + // ZDICT_finalizeDictionary API is stable since v1.4.5 + return (ZSTD_versionNumber() >= 10405); +#else + return false; +#endif +} + +inline std::string ZSTD_FinalizeDictionary( + const std::string& samples, const std::vector& sample_lens, + size_t max_dict_bytes, int level) { + // ZDICT_finalizeDictionary is stable since version v1.4.5 +#if ZSTD_VERSION_NUMBER >= 10405 // v1.4.5+ + assert(samples.empty() == sample_lens.empty()); + if (samples.empty()) { + return ""; + } + if (level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } + std::string dict_data(max_dict_bytes, '\0'); + size_t dict_len = ZDICT_finalizeDictionary( + dict_data.data(), max_dict_bytes, samples.data(), + std::min(static_cast(samples.size()), max_dict_bytes), + samples.data(), sample_lens.data(), + static_cast(sample_lens.size()), + {level, 0 /* notificationLevel */, 0 /* dictID */}); + if (ZDICT_isError(dict_len)) { + return ""; + } else { + assert(dict_len <= max_dict_bytes); + dict_data.resize(dict_len); + return dict_data; + } +#else // up to v1.4.4 + (void)samples; + (void)sample_lens; + (void)max_dict_bytes; + (void)level; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10405 +} + +inline bool CompressData(const Slice& raw, + const CompressionInfo& compression_info, + uint32_t compress_format_version, + std::string* compressed_output) { + bool ret = false; + + // Will return compressed block contents if (1) the compression method is + // supported in this platform and (2) the compression rate is "good enough". + switch (compression_info.type()) { + case kSnappyCompression: + ret = Snappy_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + break; + case kZlibCompression: + ret = Zlib_Compress(compression_info, compress_format_version, raw.data(), + raw.size(), compressed_output); + break; + case kBZip2Compression: + ret = BZip2_Compress(compression_info, compress_format_version, + raw.data(), raw.size(), compressed_output); + break; + case kLZ4Compression: + ret = LZ4_Compress(compression_info, compress_format_version, raw.data(), + raw.size(), compressed_output); + break; + case kLZ4HCCompression: + ret = LZ4HC_Compress(compression_info, compress_format_version, + raw.data(), raw.size(), compressed_output); + break; + case kXpressCompression: + ret = XPRESS_Compress(raw.data(), raw.size(), compressed_output); + break; + case kZSTD: + case kZSTDNotFinalCompression: + ret = ZSTD_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + break; + default: + // Do not recognize this compression type + break; + } + + TEST_SYNC_POINT_CALLBACK("CompressData:TamperWithReturnValue", + static_cast(&ret)); + + return ret; +} + +inline CacheAllocationPtr UncompressData( + const UncompressionInfo& uncompression_info, const char* data, size_t n, + size_t* uncompressed_size, uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr) { + switch (uncompression_info.type()) { + case kSnappyCompression: + return Snappy_Uncompress(data, n, uncompressed_size, allocator); + case kZlibCompression: + return Zlib_Uncompress(uncompression_info, data, n, uncompressed_size, + compress_format_version, allocator); + case kBZip2Compression: + return BZip2_Uncompress(data, n, uncompressed_size, + compress_format_version, allocator); + case kLZ4Compression: + case kLZ4HCCompression: + return LZ4_Uncompress(uncompression_info, data, n, uncompressed_size, + compress_format_version, allocator); + case kXpressCompression: + // XPRESS allocates memory internally, thus no support for custom + // allocator. + return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size)); + case kZSTD: + case kZSTDNotFinalCompression: + return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size, + allocator); + default: + return CacheAllocationPtr(); + } +} + +// Records the compression type for subsequent WAL records. +class CompressionTypeRecord { + public: + explicit CompressionTypeRecord(CompressionType compression_type) + : compression_type_(compression_type) {} + + CompressionType GetCompressionType() const { return compression_type_; } + + inline void EncodeTo(std::string* dst) const { + assert(dst != nullptr); + PutFixed32(dst, compression_type_); + } + + inline Status DecodeFrom(Slice* src) { + constexpr char class_name[] = "CompressionTypeRecord"; + + uint32_t val; + if (!GetFixed32(src, &val)) { + return Status::Corruption(class_name, + "Error decoding WAL compression type"); + } + CompressionType compression_type = static_cast(val); + if (!StreamingCompressionTypeSupported(compression_type)) { + return Status::Corruption(class_name, + "WAL compression type not supported"); + } + compression_type_ = compression_type; + return Status::OK(); + } + + inline std::string DebugString() const { + return "compression_type: " + CompressionTypeToString(compression_type_); + } + + private: + CompressionType compression_type_; +}; + +// Base class to implement compression for a stream of buffers. +// Instantiate an implementation of the class using Create() with the +// compression type and use Compress() repeatedly. +// The output buffer needs to be at least max_output_len. +// Call Reset() in between frame boundaries or in case of an error. +// NOTE: This class is not thread safe. +class StreamingCompress { + public: + StreamingCompress(CompressionType compression_type, + const CompressionOptions& opts, + uint32_t compress_format_version, size_t max_output_len) + : compression_type_(compression_type), + opts_(opts), + compress_format_version_(compress_format_version), + max_output_len_(max_output_len) {} + virtual ~StreamingCompress() = default; + // compress should be called repeatedly with the same input till the method + // returns 0 + // Parameters: + // input - buffer to compress + // input_size - size of input buffer + // output - compressed buffer allocated by caller, should be at least + // max_output_len + // output_size - size of the output buffer + // Returns -1 for errors, the remaining size of the input buffer that needs to + // be compressed + virtual int Compress(const char* input, size_t input_size, char* output, + size_t* output_pos) = 0; + // static method to create object of a class inherited from StreamingCompress + // based on the actual compression type. + static StreamingCompress* Create(CompressionType compression_type, + const CompressionOptions& opts, + uint32_t compress_format_version, + size_t max_output_len); + virtual void Reset() = 0; + + protected: + const CompressionType compression_type_; + const CompressionOptions opts_; + const uint32_t compress_format_version_; + const size_t max_output_len_; +}; + +// Base class to uncompress a stream of compressed buffers. +// Instantiate an implementation of the class using Create() with the +// compression type and use Uncompress() repeatedly. +// The output buffer needs to be at least max_output_len. +// Call Reset() in between frame boundaries or in case of an error. +// NOTE: This class is not thread safe. +class StreamingUncompress { + public: + StreamingUncompress(CompressionType compression_type, + uint32_t compress_format_version, size_t max_output_len) + : compression_type_(compression_type), + compress_format_version_(compress_format_version), + max_output_len_(max_output_len) {} + virtual ~StreamingUncompress() = default; + // Uncompress can be called repeatedly to progressively process the same + // input buffer, or can be called with a new input buffer. When the input + // buffer is not fully consumed, the return value is > 0 or output_size + // == max_output_len. When calling uncompress to continue processing the + // same input buffer, the input argument should be nullptr. + // Parameters: + // input - buffer to uncompress + // input_size - size of input buffer + // output - uncompressed buffer allocated by caller, should be at least + // max_output_len + // output_size - size of the output buffer + // Returns -1 for errors, remaining input to be processed otherwise. + virtual int Uncompress(const char* input, size_t input_size, char* output, + size_t* output_pos) = 0; + static StreamingUncompress* Create(CompressionType compression_type, + uint32_t compress_format_version, + size_t max_output_len); + virtual void Reset() = 0; + + protected: + CompressionType compression_type_; + uint32_t compress_format_version_; + size_t max_output_len_; +}; + +class ZSTDStreamingCompress final : public StreamingCompress { + public: + explicit ZSTDStreamingCompress(const CompressionOptions& opts, + uint32_t compress_format_version, + size_t max_output_len) + : StreamingCompress(kZSTD, opts, compress_format_version, + max_output_len) { +#ifdef ZSTD_STREAMING + cctx_ = ZSTD_createCCtx(); + // Each compressed frame will have a checksum + ZSTD_CCtx_setParameter(cctx_, ZSTD_c_checksumFlag, 1); + assert(cctx_ != nullptr); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif + } + ~ZSTDStreamingCompress() override { +#ifdef ZSTD_STREAMING + ZSTD_freeCCtx(cctx_); +#endif + } + int Compress(const char* input, size_t input_size, char* output, + size_t* output_pos) override; + void Reset() override; +#ifdef ZSTD_STREAMING + ZSTD_CCtx* cctx_; + ZSTD_inBuffer input_buffer_; +#endif +}; + +class ZSTDStreamingUncompress final : public StreamingUncompress { + public: + explicit ZSTDStreamingUncompress(uint32_t compress_format_version, + size_t max_output_len) + : StreamingUncompress(kZSTD, compress_format_version, max_output_len) { +#ifdef ZSTD_STREAMING + dctx_ = ZSTD_createDCtx(); + assert(dctx_ != nullptr); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif + } + ~ZSTDStreamingUncompress() override { +#ifdef ZSTD_STREAMING + ZSTD_freeDCtx(dctx_); +#endif + } + int Uncompress(const char* input, size_t input_size, char* output, + size_t* output_size) override; + void Reset() override; + + private: +#ifdef ZSTD_STREAMING + ZSTD_DCtx* dctx_; + ZSTD_inBuffer input_buffer_; +#endif +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.cc new file mode 100644 index 0000000000..52c3fac72a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#include "util/compression_context_cache.h" + +#include + +#include "util/compression.h" +#include "util/core_local.h" + +namespace ROCKSDB_NAMESPACE { +namespace compression_cache { + +void* const SentinelValue = nullptr; +// Cache ZSTD uncompression contexts for reads +// if needed we can add ZSTD compression context caching +// which is currently is not done since BlockBasedTableBuilder +// simply creates one compression context per new SST file. +struct ZSTDCachedData { + // We choose to cache the below structure instead of a ptr + // because we want to avoid a) native types leak b) make + // cache use transparent for the user + ZSTDUncompressCachedData uncomp_cached_data_; + std::atomic zstd_uncomp_sentinel_; + + char + padding[(CACHE_LINE_SIZE - + (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic)) % + CACHE_LINE_SIZE)]; // unused padding field + + ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {} + ZSTDCachedData(const ZSTDCachedData&) = delete; + ZSTDCachedData& operator=(const ZSTDCachedData&) = delete; + + ZSTDUncompressCachedData GetUncompressData(int64_t idx) { + ZSTDUncompressCachedData result; + void* expected = &uncomp_cached_data_; + if (zstd_uncomp_sentinel_.compare_exchange_strong(expected, + SentinelValue)) { + uncomp_cached_data_.CreateIfNeeded(); + result.InitFromCache(uncomp_cached_data_, idx); + } else { + // Creates one time use data + result.CreateIfNeeded(); + } + return result; + } + // Return the entry back into circulation + // This is executed only when we successfully obtained + // in the first place + void ReturnUncompressData() { + if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) { + // Means we are returning while not having it acquired. + assert(false); + } + } +}; +static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0, + "Expected CACHE_LINE_SIZE alignment"); +} // namespace compression_cache + +class CompressionContextCache::Rep { + public: + Rep() {} + ZSTDUncompressCachedData GetZSTDUncompressData() { + auto p = per_core_uncompr_.AccessElementAndIndex(); + int64_t idx = static_cast(p.second); + return p.first->GetUncompressData(idx); + } + void ReturnZSTDUncompressData(int64_t idx) { + assert(idx >= 0); + auto* cn = per_core_uncompr_.AccessAtCore(static_cast(idx)); + cn->ReturnUncompressData(); + } + + private: + CoreLocalArray per_core_uncompr_; +}; + +CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {} + +CompressionContextCache* CompressionContextCache::Instance() { + static CompressionContextCache instance; + return &instance; +} + +void CompressionContextCache::InitSingleton() { Instance(); } + +ZSTDUncompressCachedData +CompressionContextCache::GetCachedZSTDUncompressData() { + return rep_->GetZSTDUncompressData(); +} + +void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) { + rep_->ReturnZSTDUncompressData(idx); +} + +CompressionContextCache::~CompressionContextCache() { delete rep_; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.h new file mode 100644 index 0000000000..7b7b2d507a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/compression_context_cache.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +// Compression context cache allows to cache compression/uncompression contexts +// This helps with Random Read latencies and reduces CPU utilization +// Caching is implemented using CoreLocal facility. Compression/Uncompression +// instances are cached on a per core basis using CoreLocalArray. A borrowed +// instance is atomically replaced with a sentinel value for the time of being +// used. If it turns out that another thread is already makes use of the +// instance we still create one on the heap which is later is destroyed. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class ZSTDUncompressCachedData; + +class CompressionContextCache { + public: + // Singleton + static CompressionContextCache* Instance(); + static void InitSingleton(); + CompressionContextCache(const CompressionContextCache&) = delete; + CompressionContextCache& operator=(const CompressionContextCache&) = delete; + + ZSTDUncompressCachedData GetCachedZSTDUncompressData(); + void ReturnCachedZSTDUncompressData(int64_t idx); + + private: + // Singleton + CompressionContextCache(); + ~CompressionContextCache(); + + class Rep; + Rep* rep_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.cc new file mode 100644 index 0000000000..a0fc7331f5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/concurrent_task_limiter_impl.h" + +#include "rocksdb/concurrent_task_limiter.h" + +namespace ROCKSDB_NAMESPACE { + +ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl( + const std::string& name, int32_t max_outstanding_task) + : name_(name), + max_outstanding_tasks_{max_outstanding_task}, + outstanding_tasks_{0} {} + +ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() { + assert(outstanding_tasks_ == 0); +} + +const std::string& ConcurrentTaskLimiterImpl::GetName() const { return name_; } + +void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) { + max_outstanding_tasks_.store(limit, std::memory_order_relaxed); +} + +void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() { + max_outstanding_tasks_.store(-1, std::memory_order_relaxed); +} + +int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const { + return outstanding_tasks_.load(std::memory_order_relaxed); +} + +std::unique_ptr ConcurrentTaskLimiterImpl::GetToken( + bool force) { + int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed); + int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed); + // force = true, bypass the throttle. + // limit < 0 means unlimited tasks. + while (force || limit < 0 || tasks < limit) { + if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) { + return std::unique_ptr(new TaskLimiterToken(this)); + } + } + return nullptr; +} + +ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit) { + return new ConcurrentTaskLimiterImpl(name, limit); +} + +TaskLimiterToken::~TaskLimiterToken() { + --limiter_->outstanding_tasks_; + assert(limiter_->outstanding_tasks_ >= 0); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.h new file mode 100644 index 0000000000..4952ae23aa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/concurrent_task_limiter_impl.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class TaskLimiterToken; + +class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter { + public: + explicit ConcurrentTaskLimiterImpl(const std::string& name, + int32_t max_outstanding_task); + // No copying allowed + ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete; + ConcurrentTaskLimiterImpl& operator=(const ConcurrentTaskLimiterImpl&) = + delete; + + virtual ~ConcurrentTaskLimiterImpl(); + + virtual const std::string& GetName() const override; + + virtual void SetMaxOutstandingTask(int32_t limit) override; + + virtual void ResetMaxOutstandingTask() override; + + virtual int32_t GetOutstandingTask() const override; + + // Request token for adding a new task. + // If force == true, it requests a token bypassing throttle. + // Returns nullptr if it got throttled. + virtual std::unique_ptr GetToken(bool force); + + private: + friend class TaskLimiterToken; + + std::string name_; + std::atomic max_outstanding_tasks_; + std::atomic outstanding_tasks_; +}; + +class TaskLimiterToken { + public: + explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter) + : limiter_(limiter) {} + ~TaskLimiterToken(); + + private: + ConcurrentTaskLimiterImpl* limiter_; + + // no copying allowed + TaskLimiterToken(const TaskLimiterToken&) = delete; + void operator=(const TaskLimiterToken&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/core_local.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/core_local.h new file mode 100644 index 0000000000..25174aef84 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/core_local.h @@ -0,0 +1,84 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "port/likely.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +// An array of core-local values. Ideally the value type, T, is cache aligned to +// prevent false sharing. +template +class CoreLocalArray { + public: + CoreLocalArray(); + + size_t Size() const; + // returns pointer to the element corresponding to the core that the thread + // currently runs on. + T* Access() const; + // same as above, but also returns the core index, which the client can cache + // to reduce how often core ID needs to be retrieved. Only do this if some + // inaccuracy is tolerable, as the thread may migrate to a different core. + std::pair AccessElementAndIndex() const; + // returns pointer to element for the specified core index. This can be used, + // e.g., for aggregation, or if the client caches core index. + T* AccessAtCore(size_t core_idx) const; + + private: + std::unique_ptr data_; + int size_shift_; +}; + +template +CoreLocalArray::CoreLocalArray() { + int num_cpus = static_cast(std::thread::hardware_concurrency()); + // find a power of two >= num_cpus and >= 8 + size_shift_ = 3; + while (1 << size_shift_ < num_cpus) { + ++size_shift_; + } + data_.reset(new T[static_cast(1) << size_shift_]); +} + +template +size_t CoreLocalArray::Size() const { + return static_cast(1) << size_shift_; +} + +template +T* CoreLocalArray::Access() const { + return AccessElementAndIndex().first; +} + +template +std::pair CoreLocalArray::AccessElementAndIndex() const { + int cpuid = port::PhysicalCoreID(); + size_t core_idx; + if (UNLIKELY(cpuid < 0)) { + // cpu id unavailable, just pick randomly + core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_); + } else { + core_idx = static_cast(cpuid & ((1 << size_shift_) - 1)); + } + return {AccessAtCore(core_idx), core_idx}; +} + +template +T* CoreLocalArray::AccessAtCore(size_t core_idx) const { + assert(core_idx < static_cast(1) << size_shift_); + return &data_[core_idx]; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coro_utils.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coro_utils.h new file mode 100644 index 0000000000..5b4211135e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/coro_utils.h @@ -0,0 +1,112 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if defined(USE_COROUTINES) +#include "folly/experimental/coro/Coroutine.h" +#include "folly/experimental/coro/Task.h" +#endif +#include "rocksdb/rocksdb_namespace.h" + +// This file has two sctions. The first section applies to all instances of +// header file inclusion and has an include guard. The second section is +// meant for multiple inclusions in the same source file, and is idempotent. +namespace ROCKSDB_NAMESPACE { + +#ifndef UTIL_CORO_UTILS_H_ +#define UTIL_CORO_UTILS_H_ + +#if defined(USE_COROUTINES) + +// The follwoing macros expand to regular and coroutine function +// declarations for a given function +#define DECLARE_SYNC_AND_ASYNC(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__); \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__); + +#define DECLARE_SYNC_AND_ASYNC_OVERRIDE(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) override; \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__) \ + override; + +#define DECLARE_SYNC_AND_ASYNC_CONST(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) const; \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__) const; + +constexpr bool using_coroutines() { return true; } +#else // !USE_COROUTINES + +// The follwoing macros expand to a regular function declaration for a given +// function +#define DECLARE_SYNC_AND_ASYNC(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__); + +#define DECLARE_SYNC_AND_ASYNC_OVERRIDE(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) override; + +#define DECLARE_SYNC_AND_ASYNC_CONST(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) const; + +constexpr bool using_coroutines() { return false; } +#endif // USE_COROUTINES +#endif // UTIL_CORO_UTILS_H_ + +// The following section of the file is meant to be included twice in a +// source file - once defining WITH_COROUTINES and once defining +// WITHOUT_COROUTINES +#undef DEFINE_SYNC_AND_ASYNC +#undef CO_AWAIT +#undef CO_RETURN + +#if defined(WITH_COROUTINES) && defined(USE_COROUTINES) + +// This macro should be used in the beginning of the function +// definition. The declaration should have been done using one of the +// DECLARE_SYNC_AND_ASYNC* macros. It expands to the return type and +// the function name with the Coroutine suffix. For example - +// DEFINE_SYNC_AND_ASYNC(int, foo)(bool bar) {} +// would expand to - +// folly::coro::Task fooCoroutine(bool bar) {} +#define DEFINE_SYNC_AND_ASYNC(__ret_type__, __func_name__) \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine + +// This macro should be used to call a function that might be a +// coroutine. It expands to the correct function name and prefixes +// the co_await operator if necessary. For example - +// s = CO_AWAIT(foo)(true); +// if the code is compiled WITH_COROUTINES, would expand to +// s = co_await fooCoroutine(true); +// if compiled WITHOUT_COROUTINES, would expand to +// s = foo(true); +#define CO_AWAIT(__func_name__) co_await __func_name__##Coroutine + +#define CO_RETURN co_return + +#elif defined(WITHOUT_COROUTINES) + +// This macro should be used in the beginning of the function +// definition. The declaration should have been done using one of the +// DECLARE_SYNC_AND_ASYNC* macros. It expands to the return type and +// the function name without the Coroutine suffix. For example - +// DEFINE_SYNC_AND_ASYNC(int, foo)(bool bar) {} +// would expand to - +// int foo(bool bar) {} +#define DEFINE_SYNC_AND_ASYNC(__ret_type__, __func_name__) \ + __ret_type__ __func_name__ + +// This macro should be used to call a function that might be a +// coroutine. It expands to the correct function name and prefixes +// the co_await operator if necessary. For example - +// s = CO_AWAIT(foo)(true); +// if the code is compiled WITH_COROUTINES, would expand to +// s = co_await fooCoroutine(true); +// if compiled WITHOUT_COROUTINES, would expand to +// s = foo(true); +#define CO_AWAIT(__func_name__) __func_name__ + +#define CO_RETURN return + +#endif // DO_NOT_USE_COROUTINES +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.cc new file mode 100644 index 0000000000..d4cd78b52e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.cc @@ -0,0 +1,1292 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. +#include "util/crc32c.h" + +#include + +#include +#include + +#include "port/lang.h" +#include "util/coding.h" +#include "util/crc32c_arm64.h" +#include "util/math.h" + +#ifdef __powerpc64__ +#include "util/crc32c_ppc.h" +#include "util/crc32c_ppc_constants.h" + +#if __linux__ +#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT +#include +#endif + +#ifndef PPC_FEATURE2_VEC_CRYPTO +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#elif __FreeBSD__ +#include +#include +#include +#endif /* __linux__ */ + +#endif + +ASSERT_FEATURE_COMPAT_HEADER(); + +#ifdef __SSE4_2__ +#include +#include +#endif + +#if defined(HAVE_ARM64_CRC) +bool pmull_runtime_flag = false; +#endif + +namespace ROCKSDB_NAMESPACE { +namespace crc32c { + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +#ifdef __powerpc64__ +static int arch_ppc_crc32 = 0; +#endif /* __powerpc64__ */ +#endif + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351}; +#ifndef __SSE4_2__ +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, + 0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, 0x3fc5f181, 0x2c6769f6, + 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, + 0xcb1e630b, 0xd8bcfb7c, 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, 0xe29f20ba, 0xf13db8cd, + 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, + 0x298143b1, 0x3a23dbc6, 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, 0xff17c604, 0xecb55e73, + 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, + 0x0bcc548e, 0x186eccf9, 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, 0x5dc6f43d, 0x4e646c4a, + 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, + 0xe9537434, 0xfaf1ec43, 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, 0xbf59d487, 0xacfb4cf0, + 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, + 0x4b82460d, 0x5820de7a, 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, 0x66d73941, 0x7575a136, + 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, + 0xadc95a4a, 0xbe6bc23d, 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, 0x844819fb, 0x97ea818c, + 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, + 0x70938b71, 0x63311306, 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, 0x26992bc2, 0x353bb3b5, + 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, + 0x6d1b6dcf, 0x7eb9f5b8, 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, 0x3b11cd7c, 0x28b3550b, + 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, + 0xcfca5ff6, 0xdc68c781, 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, 0xe64b1c47, 0xf5e98430, + 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, + 0x2d557f4c, 0x3ef7e73b, 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, + 0xd1b1f617, 0x74f06469, 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, 0x70a27d8a, 0xd5e3eff4, + 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, + 0x9942b558, 0x3c032726, 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, 0xd915c5d1, 0x7c5457af, + 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, + 0x40577089, 0xe516e2f7, 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, 0xc76580d9, 0x622412a7, + 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, + 0x2e85480b, 0x8bc4da75, 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, 0x8f96c396, 0x2ad751e8, + 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, + 0xf7908dda, 0x52d11fa4, 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, 0x56830647, 0xf3c29439, + 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, + 0xbf63ce95, 0x1a225ceb, 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, 0xb3764986, 0x1637dbf8, + 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, + 0x2a34fcde, 0x8f756ea0, 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, 0x6a638c57, 0xcf221e29, + 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, + 0x83834485, 0x26c2d6fb, 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, 0x2290cf18, 0x87d15d66, + 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, + 0x9df3018d, 0x38b293f3, 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, 0x3ce08a10, 0x99a1186e, + 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, + 0xd50042c2, 0x7041d0bc, 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, 0x9557324b, 0x3016a035, + 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, + 0x0c158713, 0xa954156d, 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, + 0xc4451272, 0x1900b8ca, 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, 0xe964b13d, 0x34211b85, + 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, + 0xdb65c0a9, 0x06206a11, 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, 0x2161776d, 0xfc24ddd5, + 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, + 0xfa04b7c4, 0x27411d7c, 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, 0xaba65fe7, 0x76e3f55f, + 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, + 0x99a72e73, 0x44e284cb, 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, 0xb4868d3c, 0x69c32784, + 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, + 0xb8c6591e, 0x6583f3a6, 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, 0x95e7fa51, 0x48a250e9, + 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, + 0xa7e68bc5, 0x7aa3217d, 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, 0xa4e4aad9, 0x79a10061, + 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, + 0x7f816a70, 0xa2c4c0c8, 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, 0x8585ddb4, 0x58c0770c, + 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, + 0xb784ac20, 0x6ac10698, 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, 0x9aa50f6f, 0x47e0a5d7, + 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, + 0x3d4384aa, 0xe0062e12, 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, 0x106227e5, 0xcd278d5d, + 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, + 0x22635671, 0xff26fcc9, 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, 0xd867e1b5, 0x05224b0d, + 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, + 0x0302211c, 0xde478ba4, 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t* p) { + return DecodeFixed32(reinterpret_cast(p)); +} +#endif // !__SSE4_2__ + +static inline void DefaultCRC32(uint64_t* l, uint8_t const** p) { +#ifndef __SSE4_2__ + uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; + // DO it twice. + c = static_cast(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; +#elif defined(__LP64__) || defined(_WIN64) + *l = _mm_crc32_u64(*l, DecodeFixed64(reinterpret_cast(*p))); + *p += 8; +#else + *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); + *p += 4; + *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); + *p += 4; +#endif +} + +template +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { + const uint8_t* p = reinterpret_cast(buf); + const uint8_t* e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 \ + do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ + } while (0) + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e - p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e - p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return static_cast(l ^ 0xffffffffu); +} + +using Function = uint32_t (*)(uint32_t, const char*, size_t); + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +uint32_t ExtendPPCImpl(uint32_t crc, const char* buf, size_t size) { + return crc32c_ppc(crc, (const unsigned char*)buf, size); +} + +#if __linux__ +static int arch_ppc_probe(void) { + arch_ppc_crc32 = 0; + +#if defined(__powerpc64__) && defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) + if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1; +#endif /* __powerpc64__ */ + + return arch_ppc_crc32; +} +#elif __FreeBSD__ +static int arch_ppc_probe(void) { + unsigned long cpufeatures; + arch_ppc_crc32 = 0; + +#if defined(__powerpc64__) + elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures)); + if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1; +#endif /* __powerpc64__ */ + + return arch_ppc_crc32; +} +#endif // __linux__ + +static bool isAltiVec() { + if (arch_ppc_probe()) { + return true; + } else { + return false; + } +} +#endif + +#if defined(HAVE_ARM64_CRC) +uint32_t ExtendARMImpl(uint32_t crc, const char* buf, size_t size) { + return crc32c_arm64(crc, (const unsigned char*)buf, size); +} +#endif + +std::string IsFastCrc32Supported() { + bool has_fast_crc = false; + std::string fast_zero_msg; + std::string arch; +#ifdef HAVE_POWER8 +#ifdef HAS_ALTIVEC + if (arch_ppc_probe()) { + has_fast_crc = true; + arch = "PPC"; + } +#else + has_fast_crc = false; + arch = "PPC"; +#endif +#elif defined(HAVE_ARM64_CRC) + if (crc32c_runtime_check()) { + has_fast_crc = true; + arch = "Arm64"; + pmull_runtime_flag = crc32c_pmull_runtime_check(); + } else { + has_fast_crc = false; + arch = "Arm64"; + } +#else +#ifdef __SSE4_2__ + has_fast_crc = true; +#endif // __SSE4_2__ + arch = "x86"; +#endif + if (has_fast_crc) { + fast_zero_msg.append("Supported on " + arch); + } else { + fast_zero_msg.append("Not supported on " + arch); + } + return fast_zero_msg; +} + +/* + * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * Ferry Toth + * ftoth@exalondelft.nl + * + * https://github.com/htot/crc32c + * + * Modified by Facebook + * + * Original intel whitepaper: + * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * + * This version is from the folly library, created by Dave Watson + * + * + */ +#if defined(__SSE4_2__) && defined(__PCLMUL__) + +#define CRCtriplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ + crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); + +#define CRCduplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); + +#define CRCsinglet(crc, buf, offset) \ + crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); + +// Numbers taken directly from intel whitepaper. +// clang-format off +const uint64_t clmul_constants[] = { + 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, + 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, + 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, + 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, + 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, + 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, + 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, + 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, + 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, + 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, + 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, + 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, + 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, + 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, + 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, + 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, + 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, + 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, + 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, + 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, + 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, + 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, + 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, + 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, + 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, + 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, + 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, + 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, + 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, + 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, + 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, + 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, + 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, + 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, + 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, + 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, + 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, + 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, + 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, + 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, + 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, + 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, + 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, + 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, + 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, + 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, + 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, + 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, + 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, + 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, + 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, + 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, + 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, + 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, + 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, + 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, + 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, + 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, + 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, + 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, + 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, + 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, + 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, + 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, +}; + +// Compute the crc32c value for buffer smaller than 8 +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void align_to_8( + size_t len, + uint64_t& crc0, // crc so far, updated on return + const unsigned char*& next) { // next data pointer, updated on return + uint32_t crc32bit = static_cast(crc0); + if (len & 0x04) { + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); + next += sizeof(uint32_t); + } + if (len & 0x02) { + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); + next += sizeof(uint16_t); + } + if (len & 0x01) { + crc32bit = _mm_crc32_u8(crc32bit, *(next)); + next++; + } + crc0 = crc32bit; +} + +// +// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well +// chosen constant and xor's these with the remaining CRC. +// +inline uint64_t CombineCRC( + size_t block_size, + uint64_t crc0, + uint64_t crc1, + uint64_t crc2, + const uint64_t* next2) { + const auto multiplier = + *(reinterpret_cast(clmul_constants) + block_size - 1); + const auto crc0_xmm = _mm_set_epi64x(0, crc0); + const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); + const auto crc1_xmm = _mm_set_epi64x(0, crc1); + const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); + const auto res = _mm_xor_si128(res0, res1); + crc0 = _mm_cvtsi128_si64(res); + crc0 = crc0 ^ *((uint64_t*)next2 - 1); + crc2 = _mm_crc32_u64(crc2, crc0); + return crc2; +} + +// Compute CRC-32C using the Intel hardware instruction. +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { + const unsigned char* next = (const unsigned char*)buf; + uint64_t count; + uint64_t crc0, crc1, crc2; + crc0 = crc ^ 0xffffffffu; + + + if (len >= 8) { + // if len > 216 then align and use triplets + if (len > 216) { + { + // Work on the bytes (< 8) before the first 8-byte alignment addr starts + uint64_t align_bytes = (8 - (uintptr_t)next) & 7; + len -= align_bytes; + align_to_8(align_bytes, crc0, next); + } + + // Now work on the remaining blocks + count = len / 24; // number of triplets + len %= 24; // bytes remaining + uint64_t n = count >> 7; // #blocks = first block + full blocks + uint64_t block_size = count & 127; + if (block_size == 0) { + block_size = 128; + } else { + n++; + } + // points to the first byte of the next block + const uint64_t* next0 = (uint64_t*)next + block_size; + const uint64_t* next1 = next0 + block_size; + const uint64_t* next2 = next1 + block_size; + + crc1 = crc2 = 0; + // Use Duff's device, a for() loop inside a switch() + // statement. This needs to execute at least once, round len + // down to nearest triplet multiple + switch (block_size) { + case 128: + do { + // jumps here for a full block of len 128 + CRCtriplet(crc, next, -128); + FALLTHROUGH_INTENDED; + case 127: + // jumps here or below for the first block smaller + CRCtriplet(crc, next, -127); + FALLTHROUGH_INTENDED; + case 126: + CRCtriplet(crc, next, -126); // than 128 + FALLTHROUGH_INTENDED; + case 125: + CRCtriplet(crc, next, -125); + FALLTHROUGH_INTENDED; + case 124: + CRCtriplet(crc, next, -124); + FALLTHROUGH_INTENDED; + case 123: + CRCtriplet(crc, next, -123); + FALLTHROUGH_INTENDED; + case 122: + CRCtriplet(crc, next, -122); + FALLTHROUGH_INTENDED; + case 121: + CRCtriplet(crc, next, -121); + FALLTHROUGH_INTENDED; + case 120: + CRCtriplet(crc, next, -120); + FALLTHROUGH_INTENDED; + case 119: + CRCtriplet(crc, next, -119); + FALLTHROUGH_INTENDED; + case 118: + CRCtriplet(crc, next, -118); + FALLTHROUGH_INTENDED; + case 117: + CRCtriplet(crc, next, -117); + FALLTHROUGH_INTENDED; + case 116: + CRCtriplet(crc, next, -116); + FALLTHROUGH_INTENDED; + case 115: + CRCtriplet(crc, next, -115); + FALLTHROUGH_INTENDED; + case 114: + CRCtriplet(crc, next, -114); + FALLTHROUGH_INTENDED; + case 113: + CRCtriplet(crc, next, -113); + FALLTHROUGH_INTENDED; + case 112: + CRCtriplet(crc, next, -112); + FALLTHROUGH_INTENDED; + case 111: + CRCtriplet(crc, next, -111); + FALLTHROUGH_INTENDED; + case 110: + CRCtriplet(crc, next, -110); + FALLTHROUGH_INTENDED; + case 109: + CRCtriplet(crc, next, -109); + FALLTHROUGH_INTENDED; + case 108: + CRCtriplet(crc, next, -108); + FALLTHROUGH_INTENDED; + case 107: + CRCtriplet(crc, next, -107); + FALLTHROUGH_INTENDED; + case 106: + CRCtriplet(crc, next, -106); + FALLTHROUGH_INTENDED; + case 105: + CRCtriplet(crc, next, -105); + FALLTHROUGH_INTENDED; + case 104: + CRCtriplet(crc, next, -104); + FALLTHROUGH_INTENDED; + case 103: + CRCtriplet(crc, next, -103); + FALLTHROUGH_INTENDED; + case 102: + CRCtriplet(crc, next, -102); + FALLTHROUGH_INTENDED; + case 101: + CRCtriplet(crc, next, -101); + FALLTHROUGH_INTENDED; + case 100: + CRCtriplet(crc, next, -100); + FALLTHROUGH_INTENDED; + case 99: + CRCtriplet(crc, next, -99); + FALLTHROUGH_INTENDED; + case 98: + CRCtriplet(crc, next, -98); + FALLTHROUGH_INTENDED; + case 97: + CRCtriplet(crc, next, -97); + FALLTHROUGH_INTENDED; + case 96: + CRCtriplet(crc, next, -96); + FALLTHROUGH_INTENDED; + case 95: + CRCtriplet(crc, next, -95); + FALLTHROUGH_INTENDED; + case 94: + CRCtriplet(crc, next, -94); + FALLTHROUGH_INTENDED; + case 93: + CRCtriplet(crc, next, -93); + FALLTHROUGH_INTENDED; + case 92: + CRCtriplet(crc, next, -92); + FALLTHROUGH_INTENDED; + case 91: + CRCtriplet(crc, next, -91); + FALLTHROUGH_INTENDED; + case 90: + CRCtriplet(crc, next, -90); + FALLTHROUGH_INTENDED; + case 89: + CRCtriplet(crc, next, -89); + FALLTHROUGH_INTENDED; + case 88: + CRCtriplet(crc, next, -88); + FALLTHROUGH_INTENDED; + case 87: + CRCtriplet(crc, next, -87); + FALLTHROUGH_INTENDED; + case 86: + CRCtriplet(crc, next, -86); + FALLTHROUGH_INTENDED; + case 85: + CRCtriplet(crc, next, -85); + FALLTHROUGH_INTENDED; + case 84: + CRCtriplet(crc, next, -84); + FALLTHROUGH_INTENDED; + case 83: + CRCtriplet(crc, next, -83); + FALLTHROUGH_INTENDED; + case 82: + CRCtriplet(crc, next, -82); + FALLTHROUGH_INTENDED; + case 81: + CRCtriplet(crc, next, -81); + FALLTHROUGH_INTENDED; + case 80: + CRCtriplet(crc, next, -80); + FALLTHROUGH_INTENDED; + case 79: + CRCtriplet(crc, next, -79); + FALLTHROUGH_INTENDED; + case 78: + CRCtriplet(crc, next, -78); + FALLTHROUGH_INTENDED; + case 77: + CRCtriplet(crc, next, -77); + FALLTHROUGH_INTENDED; + case 76: + CRCtriplet(crc, next, -76); + FALLTHROUGH_INTENDED; + case 75: + CRCtriplet(crc, next, -75); + FALLTHROUGH_INTENDED; + case 74: + CRCtriplet(crc, next, -74); + FALLTHROUGH_INTENDED; + case 73: + CRCtriplet(crc, next, -73); + FALLTHROUGH_INTENDED; + case 72: + CRCtriplet(crc, next, -72); + FALLTHROUGH_INTENDED; + case 71: + CRCtriplet(crc, next, -71); + FALLTHROUGH_INTENDED; + case 70: + CRCtriplet(crc, next, -70); + FALLTHROUGH_INTENDED; + case 69: + CRCtriplet(crc, next, -69); + FALLTHROUGH_INTENDED; + case 68: + CRCtriplet(crc, next, -68); + FALLTHROUGH_INTENDED; + case 67: + CRCtriplet(crc, next, -67); + FALLTHROUGH_INTENDED; + case 66: + CRCtriplet(crc, next, -66); + FALLTHROUGH_INTENDED; + case 65: + CRCtriplet(crc, next, -65); + FALLTHROUGH_INTENDED; + case 64: + CRCtriplet(crc, next, -64); + FALLTHROUGH_INTENDED; + case 63: + CRCtriplet(crc, next, -63); + FALLTHROUGH_INTENDED; + case 62: + CRCtriplet(crc, next, -62); + FALLTHROUGH_INTENDED; + case 61: + CRCtriplet(crc, next, -61); + FALLTHROUGH_INTENDED; + case 60: + CRCtriplet(crc, next, -60); + FALLTHROUGH_INTENDED; + case 59: + CRCtriplet(crc, next, -59); + FALLTHROUGH_INTENDED; + case 58: + CRCtriplet(crc, next, -58); + FALLTHROUGH_INTENDED; + case 57: + CRCtriplet(crc, next, -57); + FALLTHROUGH_INTENDED; + case 56: + CRCtriplet(crc, next, -56); + FALLTHROUGH_INTENDED; + case 55: + CRCtriplet(crc, next, -55); + FALLTHROUGH_INTENDED; + case 54: + CRCtriplet(crc, next, -54); + FALLTHROUGH_INTENDED; + case 53: + CRCtriplet(crc, next, -53); + FALLTHROUGH_INTENDED; + case 52: + CRCtriplet(crc, next, -52); + FALLTHROUGH_INTENDED; + case 51: + CRCtriplet(crc, next, -51); + FALLTHROUGH_INTENDED; + case 50: + CRCtriplet(crc, next, -50); + FALLTHROUGH_INTENDED; + case 49: + CRCtriplet(crc, next, -49); + FALLTHROUGH_INTENDED; + case 48: + CRCtriplet(crc, next, -48); + FALLTHROUGH_INTENDED; + case 47: + CRCtriplet(crc, next, -47); + FALLTHROUGH_INTENDED; + case 46: + CRCtriplet(crc, next, -46); + FALLTHROUGH_INTENDED; + case 45: + CRCtriplet(crc, next, -45); + FALLTHROUGH_INTENDED; + case 44: + CRCtriplet(crc, next, -44); + FALLTHROUGH_INTENDED; + case 43: + CRCtriplet(crc, next, -43); + FALLTHROUGH_INTENDED; + case 42: + CRCtriplet(crc, next, -42); + FALLTHROUGH_INTENDED; + case 41: + CRCtriplet(crc, next, -41); + FALLTHROUGH_INTENDED; + case 40: + CRCtriplet(crc, next, -40); + FALLTHROUGH_INTENDED; + case 39: + CRCtriplet(crc, next, -39); + FALLTHROUGH_INTENDED; + case 38: + CRCtriplet(crc, next, -38); + FALLTHROUGH_INTENDED; + case 37: + CRCtriplet(crc, next, -37); + FALLTHROUGH_INTENDED; + case 36: + CRCtriplet(crc, next, -36); + FALLTHROUGH_INTENDED; + case 35: + CRCtriplet(crc, next, -35); + FALLTHROUGH_INTENDED; + case 34: + CRCtriplet(crc, next, -34); + FALLTHROUGH_INTENDED; + case 33: + CRCtriplet(crc, next, -33); + FALLTHROUGH_INTENDED; + case 32: + CRCtriplet(crc, next, -32); + FALLTHROUGH_INTENDED; + case 31: + CRCtriplet(crc, next, -31); + FALLTHROUGH_INTENDED; + case 30: + CRCtriplet(crc, next, -30); + FALLTHROUGH_INTENDED; + case 29: + CRCtriplet(crc, next, -29); + FALLTHROUGH_INTENDED; + case 28: + CRCtriplet(crc, next, -28); + FALLTHROUGH_INTENDED; + case 27: + CRCtriplet(crc, next, -27); + FALLTHROUGH_INTENDED; + case 26: + CRCtriplet(crc, next, -26); + FALLTHROUGH_INTENDED; + case 25: + CRCtriplet(crc, next, -25); + FALLTHROUGH_INTENDED; + case 24: + CRCtriplet(crc, next, -24); + FALLTHROUGH_INTENDED; + case 23: + CRCtriplet(crc, next, -23); + FALLTHROUGH_INTENDED; + case 22: + CRCtriplet(crc, next, -22); + FALLTHROUGH_INTENDED; + case 21: + CRCtriplet(crc, next, -21); + FALLTHROUGH_INTENDED; + case 20: + CRCtriplet(crc, next, -20); + FALLTHROUGH_INTENDED; + case 19: + CRCtriplet(crc, next, -19); + FALLTHROUGH_INTENDED; + case 18: + CRCtriplet(crc, next, -18); + FALLTHROUGH_INTENDED; + case 17: + CRCtriplet(crc, next, -17); + FALLTHROUGH_INTENDED; + case 16: + CRCtriplet(crc, next, -16); + FALLTHROUGH_INTENDED; + case 15: + CRCtriplet(crc, next, -15); + FALLTHROUGH_INTENDED; + case 14: + CRCtriplet(crc, next, -14); + FALLTHROUGH_INTENDED; + case 13: + CRCtriplet(crc, next, -13); + FALLTHROUGH_INTENDED; + case 12: + CRCtriplet(crc, next, -12); + FALLTHROUGH_INTENDED; + case 11: + CRCtriplet(crc, next, -11); + FALLTHROUGH_INTENDED; + case 10: + CRCtriplet(crc, next, -10); + FALLTHROUGH_INTENDED; + case 9: + CRCtriplet(crc, next, -9); + FALLTHROUGH_INTENDED; + case 8: + CRCtriplet(crc, next, -8); + FALLTHROUGH_INTENDED; + case 7: + CRCtriplet(crc, next, -7); + FALLTHROUGH_INTENDED; + case 6: + CRCtriplet(crc, next, -6); + FALLTHROUGH_INTENDED; + case 5: + CRCtriplet(crc, next, -5); + FALLTHROUGH_INTENDED; + case 4: + CRCtriplet(crc, next, -4); + FALLTHROUGH_INTENDED; + case 3: + CRCtriplet(crc, next, -3); + FALLTHROUGH_INTENDED; + case 2: + CRCtriplet(crc, next, -2); + FALLTHROUGH_INTENDED; + case 1: + CRCduplet(crc, next, -1); // the final triplet is actually only 2 + //{ CombineCRC(); } + crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); + if (--n > 0) { + crc1 = crc2 = 0; + block_size = 128; + // points to the first byte of the next block + next0 = next2 + 128; + next1 = next0 + 128; // from here on all blocks are 128 long + next2 = next1 + 128; + } + FALLTHROUGH_INTENDED; + case 0:; + } while (n > 0); + } + next = (const unsigned char*)next2; + } + uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets + len = len & 7; + next += (count2 * 8); + switch (count2) { + case 27: + CRCsinglet(crc0, next, -27 * 8); + FALLTHROUGH_INTENDED; + case 26: + CRCsinglet(crc0, next, -26 * 8); + FALLTHROUGH_INTENDED; + case 25: + CRCsinglet(crc0, next, -25 * 8); + FALLTHROUGH_INTENDED; + case 24: + CRCsinglet(crc0, next, -24 * 8); + FALLTHROUGH_INTENDED; + case 23: + CRCsinglet(crc0, next, -23 * 8); + FALLTHROUGH_INTENDED; + case 22: + CRCsinglet(crc0, next, -22 * 8); + FALLTHROUGH_INTENDED; + case 21: + CRCsinglet(crc0, next, -21 * 8); + FALLTHROUGH_INTENDED; + case 20: + CRCsinglet(crc0, next, -20 * 8); + FALLTHROUGH_INTENDED; + case 19: + CRCsinglet(crc0, next, -19 * 8); + FALLTHROUGH_INTENDED; + case 18: + CRCsinglet(crc0, next, -18 * 8); + FALLTHROUGH_INTENDED; + case 17: + CRCsinglet(crc0, next, -17 * 8); + FALLTHROUGH_INTENDED; + case 16: + CRCsinglet(crc0, next, -16 * 8); + FALLTHROUGH_INTENDED; + case 15: + CRCsinglet(crc0, next, -15 * 8); + FALLTHROUGH_INTENDED; + case 14: + CRCsinglet(crc0, next, -14 * 8); + FALLTHROUGH_INTENDED; + case 13: + CRCsinglet(crc0, next, -13 * 8); + FALLTHROUGH_INTENDED; + case 12: + CRCsinglet(crc0, next, -12 * 8); + FALLTHROUGH_INTENDED; + case 11: + CRCsinglet(crc0, next, -11 * 8); + FALLTHROUGH_INTENDED; + case 10: + CRCsinglet(crc0, next, -10 * 8); + FALLTHROUGH_INTENDED; + case 9: + CRCsinglet(crc0, next, -9 * 8); + FALLTHROUGH_INTENDED; + case 8: + CRCsinglet(crc0, next, -8 * 8); + FALLTHROUGH_INTENDED; + case 7: + CRCsinglet(crc0, next, -7 * 8); + FALLTHROUGH_INTENDED; + case 6: + CRCsinglet(crc0, next, -6 * 8); + FALLTHROUGH_INTENDED; + case 5: + CRCsinglet(crc0, next, -5 * 8); + FALLTHROUGH_INTENDED; + case 4: + CRCsinglet(crc0, next, -4 * 8); + FALLTHROUGH_INTENDED; + case 3: + CRCsinglet(crc0, next, -3 * 8); + FALLTHROUGH_INTENDED; + case 2: + CRCsinglet(crc0, next, -2 * 8); + FALLTHROUGH_INTENDED; + case 1: + CRCsinglet(crc0, next, -1 * 8); + FALLTHROUGH_INTENDED; + case 0:; + } + } + { + align_to_8(len, crc0, next); + return (uint32_t)crc0 ^ 0xffffffffu; + } +} + +#endif //__SSE4_2__ && __PCLMUL__ + +static inline Function Choose_Extend() { +#ifdef HAVE_POWER8 + return isAltiVec() ? ExtendPPCImpl : ExtendImpl; +#elif defined(HAVE_ARM64_CRC) + if(crc32c_runtime_check()) { + pmull_runtime_flag = crc32c_pmull_runtime_check(); + return ExtendARMImpl; + } else { + return ExtendImpl; + } +#elif defined(__SSE4_2__) && defined(__PCLMUL__) && !defined NO_THREEWAY_CRC32C + // NOTE: runtime detection no longer supported on x86 + (void)ExtendImpl; // suppress unused warning + return crc32c_3way; +#else + return ExtendImpl; +#endif +} + +static Function ChosenExtend = Choose_Extend(); +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} + +// The code for crc32c combine, copied with permission from folly + +// Standard galois-field multiply. The only modification is that a, +// b, m, and p are all bit-reflected. +// +// https://en.wikipedia.org/wiki/Finite_field_arithmetic +static constexpr uint32_t gf_multiply_sw_1( + size_t i, uint32_t p, uint32_t a, uint32_t b, uint32_t m) { + // clang-format off + return i == 32 ? p : gf_multiply_sw_1( + /* i = */ i + 1, + /* p = */ p ^ ((0u-((b >> 31) & 1)) & a), + /* a = */ (a >> 1) ^ ((0u-(a & 1)) & m), + /* b = */ b << 1, + /* m = */ m); + // clang-format on +} +static constexpr uint32_t gf_multiply_sw(uint32_t a, uint32_t b, uint32_t m) { + return gf_multiply_sw_1(/* i = */ 0, /* p = */ 0, a, b, m); +} + +static constexpr uint32_t gf_square_sw(uint32_t a, uint32_t m) { + return gf_multiply_sw(a, a, m); +} + +template +struct gf_powers_memo { + static constexpr uint32_t value = + gf_square_sw(gf_powers_memo::value, m); +}; +template +struct gf_powers_memo<0, m> { + static constexpr uint32_t value = m; +}; + +template +struct integer_sequence { + using value_type = T; + static constexpr size_t size() { return sizeof...(Ints); } +}; + +template +struct make_integer_sequence : make_integer_sequence {}; + +template +struct make_integer_sequence : integer_sequence {}; + +template +using make_index_sequence = make_integer_sequence; + +template +struct gf_powers_make { + template + using index_sequence = integer_sequence; + template + constexpr std::array operator()( + index_sequence) const { + return std::array{{gf_powers_memo::value...}}; + } +}; + +static constexpr uint32_t crc32c_m = 0x82f63b78; + +static constexpr std::array const crc32c_powers = + gf_powers_make{}(make_index_sequence<62>{}); + +// Expects a "pure" crc (see Crc32cCombine) +static uint32_t Crc32AppendZeroes( + uint32_t crc, size_t len_over_4, uint32_t polynomial, + std::array const& powers_array) { + auto powers = powers_array.data(); + // Append by multiplying by consecutive powers of two of the zeroes + // array + size_t len_bits = len_over_4; + + while (len_bits) { + // Advance directly to next bit set. + auto r = CountTrailingZeroBits(len_bits); + len_bits >>= r; + powers += r; + + crc = gf_multiply_sw(crc, *powers, polynomial); + + len_bits >>= 1; + powers++; + } + + return crc; +} + +static inline uint32_t InvertedToPure(uint32_t crc) { return ~crc; } + +static inline uint32_t PureToInverted(uint32_t crc) { return ~crc; } + +static inline uint32_t PureExtend(uint32_t crc, const char* buf, size_t size) { + return InvertedToPure(Extend(PureToInverted(crc), buf, size)); +} + +// Background: +// RocksDB uses two kinds of crc32c values: masked and unmasked. Neither is +// a "pure" CRC because a pure CRC satisfies (^ for xor) +// crc(a ^ b) = crc(a) ^ crc(b) +// The unmasked is closest, and this function takes unmasked crc32c values. +// The unmasked values are impure in two ways: +// * The initial setting at the start of CRC computation is all 1 bits +// (like -1) instead of zero. +// * The result has all bits invered. +// Note that together, these result in the empty string having a crc32c of +// zero. See +// https://en.wikipedia.org/wiki/Computation_of_cyclic_redundancy_checks#CRC_variants +// +// Simplified version of strategy, using xor through pure CRCs (+ for concat): +// +// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^ +// pure_crc(zeros(len(str1)) + str2) +// +// because the xor of these two zero-padded strings is str1 + str2. For pure +// CRC, leading zeros don't affect the result, so we only need +// +// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^ +// pure_crc(str2) +// +// Considering we aren't working with pure CRCs, what is actually in the input? +// +// crc1 = PureToInverted(PureExtendCrc32c(-1, zeros, crc1len) ^ +// PureCrc32c(str1, crc1len)) +// crc2 = PureToInverted(PureExtendCrc32c(-1, zeros, crc2len) ^ +// PureCrc32c(str2, crc2len)) +// +// The result we want to compute is +// combined = PureToInverted(PureExtendCrc32c(PureExtendCrc32c(-1, zeros, +// crc1len) ^ +// PureCrc32c(str1, crc1len), +// zeros, crc2len) ^ +// PureCrc32c(str2, crc2len)) +// +// Thus, in addition to extending crc1 over the length of str2 in (virtual) +// zeros, we need to cancel out the -1 initializer that was used in computing +// crc2. To cancel it out, we also need to extend it over crc2len in zeros. +// To simplify, since the end of str1 and that -1 initializer for crc2 are at +// the same logical position, we can combine them before we extend over the +// zeros. +uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len) { + uint32_t pure_crc1_with_init = InvertedToPure(crc1); + uint32_t pure_crc2_with_init = InvertedToPure(crc2); + uint32_t pure_crc2_init = static_cast(-1); + + // Append up to 32 bits of zeroes in the normal way + char zeros[4] = {0, 0, 0, 0}; + auto len = crc2len & 3; + uint32_t tmp = pure_crc1_with_init ^ pure_crc2_init; + if (len) { + tmp = PureExtend(tmp, zeros, len); + } + return PureToInverted( + Crc32AppendZeroes(tmp, crc2len / 4, crc32c_m, crc32c_powers) ^ + pure_crc2_with_init); +} + +} // namespace crc32c +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.h new file mode 100644 index 0000000000..a08ad60af3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace crc32c { + +extern std::string IsFastCrc32Supported(); + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Takes two unmasked crc32c values, and the length of the string from +// which `crc2` was computed, and computes a crc32c value for the +// concatenation of the original two input strings. Running time is +// ~ log(crc2len). +extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); } + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} // namespace crc32c +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.cc new file mode 100644 index 0000000000..98d1c307db --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.cc @@ -0,0 +1,213 @@ +// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/crc32c_arm64.h" + +#if defined(HAVE_ARM64_CRC) + +#if defined(__linux__) +#include +#endif +#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT +#include +#endif +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif +#ifndef HWCAP_PMULL +#define HWCAP_PMULL (1 << 4) +#endif +#if defined(__APPLE__) +#include +#endif +#if defined(__OpenBSD__) +#include +#include +#include +#include +#endif + +#ifdef HAVE_ARM64_CRYPTO +/* unfolding to compute 8 * 3 = 24 bytes parallelly */ +#define CRC32C24BYTES(ITR) \ + crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR))); \ + crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \ + crc0 = crc32c_u64(crc0, *(buf64 + (ITR))); + +/* unfolding to compute 24 * 7 = 168 bytes parallelly */ +#define CRC32C7X24BYTES(ITR) \ + do { \ + CRC32C24BYTES((ITR)*7 + 0) \ + CRC32C24BYTES((ITR)*7 + 1) \ + CRC32C24BYTES((ITR)*7 + 2) \ + CRC32C24BYTES((ITR)*7 + 3) \ + CRC32C24BYTES((ITR)*7 + 4) \ + CRC32C24BYTES((ITR)*7 + 5) \ + CRC32C24BYTES((ITR)*7 + 6) \ + } while (0) +#endif + +extern bool pmull_runtime_flag; + +uint32_t crc32c_runtime_check(void) { +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) + uint64_t auxv = 0; +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) + auxv = getauxval(AT_HWCAP); +#elif defined(__FreeBSD__) + elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv)); +#endif + return (auxv & HWCAP_CRC32) != 0; +#elif defined(__APPLE__) + int r; + size_t l = sizeof(r); + if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0; + return r == 1; +#elif defined(__OpenBSD__) + int r = 0; + const int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0}; + uint64_t isar0; + size_t len = sizeof(isar0); + + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) r = 1; + } + return r; +#else + return 0; +#endif +} + +bool crc32c_pmull_runtime_check(void) { +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) + uint64_t auxv = 0; +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) + auxv = getauxval(AT_HWCAP); +#elif defined(__FreeBSD__) + elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv)); +#endif + return (auxv & HWCAP_PMULL) != 0; +#elif defined(__APPLE__) + return true; +#elif defined(__OpenBSD__) + bool r = false; + const int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0}; + uint64_t isar0; + size_t len = sizeof(isar0); + + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL) r = true; + } + return r; +#else + return false; +#endif +} + +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +uint32_t +crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) { + const uint8_t *buf8; + const uint64_t *buf64 = (uint64_t *)data; + int length = (int)len; + crc ^= 0xffffffff; + + /* + * Pmull runtime check here. + * Raspberry Pi supports crc32 but doesn't support pmull. + * Skip Crc32c Parallel computation if no crypto extension available. + */ + if (pmull_runtime_flag) { +/* Macro (HAVE_ARM64_CRYPTO) is used for compiling check */ +#ifdef HAVE_ARM64_CRYPTO +/* Crc32c Parallel computation + * Algorithm comes from Intel whitepaper: + * crc-iscsi-polynomial-crc32-instruction-paper + * + * Input data is divided into three equal-sized blocks + * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes + * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes + */ +#define BLK_LENGTH 42 + while (length >= 1024) { + uint64_t t0, t1; + uint32_t crc0 = 0, crc1 = 0, crc2 = 0; + + /* Parallel Param: + * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1)); + * k1 = CRC32(x ^ (42 * 8 * 8 - 1)); + */ + uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; + + /* Prefetch data for following block to avoid cache miss */ + PREF1KL1((uint8_t *)buf64, 1024); + + /* First 8 byte for better pipelining */ + crc0 = crc32c_u64(crc, *buf64++); + + /* 3 blocks crc32c parallel computation + * Macro unfolding to compute parallelly + * 168 * 6 = 1008 (bytes) + */ + CRC32C7X24BYTES(0); + CRC32C7X24BYTES(1); + CRC32C7X24BYTES(2); + CRC32C7X24BYTES(3); + CRC32C7X24BYTES(4); + CRC32C7X24BYTES(5); + buf64 += (BLK_LENGTH * 3); + + /* Last 8 bytes */ + crc = crc32c_u64(crc2, *buf64++); + + t0 = (uint64_t)vmull_p64(crc0, k0); + t1 = (uint64_t)vmull_p64(crc1, k1); + + /* Merge (crc0, crc1, crc2) -> crc */ + crc1 = crc32c_u64(0, t1); + crc ^= crc1; + crc0 = crc32c_u64(0, t0); + crc ^= crc0; + + length -= 1024; + } + + if (length == 0) return crc ^ (0xffffffffU); +#endif + } // if Pmull runtime check here + + buf8 = (const uint8_t *)buf64; + while (length >= 8) { + crc = crc32c_u64(crc, *(const uint64_t *)buf8); + buf8 += 8; + length -= 8; + } + + /* The following is more efficient than the straight loop */ + if (length >= 4) { + crc = crc32c_u32(crc, *(const uint32_t *)buf8); + buf8 += 4; + length -= 4; + } + + if (length >= 2) { + crc = crc32c_u16(crc, *(const uint16_t *)buf8); + buf8 += 2; + length -= 2; + } + + if (length >= 1) crc = crc32c_u8(crc, *buf8); + + crc ^= 0xffffffff; + return crc; +} + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.h new file mode 100644 index 0000000000..4b27fe8710 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_arm64.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef UTIL_CRC32C_ARM64_H +#define UTIL_CRC32C_ARM64_H + +#include +#include + +#if defined(__aarch64__) || defined(__AARCH64__) + +#ifdef __ARM_FEATURE_CRC32 +#define HAVE_ARM64_CRC +#include +#define crc32c_u8(crc, v) __crc32cb(crc, v) +#define crc32c_u16(crc, v) __crc32ch(crc, v) +#define crc32c_u32(crc, v) __crc32cw(crc, v) +#define crc32c_u64(crc, v) __crc32cd(crc, v) +// clang-format off +#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64)); \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 1) * 64)); \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64)); \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64)); +// clang-format on + +#define PREF1KL1(buffer, PREF_OFFSET) \ + PREF4X64L1(buffer, (PREF_OFFSET), 0) \ + PREF4X64L1(buffer, (PREF_OFFSET), 4) \ + PREF4X64L1(buffer, (PREF_OFFSET), 8) \ + PREF4X64L1(buffer, (PREF_OFFSET), 12) + +extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, + size_t len); +extern uint32_t crc32c_runtime_check(void); +extern bool crc32c_pmull_runtime_check(void); + +#ifdef __ARM_FEATURE_CRYPTO +#define HAVE_ARM64_CRYPTO +#include +#endif // __ARM_FEATURE_CRYPTO +#endif // __ARM_FEATURE_CRC32 + +#endif // defined(__aarch64__) || defined(__AARCH64__) + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc.h new file mode 100644 index 0000000000..f0b0b66d5d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc.h @@ -0,0 +1,22 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, + size_t len); + +#ifdef __cplusplus +} +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc_constants.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc_constants.h new file mode 100644 index 0000000000..f6494cd01c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/crc32c_ppc_constants.h @@ -0,0 +1,900 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (C) 2015, 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#define CRC 0x1edc6f41 +#define REFLECT +#define CRC_XOR + +#ifndef __ASSEMBLY__ +#ifdef CRC_TABLE +static const unsigned int crc_table[] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, +}; + +#endif + +#else +#define MAX_SIZE 32768 +.constants : + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + + .short_constants : + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include + the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod + p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod + p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod + p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod + p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod + p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod + p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod + p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod + p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` + */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + .barrett_constants : + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/data_structure.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/data_structure.cc new file mode 100644 index 0000000000..d647df5d5b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/data_structure.cc @@ -0,0 +1,18 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/data_structure.h" + +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { +namespace detail { + +int CountTrailingZeroBitsForSmallEnumSet(uint64_t v) { + return CountTrailingZeroBits(v); +} + +} // namespace detail +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/defer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/defer.h new file mode 100644 index 0000000000..f71e67ba9c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/defer.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Defers the execution of the provided function until the Defer +// object goes out of scope. +// +// Usage example: +// +// Status DeferTest() { +// Status s; +// Defer defer([&s]() { +// if (!s.ok()) { +// // do cleanups ... +// } +// }); +// // do something ... +// if (!s.ok()) return; +// // do some other things ... +// return s; +// } +// +// The above code ensures that cleanups will always happen on returning. +// +// Without the help of Defer, you can +// 1. every time when !s.ok(), do the cleanup; +// 2. instead of returning when !s.ok(), continue the work only when s.ok(), +// but sometimes, this might lead to nested blocks of "if (s.ok()) {...}". +// +// With the help of Defer, you can centralize the cleanup logic inside the +// lambda passed to Defer, and you can return immediately on failure when +// necessary. +class Defer final { + public: + explicit Defer(std::function&& fn) : fn_(std::move(fn)) {} + ~Defer() { fn_(); } + + // Disallow copy. + Defer(const Defer&) = delete; + Defer& operator=(const Defer&) = delete; + + private: + std::function fn_; +}; + +// An RAII utility object that saves the current value of an object so that +// it can be overwritten, and restores it to the saved value when the +// SaveAndRestore object goes out of scope. +template +class SaveAndRestore { + public: + // obj is non-null pointer to value to be saved and later restored. + explicit SaveAndRestore(T* obj) : obj_(obj), saved_(*obj) {} + // new_value is stored in *obj + SaveAndRestore(T* obj, const T& new_value) + : obj_(obj), saved_(std::move(*obj)) { + *obj = new_value; + } + SaveAndRestore(T* obj, T&& new_value) : obj_(obj), saved_(std::move(*obj)) { + *obj = std::move(new_value); + } + ~SaveAndRestore() { *obj_ = std::move(saved_); } + + // No copies + SaveAndRestore(const SaveAndRestore&) = delete; + SaveAndRestore& operator=(const SaveAndRestore&) = delete; + + private: + T* const obj_; + T saved_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/distributed_mutex.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/distributed_mutex.h new file mode 100644 index 0000000000..1734269cc9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/distributed_mutex.h @@ -0,0 +1,50 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +// This file declares a wrapper around the efficient folly DistributedMutex +// that falls back on a standard mutex when not available. See +// https://github.com/facebook/folly/blob/main/folly/synchronization/DistributedMutex.h +// for benefits and limitations. + +// At the moment, only scoped locking is supported using DMutexLock +// RAII wrapper, because lock/unlock APIs will vary. + +#ifdef USE_FOLLY + +#include + +namespace ROCKSDB_NAMESPACE { + +class DMutex : public folly::DistributedMutex { + public: + static const char* kName() { return "folly::DistributedMutex"; } + + explicit DMutex(bool IGNORED_adaptive = false) { (void)IGNORED_adaptive; } + + // currently no-op + void AssertHeld() {} +}; +using DMutexLock = std::lock_guard; + +} // namespace ROCKSDB_NAMESPACE + +#else + +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +using DMutex = port::Mutex; +using DMutexLock = std::lock_guard; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/duplicate_detector.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/duplicate_detector.h new file mode 100644 index 0000000000..aa42e950e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/duplicate_detector.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "util/set_comparator.h" + +namespace ROCKSDB_NAMESPACE { +// During recovery if the memtable is flushed we cannot rely on its help on +// duplicate key detection and as key insert will not be attempted. This class +// will be used as a emulator of memtable to tell if insertion of a key/seq +// would have resulted in duplication. +class DuplicateDetector { + public: + explicit DuplicateDetector(DBImpl* db) : db_(db) {} + bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) { + assert(seq >= batch_seq_); + if (batch_seq_ != seq) { // it is a new batch + keys_.clear(); + } + batch_seq_ = seq; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + return true; + } + return false; + } + + private: + SequenceNumber batch_seq_ = 0; + DBImpl* db_; + using CFKeys = std::set; + std::map keys_; + void InitWithComp(const uint32_t cf) { + auto h = db_->GetColumnFamilyHandle(cf); + if (!h) { + // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not + // implemented yet. When it does, we should return proper error instead + // of throwing exception. + ROCKS_LOG_FATAL( + db_->immutable_db_options().info_log, + "Recovering an entry from the dropped column family %" PRIu32 + ". WAL must must have been emptied before dropping the column " + "family", + cf); + throw std::runtime_error( + "Recovering an entry from a dropped column family. " + "WAL must must have been flushed before dropping the column " + "family"); + return; + } + auto cmp = h->GetComparator(); + keys_[cf] = CFKeys(SetComparator(cmp)); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.cc new file mode 100644 index 0000000000..0ff3b4a758 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "dynamic_bloom.h" + +#include + +#include "memory/allocator.h" +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint32_t roundUpToPow2(uint32_t x) { + uint32_t rv = 1; + while (rv < x) { + rv <<= 1; + } + return rv; +} +} // namespace + +DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, + uint32_t num_probes, size_t huge_page_tlb_size, + Logger* logger) + // Round down, except round up with 1 + : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) { + assert(num_probes % 2 == 0); // limitation of current implementation + assert(num_probes <= 10); // limitation of current implementation + assert(kNumDoubleProbes > 0); + + // Determine how much to round off + align by so that x ^ i (that's xor) is + // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes. + uint32_t block_bytes = /*bytes/u64*/ 8 * + /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes)); + uint32_t block_bits = block_bytes * 8; + uint32_t blocks = (total_bits + block_bits - 1) / block_bits; + uint32_t sz = blocks * block_bytes; + kLen = sz / /*bytes/u64*/ 8; + assert(kLen > 0); +#ifndef NDEBUG + for (uint32_t i = 0; i < kNumDoubleProbes; ++i) { + // Ensure probes starting at last word are in range + assert(((kLen - 1) ^ i) < kLen); + } +#endif + + // Padding to correct for allocation not originally aligned on block_bytes + // boundary + sz += block_bytes - 1; + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto block_offset = reinterpret_cast(raw) % block_bytes; + if (block_offset > 0) { + // Align on block_bytes boundary + raw += block_bytes - block_offset; + } + static_assert(sizeof(std::atomic) == sizeof(uint64_t), + "Expecting zero-space-overhead atomic"); + data_ = reinterpret_cast*>(raw); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.h new file mode 100644 index 0000000000..40cd294044 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/dynamic_bloom.h @@ -0,0 +1,214 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "table/multiget_context.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Allocator; +class Logger; + +// A Bloom filter intended only to be used in memory, never serialized in a way +// that could lead to schema incompatibility. Supports opt-in lock-free +// concurrent access. +// +// This implementation is also intended for applications generally preferring +// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate. +// For 1% FP rate, that means that the latency of a look-up triggered by an FP +// should be less than roughly 100x the cost of a Bloom filter op. +// +// For simplicity and performance, the current implementation requires +// num_probes to be a multiple of two and <= 10. +// +class DynamicBloom { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit DynamicBloom(Allocator* allocator, uint32_t total_bits, + uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, + Logger* logger = nullptr); + + ~DynamicBloom() {} + + // Assuming single threaded access to this function. + void Add(const Slice& key); + + // Like Add, but may be called concurrent with other functions. + void AddConcurrently(const Slice& key); + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Like AddHash, but may be called concurrent with other functions. + void AddHashConcurrently(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContain(const Slice& key) const; + + void MayContain(int num_keys, Slice* keys, bool* may_match) const; + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t h); + + private: + // Length of the structure, in 64-bit words. For this structure, "word" + // will always refer to 64-bit words. + uint32_t kLen; + // We make the k probes in pairs, two for each 64-bit read/write. Thus, + // this stores k/2, the number of words to double-probe. + const uint32_t kNumDoubleProbes; + + std::atomic* data_; + + // or_func(ptr, mask) should effect *ptr |= mask with the appropriate + // concurrency safety, working with bytes. + template + void AddHash(uint32_t hash, const OrFunc& or_func); + + bool DoubleProbe(uint32_t h32, size_t a) const; +}; + +inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); } + +inline void DynamicBloom::AddConcurrently(const Slice& key) { + AddHashConcurrently(BloomHash(key)); +} + +inline void DynamicBloom::AddHash(uint32_t hash) { + AddHash(hash, [](std::atomic* ptr, uint64_t mask) { + ptr->store(ptr->load(std::memory_order_relaxed) | mask, + std::memory_order_relaxed); + }); +} + +inline void DynamicBloom::AddHashConcurrently(uint32_t hash) { + AddHash(hash, [](std::atomic* ptr, uint64_t mask) { + // Happens-before between AddHash and MaybeContains is handled by + // access to versions_->LastSequence(), so all we have to do here is + // avoid races (so we don't give the compiler a license to mess up + // our code) and not lose bits. std::memory_order_relaxed is enough + // for that. + if ((mask & ptr->load(std::memory_order_relaxed)) != mask) { + ptr->fetch_or(mask, std::memory_order_relaxed); + } + }); +} + +inline bool DynamicBloom::MayContain(const Slice& key) const { + return (MayContainHash(BloomHash(key))); +} + +inline void DynamicBloom::MayContain(int num_keys, Slice* keys, + bool* may_match) const { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + hashes[i] = BloomHash(keys[i]); + size_t a = FastRange32(kLen, hashes[i]); + PREFETCH(data_ + a, 0, 3); + byte_offsets[i] = a; + } + + for (int i = 0; i < num_keys; i++) { + may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]); + } +} + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void DynamicBloom::Prefetch(uint32_t h32) { + size_t a = FastRange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +// Speed hacks in this implementation: +// * Uses fastrange instead of % +// * Minimum logic to determine first (and all) probed memory addresses. +// (Uses constant bit-xor offsets from the starting probe address.) +// * (Major) Two probes per 64-bit memory fetch/write. +// Code simplification / optimization: only allow even number of probes. +// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At +// least on recent CPUs, integer multiplication is very cheap. Each 64-bit +// remix provides five pairs of bit addresses within a uint64_t.) +// Code simplification / optimization: only allow up to 10 probes, from a +// single 64-bit remix. +// +// The FP rate penalty for this implementation, vs. standard Bloom filter, is +// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom. +// This implementation does not explicitly use the cache line size, but is +// effectively cache-local (up to 16 probes) because of the bit-xor offsetting. +// +// NB: could easily be upgraded to support a 64-bit hash and +// total_bits > 2^32 (512MB). (The latter is a bad idea without the former, +// because of false positives.) + +inline bool DynamicBloom::MayContainHash(uint32_t h32) const { + size_t a = FastRange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); + return DoubleProbe(h32, a); +} + +inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const { + // Expand/remix with 64-bit golden ratio + uint64_t h = 0x9e3779b97f4a7c13ULL * h32; + for (unsigned i = 0;; ++i) { + // Two bit probes per uint64_t probe + uint64_t mask = + ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63)); + uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed); + if (i + 1 >= kNumDoubleProbes) { + return (val & mask) == mask; + } else if ((val & mask) != mask) { + return false; + } + h = (h >> 12) | (h << 52); + } +} + +template +inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) { + size_t a = FastRange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); + // Expand/remix with 64-bit golden ratio + uint64_t h = 0x9e3779b97f4a7c13ULL * h32; + for (unsigned i = 0;; ++i) { + // Two bit probes per uint64_t probe + uint64_t mask = + ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63)); + or_func(&data_[a ^ i], mask); + if (i + 1 >= kNumDoubleProbes) { + return; + } + h = (h >> 12) | (h << 52); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/fastrange.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/fastrange.h new file mode 100644 index 0000000000..a70a980f67 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/fastrange.h @@ -0,0 +1,114 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// fastrange/FastRange: A faster alternative to % for mapping a hash value +// to an arbitrary range. See https://github.com/lemire/fastrange +// +// Generally recommended are FastRange32 for mapping results of 32-bit +// hash functions and FastRange64 for mapping results of 64-bit hash +// functions. FastRange is less forgiving than % if the input hashes are +// not well distributed over the full range of the type (32 or 64 bits). +// +// Also included is a templated implementation FastRangeGeneric for use +// in generic algorithms, but not otherwise recommended because of +// potential ambiguity. Unlike with %, it is critical to use the right +// FastRange variant for the output size of your hash function. + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +#ifdef TEST_UINT128_COMPAT +#undef HAVE_UINT128_EXTENSION +#endif + +namespace ROCKSDB_NAMESPACE { + +namespace detail { + +// Using a class template to support partial specialization +template +struct FastRangeGenericImpl { + // only reach this on no supported specialization +}; + +template +struct FastRangeGenericImpl { + static inline Range Fn(uint32_t hash, Range range) { + static_assert(std::is_unsigned::value, "must be unsigned"); + static_assert(sizeof(Range) <= sizeof(uint32_t), + "cannot be larger than hash (32 bits)"); + + uint64_t product = uint64_t{range} * hash; + return static_cast(product >> 32); + } +}; + +template +struct FastRangeGenericImpl { + static inline Range Fn(uint64_t hash, Range range) { + static_assert(std::is_unsigned::value, "must be unsigned"); + static_assert(sizeof(Range) <= sizeof(uint64_t), + "cannot be larger than hash (64 bits)"); + +#ifdef HAVE_UINT128_EXTENSION + // Can use compiler's 128-bit type. Trust it to do the right thing. + __uint128_t wide = __uint128_t{range} * hash; + return static_cast(wide >> 64); +#else + // Fall back: full decomposition. + // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit + // -> 128-bit multiplication and optimize it appropriately + uint64_t range64 = range; // ok to shift by 32, even if Range is 32-bit + uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF}; + tmp >>= 32; + tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32}; + // Avoid overflow: first add lower 32 of tmp2, and later upper 32 + uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF}; + tmp += static_cast(tmp2); + tmp >>= 32; + tmp += (tmp2 >> 32); + tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32}; + return static_cast(tmp); +#endif + } +}; + +} // namespace detail + +// Now an omnibus templated function (yay parameter inference). +// +// NOTICE: +// This templated version is not recommended for typical use because +// of the potential to mix a 64-bit FastRange with a 32-bit bit hash, +// most likely because you put your 32-bit hash in an "unsigned long" +// which is 64 bits on some platforms. That doesn't really matter for +// an operation like %, but 64-bit FastRange gives extremely bad results, +// mostly zero, on 32-bit hash values. And because good hashing is not +// generally required for correctness, this kind of mistake could go +// unnoticed with just unit tests. Plus it could vary by platform. +template +inline Range FastRangeGeneric(Hash hash, Range range) { + return detail::FastRangeGenericImpl::Fn(hash, range); +} + +// The most popular / convenient / recommended variants: + +// Map a quality 64-bit hash value down to an arbitrary size_t range. +// (size_t is standard for mapping to things in memory.) +inline size_t FastRange64(uint64_t hash, size_t range) { + return FastRangeGeneric(hash, range); +} + +// Map a quality 32-bit hash value down to an arbitrary uint32_t range. +inline uint32_t FastRange32(uint32_t hash, uint32_t range) { + return FastRangeGeneric(hash, range); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.cc new file mode 100644 index 0000000000..b8c4099b80 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/file_checksum_helper.h" + +#include + +#include "db/log_reader.h" +#include "db/version_edit.h" +#include "db/version_edit_handler.h" +#include "file/sequence_file_reader.h" +#include "rocksdb/utilities/customizable_util.h" + +namespace ROCKSDB_NAMESPACE { + +void FileChecksumListImpl::reset() { checksum_map_.clear(); } + +size_t FileChecksumListImpl::size() const { return checksum_map_.size(); } + +Status FileChecksumListImpl::GetAllFileChecksums( + std::vector* file_numbers, std::vector* checksums, + std::vector* checksum_func_names) { + if (file_numbers == nullptr || checksums == nullptr || + checksum_func_names == nullptr) { + return Status::InvalidArgument("Pointer has not been initiated"); + } + + for (auto i : checksum_map_) { + file_numbers->push_back(i.first); + checksums->push_back(i.second.first); + checksum_func_names->push_back(i.second.second); + } + return Status::OK(); +} + +Status FileChecksumListImpl::SearchOneFileChecksum( + uint64_t file_number, std::string* checksum, + std::string* checksum_func_name) { + if (checksum == nullptr || checksum_func_name == nullptr) { + return Status::InvalidArgument("Pointer has not been initiated"); + } + + auto it = checksum_map_.find(file_number); + if (it == checksum_map_.end()) { + return Status::NotFound(); + } else { + *checksum = it->second.first; + *checksum_func_name = it->second.second; + } + return Status::OK(); +} + +Status FileChecksumListImpl::InsertOneFileChecksum( + uint64_t file_number, const std::string& checksum, + const std::string& checksum_func_name) { + auto it = checksum_map_.find(file_number); + if (it == checksum_map_.end()) { + checksum_map_.insert(std::make_pair( + file_number, std::make_pair(checksum, checksum_func_name))); + } else { + it->second.first = checksum; + it->second.second = checksum_func_name; + } + return Status::OK(); +} + +Status FileChecksumListImpl::RemoveOneFileChecksum(uint64_t file_number) { + auto it = checksum_map_.find(file_number); + if (it == checksum_map_.end()) { + return Status::NotFound(); + } else { + checksum_map_.erase(it); + } + return Status::OK(); +} + +FileChecksumList* NewFileChecksumList() { + FileChecksumListImpl* checksum_list = new FileChecksumListImpl(); + return checksum_list; +} + +std::shared_ptr GetFileChecksumGenCrc32cFactory() { + static std::shared_ptr default_crc32c_gen_factory( + new FileChecksumGenCrc32cFactory()); + return default_crc32c_gen_factory; +} + +Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, + uint64_t manifest_file_size, + FileChecksumList* checksum_list) { + if (checksum_list == nullptr) { + return Status::InvalidArgument("checksum_list is nullptr"); + } + assert(checksum_list); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + checksum_list->reset(); + Status s; + + std::unique_ptr file_reader; + { + std::unique_ptr file; + const std::shared_ptr& fs = src_env->GetFileSystem(); + s = fs->NewSequentialFile(abs_path, + fs->OptimizeForManifestRead(FileOptions()), &file, + nullptr /* dbg */); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader(std::move(file), abs_path)); + } + + struct LogReporter : public log::Reader::Reporter { + Status* status_ptr; + virtual void Corruption(size_t /*bytes*/, const Status& st) override { + if (status_ptr->ok()) { + *status_ptr = st; + } + } + } reporter; + reporter.status_ptr = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + FileChecksumRetriever retriever(read_options, manifest_file_size, + *checksum_list); + retriever.Iterate(reader, &s); + assert(!retriever.status().ok() || + manifest_file_size == std::numeric_limits::max() || + reader.LastRecordEnd() == manifest_file_size); + + return retriever.status(); +} + +namespace { +static int RegisterFileChecksumGenFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + FileChecksumGenCrc32cFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FileChecksumGenCrc32cFactory()); + return guard->get(); + }); + return 1; +} +} // namespace + +Status FileChecksumGenFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterFileChecksumGenFactories(*(ObjectLibrary::Default().get()), ""); + }); + if (value == FileChecksumGenCrc32cFactory::kClassName()) { + *result = GetFileChecksumGenCrc32cFactory(); + return Status::OK(); + } else { + Status s = LoadSharedObject(options, value, result); + return s; + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.h new file mode 100644 index 0000000000..52469cf9f9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/file_checksum_helper.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/status.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { + +// This is the class to generate the file checksum based on Crc32. It +// will be used as the default checksum method for SST file checksum +class FileChecksumGenCrc32c : public FileChecksumGenerator { + public: + FileChecksumGenCrc32c(const FileChecksumGenContext& /*context*/) { + checksum_ = 0; + } + + void Update(const char* data, size_t n) override { + checksum_ = crc32c::Extend(checksum_, data, n); + } + + void Finalize() override { + assert(checksum_str_.empty()); + // Store as big endian raw bytes + PutFixed32(&checksum_str_, EndianSwapValue(checksum_)); + } + + std::string GetChecksum() const override { + assert(!checksum_str_.empty()); + return checksum_str_; + } + + const char* Name() const override { return "FileChecksumCrc32c"; } + + private: + uint32_t checksum_; + std::string checksum_str_; +}; + +class FileChecksumGenCrc32cFactory : public FileChecksumGenFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + if (context.requested_checksum_func_name.empty() || + context.requested_checksum_func_name == "FileChecksumCrc32c") { + return std::unique_ptr( + new FileChecksumGenCrc32c(context)); + } else { + return nullptr; + } + } + + static const char* kClassName() { return "FileChecksumGenCrc32cFactory"; } + const char* Name() const override { return kClassName(); } +}; + +// The default implementaion of FileChecksumList +class FileChecksumListImpl : public FileChecksumList { + public: + FileChecksumListImpl() {} + void reset() override; + + size_t size() const override; + + Status GetAllFileChecksums( + std::vector* file_numbers, std::vector* checksums, + std::vector* checksum_func_names) override; + + Status SearchOneFileChecksum(uint64_t file_number, std::string* checksum, + std::string* checksum_func_name) override; + + Status InsertOneFileChecksum(uint64_t file_number, + const std::string& checksum, + const std::string& checksum_func_name) override; + + Status RemoveOneFileChecksum(uint64_t file_number) override; + + private: + // Key is the file number, the first portion of the value is checksum, the + // second portion of the value is checksum function name. + std::unordered_map> + checksum_map_; +}; + +// If manifest_file_size < std::numeric_limits::max(), only use +// that length prefix of the manifest file. +Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, + uint64_t manifest_file_size, + FileChecksumList* checksum_list); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/gflags_compat.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/gflags_compat.h new file mode 100644 index 0000000000..8f4a30b0d6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/gflags_compat.h @@ -0,0 +1,29 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include + +#ifndef GFLAGS_NAMESPACE +// in case it's not defined in old versions, that's probably because it was +// still google by default. +#define GFLAGS_NAMESPACE google +#endif + +#ifndef DEFINE_uint32 +// DEFINE_uint32 / DECLARE_uint32 do not appear in older versions of gflags. +// These should be sane definitions for those versions. +#include +#define DEFINE_uint32(name, val, txt) \ + namespace gflags_compat { \ + DEFINE_int32(name, val, txt); \ + } \ + uint32_t &FLAGS_##name = \ + *reinterpret_cast(&gflags_compat::FLAGS_##name); + +#define DECLARE_uint32(name) extern uint32_t &FLAGS_##name; +#endif // !DEFINE_uint32 diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.cc new file mode 100644 index 0000000000..0f7f2edc13 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/hash.h" + +#include + +#include "port/lang.h" +#include "util/coding.h" +#include "util/hash128.h" +#include "util/math128.h" +#include "util/xxhash.h" +#include "util/xxph3.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64; + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // MurmurHash1 - fast but mediocre quality + // https://github.com/aappleby/smhasher/wiki/MurmurHash1 + // + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = static_cast(seed ^ (n * m)); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + // Note: The original hash implementation used data[i] << shift, which + // promotes the char to int and then performs the shift. If the char is + // negative, the shift is undefined behavior in C++. The hash algorithm is + // part of the format definition, so we cannot change it; to obtain the same + // behavior in a legal way we just cast to uint32_t, which will do + // sign-extension. To guarantee compatibility with architectures where chars + // are unsigned we first cast the char to int8_t. + case 3: + h += static_cast(static_cast(data[2])) << 16; + FALLTHROUGH_INTENDED; + case 2: + h += static_cast(static_cast(data[1])) << 8; + FALLTHROUGH_INTENDED; + case 1: + h += static_cast(static_cast(data[0])); + h *= m; + h ^= (h >> r); + break; + } + return h; +} + +// We are standardizing on a preview release of XXH3, because that's +// the best available at time of standardizing. +// +// In testing (mostly Intel Skylake), this hash function is much more +// thorough than Hash32 and is almost universally faster. Hash() only +// seems faster when passing runtime-sized keys of the same small size +// (less than about 24 bytes) thousands of times in a row; this seems +// to allow the branch predictor to work some magic. XXH3's speed is +// much less dependent on branch prediction. +// +// Hashing with a prefix extractor is potentially a common case of +// hashing objects of small, predictable size. We could consider +// bundling hash functions specialized for particular lengths with +// the prefix extractors. +uint64_t Hash64(const char* data, size_t n, uint64_t seed) { + return XXPH3_64bits_withSeed(data, n, seed); +} + +uint64_t Hash64(const char* data, size_t n) { + // Same as seed = 0 + return XXPH3_64bits(data, n); +} + +uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) { + // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation. + size_t concat_len = 0; + for (int i = 0; i < data.num_parts; ++i) { + concat_len += data.parts[i].size(); + } + std::string concat_data; + concat_data.reserve(concat_len); + for (int i = 0; i < data.num_parts; ++i) { + concat_data.append(data.parts[i].data(), data.parts[i].size()); + } + assert(concat_data.size() == concat_len); + return NPHash64(concat_data.data(), concat_len, seed); +} + +Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) { + auto h = XXH3_128bits_withSeed(data, n, seed); + return (Unsigned128{h.high64} << 64) | (h.low64); +} + +Unsigned128 Hash128(const char* data, size_t n) { + // Same as seed = 0 + auto h = XXH3_128bits(data, n); + return (Unsigned128{h.high64} << 64) | (h.low64); +} + +void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) { + // Same as seed = 0 + auto h = XXH3_128bits(data, n); + *high64 = h.high64; + *low64 = h.low64; +} + +void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, + uint64_t* low64) { + auto h = XXH3_128bits_withSeed(data, n, seed); + *high64 = h.high64; + *low64 = h.low64; +} + +namespace { + +inline uint64_t XXH3_avalanche(uint64_t h64) { + h64 ^= h64 >> 37; + h64 *= 0x165667919E3779F9U; + h64 ^= h64 >> 32; + return h64; +} + +inline uint64_t XXH3_unavalanche(uint64_t h64) { + h64 ^= h64 >> 32; + h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U + h64 ^= h64 >> 37; + return h64; +} + +} // namespace + +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64) { + // Adapted from XXH3_len_9to16_128b + const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; + const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; + Unsigned128 tmp128 = + Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U); + uint64_t lo = Lower64of128(tmp128); + uint64_t hi = Upper64of128(tmp128); + lo += 0x3c0000000000000U; // (len - 1) << 54 + in_high64 ^= bitfliph; + hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76}); + lo ^= EndianSwapValue(hi); + tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU); + lo = Lower64of128(tmp128); + hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU); + *out_low64 = XXH3_avalanche(lo); + *out_high64 = XXH3_avalanche(hi); +} + +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64) { + // Inverted above (also consulting XXH3_len_9to16_128b) + const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; + const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; + uint64_t lo = XXH3_unavalanche(in_low64); + uint64_t hi = XXH3_unavalanche(in_high64); + lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU + hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU)); + hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU + lo ^= EndianSwapValue(hi); + lo -= 0x3c0000000000000U; + lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U + hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U)); + uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77 + hi -= tmp32; + hi = (hi & 0xFFFFFFFF00000000U) - + ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32; + hi ^= bitfliph; + lo ^= hi ^ bitflipl; + *out_high64 = hi; + *out_low64 = lo; +} + +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64) { + BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); +} + +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64) { + BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.h new file mode 100644 index 0000000000..eafa47f346 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Common hash functions with convenient interfaces. If hashing a +// statically-sized input in a performance-critical context, consider +// calling a specific hash implementation directly, such as +// XXH3_64bits from xxhash.h. +// +// Since this is a very common header, implementation details are kept +// out-of-line. Out-of-lining also aids in tracking the time spent in +// hashing functions. Inlining is of limited benefit for runtime-sized +// hash inputs. + +#pragma once + +#include +#include + +#include "rocksdb/slice.h" +#include "util/fastrange.h" + +namespace ROCKSDB_NAMESPACE { + +// Stable/persistent 64-bit hash. Higher quality and generally faster than +// Hash(), especially for inputs > 24 bytes. +// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent +// results from previous seed. Recommend incrementing by a large odd number. +extern uint64_t Hash64(const char* data, size_t n, uint64_t seed); + +// Specific optimization without seed (same as seed = 0) +extern uint64_t Hash64(const char* data, size_t n); + +// Non-persistent hash. Must only used for in-memory data structures. +// The hash results are thus subject to change between releases, +// architectures, build configuration, etc. (Thus, it rarely makes sense +// to specify a seed for this function, except for a "rolling" hash.) +// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent +// results from previous seed. Recommend incrementing by a large odd number. +inline uint64_t NPHash64(const char* data, size_t n, uint64_t seed) { +#ifdef ROCKSDB_MODIFY_NPHASH + // For testing "subject to change" + return Hash64(data, n, seed + 123456789); +#else + // Currently same as Hash64 + return Hash64(data, n, seed); +#endif +} + +// Specific optimization without seed (same as seed = 0) +inline uint64_t NPHash64(const char* data, size_t n) { +#ifdef ROCKSDB_MODIFY_NPHASH + // For testing "subject to change" + return Hash64(data, n, 123456789); +#else + // Currently same as Hash64 + return Hash64(data, n); +#endif +} + +// Convenient and equivalent version of Hash128 without depending on 128-bit +// scalars +void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64); +void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, + uint64_t* low64); + +// Hash 128 bits to 128 bits, guaranteed not to lose data (equivalent to +// Hash2x64 on 16 bytes little endian) +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64); +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64); + +// Inverse of above (mostly for testing) +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64); +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64); + +// Stable/persistent 32-bit hash. Moderate quality and high speed on +// small inputs. +// TODO: consider rename to Hash32 +// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent +// results from previous seed. Recommend pseudorandom or hashed seeds. +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +// TODO: consider rename to LegacyBloomHash32 +inline uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +inline uint64_t GetSliceHash64(const Slice& key) { + return Hash64(key.data(), key.size()); +} +// Provided for convenience for use with template argument deduction, where a +// specific overload needs to be used. +extern uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&); + +inline uint64_t GetSliceNPHash64(const Slice& s) { + return NPHash64(s.data(), s.size()); +} + +inline uint64_t GetSliceNPHash64(const Slice& s, uint64_t seed) { + return NPHash64(s.data(), s.size(), seed); +} + +// Similar to `GetSliceNPHash64()` with `seed`, but input comes from +// concatenation of `Slice`s in `data`. +extern uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed); + +inline size_t GetSliceRangedNPHash(const Slice& s, size_t range) { + return FastRange64(NPHash64(s.data(), s.size()), range); +} + +// TODO: consider rename to GetSliceHash32 +inline uint32_t GetSliceHash(const Slice& s) { + return Hash(s.data(), s.size(), 397); +} + +// Useful for splitting up a 64-bit hash +inline uint32_t Upper32of64(uint64_t v) { + return static_cast(v >> 32); +} +inline uint32_t Lower32of64(uint64_t v) { return static_cast(v); } + +// std::hash compatible interface. +// TODO: consider rename to SliceHasher32 +struct SliceHasher { + uint32_t operator()(const Slice& s) const { return GetSliceHash(s); } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash128.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash128.h new file mode 100644 index 0000000000..305caa14ae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash128.h @@ -0,0 +1,26 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// 128-bit hash gets it own header so that more popular hash.h doesn't +// depend on math128.h + +#include "rocksdb/slice.h" +#include "util/math128.h" + +namespace ROCKSDB_NAMESPACE { + +// Stable/persistent 128-bit hash for non-cryptographic applications. +Unsigned128 Hash128(const char* data, size_t n, uint64_t seed); + +// Specific optimization without seed (same as seed = 0) +Unsigned128 Hash128(const char* data, size_t n); + +inline Unsigned128 GetSliceHash128(const Slice& key) { + return Hash128(key.data(), key.size()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_containers.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_containers.h new file mode 100644 index 0000000000..52be3718c0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_containers.h @@ -0,0 +1,51 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This header establishes compile-time pluggable implementations of hashed +// container structures, so that deployments have the option of minimal +// dependencies with ok performance (e.g. std::unordered_map) or more +// dependencies with optimized performance (e.g. folly::F14FastMap). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +#ifdef USE_FOLLY + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +template +using UnorderedMap = folly::F14FastMap; + +template +using UnorderedMapH = folly::F14FastMap; + +template +using UnorderedSet = folly::F14FastSet; + +} // namespace ROCKSDB_NAMESPACE + +#else + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +template +using UnorderedMap = std::unordered_map; + +template +using UnorderedMapH = std::unordered_map; + +template +using UnorderedSet = std::unordered_set; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_map.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_map.h new file mode 100644 index 0000000000..e3ad2584f7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/hash_map.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include +#include + +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// This is similar to std::unordered_map, except that it tries to avoid +// allocating or deallocating memory as much as possible. With +// std::unordered_map, an allocation/deallocation is made for every insertion +// or deletion because of the requirement that iterators remain valid even +// with insertions or deletions. This means that the hash chains will be +// implemented as linked lists. +// +// This implementation uses autovector as hash chains insteads. +// +template +class HashMap { + std::array, 1>, size> table_; + + public: + bool Contains(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair& p) { return p.first == key; }); + return it != bucket.end(); + } + + void Insert(K key, const V& value) { + auto& bucket = table_[key % size]; + bucket.push_back({key, value}); + } + + void Delete(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair& p) { return p.first == key; }); + if (it != bucket.end()) { + auto last = bucket.end() - 1; + if (it != last) { + *it = *last; + } + bucket.pop_back(); + } + } + + V& Get(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair& p) { return p.first == key; }); + return it->second; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/heap.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/heap.h new file mode 100644 index 0000000000..f221fc7327 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/heap.h @@ -0,0 +1,174 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Binary heap implementation optimized for use in multi-way merge sort. +// Comparison to std::priority_queue: +// - In libstdc++, std::priority_queue::pop() usually performs just over logN +// comparisons but never fewer. +// - std::priority_queue does not have a replace-top operation, requiring a +// pop+push. If the replacement element is the new top, this requires +// around 2logN comparisons. +// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN +// comparisons. +// - This heap provides a replace_top() operation which requires [1, 2logN] +// comparisons. When the replacement element is also the new top, this +// takes just 1 or 2 comparisons. +// +// The last property can yield an order-of-magnitude performance improvement +// when merge-sorting real-world non-random data. If the merge operation is +// likely to take chunks of elements from the same input stream, only 1 +// comparison per element is needed. In RocksDB-land, this happens when +// compacting a database where keys are not randomly distributed across L0 +// files but nearby keys are likely to be in the same L0 file. +// +// The container uses the same counterintuitive ordering as +// std::priority_queue: the comparison operator is expected to provide the +// less-than relation, but top() will return the maximum. + +template > +class BinaryHeap { + public: + BinaryHeap() {} + explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) {} + + void push(const T& value) { + data_.push_back(value); + upheap(data_.size() - 1); + } + + void push(T&& value) { + data_.push_back(std::move(value)); + upheap(data_.size() - 1); + } + + const T& top() const { + assert(!empty()); + return data_.front(); + } + + void replace_top(const T& value) { + assert(!empty()); + data_.front() = value; + downheap(get_root()); + } + + void replace_top(T&& value) { + assert(!empty()); + data_.front() = std::move(value); + downheap(get_root()); + } + + void pop() { + assert(!empty()); + if (data_.size() > 1) { + // Avoid self-move-assign, because it could cause problems with + // classes which are not prepared for this and it trips up the + // STL debugger when activated. + data_.front() = std::move(data_.back()); + } + data_.pop_back(); + if (!empty()) { + downheap(get_root()); + } else { + reset_root_cmp_cache(); + } + } + + void swap(BinaryHeap& other) { + std::swap(cmp_, other.cmp_); + data_.swap(other.data_); + std::swap(root_cmp_cache_, other.root_cmp_cache_); + } + + void clear() { + data_.clear(); + reset_root_cmp_cache(); + } + + bool empty() const { return data_.empty(); } + + size_t size() const { return data_.size(); } + + void reset_root_cmp_cache() { + root_cmp_cache_ = std::numeric_limits::max(); + } + + private: + static inline size_t get_root() { return 0; } + static inline size_t get_parent(size_t index) { return (index - 1) / 2; } + static inline size_t get_left(size_t index) { return 2 * index + 1; } + static inline size_t get_right(size_t index) { return 2 * index + 2; } + + void upheap(size_t index) { + T v = std::move(data_[index]); + while (index > get_root()) { + const size_t parent = get_parent(index); + if (!cmp_(data_[parent], v)) { + break; + } + data_[index] = std::move(data_[parent]); + index = parent; + } + data_[index] = std::move(v); + reset_root_cmp_cache(); + } + + void downheap(size_t index) { + T v = std::move(data_[index]); + + size_t picked_child = std::numeric_limits::max(); + while (1) { + const size_t left_child = get_left(index); + if (get_left(index) >= data_.size()) { + break; + } + const size_t right_child = left_child + 1; + assert(right_child == get_right(index)); + picked_child = left_child; + if (index == 0 && root_cmp_cache_ < data_.size()) { + picked_child = root_cmp_cache_; + } else if (right_child < data_.size() && + cmp_(data_[left_child], data_[right_child])) { + picked_child = right_child; + } + if (!cmp_(v, data_[picked_child])) { + break; + } + data_[index] = std::move(data_[picked_child]); + index = picked_child; + } + + if (index == 0) { + // We did not change anything in the tree except for the value + // of the root node, left and right child did not change, we can + // cache that `picked_child` is the smallest child + // so next time we compare againist it directly + root_cmp_cache_ = picked_child; + } else { + // the tree changed, reset cache + reset_root_cmp_cache(); + } + + data_[index] = std::move(v); + } + + Compare cmp_; + autovector data_; + // Used to reduce number of cmp_ calls in downheap() + size_t root_cmp_cache_ = std::numeric_limits::max(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/kv_map.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/kv_map.h new file mode 100644 index 0000000000..62be6d18e3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/kv_map.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace stl_wrappers { + +struct LessOfComparator { + explicit LessOfComparator(const Comparator* c = BytewiseComparator()) + : cmp(c) {} + + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } + bool operator()(const Slice& a, const Slice& b) const { + return cmp->Compare(a, b) < 0; + } + + const Comparator* cmp; +}; + +using KVMap = std::map; +} // namespace stl_wrappers +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math.h new file mode 100644 index 0000000000..39f3083287 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math.h @@ -0,0 +1,299 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#ifdef _MSC_VER +#include +#endif + +#include +#include + +#include "port/lang.h" +#include "rocksdb/rocksdb_namespace.h" + +ASSERT_FEATURE_COMPAT_HEADER(); + +namespace ROCKSDB_NAMESPACE { + +// Fast implementation of floor(log2(v)). Undefined for 0 or negative +// numbers (in case of signed type). +template +inline int FloorLog2(T v) { + static_assert(std::is_integral::value, "non-integral type"); + assert(v > 0); +#ifdef _MSC_VER + static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); + unsigned long idx = 0; + if (sizeof(T) <= sizeof(uint32_t)) { + _BitScanReverse(&idx, static_cast(v)); + } else { +#if defined(_M_X64) || defined(_M_ARM64) + _BitScanReverse64(&idx, static_cast(v)); +#else + const auto vh = static_cast(static_cast(v) >> 32); + if (vh != 0) { + _BitScanReverse(&idx, static_cast(vh)); + idx += 32; + } else { + _BitScanReverse(&idx, static_cast(v)); + } +#endif + } + return idx; +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) <= sizeof(unsigned int)) { + int lz = __builtin_clz(static_cast(v)); + return int{sizeof(unsigned int)} * 8 - 1 - lz; + } else if (sizeof(T) <= sizeof(unsigned long)) { + int lz = __builtin_clzl(static_cast(v)); + return int{sizeof(unsigned long)} * 8 - 1 - lz; + } else { + int lz = __builtin_clzll(static_cast(v)); + return int{sizeof(unsigned long long)} * 8 - 1 - lz; + } +#endif +} + +// Constexpr version of FloorLog2 +template +constexpr int ConstexprFloorLog2(T v) { + int rv = 0; + while (v > T{1}) { + ++rv; + v >>= 1; + } + return rv; +} + +// Number of low-order zero bits before the first 1 bit. Undefined for 0. +template +inline int CountTrailingZeroBits(T v) { + static_assert(std::is_integral::value, "non-integral type"); + assert(v != 0); +#ifdef _MSC_VER + static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); + unsigned long tz = 0; + if (sizeof(T) <= sizeof(uint32_t)) { + _BitScanForward(&tz, static_cast(v)); + } else { +#if defined(_M_X64) || defined(_M_ARM64) + _BitScanForward64(&tz, static_cast(v)); +#else + _BitScanForward(&tz, static_cast(v)); + if (tz == 0) { + _BitScanForward(&tz, + static_cast(static_cast(v) >> 32)); + tz += 32; + } +#endif + } + return static_cast(tz); +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) <= sizeof(unsigned int)) { + return __builtin_ctz(static_cast(v)); + } else if (sizeof(T) <= sizeof(unsigned long)) { + return __builtin_ctzl(static_cast(v)); + } else { + return __builtin_ctzll(static_cast(v)); + } +#endif +} + +// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include +// the following code at coarse granularity for simpler macros. It's important +// to exclude at least so our non-MSVC unit test coverage tool doesn't see it. +#ifdef _MSC_VER + +namespace detail { + +template +int BitsSetToOneFallback(T v) { + const int kBits = static_cast(sizeof(T)) * 8; + static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits"); + // we static_cast these bit patterns in order to truncate them to the correct + // size. Warning C4309 dislikes this technique, so disable it here. +#pragma warning(disable : 4309) + v = static_cast(v - ((v >> 1) & static_cast(0x5555555555555555ull))); + v = static_cast((v & static_cast(0x3333333333333333ull)) + + ((v >> 2) & static_cast(0x3333333333333333ull))); + v = static_cast((v + (v >> 4)) & static_cast(0x0F0F0F0F0F0F0F0Full)); +#pragma warning(default : 4309) + for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) { + v += static_cast(v >> shift_bits); + } + // we want the bottom "slot" that's big enough to represent a value up to + // (and including) kBits. + return static_cast(v & static_cast(kBits | (kBits - 1))); +} + +} // namespace detail + +#endif // _MSC_VER + +// Number of bits set to 1. Also known as "population count". +template +inline int BitsSetToOne(T v) { + static_assert(std::is_integral::value, "non-integral type"); +#ifdef _MSC_VER + static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); + if (sizeof(T) < sizeof(uint32_t)) { + // This bit mask is to avoid a compiler warning on unused path + constexpr auto mm = 8 * sizeof(uint32_t) - 1; + // The bit mask is to neutralize sign extension on small signed types + constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1; +#if __POPCNT__ + return static_cast(__popcnt(static_cast(v) & m)); +#else + return static_cast(detail::BitsSetToOneFallback(v) & m); +#endif // __POPCNT__ + } else if (sizeof(T) == sizeof(uint32_t)) { +#if __POPCNT__ + return static_cast(__popcnt(static_cast(v))); +#else + return detail::BitsSetToOneFallback(static_cast(v)); +#endif // __POPCNT__ + } else { +#if __POPCNT__ +#ifdef _M_X64 + return static_cast(__popcnt64(static_cast(v))); +#else + return static_cast( + __popcnt(static_cast(static_cast(v) >> 32) + + __popcnt(static_cast(v)))); +#endif // _M_X64 +#else + return detail::BitsSetToOneFallback(static_cast(v)); +#endif // __POPCNT__ + } +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) < sizeof(unsigned int)) { + // This bit mask is to avoid a compiler warning on unused path + constexpr auto mm = 8 * sizeof(unsigned int) - 1; + // This bit mask is to neutralize sign extension on small signed types + constexpr unsigned int m = (1U << ((8 * sizeof(T)) & mm)) - 1; + return __builtin_popcount(static_cast(v) & m); + } else if (sizeof(T) == sizeof(unsigned int)) { + return __builtin_popcount(static_cast(v)); + } else if (sizeof(T) <= sizeof(unsigned long)) { + return __builtin_popcountl(static_cast(v)); + } else { + return __builtin_popcountll(static_cast(v)); + } +#endif +} + +template +inline int BitParity(T v) { + static_assert(std::is_integral::value, "non-integral type"); +#ifdef _MSC_VER + // bit parity == oddness of popcount + return BitsSetToOne(v) & 1; +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) <= sizeof(unsigned int)) { + // On any sane systen, potential sign extension here won't change parity + return __builtin_parity(static_cast(v)); + } else if (sizeof(T) <= sizeof(unsigned long)) { + return __builtin_parityl(static_cast(v)); + } else { + return __builtin_parityll(static_cast(v)); + } +#endif +} + +// Swaps between big and little endian. Can be used in combination with the +// little-endian encoding/decoding functions in coding_lean.h and coding.h to +// encode/decode big endian. +template +inline T EndianSwapValue(T v) { + static_assert(std::is_integral::value, "non-integral type"); + +#ifdef _MSC_VER + if (sizeof(T) == 2) { + return static_cast(_byteswap_ushort(static_cast(v))); + } else if (sizeof(T) == 4) { + return static_cast(_byteswap_ulong(static_cast(v))); + } else if (sizeof(T) == 8) { + return static_cast(_byteswap_uint64(static_cast(v))); + } +#else + if (sizeof(T) == 2) { + return static_cast(__builtin_bswap16(static_cast(v))); + } else if (sizeof(T) == 4) { + return static_cast(__builtin_bswap32(static_cast(v))); + } else if (sizeof(T) == 8) { + return static_cast(__builtin_bswap64(static_cast(v))); + } +#endif + // Recognized by clang as bswap, but not by gcc :( + T ret_val = 0; + for (std::size_t i = 0; i < sizeof(T); ++i) { + ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i)); + } + return ret_val; +} + +// Reverses the order of bits in an integral value +template +inline T ReverseBits(T v) { + T r = EndianSwapValue(v); + const T kHighestByte = T{1} << ((sizeof(T) - 1) * 8); + const T kEveryByte = kHighestByte | (kHighestByte / 255); + + r = ((r & (kEveryByte * 0x0f)) << 4) | ((r >> 4) & (kEveryByte * 0x0f)); + r = ((r & (kEveryByte * 0x33)) << 2) | ((r >> 2) & (kEveryByte * 0x33)); + r = ((r & (kEveryByte * 0x55)) << 1) | ((r >> 1) & (kEveryByte * 0x55)); + + return r; +} + +// Every output bit depends on many input bits in the same and higher +// positions, but not lower positions. Specifically, this function +// * Output highest bit set to 1 is same as input (same FloorLog2, or +// equivalently, same number of leading zeros) +// * Is its own inverse (an involution) +// * Guarantees that b bottom bits of v and c bottom bits of +// DownwardInvolution(v) uniquely identify b + c bottom bits of v +// (which is all of v if v < 2**(b + c)). +// ** A notable special case is that modifying c adjacent bits at +// some chosen position in the input is bijective with the bottom c +// output bits. +// * Distributes over xor, as in DI(a ^ b) == DI(a) ^ DI(b) +// +// This transformation is equivalent to a matrix*vector multiplication in +// GF(2) where the matrix is recursively defined by the pattern matrix +// P = | 1 1 | +// | 0 1 | +// and replacing 1's with P and 0's with 2x2 zero matices to some depth, +// e.g. depth of 6 for 64-bit T. An essential feature of this matrix +// is that all square sub-matrices that include the top row are invertible. +template +inline T DownwardInvolution(T v) { + static_assert(std::is_integral::value, "non-integral type"); + static_assert(sizeof(T) <= 8, "only supported up to 64 bits"); + + uint64_t r = static_cast(v); + if constexpr (sizeof(T) > 4) { + r ^= r >> 32; + } + if constexpr (sizeof(T) > 2) { + r ^= (r & 0xffff0000ffff0000U) >> 16; + } + if constexpr (sizeof(T) > 1) { + r ^= (r & 0xff00ff00ff00ff00U) >> 8; + } + r ^= (r & 0xf0f0f0f0f0f0f0f0U) >> 4; + r ^= (r & 0xccccccccccccccccU) >> 2; + r ^= (r & 0xaaaaaaaaaaaaaaaaU) >> 1; + return static_cast(r); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math128.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math128.h new file mode 100644 index 0000000000..ae490051a7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/math128.h @@ -0,0 +1,316 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "util/coding_lean.h" +#include "util/math.h" + +#ifdef TEST_UINT128_COMPAT +#undef HAVE_UINT128_EXTENSION +#endif + +namespace ROCKSDB_NAMESPACE { + +// Unsigned128 is a 128 bit value supporting (at least) bitwise operators, +// shifts, and comparisons. __uint128_t is not always available. + +#ifdef HAVE_UINT128_EXTENSION +using Unsigned128 = __uint128_t; +#else +struct Unsigned128 { + uint64_t lo; + uint64_t hi; + + inline Unsigned128() { + static_assert(sizeof(Unsigned128) == 2 * sizeof(uint64_t), + "unexpected overhead in representation"); + lo = 0; + hi = 0; + } + + inline Unsigned128(uint64_t lower) { + lo = lower; + hi = 0; + } + + inline Unsigned128(uint64_t lower, uint64_t upper) { + lo = lower; + hi = upper; + } + + explicit operator uint64_t() { return lo; } + + explicit operator uint32_t() { return static_cast(lo); } + + explicit operator uint16_t() { return static_cast(lo); } + + explicit operator uint8_t() { return static_cast(lo); } +}; + +inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) { + shift &= 127; + Unsigned128 rv; + if (shift >= 64) { + rv.lo = 0; + rv.hi = lhs.lo << (shift & 63); + } else { + uint64_t tmp = lhs.lo; + rv.lo = tmp << shift; + // Ensure shift==0 shifts away everything. (This avoids another + // conditional branch on shift == 0.) + tmp = tmp >> 1 >> (63 - shift); + rv.hi = tmp | (lhs.hi << shift); + } + return rv; +} + +inline Unsigned128& operator<<=(Unsigned128& lhs, unsigned shift) { + lhs = lhs << shift; + return lhs; +} + +inline Unsigned128 operator>>(const Unsigned128& lhs, unsigned shift) { + shift &= 127; + Unsigned128 rv; + if (shift >= 64) { + rv.hi = 0; + rv.lo = lhs.hi >> (shift & 63); + } else { + uint64_t tmp = lhs.hi; + rv.hi = tmp >> shift; + // Ensure shift==0 shifts away everything + tmp = tmp << 1 << (63 - shift); + rv.lo = tmp | (lhs.lo >> shift); + } + return rv; +} + +inline Unsigned128& operator>>=(Unsigned128& lhs, unsigned shift) { + lhs = lhs >> shift; + return lhs; +} + +inline Unsigned128 operator&(const Unsigned128& lhs, const Unsigned128& rhs) { + return Unsigned128(lhs.lo & rhs.lo, lhs.hi & rhs.hi); +} + +inline Unsigned128& operator&=(Unsigned128& lhs, const Unsigned128& rhs) { + lhs = lhs & rhs; + return lhs; +} + +inline Unsigned128 operator|(const Unsigned128& lhs, const Unsigned128& rhs) { + return Unsigned128(lhs.lo | rhs.lo, lhs.hi | rhs.hi); +} + +inline Unsigned128& operator|=(Unsigned128& lhs, const Unsigned128& rhs) { + lhs = lhs | rhs; + return lhs; +} + +inline Unsigned128 operator^(const Unsigned128& lhs, const Unsigned128& rhs) { + return Unsigned128(lhs.lo ^ rhs.lo, lhs.hi ^ rhs.hi); +} + +inline Unsigned128& operator^=(Unsigned128& lhs, const Unsigned128& rhs) { + lhs = lhs ^ rhs; + return lhs; +} + +inline Unsigned128 operator~(const Unsigned128& v) { + return Unsigned128(~v.lo, ~v.hi); +} + +inline bool operator==(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.lo == rhs.lo && lhs.hi == rhs.hi; +} + +inline bool operator!=(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.lo != rhs.lo || lhs.hi != rhs.hi; +} + +inline bool operator>(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo > rhs.lo); +} + +inline bool operator<(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo < rhs.lo); +} + +inline bool operator>=(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo >= rhs.lo); +} + +inline bool operator<=(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo <= rhs.lo); +} +#endif + +inline uint64_t Lower64of128(Unsigned128 v) { +#ifdef HAVE_UINT128_EXTENSION + return static_cast(v); +#else + return v.lo; +#endif +} + +inline uint64_t Upper64of128(Unsigned128 v) { +#ifdef HAVE_UINT128_EXTENSION + return static_cast(v >> 64); +#else + return v.hi; +#endif +} + +// This generally compiles down to a single fast instruction on 64-bit. +// This doesn't really make sense as operator* because it's not a +// general 128x128 multiply and provides more output than 64x64 multiply. +inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) { +#ifdef HAVE_UINT128_EXTENSION + return Unsigned128{a} * Unsigned128{b}; +#else + // Full decomposition + // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit + // -> 128-bit multiplication and optimize it appropriately. + uint64_t tmp = uint64_t{b & 0xffffFFFF} * uint64_t{a & 0xffffFFFF}; + uint64_t lower = tmp & 0xffffFFFF; + tmp >>= 32; + tmp += uint64_t{b & 0xffffFFFF} * uint64_t{a >> 32}; + // Avoid overflow: first add lower 32 of tmp2, and later upper 32 + uint64_t tmp2 = uint64_t{b >> 32} * uint64_t{a & 0xffffFFFF}; + tmp += tmp2 & 0xffffFFFF; + lower |= tmp << 32; + tmp >>= 32; + tmp += tmp2 >> 32; + tmp += uint64_t{b >> 32} * uint64_t{a >> 32}; + return Unsigned128(lower, tmp); +#endif +} + +template <> +inline int FloorLog2(Unsigned128 v) { + if (Upper64of128(v) == 0) { + return FloorLog2(Lower64of128(v)); + } else { + return FloorLog2(Upper64of128(v)) + 64; + } +} + +template <> +inline int CountTrailingZeroBits(Unsigned128 v) { + if (Lower64of128(v) != 0) { + return CountTrailingZeroBits(Lower64of128(v)); + } else { + return CountTrailingZeroBits(Upper64of128(v)) + 64; + } +} + +template <> +inline int BitsSetToOne(Unsigned128 v) { + return BitsSetToOne(Lower64of128(v)) + BitsSetToOne(Upper64of128(v)); +} + +template <> +inline int BitParity(Unsigned128 v) { + return BitParity(Lower64of128(v) ^ Upper64of128(v)); +} + +template <> +inline Unsigned128 EndianSwapValue(Unsigned128 v) { + return (Unsigned128{EndianSwapValue(Lower64of128(v))} << 64) | + EndianSwapValue(Upper64of128(v)); +} + +template <> +inline Unsigned128 ReverseBits(Unsigned128 v) { + return (Unsigned128{ReverseBits(Lower64of128(v))} << 64) | + ReverseBits(Upper64of128(v)); +} + +template <> +inline Unsigned128 DownwardInvolution(Unsigned128 v) { + return (Unsigned128{DownwardInvolution(Upper64of128(v))} << 64) | + DownwardInvolution(Upper64of128(v) ^ Lower64of128(v)); +} + +template +struct IsUnsignedUpTo128 + : std::integral_constant::value || + std::is_same::value> {}; + +inline void EncodeFixed128(char* dst, Unsigned128 value) { + EncodeFixed64(dst, Lower64of128(value)); + EncodeFixed64(dst + 8, Upper64of128(value)); +} + +inline Unsigned128 DecodeFixed128(const char* ptr) { + Unsigned128 rv = DecodeFixed64(ptr + 8); + return (rv << 64) | DecodeFixed64(ptr); +} + +// A version of EncodeFixed* for generic algorithms. Likely to be used +// with Unsigned128, so lives here for now. +template +inline void EncodeFixedGeneric(char* /*dst*/, T /*value*/) { + // Unfortunately, GCC does not appear to optimize this simple code down + // to a trivial load on Intel: + // + // T ret_val = 0; + // for (size_t i = 0; i < sizeof(T); ++i) { + // ret_val |= (static_cast(static_cast(ptr[i])) << (8 * + // i)); + // } + // return ret_val; + // + // But does unroll the loop, and does optimize manually unrolled version + // for specific sizes down to a trivial load. I have no idea why it doesn't + // do both on this code. + + // So instead, we rely on specializations + static_assert(sizeof(T) == 0, "No specialization provided for this type"); +} + +template <> +inline void EncodeFixedGeneric(char* dst, uint16_t value) { + return EncodeFixed16(dst, value); +} +template <> +inline void EncodeFixedGeneric(char* dst, uint32_t value) { + return EncodeFixed32(dst, value); +} +template <> +inline void EncodeFixedGeneric(char* dst, uint64_t value) { + return EncodeFixed64(dst, value); +} +template <> +inline void EncodeFixedGeneric(char* dst, Unsigned128 value) { + return EncodeFixed128(dst, value); +} + +// A version of EncodeFixed* for generic algorithms. +template +inline T DecodeFixedGeneric(const char* /*dst*/) { + static_assert(sizeof(T) == 0, "No specialization provided for this type"); +} + +template <> +inline uint16_t DecodeFixedGeneric(const char* dst) { + return DecodeFixed16(dst); +} +template <> +inline uint32_t DecodeFixedGeneric(const char* dst) { + return DecodeFixed32(dst); +} +template <> +inline uint64_t DecodeFixedGeneric(const char* dst) { + return DecodeFixed64(dst); +} +template <> +inline Unsigned128 DecodeFixedGeneric(const char* dst) { + return DecodeFixed128(dst); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.cc new file mode 100644 index 0000000000..a69f3918ab --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash + is under the MIT license. +*/ +#include "murmurhash.h" + +#include "port/lang.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +// clang-format off +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED; + case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED; + case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED; + case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED; + case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED; + case 2: h ^= ((uint64_t)data2[1]) << 8; FALLTHROUGH_INTENDED; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} +// clang-format on + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. +// clang-format off +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED; + case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} +// clang-format on + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. +// clang-format off +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED; + case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} +// clang-format on + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.h new file mode 100644 index 0000000000..7ef4cbbec8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/murmurhash.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash + is under the MIT license. +*/ +#pragma once +#include + +#include "rocksdb/slice.h" + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A(const void* key, int len, unsigned int seed); +#define MurmurHash MurmurHash64A +using murmur_t = uint64_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2(const void* key, int len, unsigned int seed); +#define MurmurHash MurmurHash2 +using murmur_t = unsigned int; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2(const void* key, int len, unsigned int seed); +#define MurmurHash MurmurHashNeutral2 +using murmur_t = unsigned int; +#endif + +// Allow slice to be hashable by murmur hash. +namespace ROCKSDB_NAMESPACE { +struct murmur_hash { + size_t operator()(const Slice& slice) const { + return MurmurHash(slice.data(), static_cast(slice.size()), 0); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/mutexlock.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/mutexlock.h new file mode 100644 index 0000000000..a5b7581538 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/mutexlock.h @@ -0,0 +1,181 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include +#include +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); } + // No copying allowed + MutexLock(const MutexLock &) = delete; + void operator=(const MutexLock &) = delete; + + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; +}; + +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released when the +// object goes out of scope. +// +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { this->mu_->ReadLock(); } + // No copying allowed + ReadLock(const ReadLock &) = delete; + void operator=(const ReadLock &) = delete; + + ~ReadLock() { this->mu_->ReadUnlock(); } + + private: + port::RWMutex *const mu_; +}; + +// +// Automatically unlock a locked mutex when the object is destroyed +// +class ReadUnlock { + public: + explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); } + // No copying allowed + ReadUnlock(const ReadUnlock &) = delete; + ReadUnlock &operator=(const ReadUnlock &) = delete; + + ~ReadUnlock() { mu_->ReadUnlock(); } + + private: + port::RWMutex *const mu_; +}; + +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { this->mu_->WriteLock(); } + // No copying allowed + WriteLock(const WriteLock &) = delete; + void operator=(const WriteLock &) = delete; + + ~WriteLock() { this->mu_->WriteUnlock(); } + + private: + port::RWMutex *const mu_; +}; + +// +// SpinMutex has very low overhead for low-contention cases. Method names +// are chosen so you can use std::unique_lock or std::lock_guard with it. +// +class SpinMutex { + public: + SpinMutex() : locked_(false) {} + + bool try_lock() { + auto currently_locked = locked_.load(std::memory_order_relaxed); + return !currently_locked && + locked_.compare_exchange_weak(currently_locked, true, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + void lock() { + for (size_t tries = 0;; ++tries) { + if (try_lock()) { + // success + break; + } + port::AsmVolatilePause(); + if (tries > 100) { + std::this_thread::yield(); + } + } + } + + void unlock() { locked_.store(false, std::memory_order_release); } + + private: + std::atomic locked_; +}; + +// We want to prevent false sharing +template +struct ALIGN_AS(CACHE_LINE_SIZE) LockData { + T lock_; +}; + +// +// Inspired by Guava: https://github.com/google/guava/wiki/StripedExplained +// A striped Lock. This offers the underlying lock striping similar +// to that of ConcurrentHashMap in a reusable form, and extends it for +// semaphores and read-write locks. Conceptually, lock striping is the technique +// of dividing a lock into many stripes, increasing the granularity of a +// single lock and allowing independent operations to lock different stripes and +// proceed concurrently, instead of creating contention for a single lock. +// +template +class Striped { + public: + Striped(size_t stripes, std::function hash) + : stripes_(stripes), hash_(hash) { + locks_ = reinterpret_cast *>( + port::cacheline_aligned_alloc(sizeof(LockData) * stripes)); + for (size_t i = 0; i < stripes; i++) { + new (&locks_[i]) LockData(); + } + } + + virtual ~Striped() { + if (locks_ != nullptr) { + assert(stripes_ > 0); + for (size_t i = 0; i < stripes_; i++) { + locks_[i].~LockData(); + } + port::cacheline_aligned_free(locks_); + } + } + + T *get(const P &key) { + uint64_t h = hash_(key); + size_t index = h % stripes_; + return &reinterpret_cast *>(&locks_[index])->lock_; + } + + private: + size_t stripes_; + LockData *locks_; + std::function hash_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ppc-opcode.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ppc-opcode.h new file mode 100644 index 0000000000..5cc5af0e30 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ppc-opcode.h @@ -0,0 +1,27 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#define __PPC_RA(a) (((a)&0x1f) << 16) +#define __PPC_RB(b) (((b)&0x1f) << 11) +#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3)) +#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4)) +#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5)) +#define __PPC_XT(s) __PPC_XS(s) +#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b)) +#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b)) + +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 + +#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b) +#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b) +#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0) +#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.cc new file mode 100644 index 0000000000..7ac6ee19a1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "util/random.h" + +#include +#include +#include + +#include +#include + +#include "port/likely.h" +#include "util/thread_local.h" + +#define STORAGE_DECL static thread_local + +namespace ROCKSDB_NAMESPACE { + +Random* Random::GetTLSInstance() { + STORAGE_DECL Random* tls_instance; + STORAGE_DECL std::aligned_storage::type tls_instance_bytes; + + auto rv = tls_instance; + if (UNLIKELY(rv == nullptr)) { + size_t seed = std::hash()(std::this_thread::get_id()); + rv = new (&tls_instance_bytes) Random((uint32_t)seed); + tls_instance = rv; + } + return rv; +} + +std::string Random::HumanReadableString(int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; ++i) { + ret[i] = static_cast('a' + Uniform(26)); + } + return ret; +} + +std::string Random::RandomString(int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; i++) { + ret[i] = static_cast(' ' + Uniform(95)); // ' ' .. '~' + } + return ret; +} + +std::string Random::RandomBinaryString(int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; i++) { + ret[i] = static_cast(Uniform(CHAR_MAX)); + } + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.h new file mode 100644 index 0000000000..8923bdc4f0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/random.h @@ -0,0 +1,190 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + enum : uint32_t { + M = 2147483647L // 2^31-1 + }; + enum : uint64_t { + A = 16807 // bits 14, 8, 7, 5, 2, 1, 0 + }; + + uint32_t seed_; + + static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; } + + public: + // This is the largest value that can be returned from Next() + enum : uint32_t { kMaxNext = M }; + + explicit Random(uint32_t s) : seed_(GoodSeed(s)) {} + + void Reset(uint32_t s) { seed_ = GoodSeed(s); } + + uint32_t Next() { + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + + uint64_t Next64() { return (uint64_t{Next()} << 32) | Next(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return Uniform(n) == 0; } + + // "Optional" one-in-n, where 0 or negative always returns false + // (may or may not consume a random value) + bool OneInOpt(int n) { return n > 0 && OneIn(n); } + + // Returns random bool that is true for the given percentage of + // calls on average. Zero or less is always false and 100 or more + // is always true (may or may not consume a random value) + bool PercentTrue(int percentage) { + return static_cast(Uniform(100)) < percentage; + } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } + + // Returns a random string of length "len" + std::string RandomString(int len); + + // Generates a random string of len bytes using human-readable characters + std::string HumanReadableString(int len); + + // Generates a random binary data + std::string RandomBinaryString(int len); + + // Returns a Random instance for use by the current thread without + // additional locking + static Random* GetTLSInstance(); +}; + +// A good 32-bit random number generator based on std::mt19937. +// This exists in part to avoid compiler variance in warning about coercing +// uint_fast32_t from mt19937 to uint32_t. +class Random32 { + private: + std::mt19937 generator_; + + public: + explicit Random32(uint32_t s) : generator_(s) {} + + // Generates the next random number + uint32_t Next() { return static_cast(generator_()); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(uint32_t n) { + return static_cast( + std::uniform_int_distribution( + 0, n - 1)(generator_)); + } + + // Returns an *almost* uniformly distributed value in the range [0..n-1]. + // Much faster than Uniform(). + // REQUIRES: n > 0 + uint32_t Uniformish(uint32_t n) { + // fastrange (without the header) + return static_cast((uint64_t(generator_()) * uint64_t(n)) >> 32); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint32_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(uint32_t{1} << Uniform(max_log + 1)); + } + + // Reset the seed of the generator to the given value + void Seed(uint32_t new_seed) { generator_.seed(new_seed); } +}; + +// A good 64-bit random number generator based on std::mt19937_64 +class Random64 { + private: + std::mt19937_64 generator_; + + public: + explicit Random64(uint64_t s) : generator_(s) {} + + // Generates the next random number + uint64_t Next() { return generator_(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform(uint64_t n) { + return std::uniform_int_distribution(0, n - 1)(generator_); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint64_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint64_t Skewed(int max_log) { + return Uniform(uint64_t(1) << Uniform(max_log + 1)); + } +}; + +// A seeded replacement for removed std::random_shuffle +template +void RandomShuffle(RandomIt first, RandomIt last, uint32_t seed) { + std::mt19937 rng(seed); + std::shuffle(first, last, rng); +} + +// A replacement for removed std::random_shuffle +template +void RandomShuffle(RandomIt first, RandomIt last) { + RandomShuffle(first, last, std::random_device{}()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter.cc new file mode 100644 index 0000000000..be54138d94 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter.cc @@ -0,0 +1,376 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "monitoring/statistics_impl.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/rate_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { +size_t RateLimiter::RequestToken(size_t bytes, size_t alignment, + Env::IOPriority io_priority, Statistics* stats, + RateLimiter::OpType op_type) { + if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) { + bytes = std::min(bytes, static_cast(GetSingleBurstBytes())); + + if (alignment > 0) { + // Here we may actually require more than burst and block + // as we can not write/read less than one page at a time on direct I/O + // thus we do not want to be strictly constrained by burst + bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes)); + } + Request(bytes, io_priority, stats, op_type); + } + return bytes; +} + +// Pending request +struct GenericRateLimiter::Req { + explicit Req(int64_t _bytes, port::Mutex* _mu) + : request_bytes(_bytes), bytes(_bytes), cv(_mu) {} + int64_t request_bytes; + int64_t bytes; + port::CondVar cv; +}; + +GenericRateLimiter::GenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness, + RateLimiter::Mode mode, const std::shared_ptr& clock, + bool auto_tuned) + : RateLimiter(mode), + refill_period_us_(refill_period_us), + rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2 + : rate_bytes_per_sec), + refill_bytes_per_period_( + CalculateRefillBytesPerPeriodLocked(rate_bytes_per_sec_)), + clock_(clock), + stop_(false), + exit_cv_(&request_mutex_), + requests_to_wait_(0), + available_bytes_(0), + next_refill_us_(NowMicrosMonotonicLocked()), + fairness_(fairness > 100 ? 100 : fairness), + rnd_((uint32_t)time(nullptr)), + wait_until_refill_pending_(false), + auto_tuned_(auto_tuned), + num_drains_(0), + max_bytes_per_sec_(rate_bytes_per_sec), + tuned_time_(NowMicrosMonotonicLocked()) { + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_requests_[i] = 0; + total_bytes_through_[i] = 0; + } +} + +GenericRateLimiter::~GenericRateLimiter() { + MutexLock g(&request_mutex_); + stop_ = true; + std::deque::size_type queues_size_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + queues_size_sum += queue_[i].size(); + } + requests_to_wait_ = static_cast(queues_size_sum); + + for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) { + std::deque queue = queue_[i]; + for (auto& r : queue) { + r->cv.Signal(); + } + } + + while (requests_to_wait_ > 0) { + exit_cv_.Wait(); + } +} + +// This API allows user to dynamically change rate limiter's bytes per second. +void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) { + MutexLock g(&request_mutex_); + SetBytesPerSecondLocked(bytes_per_second); +} + +void GenericRateLimiter::SetBytesPerSecondLocked(int64_t bytes_per_second) { + assert(bytes_per_second > 0); + rate_bytes_per_sec_.store(bytes_per_second, std::memory_order_relaxed); + refill_bytes_per_period_.store( + CalculateRefillBytesPerPeriodLocked(bytes_per_second), + std::memory_order_relaxed); +} + +void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, + Statistics* stats) { + assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed)); + bytes = std::max(static_cast(0), bytes); + TEST_SYNC_POINT("GenericRateLimiter::Request"); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1", + &rate_bytes_per_sec_); + MutexLock g(&request_mutex_); + + if (auto_tuned_) { + static const int kRefillsPerTune = 100; + std::chrono::microseconds now(NowMicrosMonotonicLocked()); + if (now - tuned_time_ >= + kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) { + Status s = TuneLocked(); + s.PermitUncheckedError(); //**TODO: What to do on error? + } + } + + if (stop_) { + // It is now in the clean-up of ~GenericRateLimiter(). + // Therefore any new incoming request will exit from here + // and not get satiesfied. + return; + } + + ++total_requests_[pri]; + + if (available_bytes_ > 0) { + int64_t bytes_through = std::min(available_bytes_, bytes); + total_bytes_through_[pri] += bytes_through; + available_bytes_ -= bytes_through; + bytes -= bytes_through; + } + + if (bytes == 0) { + return; + } + + // Request cannot be satisfied at this moment, enqueue + Req r(bytes, &request_mutex_); + queue_[pri].push_back(&r); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostEnqueueRequest", + &request_mutex_); + // A thread representing a queued request coordinates with other such threads. + // There are two main duties. + // + // (1) Waiting for the next refill time. + // (2) Refilling the bytes and granting requests. + do { + int64_t time_until_refill_us = next_refill_us_ - NowMicrosMonotonicLocked(); + if (time_until_refill_us > 0) { + if (wait_until_refill_pending_) { + // Somebody is performing (1). Trust we'll be woken up when our request + // is granted or we are needed for future duties. + r.cv.Wait(); + } else { + // Whichever thread reaches here first performs duty (1) as described + // above. + int64_t wait_until = clock_->NowMicros() + time_until_refill_us; + RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS); + ++num_drains_; + wait_until_refill_pending_ = true; + r.cv.TimedWait(wait_until); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostTimedWait", + &time_until_refill_us); + wait_until_refill_pending_ = false; + } + } else { + // Whichever thread reaches here first performs duty (2) as described + // above. + RefillBytesAndGrantRequestsLocked(); + if (r.request_bytes == 0) { + // If there is any remaining requests, make sure there exists at least + // one candidate is awake for future duties by signaling a front request + // of a queue. + for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) { + std::deque queue = queue_[i]; + if (!queue.empty()) { + queue.front()->cv.Signal(); + break; + } + } + } + } + // Invariant: non-granted request is always in one queue, and granted + // request is always in zero queues. +#ifndef NDEBUG + int num_found = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + if (std::find(queue_[i].begin(), queue_[i].end(), &r) != + queue_[i].end()) { + ++num_found; + } + } + if (r.request_bytes == 0) { + assert(num_found == 0); + } else { + assert(num_found == 1); + } +#endif // NDEBUG + } while (!stop_ && r.request_bytes > 0); + + if (stop_) { + // It is now in the clean-up of ~GenericRateLimiter(). + // Therefore any woken-up request will have come out of the loop and then + // exit here. It might or might not have been satisfied. + --requests_to_wait_; + exit_cv_.Signal(); + } +} + +std::vector +GenericRateLimiter::GeneratePriorityIterationOrderLocked() { + std::vector pri_iteration_order(Env::IO_TOTAL /* 4 */); + // We make Env::IO_USER a superior priority by always iterating its queue + // first + pri_iteration_order[0] = Env::IO_USER; + + bool high_pri_iterated_after_mid_low_pri = rnd_.OneIn(fairness_); + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForHighPri", + &high_pri_iterated_after_mid_low_pri); + bool mid_pri_itereated_after_low_pri = rnd_.OneIn(fairness_); + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForMidPri", + &mid_pri_itereated_after_low_pri); + + if (high_pri_iterated_after_mid_low_pri) { + pri_iteration_order[3] = Env::IO_HIGH; + pri_iteration_order[2] = + mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW; + pri_iteration_order[1] = + (pri_iteration_order[2] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID; + } else { + pri_iteration_order[1] = Env::IO_HIGH; + pri_iteration_order[3] = + mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW; + pri_iteration_order[2] = + (pri_iteration_order[3] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID; + } + + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PreReturnPriIterationOrder", + &pri_iteration_order); + return pri_iteration_order; +} + +void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::RefillBytesAndGrantRequestsLocked", &request_mutex_); + next_refill_us_ = NowMicrosMonotonicLocked() + refill_period_us_; + // Carry over the left over quota from the last period + auto refill_bytes_per_period = + refill_bytes_per_period_.load(std::memory_order_relaxed); + assert(available_bytes_ == 0); + available_bytes_ = refill_bytes_per_period; + + std::vector pri_iteration_order = + GeneratePriorityIterationOrderLocked(); + + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + assert(!pri_iteration_order.empty()); + Env::IOPriority current_pri = pri_iteration_order[i]; + auto* queue = &queue_[current_pri]; + while (!queue->empty()) { + auto* next_req = queue->front(); + if (available_bytes_ < next_req->request_bytes) { + // Grant partial request_bytes to avoid starvation of requests + // that become asking for more bytes than available_bytes_ + // due to dynamically reduced rate limiter's bytes_per_second that + // leads to reduced refill_bytes_per_period hence available_bytes_ + next_req->request_bytes -= available_bytes_; + available_bytes_ = 0; + break; + } + available_bytes_ -= next_req->request_bytes; + next_req->request_bytes = 0; + total_bytes_through_[current_pri] += next_req->bytes; + queue->pop_front(); + + // Quota granted, signal the thread to exit + next_req->cv.Signal(); + } + } +} + +int64_t GenericRateLimiter::CalculateRefillBytesPerPeriodLocked( + int64_t rate_bytes_per_sec) { + if (std::numeric_limits::max() / rate_bytes_per_sec < + refill_period_us_) { + // Avoid unexpected result in the overflow case. The result now is still + // inaccurate but is a number that is large enough. + return std::numeric_limits::max() / 1000000; + } else { + return rate_bytes_per_sec * refill_period_us_ / 1000000; + } +} + +Status GenericRateLimiter::TuneLocked() { + const int kLowWatermarkPct = 50; + const int kHighWatermarkPct = 90; + const int kAdjustFactorPct = 5; + // computed rate limit will be in + // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`. + const int kAllowedRangeFactor = 20; + + std::chrono::microseconds prev_tuned_time = tuned_time_; + tuned_time_ = std::chrono::microseconds(NowMicrosMonotonicLocked()); + + int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time + + std::chrono::microseconds(refill_period_us_) - + std::chrono::microseconds(1)) / + std::chrono::microseconds(refill_period_us_); + // We tune every kRefillsPerTune intervals, so the overflow and division-by- + // zero conditions should never happen. + assert(num_drains_ <= std::numeric_limits::max() / 100); + assert(elapsed_intervals > 0); + int64_t drained_pct = num_drains_ * 100 / elapsed_intervals; + + int64_t prev_bytes_per_sec = GetBytesPerSecond(); + int64_t new_bytes_per_sec; + if (drained_pct == 0) { + new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor; + } else if (drained_pct < kLowWatermarkPct) { + // sanitize to prevent overflow + int64_t sanitized_prev_bytes_per_sec = + std::min(prev_bytes_per_sec, std::numeric_limits::max() / 100); + new_bytes_per_sec = + std::max(max_bytes_per_sec_ / kAllowedRangeFactor, + sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct)); + } else if (drained_pct > kHighWatermarkPct) { + // sanitize to prevent overflow + int64_t sanitized_prev_bytes_per_sec = + std::min(prev_bytes_per_sec, std::numeric_limits::max() / + (100 + kAdjustFactorPct)); + new_bytes_per_sec = + std::min(max_bytes_per_sec_, + sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100); + } else { + new_bytes_per_sec = prev_bytes_per_sec; + } + if (new_bytes_per_sec != prev_bytes_per_sec) { + SetBytesPerSecondLocked(new_bytes_per_sec); + } + num_drains_ = 0; + return Status::OK(); +} + +RateLimiter* NewGenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */, + int32_t fairness /* = 10 */, + RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */, + bool auto_tuned /* = false */) { + assert(rate_bytes_per_sec > 0); + assert(refill_period_us > 0); + assert(fairness > 0); + std::unique_ptr limiter( + new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness, + mode, SystemClock::Default(), auto_tuned)); + return limiter.release(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter_impl.h new file mode 100644 index 0000000000..4c078f5a0e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/rate_limiter_impl.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "util/mutexlock.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class GenericRateLimiter : public RateLimiter { + public: + GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us, + int32_t fairness, RateLimiter::Mode mode, + const std::shared_ptr& clock, + bool auto_tuned); + + virtual ~GenericRateLimiter(); + + // This API allows user to dynamically change rate limiter's bytes per second. + virtual void SetBytesPerSecond(int64_t bytes_per_second) override; + + // Request for token to write bytes. If this request can not be satisfied, + // the call is blocked. Caller is responsible to make sure + // bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes + // passed in will be rounded up to 0. + using RateLimiter::Request; + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats) override; + + virtual int64_t GetSingleBurstBytes() const override { + return refill_bytes_per_period_.load(std::memory_order_relaxed); + } + + virtual int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + int64_t total_bytes_through_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_bytes_through_sum += total_bytes_through_[i]; + } + return total_bytes_through_sum; + } + return total_bytes_through_[pri]; + } + + virtual int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + int64_t total_requests_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_requests_sum += total_requests_[i]; + } + return total_requests_sum; + } + return total_requests_[pri]; + } + + virtual Status GetTotalPendingRequests( + int64_t* total_pending_requests, + const Env::IOPriority pri = Env::IO_TOTAL) const override { + assert(total_pending_requests != nullptr); + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + int64_t total_pending_requests_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_pending_requests_sum += static_cast(queue_[i].size()); + } + *total_pending_requests = total_pending_requests_sum; + } else { + *total_pending_requests = static_cast(queue_[pri].size()); + } + return Status::OK(); + } + + virtual int64_t GetBytesPerSecond() const override { + return rate_bytes_per_sec_.load(std::memory_order_relaxed); + } + + virtual void TEST_SetClock(std::shared_ptr clock) { + MutexLock g(&request_mutex_); + clock_ = std::move(clock); + next_refill_us_ = NowMicrosMonotonicLocked(); + } + + private: + void RefillBytesAndGrantRequestsLocked(); + std::vector GeneratePriorityIterationOrderLocked(); + int64_t CalculateRefillBytesPerPeriodLocked(int64_t rate_bytes_per_sec); + Status TuneLocked(); + void SetBytesPerSecondLocked(int64_t bytes_per_second); + + uint64_t NowMicrosMonotonicLocked() { + return clock_->NowNanos() / std::milli::den; + } + + // This mutex guard all internal states + mutable port::Mutex request_mutex_; + + const int64_t refill_period_us_; + + std::atomic rate_bytes_per_sec_; + std::atomic refill_bytes_per_period_; + std::shared_ptr clock_; + + bool stop_; + port::CondVar exit_cv_; + int32_t requests_to_wait_; + + int64_t total_requests_[Env::IO_TOTAL]; + int64_t total_bytes_through_[Env::IO_TOTAL]; + int64_t available_bytes_; + int64_t next_refill_us_; + + int32_t fairness_; + Random rnd_; + + struct Req; + std::deque queue_[Env::IO_TOTAL]; + bool wait_until_refill_pending_; + + bool auto_tuned_; + int64_t num_drains_; + const int64_t max_bytes_per_sec_; + std::chrono::microseconds tuned_time_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/repeatable_thread.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/repeatable_thread.h new file mode 100644 index 0000000000..c75ad7c49f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/repeatable_thread.h @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// Simple wrapper around port::Thread that supports calling a callback every +// X seconds. If you pass in 0, then it will call your callback repeatedly +// without delay. +class RepeatableThread { + public: + RepeatableThread(std::function function, + const std::string& thread_name, SystemClock* clock, + uint64_t delay_us, uint64_t initial_delay_us = 0) + : function_(function), + thread_name_("rocksdb:" + thread_name), + clock_(clock), + delay_us_(delay_us), + initial_delay_us_(initial_delay_us), + mutex_(clock), + cond_var_(&mutex_), + running_(true), +#ifndef NDEBUG + waiting_(false), + run_count_(0), +#endif + thread_([this] { thread(); }) { + } + + void cancel() { + { + InstrumentedMutexLock l(&mutex_); + if (!running_) { + return; + } + running_ = false; + cond_var_.SignalAll(); + } + thread_.join(); + } + + bool IsRunning() { return running_; } + + ~RepeatableThread() { cancel(); } + +#ifndef NDEBUG + // Wait until RepeatableThread starting waiting, call the optional callback, + // then wait for one run of RepeatableThread. Tests can use provide a + // custom clock object to mock time, and use the callback here to bump current + // time and trigger RepeatableThread. See repeatable_thread_test for example. + // + // Note: only support one caller of this method. + void TEST_WaitForRun(std::function callback = nullptr) { + InstrumentedMutexLock l(&mutex_); + while (!waiting_) { + cond_var_.Wait(); + } + uint64_t prev_count = run_count_; + if (callback != nullptr) { + callback(); + } + cond_var_.SignalAll(); + while (!(run_count_ > prev_count)) { + cond_var_.Wait(); + } + } +#endif + + private: + bool wait(uint64_t delay) { + InstrumentedMutexLock l(&mutex_); + if (running_ && delay > 0) { + uint64_t wait_until = clock_->NowMicros() + delay; +#ifndef NDEBUG + waiting_ = true; + cond_var_.SignalAll(); +#endif + while (running_) { + cond_var_.TimedWait(wait_until); + if (clock_->NowMicros() >= wait_until) { + break; + } + } +#ifndef NDEBUG + waiting_ = false; +#endif + } + return running_; + } + + void thread() { +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + // Set thread name. + auto thread_handle = thread_.native_handle(); + int ret __attribute__((__unused__)) = + pthread_setname_np(thread_handle, thread_name_.c_str()); + assert(ret == 0); +#endif +#endif + + assert(delay_us_ > 0); + if (!wait(initial_delay_us_)) { + return; + } + do { + function_(); +#ifndef NDEBUG + { + InstrumentedMutexLock l(&mutex_); + run_count_++; + cond_var_.SignalAll(); + } +#endif + } while (wait(delay_us_)); + } + + const std::function function_; + const std::string thread_name_; + SystemClock* clock_; + const uint64_t delay_us_; + const uint64_t initial_delay_us_; + + // Mutex lock should be held when accessing running_, waiting_ + // and run_count_. + InstrumentedMutex mutex_; + InstrumentedCondVar cond_var_; + bool running_; +#ifndef NDEBUG + // RepeatableThread waiting for timeout. + bool waiting_; + // Times function_ had run. + uint64_t run_count_; +#endif + port::Thread thread_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_alg.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_alg.h new file mode 100644 index 0000000000..f9afefc237 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_alg.h @@ -0,0 +1,1225 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "util/math128.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_alg.h: generic versions of core algorithms. +// +// Ribbon is a Perfect Hash Static Function construction useful as a compact +// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear +// system construction that approximates a Band Matrix with hashing, +// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is +// remarkably efficient and adaptable at constructing an upper-triangular +// band matrix from a set of band-approximating inputs from (a), and +// (c) a storage layout that is fast and adaptable as a filter. +// +// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices +// with One Short Random Block per Row, with Applications" by Stefan +// Walzer and Martin Dietzfelbinger ("DW paper") +// (b) developed by Peter C. Dillinger, though not the first on-the-fly +// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by +// Bioglio, Grangetto, Gaeta, and Sereno. +// (c) see "interleaved" solution storage below. +// +// See ribbon_impl.h for high-level behavioral summary. This file focuses +// on the core design details. +// +// ###################################################################### +// ################# PHSF -> static filter reduction #################### +// +// A Perfect Hash Static Function is a data structure representing a +// map from anything hashable (a "key") to values of some fixed size. +// Crucially, it is allowed to return garbage values for anything not in +// the original set of map keys, and it is a "static" structure: entries +// cannot be added or deleted after construction. PHSFs representing n +// mappings to b-bit values (assume uniformly distributed) require at least +// n * b bits to represent, or at least b bits per entry. We typically +// describe the compactness of a PHSF by typical bits per entry as some +// function of b. For example, the MWHC construction (k=3 "peeling") +// requires about 1.0222*b and a variant called Xor+ requires about +// 1.08*b + 0.5 bits per entry. +// +// With more hashing, a PHSF can over-approximate a set as a Bloom filter +// does, with no FN queries and predictable false positive (FP) query +// rate. Instead of the user providing a value to map each input key to, +// a hash function provides the value. Keys in the original set will +// return a positive membership query because the underlying PHSF returns +// the same value as hashing the key. When a key is not in the original set, +// the PHSF returns a "garbage" value, which is only equal to the key's +// hash with (false positive) probability 1 in 2^b. +// +// For a matching false positive rate, standard Bloom filters require +// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h) +// require a bit more, around 1.5*b bits per entry. Thus, a Bloom +// alternative could save up to or nearly 1/3rd of memory and storage +// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter +// is dynamic.) +// +// Recommended reading: +// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" +// by Graf and Lemire +// First three sections of "Fast Scalable Construction of (Minimal +// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna +// +// ###################################################################### +// ################## PHSF vs. hash table vs. Bloom ##################### +// +// You can think of traditional hash tables and related filter variants +// such as Cuckoo filters as utilizing an "OR" construction: a hash +// function associates a key with some slots and the data is returned if +// the data is found in any one of those slots. The collision resolution +// is visible in the final data structure and requires extra information. +// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and +// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data +// structure associates each input key with data in one slot, the +// structure implicitly constructs a (near-)minimal (near-)perfect hash +// (MPH) of the keys, which requires at least 1.44 bits per key to +// represent. This is why approaches with visible collision resolution +// have a fixed + 1.5 or more in storage overhead per entry, often in +// addition to an overhead multiplier on b. +// +// By contrast Bloom filters utilize an "AND" construction: a query only +// returns true if all bit positions associated with a key are set to 1. +// There is no collision resolution, so Bloom filters do not suffer a +// fixed bits per entry overhead like the above structures. +// +// PHSFs typically use a bitwise XOR construction: the data you want is +// not in a single slot, but in a linear combination of several slots. +// For static data, this gives the best of "AND" and "OR" constructions: +// avoids the +1.44 or more fixed overhead by not approximating a MPH and +// can do much better than Bloom's 1.44 factor on b with collision +// resolution, which here is done ahead of time and invisible at query +// time. +// +// ###################################################################### +// ######################## PHSF construction ########################### +// +// For a typical PHSF, construction is solving a linear system of +// equations, typically in GF(2), which is to say that values are boolean +// and XOR serves both as addition and subtraction. We can use matrices to +// represent the problem: +// +// C * S = R +// (n x m) (m x b) (n x b) +// where C = coefficients, S = solution, R = results +// and solving for S given C and R. +// +// Note that C and R each have n rows, one for each input entry for the +// PHSF. A row in C is given by a hash function on the PHSF input key, +// and the corresponding row in R is the b-bit value to associate with +// that input key. (In a filter, rows of R are given by another hash +// function on the input key.) +// +// On solving, the matrix S (solution) is the final PHSF data, as it +// maps any row from the original C to its corresponding desired result +// in R. We just have to hash our query inputs and compute a linear +// combination of rows in S. +// +// In theory, we could chose m = n and let a hash function associate +// each input key with random rows in C. A solution exists with high +// probability, and uses essentially minimum space, b bits per entry +// (because we set m = n) but this has terrible scaling, something +// like O(n^2) space and O(n^3) time during construction (Gaussian +// elimination) and O(n) query time. But computational efficiency is +// key, and the core of this is avoiding scanning all of S to answer +// each query. +// +// The traditional approach (MWHC, aka Xor filter) starts with setting +// only some small fixed number of columns (typically k=3) to 1 for each +// row of C, with remaining entries implicitly 0. This is implemented as +// three hash functions over [0,m), and S can be implemented as a vector +// of b-bit values. Now, a query only involves looking up k rows +// (values) in S and computing their bitwise XOR. Additionally, this +// construction can use a linear time algorithm called "peeling" for +// finding a solution in many cases of one existing, but peeling +// generally requires a larger space overhead factor in the solution +// (m/n) than is required with Gaussian elimination. +// +// Recommended reading: +// "Peeling Close to the Orientability Threshold - Spatial Coupling in +// Hashing-Based Data Structures" by Stefan Walzer +// +// ###################################################################### +// ##################### Ribbon PHSF construction ####################### +// +// Ribbon constructs coefficient rows essentially the same as in the +// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width +// r (kCoeffBits in code), each key is hashed to a starting column in +// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean +// coefficients (GetCoeffRow() in code). If you sort the rows by start, +// the C matrix would look something like this: +// +// [####00000000000000000000] +// [####00000000000000000000] +// [000####00000000000000000] +// [0000####0000000000000000] +// [0000000####0000000000000] +// [000000000####00000000000] +// [000000000####00000000000] +// [0000000000000####0000000] +// [0000000000000000####0000] +// [00000000000000000####000] +// [00000000000000000000####] +// +// where each # could be a 0 or 1, chosen uniformly by a hash function. +// (Except we typically set the start column value to 1.) This scheme +// uses hashing to approximate a band matrix, and it has a solution iff +// it reduces to an upper-triangular boolean r-band matrix, like this: +// +// [1###00000000000000000000] +// [01##00000000000000000000] +// [000000000000000000000000] +// [0001###00000000000000000] +// [000000000000000000000000] +// [000001##0000000000000000] +// [000000000000000000000000] +// [00000001###0000000000000] +// [000000001###000000000000] +// [0000000001##000000000000] +// ... +// [00000000000000000000001#] +// [000000000000000000000001] +// +// where we have expanded to an m x m matrix by filling with rows of +// all zeros as needed. As in Gaussian elimination, this form is ready for +// generating a solution through back-substitution. +// +// The awesome thing about the Ribbon construction (from the DW paper) is +// how row reductions keep each row representable as a start column and +// r coefficients, because row reductions are only needed when two rows +// have the same number of leading zero columns. Thus, the combination +// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels +// out the leading 1s, so starts (at least) one column later and only +// needs (at most) r - 1 coefficients. +// +// ###################################################################### +// ###################### Ribbon PHSF scalability ####################### +// +// Although more practical detail is in ribbon_impl.h, it's worth +// understanding some of the overall benefits and limitations of the +// Ribbon PHSFs. +// +// High-end scalability is a primary issue for Ribbon PHSFs, because in +// a single Ribbon linear system with fixed r and fixed m/n ratio, the +// solution probability approaches zero as n approaches infinity. +// For a given n, solution probability improves with larger r and larger +// m/n. +// +// By contrast, peeling-based PHSFs have somewhat worse storage ratio +// or solution probability for small n (less than ~1000). This is +// especially true with spatial-coupling, where benefits are only +// notable for n on the order of 100k or 1m or more. +// +// To make best use of current hardware, r=128 seems to be closest to +// a "generally good" choice for Ribbon, at least in RocksDB where SST +// Bloom filters typically hold around 10-100k keys, and almost always +// less than 10m keys. r=128 ribbon has a high chance of encoding success +// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05) +// for roughly 10k - 10m keys in a single linear system. r=64 only scales +// up to about 10k keys with the same storage overhead. Construction and +// access times for r=128 are similar to r=64. r=128 tracks nearly +// twice as much data during construction, but in most cases we expect +// the scalability benefits of r=128 vs. r=64 to make it preferred. +// +// A natural approach to scaling Ribbon beyond ~10m keys is splitting +// (or "sharding") the inputs into multiple linear systems with their +// own hash seeds. This can also help to control peak memory consumption. +// TODO: much more to come +// +// ###################################################################### +// #################### Ribbon on-the-fly banding ####################### +// +// "Banding" is what we call the process of reducing the inputs to an +// upper-triangular r-band matrix ready for finishing a solution with +// back-substitution. Although the DW paper presents an algorithm for +// this ("SGauss"), the awesome properties of their construction enable +// an even simpler, faster, and more backtrackable algorithm. In simplest +// terms, the SGauss algorithm requires sorting the inputs by start +// columns, but it's possible to make Gaussian elimination resemble hash +// table insertion! +// +// The enhanced algorithm is based on these observations: +// - When processing a coefficient row with first 1 in column j, +// - If it's the first at column j to be processed, it can be part of +// the banding at row j. (And that decision never overwritten, with +// no loss of generality!) +// - Else, it can be combined with existing row j and re-processed, +// which will look for a later "empty" row or reach "no solution". +// +// We call our banding algorithm "incremental" and "on-the-fly" because +// (like hash table insertion) we are "finished" after each input +// processed, with respect to all inputs processed so far. Although the +// band matrix is an intermediate step to the solution structure, we have +// eliminated intermediate steps and unnecessary data tracking for +// banding. +// +// Building on "incremental" and "on-the-fly", the banding algorithm is +// easily backtrackable because no (non-empty) rows are overwritten in +// the banding. Thus, if we want to "try" adding an additional set of +// inputs to the banding, we only have to record which rows were written +// in order to efficiently backtrack to our state before considering +// the additional set. (TODO: how this can mitigate scalability and +// reach sub-1% overheads) +// +// Like in a linear-probed hash table, as the occupancy approaches and +// surpasses 90-95%, collision resolution dominates the construction +// time. (Ribbon doesn't usually pay at query time; see solution +// storage below.) This means that we can speed up construction time +// by using a higher m/n ratio, up to negative returns around 1.2. +// At m/n ~= 1.2, which still saves memory substantially vs. Bloom +// filter's 1.5, construction speed (including back-substitution) is not +// far from sorting speed, but still a few times slower than cache-local +// Bloom construction speed. +// +// Back-substitution from an upper-triangular boolean band matrix is +// especially fast and easy. All the memory accesses are sequential or at +// least local, no random. If the number of result bits (b) is a +// compile-time constant, the back-substitution state can even be tracked +// in CPU registers. Regardless of the solution representation, we prefer +// column-major representation for tracking back-substitution state, as +// r (the band width) will typically be much larger than b (result bits +// or columns), so better to handle r-bit values b times (per solution +// row) than b-bit values r times. +// +// ###################################################################### +// ##################### Ribbon solution storage ######################## +// +// Row-major layout is typical for boolean (bit) matrices, including for +// MWHC (Xor) filters where a query combines k b-bit values, and k is +// typically smaller than b. Even for k=4 and b=2, at least k=4 random +// look-ups are required regardless of layout. +// +// Ribbon PHSFs are quite different, however, because +// (a) all of the solution rows relevant to a query are within a single +// range of r rows, and +// (b) the number of solution rows involved (r/2 on average, or r if +// avoiding conditional accesses) is typically much greater than +// b, the number of solution columns. +// +// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead +// by processing (up to) r entries of b bits each, where b is typically +// less than 10 for filter applications. +// +// Column-major layout has poor locality because of accessing up to b +// memory locations in different pages (and obviously cache lines). Note +// that negative filter queries do not typically need to access all +// solution columns, as they can return when a mismatch is found in any +// result/solution column. This optimization doesn't always pay off on +// recent hardware, where the penalty for unpredictable conditional +// branching can exceed the penalty for unnecessary work, but the +// optimization is essentially unavailable with row-major layout. +// +// The best compromise seems to be interleaving column-major on the small +// scale with row-major on the large scale. For example, let a solution +// "block" be r rows column-major encoded as b r-bit values in sequence. +// Each query accesses (up to) 2 adjacent blocks, which will typically +// span 1-3 cache lines in adjacent memory. We get very close to the same +// locality as row-major, but with much faster reconstruction of each +// result column, at least for filter applications where b is relatively +// small and negative queries can return early. +// +// ###################################################################### +// ###################### Fractional result bits ######################## +// +// Bloom filters have great flexibility that alternatives mostly do not +// have. One of those flexibilities is in utilizing any ratio of data +// structure bits per key. With a typical memory allocator like jemalloc, +// this flexibility can save roughly 10% of the filters' footprint in +// DRAM by rounding up and down filter sizes to minimize memory internal +// fragmentation (see optimize_filters_for_memory RocksDB option). +// +// At first glance, PHSFs only offer a whole number of bits per "slot" +// (m rather than number of keys n), but coefficient locality in the +// Ribbon construction makes fractional bits/key quite possible and +// attractive for filter applications. This works by a prefix of the +// structure using b-1 solution columns and the rest using b solution +// columns. See InterleavedSolutionStorage below for more detail. +// +// Because false positive rates are non-linear in bits/key, this approach +// is not quite optimal in terms of information theory. In common cases, +// we see additional space overhead up to about 1.5% vs. theoretical +// optimal to achieve the same FP rate. We consider this a quite acceptable +// overhead for very efficiently utilizing space that might otherwise be +// wasted. +// +// This property of Ribbon even makes it "elastic." A Ribbon filter and +// its small metadata for answering queries can be adapted into another +// Ribbon filter filling any smaller multiple of r bits (plus small +// metadata), with a correspondingly higher FP rate. None of the data +// thrown away during construction needs to be recalled for this reduction. +// Similarly a single Ribbon construction can be separated (by solution +// column) into two or more structures (or "layers" or "levels") with +// independent filtering ability (no FP correlation, just as solution or +// result columns in a single structure) despite being constructed as part +// of a single linear system. (TODO: implement) +// See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards +// Efficient Read for LSM-tree-based KV Stores." +// + +// ###################################################################### +// ################### CODE: Ribbon core algorithms ##################### +// ###################################################################### +// +// These algorithms are templatized for genericity but near-maximum +// performance in a given application. The template parameters +// adhere to informal class/struct type concepts outlined below. (This +// code is written for C++11 so does not use formal C++ concepts.) + +// Rough architecture for these algorithms: +// +// +-----------+ +---+ +-----------------+ +// | AddInputs | --> | H | --> | BandingStorage | +// +-----------+ | a | +-----------------+ +// | s | | +// | h | Back substitution +// | e | V +// +-----------+ | r | +-----------------+ +// | Query Key | --> | | >+< | SolutionStorage | +// +-----------+ +---+ | +-----------------+ +// V +// Query result + +// Common to other concepts +// concept RibbonTypes { +// // An unsigned integer type for an r-bit subsequence of coefficients. +// // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would +// // generally only hurt scalability to leave bits of CoeffRow unused. +// typename CoeffRow; +// // An unsigned integer type big enough to hold a result row (b bits, +// // or number of solution/result columns). +// // In many applications, especially filters, the number of result +// // columns is decided at run time, so ResultRow simply needs to be +// // big enough for the largest number of columns allowed. +// typename ResultRow; +// // An unsigned integer type sufficient for representing the number of +// // rows in the solution structure, and at least the arithmetic +// // promotion size (usually 32 bits). uint32_t recommended because a +// // single Ribbon construction doesn't really scale to billions of +// // entries. +// typename Index; +// }; + +// ###################################################################### +// ######################## Hashers and Banding ######################### + +// Hasher concepts abstract out hashing details. + +// concept PhsfQueryHasher extends RibbonTypes { +// // Type for a lookup key, which is hashable. +// typename Key; +// +// // Type for hashed summary of a Key. uint64_t is recommended. +// typename Hash; +// +// // Compute a hash value summarizing a Key +// Hash GetHash(const Key &) const; +// +// // Given a hash value and a number of columns that can start an +// // r-sequence of coefficients (== m - r + 1), return the start +// // column to associate with that hash value. (Starts can be chosen +// // uniformly or "smash" extra entries into the beginning and end for +// // better utilization at those extremes of the structure. Details in +// // ribbon.impl.h) +// Index GetStart(Hash, Index num_starts) const; +// +// // Given a hash value, return the r-bit sequence of coefficients to +// // associate with it. It's generally OK if +// // sizeof(CoeffRow) > sizeof(Hash) +// // as long as the hash itself is not too prone to collisions for the +// // applications and the CoeffRow is generated uniformly from +// // available hash data, but relatively independent of the start. +// // +// // Must be non-zero, because that's required for a solution to exist +// // when mapping to non-zero result row. (Note: BandingAdd could be +// // modified to allow 0 coeff row if that only occurs with 0 result +// // row, which really only makes sense for filter implementation, +// // where both values are hash-derived. Or BandingAdd could reject 0 +// // coeff row, forcing next seed, but that has potential problems with +// // generality/scalability.) +// CoeffRow GetCoeffRow(Hash) const; +// }; + +// concept FilterQueryHasher extends PhsfQueryHasher { +// // For building or querying a filter, this returns the expected +// // result row associated with a hashed input. For general PHSF, +// // this must return 0. +// // +// // Although not strictly required, there's a slightly better chance of +// // solver success if result row is masked down here to only the bits +// // actually needed. +// ResultRow GetResultRowFromHash(Hash) const; +// } + +// concept BandingHasher extends FilterQueryHasher { +// // For a filter, this will generally be the same as Key. +// // For a general PHSF, it must either +// // (a) include a key and a result it maps to (e.g. in a std::pair), or +// // (b) GetResultRowFromInput looks up the result somewhere rather than +// // extracting it. +// typename AddInput; +// +// // Instead of requiring a way to extract a Key from an +// // AddInput, we require getting the hash of the Key part +// // of an AddInput, which is trivial if AddInput == Key. +// Hash GetHash(const AddInput &) const; +// +// // For building a non-filter PHSF, this extracts or looks up the result +// // row to associate with an input. For filter PHSF, this must return 0. +// ResultRow GetResultRowFromInput(const AddInput &) const; +// +// // Whether the solver can assume the lowest bit of GetCoeffRow is +// // always 1. When true, it should improve solver efficiency slightly. +// static bool kFirstCoeffAlwaysOne; +// } + +// Abstract storage for the the result of "banding" the inputs (Gaussian +// elimination to an upper-triangular boolean band matrix). Because the +// banding is an incremental / on-the-fly algorithm, this also represents +// all the intermediate state between input entries. +// +// concept BandingStorage extends RibbonTypes { +// // Tells the banding algorithm to prefetch memory associated with +// // the next input before processing the current input. Generally +// // recommended iff the BandingStorage doesn't easily fit in CPU +// // cache. +// bool UsePrefetch() const; +// +// // Prefetches (e.g. __builtin_prefetch) memory associated with a +// // slot index i. +// void Prefetch(Index i) const; +// +// // Load or store CoeffRow and ResultRow for slot index i. +// // (Gaussian row operations involve both sides of the equation.) +// // Bool `for_back_subst` indicates that customizing values for +// // unconstrained solution rows (cr == 0) is allowed. +// void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst) +// const; +// void StoreRow(Index i, CoeffRow cr, ResultRow rr); +// +// // Returns the number of columns that can start an r-sequence of +// // coefficients, which is the number of slots minus r (kCoeffBits) +// // plus one. (m - r + 1) +// Index GetNumStarts() const; +// }; + +// Optional storage for backtracking data in banding a set of input +// entries. It exposes an array structure which will generally be +// used as a stack. It must be able to accommodate as many entries +// as are passed in as inputs to `BandingAddRange`. +// +// concept BacktrackStorage extends RibbonTypes { +// // If false, backtracking support will be disabled in the algorithm. +// // This should preferably be an inline compile-time constant function. +// bool UseBacktrack() const; +// +// // Records `to_save` as the `i`th backtrack entry +// void BacktrackPut(Index i, Index to_save); +// +// // Recalls the `i`th backtrack entry +// Index BacktrackGet(Index i) const; +// } + +// Adds a single entry to BandingStorage (and optionally, BacktrackStorage), +// returning true if successful or false if solution is impossible with +// current hasher (and presumably its seed) and number of "slots" (solution +// or banding rows). (A solution is impossible when there is a linear +// dependence among the inputs that doesn't "cancel out".) +// +// Pre- and post-condition: the BandingStorage represents a band matrix +// ready for back substitution (row echelon form except for zero rows), +// augmented with result values such that back substitution would give a +// solution satisfying all the cr@start -> rr entries added. +template +bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start, + typename BandingStorage::ResultRow rr, + typename BandingStorage::CoeffRow cr, BacktrackStorage *bts, + typename BandingStorage::Index *backtrack_pos) { + using CoeffRow = typename BandingStorage::CoeffRow; + using ResultRow = typename BandingStorage::ResultRow; + using Index = typename BandingStorage::Index; + + Index i = start; + + if (!kFirstCoeffAlwaysOne) { + // Requires/asserts that cr != 0 + int tz = CountTrailingZeroBits(cr); + i += static_cast(tz); + cr >>= tz; + } + + for (;;) { + assert((cr & 1) == 1); + CoeffRow cr_at_i; + ResultRow rr_at_i; + bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false); + if (cr_at_i == 0) { + bs->StoreRow(i, cr, rr); + bts->BacktrackPut(*backtrack_pos, i); + ++*backtrack_pos; + return true; + } + assert((cr_at_i & 1) == 1); + // Gaussian row reduction + cr ^= cr_at_i; + rr ^= rr_at_i; + if (cr == 0) { + // Inconsistency or (less likely) redundancy + break; + } + // Find relative offset of next non-zero coefficient. + int tz = CountTrailingZeroBits(cr); + i += static_cast(tz); + cr >>= tz; + } + + // Failed, unless result row == 0 because e.g. a duplicate input or a + // stock hash collision, with same result row. (For filter, stock hash + // collision implies same result row.) Or we could have a full equation + // equal to sum of other equations, which is very possible with + // small range of values for result row. + return rr == 0; +} + +// Adds a range of entries to BandingStorage returning true if successful +// or false if solution is impossible with current hasher (and presumably +// its seed) and number of "slots" (solution or banding rows). (A solution +// is impossible when there is a linear dependence among the inputs that +// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. +// +// If UseBacktrack in the BacktrackStorage, this function call rolls back +// to prior state on failure. If !UseBacktrack, some subset of the entries +// will have been added to the BandingStorage, so best considered to be in +// an indeterminate state. +// +template +bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts, + const BandingHasher &bh, InputIterator begin, + InputIterator end) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + using Hash = typename BandingHasher::Hash; + + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + + constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne; + + if (begin == end) { + // trivial + return true; + } + + const Index num_starts = bs->GetNumStarts(); + + InputIterator cur = begin; + Index backtrack_pos = 0; + if (!bs->UsePrefetch()) { + // Simple version, no prefetch + for (;;) { + Hash h = bh.GetHash(*cur); + Index start = bh.GetStart(h, num_starts); + ResultRow rr = + bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h); + CoeffRow cr = bh.GetCoeffRow(h); + + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + if ((++cur) == end) { + return true; + } + } + } else { + // Pipelined w/prefetch + // Prime the pipeline + Hash h = bh.GetHash(*cur); + Index start = bh.GetStart(h, num_starts); + ResultRow rr = bh.GetResultRowFromInput(*cur); + bs->Prefetch(start); + + // Pipeline + for (;;) { + rr |= bh.GetResultRowFromHash(h); + CoeffRow cr = bh.GetCoeffRow(h); + if ((++cur) == end) { + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + return true; + } + Hash next_h = bh.GetHash(*cur); + Index next_start = bh.GetStart(next_h, num_starts); + ResultRow next_rr = bh.GetResultRowFromInput(*cur); + bs->Prefetch(next_start); + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + h = next_h; + start = next_start; + rr = next_rr; + } + } + // failed; backtrack (if implemented) + if (bts->UseBacktrack()) { + while (backtrack_pos > 0) { + --backtrack_pos; + Index i = bts->BacktrackGet(backtrack_pos); + // Clearing the ResultRow is not strictly required, but is required + // for good FP rate on inputs that might have been backtracked out. + // (We don't want anything we've backtracked on to leak into final + // result, as that might not be "harmless".) + bs->StoreRow(i, 0, 0); + } + } + return false; +} + +// Adds a range of entries to BandingStorage returning true if successful +// or false if solution is impossible with current hasher (and presumably +// its seed) and number of "slots" (solution or banding rows). (A solution +// is impossible when there is a linear dependence among the inputs that +// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. +// +// On failure, some subset of the entries will have been added to the +// BandingStorage, so best considered to be in an indeterminate state. +// +template +bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh, + InputIterator begin, InputIterator end) { + using Index = typename BandingStorage::Index; + struct NoopBacktrackStorage { + bool UseBacktrack() { return false; } + void BacktrackPut(Index, Index) {} + Index BacktrackGet(Index) { + assert(false); + return 0; + } + } nbts; + return BandingAddRange(bs, &nbts, bh, begin, end); +} + +// ###################################################################### +// ######################### Solution Storage ########################### + +// Back-substitution and query algorithms unfortunately depend on some +// details of data layout in the final data structure ("solution"). Thus, +// there is no common SolutionStorage covering all the reasonable +// possibilities. + +// ###################### SimpleSolutionStorage ######################### + +// SimpleSolutionStorage is for a row-major storage, typically with no +// unused bits in each ResultRow. This is mostly for demonstration +// purposes as the simplest solution storage scheme. It is relatively slow +// for filter queries. + +// concept SimpleSolutionStorage extends RibbonTypes { +// // This is called at the beginning of back-substitution for the +// // solution storage to do any remaining configuration before data +// // is stored to it. If configuration is previously finalized, this +// // could be a simple assertion or even no-op. Ribbon algorithms +// // only call this from back-substitution, and only once per call, +// // before other functions here. +// void PrepareForNumStarts(Index num_starts) const; +// // Must return num_starts passed to PrepareForNumStarts, or the most +// // recent call to PrepareForNumStarts if this storage object can be +// // reused. Note that num_starts == num_slots - kCoeffBits + 1 because +// // there must be a run of kCoeffBits slots starting from each start. +// Index GetNumStarts() const; +// // Load the solution row (type ResultRow) for a slot +// ResultRow Load(Index slot_num) const; +// // Store the solution row (type ResultRow) for a slot +// void Store(Index slot_num, ResultRow data); +// }; + +// Back-substitution for generating a solution from BandingStorage to +// SimpleSolutionStorage. +template +void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + + static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index), + "must be same"); + static_assert( + sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow), + "must be same"); + static_assert( + sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + constexpr auto kResultBits = static_cast(sizeof(ResultRow) * 8U); + + // A column-major buffer of the solution matrix, containing enough + // recently-computed solution data to compute the next solution row + // (based also on banding data). + std::array state; + state.fill(0); + + const Index num_starts = bs.GetNumStarts(); + sss->PrepareForNumStarts(num_starts); + const Index num_slots = num_starts + kCoeffBits - 1; + + for (Index i = num_slots; i > 0;) { + --i; + CoeffRow cr; + ResultRow rr; + bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true); + // solution row + ResultRow sr = 0; + for (Index j = 0; j < kResultBits; ++j) { + // Compute next solution bit at row i, column j (see derivation below) + CoeffRow tmp = state[j] << 1; + bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0; + tmp |= bit ? CoeffRow{1} : CoeffRow{0}; + + // Now tmp is solution at column j from row i for next kCoeffBits + // more rows. Thus, for valid solution, the dot product of the + // solution column with the coefficient row has to equal the result + // at that column, + // BitParity(tmp & cr) == ((rr >> j) & 1) + + // Update state. + state[j] = tmp; + // add to solution row + sr |= (bit ? ResultRow{1} : ResultRow{0}) << j; + } + sss->Store(i, sr); + } +} + +// Common functionality for querying a key (already hashed) in +// SimpleSolutionStorage. +template +typename SimpleSolutionStorage::ResultRow SimpleQueryHelper( + typename SimpleSolutionStorage::Index start_slot, + typename SimpleSolutionStorage::CoeffRow cr, + const SimpleSolutionStorage &sss) { + using CoeffRow = typename SimpleSolutionStorage::CoeffRow; + using ResultRow = typename SimpleSolutionStorage::ResultRow; + + constexpr unsigned kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + ResultRow result = 0; + for (unsigned i = 0; i < kCoeffBits; ++i) { + // Bit masking whole value is generally faster here than 'if' + result ^= sss.Load(start_slot + i) & + (ResultRow{0} - (static_cast(cr >> i) & ResultRow{1})); + } + return result; +} + +// General PHSF query a key from SimpleSolutionStorage. +template +typename SimpleSolutionStorage::ResultRow SimplePhsfQuery( + const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, + const SimpleSolutionStorage &sss) { + const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key); + + static_assert(sizeof(typename SimpleSolutionStorage::Index) == + sizeof(typename PhsfQueryHasher::Index), + "must be same"); + static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == + sizeof(typename PhsfQueryHasher::CoeffRow), + "must be same"); + + return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), + hasher.GetCoeffRow(hash), sss); +} + +// Filter query a key from SimpleSolutionStorage. +template +bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key, + const FilterQueryHasher &hasher, + const SimpleSolutionStorage &sss) { + const typename FilterQueryHasher::Hash hash = hasher.GetHash(key); + const typename SimpleSolutionStorage::ResultRow expected = + hasher.GetResultRowFromHash(hash); + + static_assert(sizeof(typename SimpleSolutionStorage::Index) == + sizeof(typename FilterQueryHasher::Index), + "must be same"); + static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == + sizeof(typename FilterQueryHasher::CoeffRow), + "must be same"); + static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) == + sizeof(typename FilterQueryHasher::ResultRow), + "must be same"); + + return expected == + SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), + hasher.GetCoeffRow(hash), sss); +} + +// #################### InterleavedSolutionStorage ###################### + +// InterleavedSolutionStorage is row-major at a high level, for good +// locality, and column-major at a low level, for CPU efficiency +// especially in filter queries or relatively small number of result bits +// (== solution columns). The storage is a sequence of "blocks" where a +// block has one CoeffRow-sized segment for each solution column. Each +// query spans at most two blocks; the starting solution row is typically +// in the row-logical middle of a block and spans to the middle of the +// next block. (See diagram below.) +// +// InterleavedSolutionStorage supports choosing b (number of result or +// solution columns) at run time, and even supports mixing b and b-1 solution +// columns in a single linear system solution, for filters that can +// effectively utilize any size space (multiple of CoeffRow) for minimizing +// FP rate for any number of added keys. To simplify query implementation +// (with lower-index columns first), the b-bit portion comes after the b-1 +// portion of the structure. +// +// Diagram (=== marks logical block boundary; b=4; ### is data used by a +// query crossing the b-1 to b boundary, each Segment has type CoeffRow): +// ... +// +======================+ +// | S e g m e n t col=0 | +// +----------------------+ +// | S e g m e n t col=1 | +// +----------------------+ +// | S e g m e n t col=2 | +// +======================+ +// | S e g m e n #########| +// +----------------------+ +// | S e g m e n #########| +// +----------------------+ +// | S e g m e n #########| +// +======================+ Result/solution columns: above = 3, below = 4 +// |#############t col=0 | +// +----------------------+ +// |#############t col=1 | +// +----------------------+ +// |#############t col=2 | +// +----------------------+ +// | S e g m e n t col=3 | +// +======================+ +// | S e g m e n t col=0 | +// +----------------------+ +// | S e g m e n t col=1 | +// +----------------------+ +// | S e g m e n t col=2 | +// +----------------------+ +// | S e g m e n t col=3 | +// +======================+ +// ... +// +// InterleavedSolutionStorage will be adapted by the algorithms from +// simple array-like segment storage. That array-like storage is templatized +// in part so that an implementation may choose to handle byte ordering +// at access time. +// +// concept InterleavedSolutionStorage extends RibbonTypes { +// // This is called at the beginning of back-substitution for the +// // solution storage to do any remaining configuration before data +// // is stored to it. If configuration is previously finalized, this +// // could be a simple assertion or even no-op. Ribbon algorithms +// // only call this from back-substitution, and only once per call, +// // before other functions here. +// void PrepareForNumStarts(Index num_starts) const; +// // Must return num_starts passed to PrepareForNumStarts, or the most +// // recent call to PrepareForNumStarts if this storage object can be +// // reused. Note that num_starts == num_slots - kCoeffBits + 1 because +// // there must be a run of kCoeffBits slots starting from each start. +// Index GetNumStarts() const; +// // The larger number of solution columns used (called "b" above). +// Index GetUpperNumColumns() const; +// // If returns > 0, then block numbers below that use +// // GetUpperNumColumns() - 1 columns per solution row, and the rest +// // use GetUpperNumColumns(). A block represents kCoeffBits "slots", +// // where all but the last kCoeffBits - 1 slots are also starts. And +// // a block contains a segment for each solution column. +// // An implementation may only support uniform columns per solution +// // row and return constant 0 here. +// Index GetUpperStartBlock() const; +// +// // ### "Array of segments" portion of API ### +// // The number of values of type CoeffRow used in this solution +// // representation. (This value can be inferred from the previous +// // three functions, but is expected at least for sanity / assertion +// // checking.) +// Index GetNumSegments() const; +// // Load an entry from the logical array of segments +// CoeffRow LoadSegment(Index segment_num) const; +// // Store an entry to the logical array of segments +// void StoreSegment(Index segment_num, CoeffRow data); +// }; + +// A helper for InterleavedBackSubst. +template +inline void BackSubstBlock(typename BandingStorage::CoeffRow *state, + typename BandingStorage::Index num_columns, + const BandingStorage &bs, + typename BandingStorage::Index start_slot) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + for (Index i = start_slot + kCoeffBits; i > start_slot;) { + --i; + CoeffRow cr; + ResultRow rr; + bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true); + for (Index j = 0; j < num_columns; ++j) { + // Compute next solution bit at row i, column j (see derivation below) + CoeffRow tmp = state[j] << 1; + int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1); + tmp |= static_cast(bit); + + // Now tmp is solution at column j from row i for next kCoeffBits + // more rows. Thus, for valid solution, the dot product of the + // solution column with the coefficient row has to equal the result + // at that column, + // BitParity(tmp & cr) == ((rr >> j) & 1) + + // Update state. + state[j] = tmp; + } + } +} + +// Back-substitution for generating a solution from BandingStorage to +// InterleavedSolutionStorage. +template +void InterleavedBackSubst(InterleavedSolutionStorage *iss, + const BandingStorage &bs) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + + static_assert( + sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index), + "must be same"); + static_assert( + sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const Index num_starts = bs.GetNumStarts(); + // Although it might be nice to have a filter that returns "always false" + // when no key is added, we aren't specifically supporting that here + // because it would require another condition branch in the query. + assert(num_starts > 0); + iss->PrepareForNumStarts(num_starts); + + const Index num_slots = num_starts + kCoeffBits - 1; + assert(num_slots % kCoeffBits == 0); + const Index num_blocks = num_slots / kCoeffBits; + const Index num_segments = iss->GetNumSegments(); + + // For now upper, then lower + Index num_columns = iss->GetUpperNumColumns(); + const Index upper_start_block = iss->GetUpperStartBlock(); + + if (num_columns == 0) { + // Nothing to do, presumably because there's not enough space for even + // a single segment. + assert(num_segments == 0); + // When num_columns == 0, a Ribbon filter query will always return true, + // or a PHSF query always 0. + return; + } + + // We should be utilizing all available segments + assert(num_segments == (upper_start_block * (num_columns - 1)) + + ((num_blocks - upper_start_block) * num_columns)); + + // TODO: consider fixed-column specializations with stack-allocated state + + // A column-major buffer of the solution matrix, containing enough + // recently-computed solution data to compute the next solution row + // (based also on banding data). + std::unique_ptr state{new CoeffRow[num_columns]()}; + + Index block = num_blocks; + Index segment_num = num_segments; + while (block > upper_start_block) { + --block; + BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); + segment_num -= num_columns; + for (Index i = 0; i < num_columns; ++i) { + iss->StoreSegment(segment_num + i, state[i]); + } + } + // Now (if applicable), region using lower number of columns + // (This should be optimized away if GetUpperStartBlock() returns + // constant 0.) + --num_columns; + while (block > 0) { + --block; + BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); + segment_num -= num_columns; + for (Index i = 0; i < num_columns; ++i) { + iss->StoreSegment(segment_num + i, state[i]); + } + } + // Verify everything processed + assert(block == 0); + assert(segment_num == 0); +} + +// Prefetch memory for a key in InterleavedSolutionStorage. +template +inline void InterleavedPrepareQuery( + const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, + const InterleavedSolutionStorage &iss, + typename PhsfQueryHasher::Hash *saved_hash, + typename InterleavedSolutionStorage::Index *saved_segment_num, + typename InterleavedSolutionStorage::Index *saved_num_columns, + typename InterleavedSolutionStorage::Index *saved_start_bit) { + using Hash = typename PhsfQueryHasher::Hash; + using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; + using Index = typename InterleavedSolutionStorage::Index; + + static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), + "must be same"); + + const Hash hash = hasher.GetHash(key); + const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts()); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const Index upper_start_block = iss.GetUpperStartBlock(); + Index num_columns = iss.GetUpperNumColumns(); + Index start_block_num = start_slot / kCoeffBits; + Index segment_num = start_block_num * num_columns - + std::min(start_block_num, upper_start_block); + // Change to lower num columns if applicable. + // (This should not compile to a conditional branch.) + num_columns -= (start_block_num < upper_start_block) ? 1 : 0; + + Index start_bit = start_slot % kCoeffBits; + + Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns); + + iss.PrefetchSegmentRange(segment_num, segment_num + segment_count); + + *saved_hash = hash; + *saved_segment_num = segment_num; + *saved_num_columns = num_columns; + *saved_start_bit = start_bit; +} + +// General PHSF query from InterleavedSolutionStorage, using data for +// the query key from InterleavedPrepareQuery +template +inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery( + typename PhsfQueryHasher::Hash hash, + typename InterleavedSolutionStorage::Index segment_num, + typename InterleavedSolutionStorage::Index num_columns, + typename InterleavedSolutionStorage::Index start_bit, + const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) { + using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; + using Index = typename InterleavedSolutionStorage::Index; + using ResultRow = typename InterleavedSolutionStorage::ResultRow; + + static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), + "must be same"); + static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const CoeffRow cr = hasher.GetCoeffRow(hash); + + ResultRow sr = 0; + const CoeffRow cr_left = cr << static_cast(start_bit); + for (Index i = 0; i < num_columns; ++i) { + sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i; + } + + if (start_bit > 0) { + segment_num += num_columns; + const CoeffRow cr_right = + cr >> static_cast(kCoeffBits - start_bit); + for (Index i = 0; i < num_columns; ++i) { + sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i; + } + } + + return sr; +} + +// Filter query a key from InterleavedFilterQuery. +template +inline bool InterleavedFilterQuery( + typename FilterQueryHasher::Hash hash, + typename InterleavedSolutionStorage::Index segment_num, + typename InterleavedSolutionStorage::Index num_columns, + typename InterleavedSolutionStorage::Index start_bit, + const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) { + using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; + using Index = typename InterleavedSolutionStorage::Index; + using ResultRow = typename InterleavedSolutionStorage::ResultRow; + + static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index), + "must be same"); + static_assert( + sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow), + "must be same"); + static_assert( + sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const CoeffRow cr = hasher.GetCoeffRow(hash); + const ResultRow expected = hasher.GetResultRowFromHash(hash); + + // TODO: consider optimizations such as + // * get rid of start_bit == 0 condition with careful fetching & shifting + if (start_bit == 0) { + for (Index i = 0; i < num_columns; ++i) { + if (BitParity(iss.LoadSegment(segment_num + i) & cr) != + (static_cast(expected >> i) & 1)) { + return false; + } + } + } else { + const CoeffRow cr_left = cr << static_cast(start_bit); + const CoeffRow cr_right = + cr >> static_cast(kCoeffBits - start_bit); + + for (Index i = 0; i < num_columns; ++i) { + CoeffRow soln_data = + (iss.LoadSegment(segment_num + i) & cr_left) ^ + (iss.LoadSegment(segment_num + num_columns + i) & cr_right); + if (BitParity(soln_data) != (static_cast(expected >> i) & 1)) { + return false; + } + } + } + // otherwise, all match + return true; +} + +// TODO: refactor Interleaved*Query so that queries can be "prepared" by +// prefetching memory, to hide memory latency for multiple queries in a +// single thread. + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.cc new file mode 100644 index 0000000000..c1046f4aaa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.cc @@ -0,0 +1,506 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/ribbon_config.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +namespace detail { + +// Each instantiation of this struct is sufficiently unique for configuration +// purposes, and is only instantiated for settings where we support the +// configuration API. An application might only reference one instantiation, +// meaning the rest could be pruned at link time. +template +struct BandingConfigHelperData { + static constexpr size_t kKnownSize = 18U; + + // Because of complexity in the data, for smaller numbers of slots + // (powers of two up to 2^17), we record known numbers that can be added + // with kCfc chance of construction failure and settings in template + // parameters. Zero means "unsupported (too small) number of slots". + // (GetNumToAdd below will use interpolation for numbers of slots + // between powers of two; double rather than integer values here make + // that more accurate.) + static const std::array kKnownToAddByPow2; + + // For sufficiently large number of slots, doubling the number of + // slots will increase the expected overhead (slots over number added) + // by approximately this constant. + // (This is roughly constant regardless of ConstructionFailureChance and + // smash setting.) + // (Would be a constant if we had partial template specialization for + // static const members.) + static inline double GetFactorPerPow2() { + if (kCoeffBits == 128U) { + return 0.0038; + } else { + assert(kCoeffBits == 64U); + return 0.0083; + } + } + + // Overhead factor for 2^(kKnownSize-1) slots + // (Would be a constant if we had partial template specialization for + // static const members.) + static inline double GetFinalKnownFactor() { + return 1.0 * (uint32_t{1} << (kKnownSize - 1)) / + kKnownToAddByPow2[kKnownSize - 1]; + } + + // GetFinalKnownFactor() - (kKnownSize-1) * GetFactorPerPow2() + // (Would be a constant if we had partial template specialization for + // static const members.) + static inline double GetBaseFactor() { + return GetFinalKnownFactor() - (kKnownSize - 1) * GetFactorPerPow2(); + } + + // Get overhead factor (slots over number to add) for sufficiently large + // number of slots (by log base 2) + static inline double GetFactorForLarge(double log2_num_slots) { + return GetBaseFactor() + log2_num_slots * GetFactorPerPow2(); + } + + // For a given power of two number of slots (specified by whole number + // log base 2), implements GetNumToAdd for such limited case, returning + // double for better interpolation in GetNumToAdd and GetNumSlots. + static inline double GetNumToAddForPow2(uint32_t log2_num_slots) { + assert(log2_num_slots <= 32); // help clang-analyze + if (log2_num_slots < kKnownSize) { + return kKnownToAddByPow2[log2_num_slots]; + } else { + return 1.0 * (uint64_t{1} << log2_num_slots) / + GetFactorForLarge(1.0 * log2_num_slots); + } + } +}; + +// Based on data from FindOccupancy in ribbon_test +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 252.984, + 506.109, + 1013.71, + 2029.47, + 4060.43, + 8115.63, + 16202.2, + 32305.1, + 64383.5, + 128274, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 126.274, + 254.279, + 510.27, + 1022.24, + 2046.02, + 4091.99, + 8154.98, + 16244.3, + 32349.7, + 64426.6, + 128307, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 124.94, + 249.968, + 501.234, + 1004.06, + 2006.15, + 3997.89, + 7946.99, + 15778.4, + 31306.9, + 62115.3, + 123284, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 62.2683, + 126.259, + 254.268, + 509.975, + 1019.98, + 2026.16, + 4019.75, + 7969.8, + 15798.2, + 31330.3, + 62134.2, + 123255, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 248.851, + 499.532, + 1001.26, + 2003.97, + 4005.59, + 8000.39, + 15966.6, + 31828.1, + 63447.3, + 126506, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 122.637, + 250.651, + 506.625, + 1018.54, + 2036.43, + 4041.6, + 8039.25, + 16005, + 31869.6, + 63492.8, + 126537, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 120.659, + 243.346, + 488.168, + 976.373, + 1948.86, + 3875.85, + 7704.97, + 15312.4, + 30395.1, + 60321.8, + 119813, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 58.6016, + 122.619, + 250.641, + 503.595, + 994.165, + 1967.36, + 3898.17, + 7727.21, + 15331.5, + 30405.8, + 60376.2, + 119836, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 242.61, + 491.887, + 983.603, + 1968.21, + 3926.98, + 7833.99, + 15629, + 31199.9, + 62307.8, + 123870, + }}; + +template <> +const std::array BandingConfigHelperData< + kOneIn1000, 128U, /*smash*/ true>::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 117.19, + 245.105, + 500.748, + 1010.67, + 1993.4, + 3950.01, + 7863.31, + 15652, + 31262.1, + 62462.8, + 124095, +}}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 114, + 234.8, + 471.498, + 940.165, + 1874, + 3721.5, + 7387.5, + 14592, + 29160, + 57745, + 115082, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{ + { + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 53.0434, + 117, + 245.312, + 483.571, + 950.251, + 1878, + 3736.34, + 7387.97, + 14618, + 29142.9, + 57838.8, + 114932, + }}; + +// We hide these implementation details from the .h file with explicit +// instantiations below these partial specializations. + +template +uint32_t BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, + true /* kIsSupported */>::GetNumToAdd(uint32_t num_slots) { + using Data = detail::BandingConfigHelperData; + if (num_slots == 0) { + return 0; + } + uint32_t num_to_add; + double log2_num_slots = std::log(num_slots) * 1.4426950409; + uint32_t floor_log2 = static_cast(log2_num_slots); + if (floor_log2 + 1 < Data::kKnownSize) { + double ceil_portion = 1.0 * num_slots / (uint32_t{1} << floor_log2) - 1.0; + // Must be a supported number of slots + assert(Data::kKnownToAddByPow2[floor_log2] > 0.0); + // Weighted average of two nearest known data points + num_to_add = static_cast( + ceil_portion * Data::kKnownToAddByPow2[floor_log2 + 1] + + (1.0 - ceil_portion) * Data::kKnownToAddByPow2[floor_log2]); + } else { + // Use formula for large values + double factor = Data::GetFactorForLarge(log2_num_slots); + assert(factor >= 1.0); + num_to_add = static_cast(num_slots / factor); + } + if (kHomogeneous) { + // Even when standard filter construction would succeed, we might + // have loaded things up too much for Homogeneous filter. (Complete + // explanation not known but observed empirically.) This seems to + // correct for that, mostly affecting small filter configurations. + if (num_to_add >= 8) { + num_to_add -= 8; + } else { + assert(false); + } + } + return num_to_add; +} + +template +uint32_t BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, + true /* kIsSupported */>::GetNumSlots(uint32_t num_to_add) { + using Data = detail::BandingConfigHelperData; + + if (num_to_add == 0) { + return 0; + } + if (kHomogeneous) { + // Reverse of above in GetNumToAdd + num_to_add += 8; + } + double log2_num_to_add = std::log(num_to_add) * 1.4426950409; + uint32_t approx_log2_slots = static_cast(log2_num_to_add + 0.5); + assert(approx_log2_slots <= 32); // help clang-analyze + + double lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots); + double upper_num_to_add; + if (approx_log2_slots == 0 || lower_num_to_add == /* unsupported */ 0) { + // Return minimum non-zero slots in standard implementation + return kUseSmash ? kCoeffBits : 2 * kCoeffBits; + } else if (num_to_add < lower_num_to_add) { + upper_num_to_add = lower_num_to_add; + --approx_log2_slots; + lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots); + } else { + upper_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots + 1); + } + + assert(num_to_add >= lower_num_to_add); + assert(num_to_add < upper_num_to_add); + + double upper_portion = + (num_to_add - lower_num_to_add) / (upper_num_to_add - lower_num_to_add); + + double lower_num_slots = 1.0 * (uint64_t{1} << approx_log2_slots); + + // Interpolation, round up + return static_cast(upper_portion * lower_num_slots + + lower_num_slots + 0.999999999); +} + +// These explicit instantiations enable us to hide most of the +// implementation details from the .h file. (The .h file currently +// needs to determine whether settings are "supported" or not.) + +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; + +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; + +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ false, /*hm*/ false, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ true, /*hm*/ false, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ false, /*hm*/ true, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ true, /*hm*/ true, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 64U, /*sm*/ false, /*hm*/ false, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 64U, /*sm*/ false, /*hm*/ true, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported; + +} // namespace detail + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.h new file mode 100644 index 0000000000..0e3edf0734 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_config.h @@ -0,0 +1,182 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "port/lang.h" // for FALLTHROUGH_INTENDED +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_config.h: APIs for relating numbers of slots with numbers of +// additions for tolerable construction failure probabilities. This is +// separate from ribbon_impl.h because it might not be needed for +// some applications. +// +// This API assumes uint32_t for number of slots, as a single Ribbon +// linear system should not normally overflow that without big penalties. +// +// Template parameter kCoeffBits uses uint64_t for convenience in case it +// comes from size_t. +// +// Most of the complexity here is trying to optimize speed and +// compiled code size, using templates to minimize table look-ups and +// the compiled size of all linked look-up tables. Look-up tables are +// required because we don't have good formulas, and the data comes +// from running FindOccupancy in ribbon_test. + +// Represents a chosen chance of successful Ribbon construction for a single +// seed. Allowing higher chance of failed construction can reduce space +// overhead but takes extra time in construction. +enum ConstructionFailureChance { + kOneIn2, + kOneIn20, + // When using kHomogeneous==true, construction failure chance should + // not generally exceed target FP rate, so it unlikely useful to + // allow a higher "failure" chance. In some cases, even more overhead + // is appropriate. (TODO) + kOneIn1000, +}; + +namespace detail { + +// It is useful to compile ribbon_test linking to BandingConfigHelper with +// settings for which we do not have configuration data, as long as we don't +// run the code. This template hack supports that. +template +struct BandingConfigHelper1MaybeSupported { + public: + static uint32_t GetNumToAdd(uint32_t num_slots) { + // Unsupported + assert(num_slots == 0); + (void)num_slots; + return 0; + } + + static uint32_t GetNumSlots(uint32_t num_to_add) { + // Unsupported + assert(num_to_add == 0); + (void)num_to_add; + return 0; + } +}; + +// Base class for BandingConfigHelper1 and helper for BandingConfigHelper +// with core implementations built on above data +template +struct BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, true /* kIsSupported */> { + public: + // See BandingConfigHelper1. Implementation in ribbon_config.cc + static uint32_t GetNumToAdd(uint32_t num_slots); + + // See BandingConfigHelper1. Implementation in ribbon_config.cc + static uint32_t GetNumSlots(uint32_t num_to_add); +}; + +} // namespace detail + +template +struct BandingConfigHelper1 + : public detail::BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, + /* kIsSupported */ kCoeffBits == 64 || kCoeffBits == 128> { + public: + // Returns a number of entries that can be added to a given number of + // slots, with roughly kCfc chance of construction failure per seed, + // or better. Does NOT do rounding for InterleavedSoln; call + // RoundUpNumSlots for that. + // + // inherited: + // static uint32_t GetNumToAdd(uint32_t num_slots); + + // Returns a number of slots for a given number of entries to add + // that should have roughly kCfc chance of construction failure per + // seed, or better. Does NOT do rounding for InterleavedSoln; call + // RoundUpNumSlots for that. + // + // num_to_add should not exceed roughly 2/3rds of the maximum value + // of the uint32_t type to avoid overflow. + // + // inherited: + // static uint32_t GetNumSlots(uint32_t num_to_add); +}; + +// Configured using TypesAndSettings as in ribbon_impl.h +template +struct BandingConfigHelper1TS + : public BandingConfigHelper1< + kCfc, + /* kCoeffBits */ sizeof(typename TypesAndSettings::CoeffRow) * 8U, + TypesAndSettings::kUseSmash, TypesAndSettings::kHomogeneous> {}; + +// Like BandingConfigHelper1TS except failure chance can be a runtime rather +// than compile time value. +template +struct BandingConfigHelper { + public: + static constexpr ConstructionFailureChance kDefaultFailureChance = + TypesAndSettings::kHomogeneous ? kOneIn1000 : kOneIn20; + + static uint32_t GetNumToAdd( + uint32_t num_slots, + ConstructionFailureChance max_failure = kDefaultFailureChance) { + switch (max_failure) { + default: + assert(false); + FALLTHROUGH_INTENDED; + case kOneIn20: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumToAdd(num_slots); + } + case kOneIn2: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumToAdd(num_slots); + } + case kOneIn1000: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumToAdd(num_slots); + } + } + } + + static uint32_t GetNumSlots( + uint32_t num_to_add, + ConstructionFailureChance max_failure = kDefaultFailureChance) { + switch (max_failure) { + default: + assert(false); + FALLTHROUGH_INTENDED; + case kOneIn20: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumSlots(num_to_add); + } + case kOneIn2: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumSlots(num_to_add); + } + case kOneIn1000: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumSlots(num_to_add); + } + } + } +}; + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_impl.h new file mode 100644 index 0000000000..0afecc67dd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/ribbon_impl.h @@ -0,0 +1,1137 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "port/port.h" // for PREFETCH +#include "util/fastrange.h" +#include "util/ribbon_alg.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_impl.h: templated (parameterized) standard implementations +// +// Ribbon is a Perfect Hash Static Function construction useful as a compact +// static Bloom filter alternative. See ribbon_alg.h for core algorithms +// and core design details. +// +// TODO: more details on trade-offs and practical issues. +// +// APIs for configuring Ribbon are in ribbon_config.h + +// Ribbon implementations in this file take these parameters, which must be +// provided in a class/struct type with members expressed in this concept: + +// concept TypesAndSettings { +// // See RibbonTypes and *Hasher in ribbon_alg.h, except here we have +// // the added constraint that Hash be equivalent to either uint32_t or +// // uint64_t. +// typename Hash; +// typename CoeffRow; +// typename ResultRow; +// typename Index; +// typename Key; +// static constexpr bool kFirstCoeffAlwaysOne; +// +// // An unsigned integer type for identifying a hash seed, typically +// // uint32_t or uint64_t. Importantly, this is the amount of data +// // stored in memory for identifying a raw seed. See StandardHasher. +// typename Seed; +// +// // When true, the PHSF implements a static filter, expecting just +// // keys as inputs for construction. When false, implements a general +// // PHSF and expects std::pair as inputs for +// // construction. +// static constexpr bool kIsFilter; +// +// // When true, enables a special "homogeneous" filter implementation that +// // is slightly faster to construct, and never fails to construct though +// // FP rate can quickly explode in cases where corresponding +// // non-homogeneous filter would fail (or nearly fail?) to construct. +// // For smaller filters, you can configure with ConstructionFailureChance +// // smaller than desired FP rate to largely counteract this effect. +// // TODO: configuring Homogeneous Ribbon for arbitrarily large filters +// // based on data from OptimizeHomogAtScale +// static constexpr bool kHomogeneous; +// +// // When true, adds a tiny bit more hashing logic on queries and +// // construction to improve utilization at the beginning and end of +// // the structure. Recommended when CoeffRow is only 64 bits (or +// // less), so typical num_starts < 10k. Although this is compatible +// // with kHomogeneous, the competing space vs. time priorities might +// // not be useful. +// static constexpr bool kUseSmash; +// +// // When true, allows number of "starts" to be zero, for best support +// // of the "no keys to add" case by always returning false for filter +// // queries. (This is distinct from the "keys added but no space for +// // any data" case, in which a filter always returns true.) The cost +// // supporting this is a conditional branch (probably predictable) in +// // queries. +// static constexpr bool kAllowZeroStarts; +// +// // A seedable stock hash function on Keys. All bits of Hash must +// // be reasonably high quality. XXH functions recommended, but +// // Murmur, City, Farm, etc. also work. +// static Hash HashFn(const Key &, Seed raw_seed); +// }; + +// A bit of a hack to automatically construct the type for +// AddInput based on a constexpr bool. +template +struct AddInputSelector { + // For general PHSF, not filter + using T = std::pair; +}; + +template +struct AddInputSelector { + // For Filter + using T = Key; +}; + +// To avoid writing 'typename' everywhere that we use types like 'Index' +#define IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings) \ + using CoeffRow = typename TypesAndSettings::CoeffRow; \ + using ResultRow = typename TypesAndSettings::ResultRow; \ + using Index = typename TypesAndSettings::Index; \ + using Hash = typename TypesAndSettings::Hash; \ + using Key = typename TypesAndSettings::Key; \ + using Seed = typename TypesAndSettings::Seed; \ + \ + /* Some more additions */ \ + using QueryInput = Key; \ + using AddInput = typename ROCKSDB_NAMESPACE::ribbon::AddInputSelector< \ + Key, ResultRow, TypesAndSettings::kIsFilter>::T; \ + static constexpr auto kCoeffBits = \ + static_cast(sizeof(CoeffRow) * 8U); \ + \ + /* Export to algorithm */ \ + static constexpr bool kFirstCoeffAlwaysOne = \ + TypesAndSettings::kFirstCoeffAlwaysOne; \ + \ + static_assert(sizeof(CoeffRow) + sizeof(ResultRow) + sizeof(Index) + \ + sizeof(Hash) + sizeof(Key) + sizeof(Seed) + \ + sizeof(QueryInput) + sizeof(AddInput) + kCoeffBits + \ + kFirstCoeffAlwaysOne > \ + 0, \ + "avoid unused warnings, semicolon expected after macro call") + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4309) // cast truncating constant +#pragma warning(disable : 4307) // arithmetic constant overflow +#endif + +// StandardHasher: A standard implementation of concepts RibbonTypes, +// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h. +// +// This implementation should be suitable for most all practical purposes +// as it "behaves" across a wide range of settings, with little room left +// for improvement. The key functionality in this hasher is generating +// CoeffRows, starts, and (for filters) ResultRows, which could be ~150 +// bits of data or more, from a modest hash of 64 or even just 32 bits, with +// enough uniformity and bitwise independence to be close to "the best you +// can do" with available hash information in terms of FP rate and +// compactness. (64 bits recommended and sufficient for PHSF practical +// purposes.) +// +// Another feature of this hasher is a minimal "premixing" of seeds before +// they are provided to TypesAndSettings::HashFn in case that function does +// not provide sufficiently independent hashes when iterating merely +// sequentially on seeds. (This for example works around a problem with the +// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXPH3 or Hash64, and +// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step +// translates "ordinal seeds," which we iterate sequentially to find a +// solution, into "raw seeds," with many more bits changing for each +// iteration. The translation is an easily reversible lightweight mixing, +// not suitable for hashing on its own. An advantage of this approach is that +// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query +// times, while from the application perspective, we can limit to a small +// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata. +// +// The default constructor initializes the seed to ordinal seed zero, which +// is equal to raw seed zero. +// +template +class StandardHasher { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + inline Hash GetHash(const Key& key) const { + return TypesAndSettings::HashFn(key, raw_seed_); + }; + // For when AddInput == pair (kIsFilter == false) + inline Hash GetHash(const std::pair& bi) const { + return GetHash(bi.first); + }; + inline Index GetStart(Hash h, Index num_starts) const { + // This is "critical path" code because it's required before memory + // lookup. + // + // FastRange gives us a fast and effective mapping from h to the + // appropriate range. This depends most, sometimes exclusively, on + // upper bits of h. + // + if (TypesAndSettings::kUseSmash) { + // Extra logic to "smash" entries at beginning and end, for + // better utilization. For example, without smash and with + // kFirstCoeffAlwaysOne, there's about a 30% chance that the + // first slot in the banding will be unused, and worse without + // kFirstCoeffAlwaysOne. The ending slots are even less utilized + // without smash. + // + // But since this only affects roughly kCoeffBits of the slots, + // it's usually small enough to be ignorable (less computation in + // this function) when number of slots is roughly 10k or larger. + // + // The best values for these smash weights might depend on how + // densely you're packing entries, and also kCoeffBits, but this + // seems to work well for roughly 95% success probability. + // + constexpr Index kFrontSmash = kCoeffBits / 4; + constexpr Index kBackSmash = kCoeffBits / 4; + Index start = FastRangeGeneric(h, num_starts + kFrontSmash + kBackSmash); + start = std::max(start, kFrontSmash); + start -= kFrontSmash; + start = std::min(start, num_starts - 1); + return start; + } else { + // For query speed, we allow small number of initial and final + // entries to be under-utilized. + // NOTE: This call statically enforces that Hash is equivalent to + // either uint32_t or uint64_t. + return FastRangeGeneric(h, num_starts); + } + } + inline CoeffRow GetCoeffRow(Hash h) const { + // This is not so much "critical path" code because it can be done in + // parallel (instruction level) with memory lookup. + // + // When we might have many entries squeezed into a single start, + // we need reasonably good remixing for CoeffRow. + if (TypesAndSettings::kUseSmash) { + // Reasonably good, reasonably fast, reasonably general. + // Probably not 1:1 but probably close enough. + Unsigned128 a = Multiply64to128(h, kAltCoeffFactor1); + Unsigned128 b = Multiply64to128(h, kAltCoeffFactor2); + auto cr = static_cast(b ^ (a << 64) ^ (a >> 64)); + + // Now ensure the value is non-zero + if (kFirstCoeffAlwaysOne) { + cr |= 1; + } else { + // Still have to ensure some bit is non-zero + cr |= (cr == 0) ? 1 : 0; + } + return cr; + } + // If not kUseSmash, we ensure we're not squeezing many entries into a + // single start, in part by ensuring num_starts > num_slots / 2. Thus, + // here we do not need good remixing for CoeffRow, but just enough that + // (a) every bit is reasonably independent from Start. + // (b) every Hash-length bit subsequence of the CoeffRow has full or + // nearly full entropy from h. + // (c) if nontrivial bit subsequences within are correlated, it needs to + // be more complicated than exact copy or bitwise not (at least without + // kFirstCoeffAlwaysOne), or else there seems to be a kind of + // correlated clustering effect. + // (d) the CoeffRow is not zero, so that no one input on its own can + // doom construction success. (Preferably a mix of 1's and 0's if + // satisfying above.) + + // First, establish sufficient bitwise independence from Start, with + // multiplication by a large random prime. + // Note that we cast to Hash because if we use product bits beyond + // original input size, that's going to correlate with Start (FastRange) + // even with a (likely) different multiplier here. + Hash a = h * kCoeffAndResultFactor; + + static_assert( + sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t), + "Supported sizes"); + // If that's big enough, we're done. If not, we have to expand it, + // maybe up to 4x size. + uint64_t b; + if (sizeof(Hash) < sizeof(uint64_t)) { + // Almost-trivial hash expansion (OK - see above), favoring roughly + // equal number of 1's and 0's in result + b = (uint64_t{a} << 32) ^ (a ^ kCoeffXor32); + } else { + b = a; + } + static_assert(sizeof(CoeffRow) <= sizeof(Unsigned128), "Supported sizes"); + Unsigned128 c; + if (sizeof(uint64_t) < sizeof(CoeffRow)) { + // Almost-trivial hash expansion (OK - see above), favoring roughly + // equal number of 1's and 0's in result + c = (Unsigned128{b} << 64) ^ (b ^ kCoeffXor64); + } else { + c = b; + } + auto cr = static_cast(c); + + // Now ensure the value is non-zero + if (kFirstCoeffAlwaysOne) { + cr |= 1; + } else if (sizeof(CoeffRow) == sizeof(Hash)) { + // Still have to ensure some bit is non-zero + cr |= (cr == 0) ? 1 : 0; + } else { + // (We did trivial expansion with constant xor, which ensures some + // bits are non-zero.) + } + return cr; + } + inline ResultRow GetResultRowMask() const { + // TODO: will be used with InterleavedSolutionStorage? + // For now, all bits set (note: might be a small type so might need to + // narrow after promotion) + return static_cast(~ResultRow{0}); + } + inline ResultRow GetResultRowFromHash(Hash h) const { + if (TypesAndSettings::kIsFilter && !TypesAndSettings::kHomogeneous) { + // This is not so much "critical path" code because it can be done in + // parallel (instruction level) with memory lookup. + // + // ResultRow bits only needs to be independent from CoeffRow bits if + // many entries might have the same start location, where "many" is + // comparable to number of hash bits or kCoeffBits. If !kUseSmash + // and num_starts > kCoeffBits, it is safe and efficient to draw from + // the same bits computed for CoeffRow, which are reasonably + // independent from Start. (Inlining and common subexpression + // elimination with GetCoeffRow should make this + // a single shared multiplication in generated code when !kUseSmash.) + Hash a = h * kCoeffAndResultFactor; + + // The bits here that are *most* independent of Start are the highest + // order bits (as in Knuth multiplicative hash). To make those the + // most preferred for use in the result row, we do a bswap here. + auto rr = static_cast(EndianSwapValue(a)); + return rr & GetResultRowMask(); + } else { + // Must be zero + return 0; + } + } + // For when AddInput == Key (kIsFilter == true) + inline ResultRow GetResultRowFromInput(const Key&) const { + // Must be zero + return 0; + } + // For when AddInput == pair (kIsFilter == false) + inline ResultRow GetResultRowFromInput( + const std::pair& bi) const { + // Simple extraction + return bi.second; + } + + // Seed tracking APIs - see class comment + void SetRawSeed(Seed seed) { raw_seed_ = seed; } + Seed GetRawSeed() { return raw_seed_; } + void SetOrdinalSeed(Seed count) { + // A simple, reversible mixing of any size (whole bytes) up to 64 bits. + // This allows casting the raw seed to any smaller size we use for + // ordinal seeds without risk of duplicate raw seeds for unique ordinal + // seeds. + + // Seed type might be smaller than numerical promotion size, but Hash + // should be at least that size, so we use Hash as intermediate type. + static_assert(sizeof(Seed) <= sizeof(Hash), + "Hash must be at least size of Seed"); + + // Multiply by a large random prime (one-to-one for any prefix of bits) + Hash tmp = count * kToRawSeedFactor; + // Within-byte one-to-one mixing + static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0, + "Illegal mask+shift"); + tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift; + raw_seed_ = static_cast(tmp); + // dynamic verification + assert(GetOrdinalSeed() == count); + } + Seed GetOrdinalSeed() { + Hash tmp = raw_seed_; + // Within-byte one-to-one mixing (its own inverse) + tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift; + // Multiply by 64-bit multiplicative inverse + static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1}, + "Must be inverses"); + return static_cast(tmp * kFromRawSeedFactor); + } + + protected: + // For expanding hash: + // large random prime + static constexpr Hash kCoeffAndResultFactor = + static_cast(0xc28f82822b650bedULL); + static constexpr uint64_t kAltCoeffFactor1 = 0x876f170be4f1fcb9U; + static constexpr uint64_t kAltCoeffFactor2 = 0xf0433a4aecda4c5fU; + // random-ish data + static constexpr uint32_t kCoeffXor32 = 0xa6293635U; + static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU; + + // For pre-mixing seeds + static constexpr Hash kSeedMixMask = static_cast(0xf0f0f0f0f0f0f0f0ULL); + static constexpr unsigned kSeedMixShift = 4U; + static constexpr Hash kToRawSeedFactor = + static_cast(0xc78219a23eeadd03ULL); + static constexpr Hash kFromRawSeedFactor = + static_cast(0xfe1a137d14b475abULL); + + // See class description + Seed raw_seed_ = 0; +}; + +// StandardRehasher (and StandardRehasherAdapter): A variant of +// StandardHasher that uses the same type for keys as for hashes. +// This is primarily intended for building a Ribbon filter +// from existing hashes without going back to original inputs in +// order to apply a different seed. This hasher seeds a 1-to-1 mixing +// transformation to apply a seed to an existing hash. (Untested for +// hash-sized keys that are not already uniformly distributed.) This +// transformation builds on the seed pre-mixing done in StandardHasher. +// +// Testing suggests essentially no degradation of solution success rate +// vs. going back to original inputs when changing hash seeds. For example: +// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys +// is about 1.10 for both StandardHasher and StandardRehasher. +// +// StandardRehasher is not really recommended for general PHSFs (not +// filters) because a collision in the original hash could prevent +// construction despite re-seeding the Rehasher. (Such collisions +// do not interfere with filter construction.) +// +// concept RehasherTypesAndSettings: like TypesAndSettings but +// does not require Key or HashFn. +template +class StandardRehasherAdapter : public RehasherTypesAndSettings { + public: + using Hash = typename RehasherTypesAndSettings::Hash; + using Key = Hash; + using Seed = typename RehasherTypesAndSettings::Seed; + + static Hash HashFn(const Hash& input, Seed raw_seed) { + // Note: raw_seed is already lightly pre-mixed, and this multiplication + // by a large prime is sufficient mixing (low-to-high bits) on top of + // that for good FastRange results, which depends primarily on highest + // bits. (The hashed CoeffRow and ResultRow are less sensitive to + // mixing than Start.) + // Also note: did consider adding ^ (input >> some) before the + // multiplication, but doesn't appear to be necessary. + return (input ^ raw_seed) * kRehashFactor; + } + + private: + static constexpr Hash kRehashFactor = + static_cast(0x6193d459236a3a0dULL); +}; + +// See comment on StandardRehasherAdapter +template +using StandardRehasher = + StandardHasher>; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// Especially with smaller hashes (e.g. 32 bit), there can be noticeable +// false positives due to collisions in the Hash returned by GetHash. +// This function returns the expected FP rate due to those collisions, +// which can be added to the expected FP rate from the underlying data +// structure. (Note: technically, a + b is only a good approximation of +// 1-(1-a)(1-b) == a + b - a*b, if a and b are much closer to 0 than to 1.) +// The number of entries added can be a double here in case it's an +// average. +template +double ExpectedCollisionFpRate(const Hasher& hasher, Numerical added) { + // Standardize on the 'double' specialization + return ExpectedCollisionFpRate(hasher, 1.0 * added); +} +template +double ExpectedCollisionFpRate(const Hasher& /*hasher*/, double added) { + // Technically, there could be overlap among the added, but ignoring that + // is typically close enough. + return added / std::pow(256.0, sizeof(typename Hasher::Hash)); +} + +// StandardBanding: a canonical implementation of BandingStorage and +// BacktrackStorage, with convenience API for banding (solving with on-the-fly +// Gaussian elimination) with and without backtracking. +template +class StandardBanding : public StandardHasher { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + StandardBanding(Index num_slots = 0, Index backtrack_size = 0) { + Reset(num_slots, backtrack_size); + } + + void Reset(Index num_slots, Index backtrack_size = 0) { + if (num_slots == 0) { + // Unusual (TypesAndSettings::kAllowZeroStarts) or "uninitialized" + num_starts_ = 0; + } else { + // Normal + assert(num_slots >= kCoeffBits); + if (num_slots > num_slots_allocated_) { + coeff_rows_.reset(new CoeffRow[num_slots]()); + if (!TypesAndSettings::kHomogeneous) { + // Note: don't strictly have to zero-init result_rows, + // except possible information leakage, etc ;) + result_rows_.reset(new ResultRow[num_slots]()); + } + num_slots_allocated_ = num_slots; + } else { + for (Index i = 0; i < num_slots; ++i) { + coeff_rows_[i] = 0; + if (!TypesAndSettings::kHomogeneous) { + // Note: don't strictly have to zero-init result_rows, + // except possible information leakage, etc ;) + result_rows_[i] = 0; + } + } + } + num_starts_ = num_slots - kCoeffBits + 1; + } + EnsureBacktrackSize(backtrack_size); + } + + void EnsureBacktrackSize(Index backtrack_size) { + if (backtrack_size > backtrack_size_) { + backtrack_.reset(new Index[backtrack_size]); + backtrack_size_ = backtrack_size; + } + } + + // ******************************************************************** + // From concept BandingStorage + + inline bool UsePrefetch() const { + // A rough guesstimate of when prefetching during construction pays off. + // TODO: verify/validate + return num_starts_ > 1500; + } + inline void Prefetch(Index i) const { + PREFETCH(&coeff_rows_[i], 1 /* rw */, 1 /* locality */); + if (!TypesAndSettings::kHomogeneous) { + PREFETCH(&result_rows_[i], 1 /* rw */, 1 /* locality */); + } + } + inline void LoadRow(Index i, CoeffRow* cr, ResultRow* rr, + bool for_back_subst) const { + *cr = coeff_rows_[i]; + if (TypesAndSettings::kHomogeneous) { + if (for_back_subst && *cr == 0) { + // Cheap pseudorandom data to fill unconstrained solution rows + *rr = static_cast(i * 0x9E3779B185EBCA87ULL); + } else { + *rr = 0; + } + } else { + *rr = result_rows_[i]; + } + } + inline void StoreRow(Index i, CoeffRow cr, ResultRow rr) { + coeff_rows_[i] = cr; + if (TypesAndSettings::kHomogeneous) { + assert(rr == 0); + } else { + result_rows_[i] = rr; + } + } + inline Index GetNumStarts() const { return num_starts_; } + + // from concept BacktrackStorage, for when backtracking is used + inline bool UseBacktrack() const { return true; } + inline void BacktrackPut(Index i, Index to_save) { backtrack_[i] = to_save; } + inline Index BacktrackGet(Index i) const { return backtrack_[i]; } + + // ******************************************************************** + // Some useful API, still somewhat low level. Here an input is + // a Key for filters, or std::pair for general PHSF. + + // Adds a range of inputs to the banding, returning true if successful. + // False means none or some may have been successfully added, so it's + // best to Reset this banding before any further use. + // + // Adding can fail even before all the "slots" are completely "full". + // + template + bool AddRange(InputIterator begin, InputIterator end) { + assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Can't add any in this case. + return begin == end; + } + // Normal + return BandingAddRange(this, *this, begin, end); + } + + // Adds a range of inputs to the banding, returning true if successful, + // or if unsuccessful, rolls back to state before this call and returns + // false. Caller guarantees that the number of inputs in this batch + // does not exceed `backtrack_size` provided to Reset. + // + // Adding can fail even before all the "slots" are completely "full". + // + template + bool AddRangeOrRollBack(InputIterator begin, InputIterator end) { + assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Can't add any in this case. + return begin == end; + } + // else Normal + return BandingAddRange(this, this, *this, begin, end); + } + + // Adds a single input to the banding, returning true if successful. + // If unsuccessful, returns false and banding state is unchanged. + // + // Adding can fail even before all the "slots" are completely "full". + // + bool Add(const AddInput& input) { + // Pointer can act as iterator + return AddRange(&input, &input + 1); + } + + // Return the number of "occupied" rows (with non-zero coefficients stored). + Index GetOccupiedCount() const { + Index count = 0; + if (num_starts_ > 0) { + const Index num_slots = num_starts_ + kCoeffBits - 1; + for (Index i = 0; i < num_slots; ++i) { + if (coeff_rows_[i] != 0) { + ++count; + } + } + } + return count; + } + + // Returns whether a row is "occupied" in the banding (non-zero + // coefficients stored). (Only recommended for debug/test) + bool IsOccupied(Index i) { return coeff_rows_[i] != 0; } + + // ******************************************************************** + // High-level API + + // Iteratively (a) resets the structure for `num_slots`, (b) attempts + // to add the range of inputs, and (c) if unsuccessful, chooses next + // hash seed, until either successful or unsuccessful with all the + // allowed seeds. Returns true if successful. In that case, use + // GetOrdinalSeed() or GetRawSeed() to get the successful seed. + // + // The allowed sequence of hash seeds is determined by + // `starting_ordinal_seed,` the first ordinal seed to be attempted + // (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of + // two minus one) for the range of ordinal seeds to consider. The + // max number of seeds considered will be ordinal_seed_mask + 1. + // For filters we suggest `starting_ordinal_seed` be chosen randomly + // or round-robin, to minimize false positive correlations between keys. + // + // If unsuccessful, how best to continue is going to be application + // specific. It should be possible to choose parameters such that + // failure is extremely unlikely, using max_seed around 32 to 64. + // (TODO: APIs to help choose parameters) One option for fallback in + // constructing a filter is to construct a Bloom filter instead. + // Increasing num_slots is an option, but should not be used often + // unless construction maximum latency is a concern (rather than + // average running time of construction). Instead, choose parameters + // appropriately and trust that seeds are independent. (Also, + // increasing num_slots without changing hash seed would have a + // significant correlation in success, rather than independence.) + template + bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin, + InputIterator end, + Seed starting_ordinal_seed = 0U, + Seed ordinal_seed_mask = 63U) { + // power of 2 minus 1 + assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0); + // starting seed is within mask + assert((starting_ordinal_seed & ordinal_seed_mask) == + starting_ordinal_seed); + starting_ordinal_seed &= ordinal_seed_mask; // if not debug + + Seed cur_ordinal_seed = starting_ordinal_seed; + do { + StandardHasher::SetOrdinalSeed(cur_ordinal_seed); + Reset(num_slots); + bool success = AddRange(begin, end); + if (success) { + return true; + } + cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask; + } while (cur_ordinal_seed != starting_ordinal_seed); + // Reached limit by circling around + return false; + } + + static std::size_t EstimateMemoryUsage(uint32_t num_slots) { + std::size_t bytes_coeff_rows = num_slots * sizeof(CoeffRow); + std::size_t bytes_result_rows = num_slots * sizeof(ResultRow); + std::size_t bytes_backtrack = 0; + std::size_t bytes_banding = + bytes_coeff_rows + bytes_result_rows + bytes_backtrack; + + return bytes_banding; + } + + protected: + // TODO: explore combining in a struct + std::unique_ptr coeff_rows_; + std::unique_ptr result_rows_; + // We generally store "starts" instead of slots for speed of GetStart(), + // as in StandardHasher. + Index num_starts_ = 0; + Index num_slots_allocated_ = 0; + std::unique_ptr backtrack_; + Index backtrack_size_ = 0; +}; + +// Implements concept SimpleSolutionStorage, mostly for demonstration +// purposes. This is "in memory" only because it does not handle byte +// ordering issues for serialization. +template +class InMemSimpleSolution { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + void PrepareForNumStarts(Index num_starts) { + if (TypesAndSettings::kAllowZeroStarts && num_starts == 0) { + // Unusual + num_starts_ = 0; + } else { + // Normal + const Index num_slots = num_starts + kCoeffBits - 1; + assert(num_slots >= kCoeffBits); + if (num_slots > num_slots_allocated_) { + // Do not need to init the memory + solution_rows_.reset(new ResultRow[num_slots]); + num_slots_allocated_ = num_slots; + } + num_starts_ = num_starts; + } + } + + Index GetNumStarts() const { return num_starts_; } + + ResultRow Load(Index slot_num) const { return solution_rows_[slot_num]; } + + void Store(Index slot_num, ResultRow solution_row) { + solution_rows_[slot_num] = solution_row; + } + + // ******************************************************************** + // High-level API + + template + void BackSubstFrom(const BandingStorage& bs) { + if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) { + // Unusual + PrepareForNumStarts(0); + } else { + // Normal + SimpleBackSubst(this, bs); + } + } + + template + ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const { + // assert(!TypesAndSettings::kIsFilter); Can be useful in testing + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual + return 0; + } else { + // Normal + return SimplePhsfQuery(input, hasher, *this); + } + } + + template + bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Zero starts presumes no keys added -> always false + return false; + } else { + // Normal, or upper_num_columns_ == 0 means "no space for data" and + // thus will always return true. + return SimpleFilterQuery(input, hasher, *this); + } + } + + double ExpectedFpRate() const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual, but we don't have FPs if we always return false. + return 0.0; + } + // else Normal + + // Each result (solution) bit (column) cuts FP rate in half + return std::pow(0.5, 8U * sizeof(ResultRow)); + } + + // ******************************************************************** + // Static high-level API + + // Round up to a number of slots supported by this structure. Note that + // this needs to be must be taken into account for the banding if this + // solution layout/storage is to be used. + static Index RoundUpNumSlots(Index num_slots) { + // Must be at least kCoeffBits for at least one start + // Or if not smash, even more because hashing not equipped + // for stacking up so many entries on a single start location + auto min_slots = kCoeffBits * (TypesAndSettings::kUseSmash ? 1 : 2); + return std::max(num_slots, static_cast(min_slots)); + } + + protected: + // We generally store "starts" instead of slots for speed of GetStart(), + // as in StandardHasher. + Index num_starts_ = 0; + Index num_slots_allocated_ = 0; + std::unique_ptr solution_rows_; +}; + +// Implements concept InterleavedSolutionStorage always using little-endian +// byte order, so easy for serialization/deserialization. This implementation +// fully supports fractional bits per key, where any number of segments +// (number of bytes multiple of sizeof(CoeffRow)) can be used with any number +// of slots that is a multiple of kCoeffBits. +// +// The structure is passed an externally allocated/de-allocated byte buffer +// that is optionally pre-populated (from storage) for answering queries, +// or can be populated by BackSubstFrom. +// +template +class SerializableInterleavedSolution { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + // Does not take ownership of `data` but uses it (up to `data_len` bytes) + // throughout lifetime + SerializableInterleavedSolution(char* data, size_t data_len) + : data_(data), data_len_(data_len) {} + + void PrepareForNumStarts(Index num_starts) { + assert(num_starts == 0 || (num_starts % kCoeffBits == 1)); + num_starts_ = num_starts; + + InternalConfigure(); + } + + Index GetNumStarts() const { return num_starts_; } + + Index GetNumBlocks() const { + const Index num_slots = num_starts_ + kCoeffBits - 1; + return num_slots / kCoeffBits; + } + + Index GetUpperNumColumns() const { return upper_num_columns_; } + + Index GetUpperStartBlock() const { return upper_start_block_; } + + Index GetNumSegments() const { + return static_cast(data_len_ / sizeof(CoeffRow)); + } + + CoeffRow LoadSegment(Index segment_num) const { + assert(data_ != nullptr); // suppress clang analyzer report + return DecodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow)); + } + void StoreSegment(Index segment_num, CoeffRow val) { + assert(data_ != nullptr); // suppress clang analyzer report + EncodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow), val); + } + void PrefetchSegmentRange(Index begin_segment_num, + Index end_segment_num) const { + if (end_segment_num == begin_segment_num) { + // Nothing to do + return; + } + char* cur = data_ + begin_segment_num * sizeof(CoeffRow); + char* last = data_ + (end_segment_num - 1) * sizeof(CoeffRow); + while (cur < last) { + PREFETCH(cur, 0 /* rw */, 1 /* locality */); + cur += CACHE_LINE_SIZE; + } + PREFETCH(last, 0 /* rw */, 1 /* locality */); + } + + // ******************************************************************** + // High-level API + + void ConfigureForNumBlocks(Index num_blocks) { + if (num_blocks == 0) { + PrepareForNumStarts(0); + } else { + PrepareForNumStarts(num_blocks * kCoeffBits - kCoeffBits + 1); + } + } + + void ConfigureForNumSlots(Index num_slots) { + assert(num_slots % kCoeffBits == 0); + ConfigureForNumBlocks(num_slots / kCoeffBits); + } + + template + void BackSubstFrom(const BandingStorage& bs) { + if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) { + // Unusual + PrepareForNumStarts(0); + } else { + // Normal + InterleavedBackSubst(this, bs); + } + } + + template + ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const { + // assert(!TypesAndSettings::kIsFilter); Can be useful in testing + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual + return 0; + } else { + // Normal + // NOTE: not using a struct to encourage compiler optimization + Hash hash; + Index segment_num; + Index num_columns; + Index start_bit; + InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num, + &num_columns, &start_bit); + return InterleavedPhsfQuery(hash, segment_num, num_columns, start_bit, + hasher, *this); + } + } + + template + bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Zero starts presumes no keys added -> always false + return false; + } else { + // Normal, or upper_num_columns_ == 0 means "no space for data" and + // thus will always return true. + // NOTE: not using a struct to encourage compiler optimization + Hash hash; + Index segment_num; + Index num_columns; + Index start_bit; + InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num, + &num_columns, &start_bit); + return InterleavedFilterQuery(hash, segment_num, num_columns, start_bit, + hasher, *this); + } + } + + double ExpectedFpRate() const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Zero starts presumes no keys added -> always false + return 0.0; + } + // else Normal + + // Note: Ignoring smash setting; still close enough in that case + double lower_portion = + (upper_start_block_ * 1.0 * kCoeffBits) / num_starts_; + + // Each result (solution) bit (column) cuts FP rate in half. Weight that + // for upper and lower number of bits (columns). + return lower_portion * std::pow(0.5, upper_num_columns_ - 1) + + (1.0 - lower_portion) * std::pow(0.5, upper_num_columns_); + } + + // ******************************************************************** + // Static high-level API + + // Round up to a number of slots supported by this structure. Note that + // this needs to be must be taken into account for the banding if this + // solution layout/storage is to be used. + static Index RoundUpNumSlots(Index num_slots) { + // Must be multiple of kCoeffBits + Index corrected = (num_slots + kCoeffBits - 1) / kCoeffBits * kCoeffBits; + + // Do not use num_starts==1 unless kUseSmash, because the hashing + // might not be equipped for stacking up so many entries on a + // single start location. + if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) { + corrected += kCoeffBits; + } + return corrected; + } + + // Round down to a number of slots supported by this structure. Note that + // this needs to be must be taken into account for the banding if this + // solution layout/storage is to be used. + static Index RoundDownNumSlots(Index num_slots) { + // Must be multiple of kCoeffBits + Index corrected = num_slots / kCoeffBits * kCoeffBits; + + // Do not use num_starts==1 unless kUseSmash, because the hashing + // might not be equipped for stacking up so many entries on a + // single start location. + if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) { + corrected = 0; + } + return corrected; + } + + // Compute the number of bytes for a given number of slots and desired + // FP rate. Since desired FP rate might not be exactly achievable, + // rounding_bias32==0 means to always round toward lower FP rate + // than desired (more bytes); rounding_bias32==max uint32_t means always + // round toward higher FP rate than desired (fewer bytes); other values + // act as a proportional threshold or bias between the two. + static size_t GetBytesForFpRate(Index num_slots, double desired_fp_rate, + uint32_t rounding_bias32) { + return InternalGetBytesForFpRate(num_slots, desired_fp_rate, + 1.0 / desired_fp_rate, rounding_bias32); + } + + // The same, but specifying desired accuracy as 1.0 / FP rate, or + // one_in_fp_rate. E.g. desired_one_in_fp_rate=100 means 1% FP rate. + static size_t GetBytesForOneInFpRate(Index num_slots, + double desired_one_in_fp_rate, + uint32_t rounding_bias32) { + return InternalGetBytesForFpRate(num_slots, 1.0 / desired_one_in_fp_rate, + desired_one_in_fp_rate, rounding_bias32); + } + + protected: + static size_t InternalGetBytesForFpRate(Index num_slots, + double desired_fp_rate, + double desired_one_in_fp_rate, + uint32_t rounding_bias32) { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts) { + if (num_slots == 0) { + // Unusual. Zero starts presumes no keys added -> always false (no FPs) + return 0U; + } + } else { + assert(num_slots > 0); + } + // Must be rounded up already. + assert(RoundUpNumSlots(num_slots) == num_slots); + + if (desired_one_in_fp_rate > 1.0 && desired_fp_rate < 1.0) { + // Typical: less than 100% FP rate + if (desired_one_in_fp_rate <= static_cast(-1)) { + // Typical: Less than maximum result row entropy + ResultRow rounded = static_cast(desired_one_in_fp_rate); + int lower_columns = FloorLog2(rounded); + double lower_columns_fp_rate = std::pow(2.0, -lower_columns); + double upper_columns_fp_rate = std::pow(2.0, -(lower_columns + 1)); + // Floating point don't let me down! + assert(lower_columns_fp_rate >= desired_fp_rate); + assert(upper_columns_fp_rate <= desired_fp_rate); + + double lower_portion = (desired_fp_rate - upper_columns_fp_rate) / + (lower_columns_fp_rate - upper_columns_fp_rate); + // Floating point don't let me down! + assert(lower_portion >= 0.0); + assert(lower_portion <= 1.0); + + double rounding_bias = (rounding_bias32 + 0.5) / double{0x100000000}; + assert(rounding_bias > 0.0); + assert(rounding_bias < 1.0); + + // Note: Ignoring smash setting; still close enough in that case + Index num_starts = num_slots - kCoeffBits + 1; + // Lower upper_start_block means lower FP rate (higher accuracy) + Index upper_start_block = static_cast( + (lower_portion * num_starts + rounding_bias) / kCoeffBits); + Index num_blocks = num_slots / kCoeffBits; + assert(upper_start_block < num_blocks); + + // Start by assuming all blocks use lower number of columns + Index num_segments = num_blocks * static_cast(lower_columns); + // Correct by 1 each for blocks using upper number of columns + num_segments += (num_blocks - upper_start_block); + // Total bytes + return num_segments * sizeof(CoeffRow); + } else { + // one_in_fp_rate too big, thus requested FP rate is smaller than + // supported. Use max number of columns for minimum supported FP rate. + return num_slots * sizeof(ResultRow); + } + } else { + // Effectively asking for 100% FP rate, or NaN etc. + if (TypesAndSettings::kAllowZeroStarts) { + // Zero segments + return 0U; + } else { + // One segment (minimum size, maximizing FP rate) + return sizeof(CoeffRow); + } + } + } + + void InternalConfigure() { + const Index num_blocks = GetNumBlocks(); + Index num_segments = GetNumSegments(); + + if (num_blocks == 0) { + // Exceptional + upper_num_columns_ = 0; + upper_start_block_ = 0; + } else { + // Normal + upper_num_columns_ = + (num_segments + /*round up*/ num_blocks - 1) / num_blocks; + upper_start_block_ = upper_num_columns_ * num_blocks - num_segments; + // Unless that's more columns than supported by ResultRow data type + if (upper_num_columns_ > 8U * sizeof(ResultRow)) { + // Use maximum columns (there will be space unused) + upper_num_columns_ = static_cast(8U * sizeof(ResultRow)); + upper_start_block_ = 0; + num_segments = num_blocks * upper_num_columns_; + } + } + // Update data_len_ for correct rounding and/or unused space + // NOTE: unused space stays gone if we PrepareForNumStarts again. + // We are prioritizing minimizing the number of fields over making + // the "unusued space" feature work well. + data_len_ = num_segments * sizeof(CoeffRow); + } + + char* const data_; + size_t data_len_; + Index num_starts_ = 0; + Index upper_num_columns_ = 0; + Index upper_start_block_ = 0; +}; + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE + +// For convenience working with templates +#define IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings) \ + using Hasher = ROCKSDB_NAMESPACE::ribbon::StandardHasher; \ + using Banding = \ + ROCKSDB_NAMESPACE::ribbon::StandardBanding; \ + using SimpleSoln = \ + ROCKSDB_NAMESPACE::ribbon::InMemSimpleSolution; \ + using InterleavedSoln = \ + ROCKSDB_NAMESPACE::ribbon::SerializableInterleavedSolution< \ + TypesAndSettings>; \ + static_assert(sizeof(Hasher) + sizeof(Banding) + sizeof(SimpleSoln) + \ + sizeof(InterleavedSoln) > \ + 0, \ + "avoid unused warnings, semicolon expected after macro call") diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/set_comparator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/set_comparator.h new file mode 100644 index 0000000000..e0e64436ad --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/set_comparator.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { +// A comparator to be used in std::set +struct SetComparator { + explicit SetComparator() : user_comparator_(BytewiseComparator()) {} + explicit SetComparator(const Comparator* user_comparator) + : user_comparator_(user_comparator ? user_comparator + : BytewiseComparator()) {} + bool operator()(const Slice& lhs, const Slice& rhs) const { + return user_comparator_->Compare(lhs, rhs) < 0; + } + + private: + const Comparator* user_comparator_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/single_thread_executor.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/single_thread_executor.h new file mode 100644 index 0000000000..c69f2a2921 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/single_thread_executor.h @@ -0,0 +1,56 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#if USE_COROUTINES +#include + +#include "folly/CPortability.h" +#include "folly/CppAttributes.h" +#include "folly/Executor.h" +#include "util/async_file_reader.h" + +namespace ROCKSDB_NAMESPACE { +// Implements a simple executor that runs callback functions in the same +// thread, unlike CPUThreadExecutor which may schedule the callback on +// another thread. Runs in a tight loop calling the queued callbacks, +// and polls for async IO completions when idle. The completions will +// resume suspended coroutines and they get added to the queue, which +// will get picked up by this loop. +// Any possibility of deadlock is precluded because the file system +// guarantees that async IO completion callbacks will not be scheduled +// to run in this thread or this executor. +class SingleThreadExecutor : public folly::Executor { + public: + explicit SingleThreadExecutor(AsyncFileReader& reader) + : reader_(reader), busy_(false) {} + + void add(folly::Func callback) override { + auto& q = q_; + q.push(std::move(callback)); + if (q.size() == 1 && !busy_) { + while (!q.empty()) { + q.front()(); + q.pop(); + + if (q.empty()) { + // Prevent recursion, as the Wait may queue resumed coroutines + busy_ = true; + reader_.Wait(); + busy_ = false; + } + } + } + } + + private: + std::queue q_; + AsyncFileReader& reader_; + bool busy_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // USE_COROUTINES diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/slice.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/slice.cc new file mode 100644 index 0000000000..22dd7ee6bb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/slice.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice.h" + +#include + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + std::string id_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { + id_ = std::string(kClassName()) + "." + std::to_string(prefix_len_); + } + + static const char* kClassName() { return "rocksdb.FixedPrefix"; } + static const char* kNickName() { return "fixed"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool IsInstanceOf(const std::string& name) const override { + if (name == id_) { + return true; + } else if (StartsWith(name, kNickName())) { + std::string alt_id = + std::string(kNickName()) + ":" + std::to_string(prefix_len_); + if (name == alt_id) { + return true; + } + } + return SliceTransform::IsInstanceOf(name); + } + + std::string GetId() const override { return id_; } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + bool InDomain(const Slice& src) const override { + return (src.size() >= prefix_len_); + } + + bool InRange(const Slice& dst) const override { + return (dst.size() == prefix_len_); + } + + bool FullLengthEnabled(size_t* len) const override { + *len = prefix_len_; + return true; + } + + bool SameResultWhenAppended(const Slice& prefix) const override { + return InDomain(prefix); + } +}; + +class CappedPrefixTransform : public SliceTransform { + private: + size_t cap_len_; + std::string id_; + + public: + explicit CappedPrefixTransform(size_t cap_len) : cap_len_(cap_len) { + id_ = std::string(kClassName()) + "." + std::to_string(cap_len_); + } + + static const char* kClassName() { return "rocksdb.CappedPrefix"; } + static const char* kNickName() { return "capped"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + std::string GetId() const override { return id_; } + + bool IsInstanceOf(const std::string& name) const override { + if (name == id_) { + return true; + } else if (StartsWith(name, kNickName())) { + std::string alt_id = + std::string(kNickName()) + ":" + std::to_string(cap_len_); + if (name == alt_id) { + return true; + } + } + return SliceTransform::IsInstanceOf(name); + } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return Slice(src.data(), std::min(cap_len_, src.size())); + } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& dst) const override { + return (dst.size() <= cap_len_); + } + + bool FullLengthEnabled(size_t* len) const override { + *len = cap_len_; + return true; + } + + bool SameResultWhenAppended(const Slice& prefix) const override { + return prefix.size() >= cap_len_; + } +}; + +class NoopTransform : public SliceTransform { + public: + explicit NoopTransform() {} + + static const char* kClassName() { return "rocksdb.Noop"; } + const char* Name() const override { return kClassName(); } + + Slice Transform(const Slice& src) const override { return src; } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& /*dst*/) const override { return true; } + + bool SameResultWhenAppended(const Slice& /*prefix*/) const override { + return false; + } +}; + +} // end namespace + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +const SliceTransform* NewCappedPrefixTransform(size_t cap_len) { + return new CappedPrefixTransform(cap_len); +} + +const SliceTransform* NewNoopTransform() { return new NoopTransform; } + +static int RegisterBuiltinSliceTransform(ObjectLibrary& library, + const std::string& /*arg*/) { + // For the builtin transforms, the format is typically + // [Name].[0-9]+ or [NickName]:[0-9]+ + library.AddFactory( + NoopTransform::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(NewNoopTransform()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(FixedPrefixTransform::kNickName(), false) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + auto len = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewFixedPrefixTransform(len)); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(FixedPrefixTransform::kClassName(), false) + .AddNumber("."), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto len = ParseSizeT( + uri.substr(strlen(FixedPrefixTransform::kClassName()) + 1)); + guard->reset(NewFixedPrefixTransform(len)); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(CappedPrefixTransform::kNickName(), false) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + auto len = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewCappedPrefixTransform(len)); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(CappedPrefixTransform::kClassName(), false) + .AddNumber("."), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto len = ParseSizeT( + uri.substr(strlen(CappedPrefixTransform::kClassName()) + 1)); + guard->reset(NewCappedPrefixTransform(len)); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} + +Status SliceTransform::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinSliceTransform(*(ObjectLibrary::Default().get()), ""); + }); + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (id.empty() && opt_map.empty()) { + result->reset(); + } else { + status = config_options.registry->NewSharedObject(id, result); + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + return Status::OK(); + } else if (status.ok()) { + SliceTransform* transform = const_cast(result->get()); + status = + Customizable::ConfigureNewObject(config_options, transform, opt_map); + } + } + return status; +} + +std::string SliceTransform::AsString() const { + if (HasRegisteredOptions()) { + ConfigOptions opts; + opts.delimiter = ";"; + return ToString(opts); + } + return GetId(); +} + +// 2 small internal utility functions, for efficient hex conversions +// and no need for snprintf, toupper etc... +// Originally from wdt/util/EncryptionUtils.cpp - for +// std::to_string(true)/DecodeHex: +char toHex(unsigned char v) { + if (v <= 9) { + return '0' + v; + } + return 'A' + v - 10; +} +// most of the code is for validation/error check +int fromHex(char c) { + // toupper: + if (c >= 'a' && c <= 'f') { + c -= ('a' - 'A'); // aka 0x20 + } + // validation + if (c < '0' || (c > '9' && (c < 'A' || c > 'F'))) { + return -1; // invalid not 0-9A-F hex char + } + if (c <= '9') { + return c - '0'; + } + return c - 'A' + 10; +} + +Slice::Slice(const SliceParts& parts, std::string* buf) { + size_t length = 0; + for (int i = 0; i < parts.num_parts; ++i) { + length += parts.parts[i].size(); + } + buf->reserve(length); + + for (int i = 0; i < parts.num_parts; ++i) { + buf->append(parts.parts[i].data(), parts.parts[i].size()); + } + data_ = buf->data(); + size_ = buf->size(); +} + +// Return a string that contains the copy of the referenced data. +std::string Slice::ToString(bool hex) const { + std::string result; // RVO/NRVO/move + if (hex) { + result.reserve(2 * size_); + for (size_t i = 0; i < size_; ++i) { + unsigned char c = data_[i]; + result.push_back(toHex(c >> 4)); + result.push_back(toHex(c & 0xf)); + } + return result; + } else { + result.assign(data_, size_); + return result; + } +} + +// Originally from rocksdb/utilities/ldb_cmd.h +bool Slice::DecodeHex(std::string* result) const { + std::string::size_type len = size_; + if (len % 2) { + // Hex string must be even number of hex digits to get complete bytes back + return false; + } + if (!result) { + return false; + } + result->clear(); + result->reserve(len / 2); + + for (size_t i = 0; i < len;) { + int h1 = fromHex(data_[i++]); + if (h1 < 0) { + return false; + } + int h2 = fromHex(data_[i++]); + if (h2 < 0) { + return false; + } + result->push_back(static_cast((h1 << 4) | h2)); + } + return true; +} + +PinnableSlice::PinnableSlice(PinnableSlice&& other) { + *this = std::move(other); +} + +PinnableSlice& PinnableSlice::operator=(PinnableSlice&& other) { + if (this != &other) { + Cleanable::Reset(); + Cleanable::operator=(std::move(other)); + size_ = other.size_; + pinned_ = other.pinned_; + if (pinned_) { + data_ = other.data_; + // When it's pinned, buf should no longer be of use. + } else { + if (other.buf_ == &other.self_space_) { + self_space_ = std::move(other.self_space_); + buf_ = &self_space_; + data_ = buf_->data(); + } else { + buf_ = other.buf_; + data_ = other.data_; + } + } + other.self_space_.clear(); + other.buf_ = &other.self_space_; + other.pinned_ = false; + other.PinSelf(); + } + return *this; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/status.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/status.cc new file mode 100644 index 0000000000..ead315848d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/status.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/status.h" + +#include +#ifdef OS_WIN +#include +#endif +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr Status::CopyState(const char* s) { + const size_t cch = std::strlen(s) + 1; // +1 for the null terminator + char* rv = new char[cch]; + std::strncpy(rv, s, cch); + return std::unique_ptr(rv); +} + +static const char* msgs[static_cast(Status::kMaxSubCode)] = { + "", // kNone + "Timeout Acquiring Mutex", // kMutexTimeout + "Timeout waiting to lock key", // kLockTimeout + "Failed to acquire lock due to max_num_locks limit", // kLockLimit + "No space left on device", // kNoSpace + "Deadlock", // kDeadlock + "Stale file handle", // kStaleFile + "Memory limit reached", // kMemoryLimit + "Space limit reached", // kSpaceLimit + "No such file or directory", // kPathNotFound + // KMergeOperandsInsufficientCapacity + "Insufficient capacity for merge operands", + // kManualCompactionPaused + "Manual compaction paused", + " (overwritten)", // kOverwritten, subcode of OK + "Txn not prepared", // kTxnNotPrepared + "IO fenced off", // kIOFenced + "Merge operator failed", // kMergeOperatorFailed +}; + +Status::Status(Code _code, SubCode _subcode, const Slice& msg, + const Slice& msg2, Severity sev) + : code_(_code), + subcode_(_subcode), + sev_(sev), + retryable_(false), + data_loss_(false), + scope_(0) { + assert(subcode_ != kMaxSubCode); + const size_t len1 = msg.size(); + const size_t len2 = msg2.size(); + const size_t size = len1 + (len2 ? (2 + len2) : 0); + char* const result = new char[size + 1]; // +1 for null terminator + memcpy(result, msg.data(), len1); + if (len2) { + result[len1] = ':'; + result[len1 + 1] = ' '; + memcpy(result + len1 + 2, msg2.data(), len2); + } + result[size] = '\0'; // null terminator for C style string + state_.reset(result); +} + +Status Status::CopyAppendMessage(const Status& s, const Slice& delim, + const Slice& msg) { + // (No attempt at efficiency) + return Status(s.code(), s.subcode(), s.severity(), + std::string(s.getState()) + delim.ToString() + msg.ToString()); +} + +std::string Status::ToString() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + const char* type = nullptr; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge in progress: "; + break; + case kIncomplete: + type = "Result incomplete: "; + break; + case kShutdownInProgress: + type = "Shutdown in progress: "; + break; + case kTimedOut: + type = "Operation timed out: "; + break; + case kAborted: + type = "Operation aborted: "; + break; + case kBusy: + type = "Resource busy: "; + break; + case kExpired: + type = "Operation expired: "; + break; + case kTryAgain: + type = "Operation failed. Try again.: "; + break; + case kCompactionTooLarge: + type = "Compaction too large: "; + break; + case kColumnFamilyDropped: + type = "Column family dropped: "; + break; + case kMaxCode: + assert(false); + break; + } + char tmp[30]; + if (type == nullptr) { + // This should not happen since `code_` should be a valid non-`kMaxCode` + // member of the `Code` enum. The above switch-statement should have had a + // case assigning `type` to a corresponding string. + assert(false); + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast(code())); + type = tmp; + } + std::string result(type); + if (subcode_ != kNone) { + uint32_t index = static_cast(subcode_); + assert(sizeof(msgs) / sizeof(msgs[0]) > index); + result.append(msgs[index]); + } + + if (state_ != nullptr) { + if (subcode_ != kNone) { + result.append(": "); + } + result.append(state_.get()); + } + return result; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.cc new file mode 100644 index 0000000000..6044b8b936 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.cc @@ -0,0 +1,30 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/stderr_logger.h" + +#include "port/sys_time.h" + +namespace ROCKSDB_NAMESPACE { +StderrLogger::~StderrLogger() {} + +void StderrLogger::Logv(const char* format, va_list ap) { + const uint64_t thread_id = Env::Default()->GetThreadID(); + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + port::LocalTimeR(&seconds, &t); + fprintf(stderr, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + vfprintf(stderr, format, ap); + fprintf(stderr, "\n"); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.h new file mode 100644 index 0000000000..c3b01210ce --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stderr_logger.h @@ -0,0 +1,31 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +// Prints logs to stderr for faster debugging +class StderrLogger : public Logger { + public: + explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : Logger(log_level) {} + + ~StderrLogger() override; + + // Brings overloaded Logv()s into scope so they're not hidden when we override + // a subset of them. + using Logger::Logv; + + virtual void Logv(const char* format, va_list ap) override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stop_watch.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stop_watch.h new file mode 100644 index 0000000000..ece72007b8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/stop_watch.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "monitoring/statistics_impl.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +// Auto-scoped. +// When statistics is not nullptr, records the measured time into any enabled +// histograms supplied to the constructor. A histogram argument may be omitted +// by setting it to Histograms::HISTOGRAM_ENUM_MAX. It is also saved into +// *elapsed if the pointer is not nullptr and overwrite is true, it will be +// added to *elapsed if overwrite is false. +class StopWatch { + public: + StopWatch(SystemClock* clock, Statistics* statistics, + const uint32_t hist_type_1, + const uint32_t hist_type_2 = Histograms::HISTOGRAM_ENUM_MAX, + uint64_t* elapsed = nullptr, bool overwrite = true, + bool delay_enabled = false) + : clock_(clock), + statistics_(statistics), + hist_type_1_(statistics && statistics->HistEnabledForType(hist_type_1) + ? hist_type_1 + : Histograms::HISTOGRAM_ENUM_MAX), + hist_type_2_(statistics && statistics->HistEnabledForType(hist_type_2) + ? hist_type_2 + : Histograms::HISTOGRAM_ENUM_MAX), + elapsed_(elapsed), + overwrite_(overwrite), + stats_enabled_(statistics && + statistics->get_stats_level() >= + StatsLevel::kExceptTimers && + (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX || + hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX)), + delay_enabled_(delay_enabled), + total_delay_(0), + delay_start_time_(0), + start_time_((stats_enabled_ || elapsed != nullptr) ? clock->NowMicros() + : 0) {} + + ~StopWatch() { + if (elapsed_) { + if (overwrite_) { + *elapsed_ = clock_->NowMicros() - start_time_; + } else { + *elapsed_ += clock_->NowMicros() - start_time_; + } + } + if (elapsed_ && delay_enabled_) { + *elapsed_ -= total_delay_; + } + if (stats_enabled_) { + const auto time = (elapsed_ != nullptr) + ? *elapsed_ + : (clock_->NowMicros() - start_time_); + if (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_1_, time); + } + if (hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_2_, time); + } + } + } + + void DelayStart() { + // if delay_start_time_ is not 0, it means we are already tracking delay, + // so delay_start_time_ should not be overwritten + if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) { + delay_start_time_ = clock_->NowMicros(); + } + } + + void DelayStop() { + if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) { + total_delay_ += clock_->NowMicros() - delay_start_time_; + } + // reset to 0 means currently no delay is being tracked, so two consecutive + // calls to DelayStop will not increase total_delay_ + delay_start_time_ = 0; + } + + uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; } + + uint64_t start_time() const { return start_time_; } + + private: + SystemClock* clock_; + Statistics* statistics_; + const uint32_t hist_type_1_; + const uint32_t hist_type_2_; + uint64_t* elapsed_; + bool overwrite_; + bool stats_enabled_; + bool delay_enabled_; + uint64_t total_delay_; + uint64_t delay_start_time_; + const uint64_t start_time_; +}; + +// a nano second precision stopwatch +class StopWatchNano { + public: + explicit StopWatchNano(SystemClock* clock, bool auto_start = false) + : clock_(clock), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = clock_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = clock_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + uint64_t ElapsedNanosSafe(bool reset = false) { + return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U; + } + + bool IsStarted() { return start_ != 0; } + + private: + SystemClock* clock_; + uint64_t start_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.cc new file mode 100644 index 0000000000..821ccba07f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "util/string_util.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "port/sys_time.h" +#include "rocksdb/slice.h" + +#ifndef __has_cpp_attribute +#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0 +#else +#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#endif + +#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L +#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]] +#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__ +#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]] +#else +#define ROCKSDB_MAYBE_UNUSED +#endif + +namespace ROCKSDB_NAMESPACE { + +const std::string kNullptrString = "nullptr"; + +std::vector StringSplit(const std::string& arg, char delim) { + std::vector splits; + std::stringstream ss(arg); + std::string item; + while (std::getline(ss, item, delim)) { + splits.push_back(item); + } + return splits; +} + +// for micros < 10ms, print "XX us". +// for micros < 10sec, print "XX ms". +// for micros >= 10 sec, print "XX sec". +// for micros <= 1 hour, print Y:X M:S". +// for micros > 1 hour, print Z:Y:X H:M:S". +int AppendHumanMicros(uint64_t micros, char* output, int len, + bool fixed_format) { + if (micros < 10000 && !fixed_format) { + return snprintf(output, len, "%" PRIu64 " us", micros); + } else if (micros < 10000000 && !fixed_format) { + return snprintf(output, len, "%.3lf ms", + static_cast(micros) / 1000); + } else if (micros < 1000000l * 60 && !fixed_format) { + return snprintf(output, len, "%.3lf sec", + static_cast(micros) / 1000000); + } else if (micros < 1000000ll * 60 * 60 && !fixed_format) { + return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S", + micros / 1000000 / 60, + static_cast(micros % 60000000) / 1000000); + } else { + return snprintf(output, len, "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S", + micros / 1000000 / 3600, (micros / 1000000 / 60) % 60, + static_cast(micros % 60000000) / 1000000); + } +} + +// for sizes >=10TB, print "XXTB" +// for sizes >=10GB, print "XXGB" +// etc. +// append file size summary to output and return the len +int AppendHumanBytes(uint64_t bytes, char* output, int len) { + const uint64_t ull10 = 10; + if (bytes >= ull10 << 40) { + return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40); + } else if (bytes >= ull10 << 30) { + return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30); + } else if (bytes >= ull10 << 20) { + return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20); + } else if (bytes >= ull10 << 10) { + return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10); + } else { + return snprintf(output, len, "%" PRIu64 "B", bytes); + } +} + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%" PRIu64, num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToHumanString(int64_t num) { + char buf[19]; + int64_t absnum = num < 0 ? -num : num; + if (absnum < 10000) { + snprintf(buf, sizeof(buf), "%" PRIi64, num); + } else if (absnum < 10000000) { + snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000); + } else if (absnum < 10000000000LL) { + snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000); + } else { + snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000); + } + return std::string(buf); +} + +std::string BytesToHumanString(uint64_t bytes) { + const char* size_name[] = {"KB", "MB", "GB", "TB"}; + double final_size = static_cast(bytes); + size_t size_idx; + + // always start with KB + final_size /= 1024; + size_idx = 0; + + while (size_idx < 3 && final_size >= 1024) { + final_size /= 1024; + size_idx++; + } + + char buf[20]; + snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]); + return std::string(buf); +} + +std::string TimeToHumanString(int unixtime) { + char time_buffer[80]; + time_t rawtime = unixtime; + struct tm tInfo; + struct tm* timeinfo = port::LocalTimeR(&rawtime, &tInfo); + assert(timeinfo == &tInfo); + strftime(time_buffer, 80, "%c", timeinfo); + return std::string(time_buffer); +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const unsigned int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64 / 10 || + (v == kMaxUint64 / 10 && delta > kMaxUint64 % 10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +bool isSpecialChar(const char c) { + if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') { + return true; + } + return false; +} + +namespace { +using CharMap = std::pair; +} + +char UnescapeChar(const char c) { + static const CharMap convert_map[] = {{'r', '\r'}, {'n', '\n'}}; + + auto iter = std::find_if(std::begin(convert_map), std::end(convert_map), + [c](const CharMap& p) { return p.first == c; }); + + if (iter == std::end(convert_map)) { + return c; + } + return iter->second; +} + +char EscapeChar(const char c) { + static const CharMap convert_map[] = {{'\n', 'n'}, {'\r', 'r'}}; + + auto iter = std::find_if(std::begin(convert_map), std::end(convert_map), + [c](const CharMap& p) { return p.first == c; }); + + if (iter == std::end(convert_map)) { + return c; + } + return iter->second; +} + +std::string EscapeOptionString(const std::string& raw_string) { + std::string output; + for (auto c : raw_string) { + if (isSpecialChar(c)) { + output += '\\'; + output += EscapeChar(c); + } else { + output += c; + } + } + + return output; +} + +std::string UnescapeOptionString(const std::string& escaped_string) { + bool escaped = false; + std::string output; + + for (auto c : escaped_string) { + if (escaped) { + output += UnescapeChar(c); + escaped = false; + } else { + if (c == '\\') { + escaped = true; + continue; + } + output += c; + } + } + return output; +} + +std::string trim(const std::string& str) { + if (str.empty()) return std::string(); + size_t start = 0; + size_t end = str.size() - 1; + while (isspace(str[start]) != 0 && start < end) { + ++start; + } + while (isspace(str[end]) != 0 && start < end) { + --end; + } + if (start <= end) { + return str.substr(start, end - start + 1); + } + return std::string(); +} + +bool EndsWith(const std::string& string, const std::string& pattern) { + size_t plen = pattern.size(); + size_t slen = string.size(); + if (plen <= slen) { + return string.compare(slen - plen, plen, pattern) == 0; + } else { + return false; + } +} + +bool StartsWith(const std::string& string, const std::string& pattern) { + return string.compare(0, pattern.size(), pattern) == 0; +} + + +bool ParseBoolean(const std::string& type, const std::string& value) { + if (value == "true" || value == "1") { + return true; + } else if (value == "false" || value == "0") { + return false; + } + throw std::invalid_argument(type); +} + +uint8_t ParseUint8(const std::string& value) { + uint64_t num = ParseUint64(value); + if ((num >> 8LL) == 0) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + +uint32_t ParseUint32(const std::string& value) { + uint64_t num = ParseUint64(value); + if ((num >> 32LL) == 0) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + +int32_t ParseInt32(const std::string& value) { + int64_t num = ParseInt64(value); + if (num <= std::numeric_limits::max() && + num >= std::numeric_limits::min()) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + + +uint64_t ParseUint64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + uint64_t num = std::stoull(value.c_str(), &endchar); +#else + char* endptr; + uint64_t num = std::strtoul(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +int64_t ParseInt64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int64_t num = std::stoll(value.c_str(), &endchar); +#else + char* endptr; + int64_t num = std::strtoll(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +int ParseInt(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int num = std::stoi(value.c_str(), &endchar); +#else + char* endptr; + int num = std::strtoul(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10; + else if (c == 'm' || c == 'M') + num <<= 20; + else if (c == 'g' || c == 'G') + num <<= 30; + } + + return num; +} + +double ParseDouble(const std::string& value) { +#ifndef CYGWIN + return std::stod(value); +#else + return std::strtod(value.c_str(), 0); +#endif +} + +size_t ParseSizeT(const std::string& value) { + return static_cast(ParseUint64(value)); +} + +std::vector ParseVectorInt(const std::string& value) { + std::vector result; + size_t start = 0; + while (start < value.size()) { + size_t end = value.find(':', start); + if (end == std::string::npos) { + result.push_back(ParseInt(value.substr(start))); + break; + } else { + result.push_back(ParseInt(value.substr(start, end - start))); + start = end + 1; + } + } + return result; +} + +bool SerializeIntVector(const std::vector& vec, std::string* value) { + *value = ""; + for (size_t i = 0; i < vec.size(); ++i) { + if (i > 0) { + *value += ":"; + } + *value += std::to_string(vec[i]); + } + return true; +} + +// Copied from folly/string.cpp: +// https://github.com/facebook/folly/blob/0deef031cb8aab76dc7e736f8b7c22d701d5f36b/folly/String.cpp#L457 +// There are two variants of `strerror_r` function, one returns +// `int`, and another returns `char*`. Selecting proper version using +// preprocessor macros portably is extremely hard. +// +// For example, on Android function signature depends on `__USE_GNU` and +// `__ANDROID_API__` macros (https://git.io/fjBBE). +// +// So we are using C++ overloading trick: we pass a pointer of +// `strerror_r` to `invoke_strerror_r` function, and C++ compiler +// selects proper function. + +#if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))) +ROCKSDB_MAYBE_UNUSED +static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t), + int err, char* buf, size_t buflen) { + // Using XSI-compatible strerror_r + int r = strerror_r(err, buf, buflen); + + // OSX/FreeBSD use EINVAL and Linux uses -1 so just check for non-zero + if (r != 0) { + snprintf(buf, buflen, "Unknown error %d (strerror_r failed with error %d)", + err, errno); + } + return buf; +} + +ROCKSDB_MAYBE_UNUSED +static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t), + int err, char* buf, size_t buflen) { + // Using GNU strerror_r + return strerror_r(err, buf, buflen); +} +#endif // !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))) + +std::string errnoStr(int err) { + char buf[1024]; + buf[0] = '\0'; + + std::string result; + + // https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man3/strerror_r.3.html + // http://www.kernel.org/doc/man-pages/online/pages/man3/strerror.3.html +#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)) + // mingw64 has no strerror_r, but Windows has strerror_s, which C11 added + // as well. So maybe we should use this across all platforms (together + // with strerrorlen_s). Note strerror_r and _s have swapped args. + int r = strerror_s(buf, sizeof(buf), err); + if (r != 0) { + snprintf(buf, sizeof(buf), + "Unknown error %d (strerror_r failed with error %d)", err, errno); + } + result.assign(buf); +#else + // Using any strerror_r + result.assign(invoke_strerror_r(strerror_r, err, buf, sizeof(buf))); +#endif + + return result; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.h new file mode 100644 index 0000000000..0b15181f5d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/string_util.h @@ -0,0 +1,175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +extern std::vector StringSplit(const std::string& arg, char delim); + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Put n digits from v in base kBase to (*buf)[0] to (*buf)[n-1] and +// advance *buf to the position after what was written. +template +inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) { + const char* digitChars = uppercase ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + : "0123456789abcdefghijklmnopqrstuvwxyz"; + for (size_t i = n; i > 0; --i) { + (*buf)[i - 1] = digitChars[static_cast(v % kBase)]; + v /= kBase; + } + *buf += n; +} + +// Parse n digits from *buf in base kBase to *v and advance *buf to the +// position after what was read. On success, true is returned. On failure, +// false is returned, *buf is placed at the first bad character, and *v +// contains the partial parsed data. Overflow is not checked but the +// result is accurate mod 2^64. Requires the starting value of *v to be +// zero or previously accumulated parsed digits, i.e. +// ParseBaseChars(&b, n, &v); +// is equivalent to n calls to +// ParseBaseChars(&b, 1, &v); +template +inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) { + while (n) { + char c = **buf; + *v *= static_cast(kBase); + if (c >= '0' && (kBase >= 10 ? c <= '9' : c < '0' + kBase)) { + *v += static_cast(c - '0'); + } else if (kBase > 10 && c >= 'A' && c < 'A' + kBase - 10) { + *v += static_cast(c - 'A' + 10); + } else if (kBase > 10 && c >= 'a' && c < 'a' + kBase - 10) { + *v += static_cast(c - 'a' + 10); + } else { + return false; + } + --n; + ++*buf; + } + return true; +} + +// Return a human-readable version of num. +// for num >= 10.000, prints "xxK" +// for num >= 10.000.000, prints "xxM" +// for num >= 10.000.000.000, prints "xxG" +extern std::string NumberToHumanString(int64_t num); + +// Return a human-readable version of bytes +// ex: 1048576 -> 1.00 GB +extern std::string BytesToHumanString(uint64_t bytes); + +// Return a human-readable version of unix time +// ex: 1562116015 -> "Tue Jul 2 18:06:55 2019" +extern std::string TimeToHumanString(int unixtime); + +// Append a human-readable time in micros. +int AppendHumanMicros(uint64_t micros, char* output, int len, + bool fixed_format); + +// Append a human-readable size in bytes +int AppendHumanBytes(uint64_t bytes, char* output, int len); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +// Returns true if the input char "c" is considered as a special character +// that will be escaped when EscapeOptionString() is called. +// +// @param c the input char +// @return true if the input char "c" is considered as a special character. +// @see EscapeOptionString +bool isSpecialChar(const char c); + +// If the input char is an escaped char, it will return the its +// associated raw-char. Otherwise, the function will simply return +// the original input char. +char UnescapeChar(const char c); + +// If the input char is a control char, it will return the its +// associated escaped char. Otherwise, the function will simply return +// the original input char. +char EscapeChar(const char c); + +// Converts a raw string to an escaped string. Escaped-characters are +// defined via the isSpecialChar() function. When a char in the input +// string "raw_string" is classified as a special characters, then it +// will be prefixed by '\' in the output. +// +// It's inverse function is UnescapeOptionString(). +// @param raw_string the input string +// @return the '\' escaped string of the input "raw_string" +// @see isSpecialChar, UnescapeOptionString +std::string EscapeOptionString(const std::string& raw_string); + +// The inverse function of EscapeOptionString. It converts +// an '\' escaped string back to a raw string. +// +// @param escaped_string the input '\' escaped string +// @return the raw string of the input "escaped_string" +std::string UnescapeOptionString(const std::string& escaped_string); + +std::string trim(const std::string& str); + +// Returns true if "string" ends with "pattern" +bool EndsWith(const std::string& string, const std::string& pattern); + +// Returns true if "string" starts with "pattern" +bool StartsWith(const std::string& string, const std::string& pattern); + +bool ParseBoolean(const std::string& type, const std::string& value); + +uint8_t ParseUint8(const std::string& value); + +uint32_t ParseUint32(const std::string& value); + +int32_t ParseInt32(const std::string& value); + +uint64_t ParseUint64(const std::string& value); + +int ParseInt(const std::string& value); + +int64_t ParseInt64(const std::string& value); + +double ParseDouble(const std::string& value); + +size_t ParseSizeT(const std::string& value); + +std::vector ParseVectorInt(const std::string& value); + +bool SerializeIntVector(const std::vector& vec, std::string* value); + +extern const std::string kNullptrString; + +// errnoStr() function returns a string that describes the error code passed in +// the argument err +extern std::string errnoStr(int err); + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_guard.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_guard.h new file mode 100644 index 0000000000..b2bb06a1b0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_guard.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "port/port.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Resource management object for threads that joins the thread upon +// destruction. Has unique ownership of the thread object, so copying it is not +// allowed, while moving it transfers ownership. +class ThreadGuard { + public: + ThreadGuard() = default; + + explicit ThreadGuard(port::Thread&& thread) : thread_(std::move(thread)) {} + + ThreadGuard(const ThreadGuard&) = delete; + ThreadGuard& operator=(const ThreadGuard&) = delete; + + ThreadGuard(ThreadGuard&&) noexcept = default; + ThreadGuard& operator=(ThreadGuard&&) noexcept = default; + + ~ThreadGuard() { + if (thread_.joinable()) { + thread_.join(); + } + } + + const port::Thread& GetThread() const { return thread_; } + port::Thread& GetThread() { return thread_; } + + private: + port::Thread thread_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.cc new file mode 100644 index 0000000000..969639d9bc --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.cc @@ -0,0 +1,521 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/thread_local.h" + +#include + +#include "port/likely.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +struct Entry { + Entry() : ptr(nullptr) {} + Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {} + std::atomic ptr; +}; + +class StaticMeta; + +// This is the structure that is declared as "thread_local" storage. +// The vector keep list of atomic pointer for all instances for "current" +// thread. The vector is indexed by an Id that is unique in process and +// associated with one ThreadLocalPtr instance. The Id is assigned by a +// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr +// instances, each thread will have a ThreadData with a vector of size 3: +// --------------------------------------------------- +// | | instance 1 | instance 2 | instance 3 | +// --------------------------------------------------- +// | thread 1 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +// | thread 2 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +// | thread 3 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +struct ThreadData { + explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst) + : entries(), next(nullptr), prev(nullptr), inst(_inst) {} + std::vector entries; + ThreadData* next; + ThreadData* prev; + ThreadLocalPtr::StaticMeta* inst; +}; + +class ThreadLocalPtr::StaticMeta { + public: + StaticMeta(); + + // Return the next available Id + uint32_t GetId(); + // Return the next available Id without claiming it + uint32_t PeekId() const; + // Return the given Id back to the free pool. This also triggers + // UnrefHandler for associated pointer value (if not NULL) for all threads. + void ReclaimId(uint32_t id); + + // Return the pointer value for the given id for the current thread. + void* Get(uint32_t id) const; + // Reset the pointer value for the given id for the current thread. + void Reset(uint32_t id, void* ptr); + // Atomically swap the supplied ptr and return the previous value + void* Swap(uint32_t id, void* ptr); + // Atomically compare and swap the provided value only if it equals + // to expected value. + bool CompareAndSwap(uint32_t id, void* ptr, void*& expected); + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(uint32_t id, autovector* ptrs, void* const replacement); + // Update res by applying func on each thread-local value. Holds a lock that + // prevents unref handler from running during this call, but clients must + // still provide external synchronization since the owning thread can + // access the values without internal locking, e.g., via Get() and Reset(). + void Fold(uint32_t id, FoldFunc func, void* res); + + // Register the UnrefHandler for id + void SetHandler(uint32_t id, UnrefHandler handler); + + // protect inst, next_instance_id_, free_instance_ids_, head_, + // ThreadData.entries + // + // Note that here we prefer function static variable instead of the usual + // global static variable. The reason is that c++ destruction order of + // static variables in the reverse order of their construction order. + // However, C++ does not guarantee any construction order when global + // static variables are defined in different files, while the function + // static variables are initialized when their function are first called. + // As a result, the construction order of the function static variables + // can be controlled by properly invoke their first function calls in + // the right order. + // + // For instance, the following function contains a function static + // variable. We place a dummy function call of this inside + // Env::Default() to ensure the construction order of the construction + // order. + static port::Mutex* Mutex(); + + // Returns the member mutex of the current StaticMeta. In general, + // Mutex() should be used instead of this one. However, in case where + // the static variable inside Instance() goes out of scope, MemberMutex() + // should be used. One example is OnThreadExit() function. + port::Mutex* MemberMutex() { return &mutex_; } + + private: + // Get UnrefHandler for id with acquiring mutex + // REQUIRES: mutex locked + UnrefHandler GetHandler(uint32_t id); + + // Triggered before a thread terminates + static void OnThreadExit(void* ptr); + + // Add current thread's ThreadData to the global chain + // REQUIRES: mutex locked + void AddThreadData(ThreadData* d); + + // Remove current thread's ThreadData from the global chain + // REQUIRES: mutex locked + void RemoveThreadData(ThreadData* d); + + static ThreadData* GetThreadLocal(); + + uint32_t next_instance_id_; + // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed + // frequently. This also prevents it from blowing up the vector space. + autovector free_instance_ids_; + // Chain all thread local structure together. This is necessary since + // when one ThreadLocalPtr gets destroyed, we need to loop over each + // thread's version of pointer corresponding to that instance and + // call UnrefHandler for it. + ThreadData head_; + + std::unordered_map handler_map_; + + // The private mutex. Developers should always use Mutex() instead of + // using this variable directly. + port::Mutex mutex_; + // Thread local storage + static thread_local ThreadData* tls_; + + // Used to make thread exit trigger possible if !defined(OS_MACOSX). + // Otherwise, used to retrieve thread data. + pthread_key_t pthread_key_; +}; + +thread_local ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; + +// Windows doesn't support a per-thread destructor with its +// TLS primitives. So, we build it manually by inserting a +// function to be called on each thread's exit. +// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way +// and http://www.nynaeve.net/?p=183 +// +// really we do this to have clear conscience since using TLS with thread-pools +// is iffy +// although OK within a request. But otherwise, threads have no identity in its +// modern use. + +// This runs on windows only called from the System Loader +#ifdef OS_WIN + +// Windows cleanup routine is invoked from a System Loader with a different +// signature so we can not directly hookup the original OnThreadExit which is +// private member +// so we make StaticMeta class share with the us the address of the function so +// we can invoke it. +namespace wintlscleanup { + +// This is set to OnThreadExit in StaticMeta singleton constructor +UnrefHandler thread_local_inclass_routine = nullptr; +pthread_key_t thread_local_key = pthread_key_t(-1); + +// Static callback function to call with each thread termination. +void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) { + // We decided to punt on PROCESS_EXIT + if (DLL_THREAD_DETACH == reason) { + if (thread_local_key != pthread_key_t(-1) && + thread_local_inclass_routine != nullptr) { + void* tls = TlsGetValue(thread_local_key); + if (tls != nullptr) { + thread_local_inclass_routine(tls); + } + } + } +} + +} // namespace wintlscleanup + +// extern "C" suppresses C++ name mangling so we know the symbol name for the +// linker /INCLUDE:symbol pragma above. +extern "C" { + +#ifdef _MSC_VER +// The linker must not discard thread_callback_on_exit. (We force a reference +// to this variable with a linker /include:symbol pragma to ensure that.) If +// this variable is discarded, the OnThreadExit function will never be called. +#ifndef _X86_ + +// .CRT section is merged with .rdata on x64 so it must be constant data. +#pragma const_seg(".CRT$XLB") +// When defining a const variable, it must have external linkage to be sure the +// linker doesn't discard it. +extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit; +const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = + wintlscleanup::WinOnThreadExit; +// Reset the default section. +#pragma const_seg() + +#pragma comment(linker, "/include:_tls_used") +#pragma comment(linker, "/include:p_thread_callback_on_exit") + +#else // _X86_ + +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit; +// Reset the default section. +#pragma data_seg() + +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit") + +#endif // _X86_ + +#else +// https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc +BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) { + if (dwReason == DLL_THREAD_DETACH) + wintlscleanup::WinOnThreadExit(h, dwReason, pv); + return TRUE; +} +#endif +} // extern "C" + +#endif // OS_WIN + +void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); } + +ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { + // Here we prefer function static variable instead of global + // static variable as function static variable is initialized + // when the function is first call. As a result, we can properly + // control their construction order by properly preparing their + // first function call. + // + // Note that here we decide to make "inst" a static pointer w/o deleting + // it at the end instead of a static variable. This is to avoid the following + // destruction order disaster happens when a child thread using ThreadLocalPtr + // dies AFTER the main thread dies: When a child thread happens to use + // ThreadLocalPtr, it will try to delete its thread-local data on its + // OnThreadExit when the child thread dies. However, OnThreadExit depends + // on the following variable. As a result, if the main thread dies before any + // child thread happen to use ThreadLocalPtr dies, then the destruction of + // the following variable will go first, then OnThreadExit, therefore causing + // invalid access. + // + // The above problem can be solved by using thread_local to store tls_. + // thread_local supports dynamic construction and destruction of + // non-primitive typed variables. As a result, we can guarantee the + // destruction order even when the main thread dies before any child threads. + static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta(); + return inst; +} + +port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; } + +void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { + auto* tls = static_cast(ptr); + assert(tls != nullptr); + + // Use the cached StaticMeta::Instance() instead of directly calling + // the variable inside StaticMeta::Instance() might already go out of + // scope here in case this OnThreadExit is called after the main thread + // dies. + auto* inst = tls->inst; + pthread_setspecific(inst->pthread_key_, nullptr); + + MutexLock l(inst->MemberMutex()); + inst->RemoveThreadData(tls); + // Unref stored pointers of current thread from all instances + uint32_t id = 0; + for (auto& e : tls->entries) { + void* raw = e.ptr.load(); + if (raw != nullptr) { + auto unref = inst->GetHandler(id); + if (unref != nullptr) { + unref(raw); + } + } + ++id; + } + // Delete thread local structure no matter if it is Mac platform + delete tls; +} + +ThreadLocalPtr::StaticMeta::StaticMeta() + : next_instance_id_(0), head_(this), pthread_key_(0) { + if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { + abort(); + } + + // OnThreadExit is not getting called on the main thread. + // Call through the static destructor mechanism to avoid memory leak. + // + // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global + // singleton (destructors are invoked in reverse order of constructor + // _completion_); the latter must not mutate internal members. This + // cleanup mechanism inherently relies on use-after-release of the + // StaticMeta, and is brittle with respect to compiler-specific handling + // of memory backing destructed statically-scoped objects. Perhaps + // registering with atexit(3) would be more robust. + // +// This is not required on Windows. +#if !defined(OS_WIN) + static struct A { + ~A() { + if (tls_) { + OnThreadExit(tls_); + } + } + } a; +#endif // !defined(OS_WIN) + + head_.next = &head_; + head_.prev = &head_; + +#ifdef OS_WIN + // Share with Windows its cleanup routine and the key + wintlscleanup::thread_local_inclass_routine = OnThreadExit; + wintlscleanup::thread_local_key = pthread_key_; +#endif +} + +void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) { + Mutex()->AssertHeld(); + d->next = &head_; + d->prev = head_.prev; + head_.prev->next = d; + head_.prev = d; +} + +void ThreadLocalPtr::StaticMeta::RemoveThreadData(ThreadData* d) { + Mutex()->AssertHeld(); + d->next->prev = d->prev; + d->prev->next = d->next; + d->next = d->prev = d; +} + +ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { + if (UNLIKELY(tls_ == nullptr)) { + auto* inst = Instance(); + tls_ = new ThreadData(inst); + { + // Register it in the global chain, needs to be done before thread exit + // handler registration + MutexLock l(Mutex()); + inst->AddThreadData(tls_); + } + // Even it is not OS_MACOSX, need to register value for pthread_key_ so that + // its exit handler will be triggered. + if (pthread_setspecific(inst->pthread_key_, tls_) != 0) { + { + MutexLock l(Mutex()); + inst->RemoveThreadData(tls_); + } + delete tls_; + abort(); + } + } + return tls_; +} + +void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + return nullptr; + } + return tls->entries[id].ptr.load(std::memory_order_acquire); +} + +void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + tls->entries[id].ptr.store(ptr, std::memory_order_release); +} + +void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire); +} + +bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, + void*& expected) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.compare_exchange_strong( + expected, ptr, std::memory_order_release, std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, + void* const replacement) { + MutexLock l(Mutex()); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(replacement, std::memory_order_acquire); + if (ptr != nullptr) { + ptrs->push_back(ptr); + } + } + } +} + +void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) { + MutexLock l(Mutex()); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = t->entries[id].ptr.load(); + if (ptr != nullptr) { + func(ptr, res); + } + } + } +} + +uint32_t ThreadLocalPtr::TEST_PeekId() { return Instance()->PeekId(); } + +void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) { + MutexLock l(Mutex()); + handler_map_[id] = handler; +} + +UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) { + Mutex()->AssertHeld(); + auto iter = handler_map_.find(id); + if (iter == handler_map_.end()) { + return nullptr; + } + return iter->second; +} + +uint32_t ThreadLocalPtr::StaticMeta::GetId() { + MutexLock l(Mutex()); + if (free_instance_ids_.empty()) { + return next_instance_id_++; + } + + uint32_t id = free_instance_ids_.back(); + free_instance_ids_.pop_back(); + return id; +} + +uint32_t ThreadLocalPtr::StaticMeta::PeekId() const { + MutexLock l(Mutex()); + if (!free_instance_ids_.empty()) { + return free_instance_ids_.back(); + } + return next_instance_id_; +} + +void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { + // This id is not used, go through all thread local data and release + // corresponding value + MutexLock l(Mutex()); + auto unref = GetHandler(id); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = t->entries[id].ptr.exchange(nullptr); + if (ptr != nullptr && unref != nullptr) { + unref(ptr); + } + } + } + handler_map_[id] = nullptr; + free_instance_ids_.push_back(id); +} + +ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) + : id_(Instance()->GetId()) { + if (handler != nullptr) { + Instance()->SetHandler(id_, handler); + } +} + +ThreadLocalPtr::~ThreadLocalPtr() { Instance()->ReclaimId(id_); } + +void* ThreadLocalPtr::Get() const { return Instance()->Get(id_); } + +void ThreadLocalPtr::Reset(void* ptr) { Instance()->Reset(id_, ptr); } + +void* ThreadLocalPtr::Swap(void* ptr) { return Instance()->Swap(id_, ptr); } + +bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { + return Instance()->CompareAndSwap(id_, ptr, expected); +} + +void ThreadLocalPtr::Scrape(autovector* ptrs, void* const replacement) { + Instance()->Scrape(id_, ptrs, replacement); +} + +void ThreadLocalPtr::Fold(FoldFunc func, void* res) { + Instance()->Fold(id_, func, res); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.h new file mode 100644 index 0000000000..fde68f86fb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_local.h @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Cleanup function that will be called for a stored thread local +// pointer (if not NULL) when one of the following happens: +// (1) a thread terminates +// (2) a ThreadLocalPtr is destroyed +// +// Warning: this function is called while holding a global mutex. The same mutex +// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's +// shared across all instances of ThreadLocalPtr. Thereforere extra care +// is needed to avoid deadlocks. In particular, the handler shouldn't lock any +// mutexes and shouldn't call any methods of any ThreadLocalPtr instances, +// unless you know what you're doing. +using UnrefHandler = void (*)(void* ptr); + +// ThreadLocalPtr stores only values of pointer type. Different from +// the usual thread-local-storage, ThreadLocalPtr has the ability to +// distinguish data coming from different threads and different +// ThreadLocalPtr instances. For example, if a regular thread_local +// variable A is declared in DBImpl, two DBImpl objects would share +// the same A. However, a ThreadLocalPtr that is defined under the +// scope of DBImpl can avoid such confliction. As a result, its memory +// usage would be O(# of threads * # of ThreadLocalPtr instances). +class ThreadLocalPtr { + public: + explicit ThreadLocalPtr(UnrefHandler handler = nullptr); + + ThreadLocalPtr(const ThreadLocalPtr&) = delete; + ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete; + + ~ThreadLocalPtr(); + + // Return the current pointer stored in thread local + void* Get() const; + + // Set a new pointer value to the thread local storage. + void Reset(void* ptr); + + // Atomically swap the supplied ptr and return the previous value + void* Swap(void* ptr); + + // Atomically compare the stored value with expected. Set the new + // pointer value to thread local only if the comparison is true. + // Otherwise, expected returns the stored value. + // Return true on success, false on failure + bool CompareAndSwap(void* ptr, void*& expected); + + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(autovector* ptrs, void* const replacement); + + using FoldFunc = std::function; + // Update res by applying func on each thread-local value. Holds a lock that + // prevents unref handler from running during this call, but clients must + // still provide external synchronization since the owning thread can + // access the values without internal locking, e.g., via Get() and Reset(). + void Fold(FoldFunc func, void* res); + + // Add here for testing + // Return the next available Id without claiming it + static uint32_t TEST_PeekId(); + + // Initialize the static singletons of the ThreadLocalPtr. + // + // If this function is not called, then the singletons will be + // automatically initialized when they are used. + // + // Calling this function twice or after the singletons have been + // initialized will be no-op. + static void InitSingletons(); + + class StaticMeta; + + private: + static StaticMeta* Instance(); + + const uint32_t id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_operation.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_operation.h new file mode 100644 index 0000000000..b6c1062796 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/thread_operation.h @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines the structures for thread operation and state. +// Thread operations are used to describe high level action of a +// thread such as doing compaction or flush, while thread state +// are used to describe lower-level action such as reading / +// writing a file or waiting for a mutex. Operations and states +// are designed to be independent. Typically, a thread usually involves +// in one operation and one state at any specific point in time. + +#pragma once + +#include + +#include "rocksdb/thread_status.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_USING_THREAD_STATUS + +// The structure that describes a major thread operation. +struct OperationInfo { + const ThreadStatus::OperationType type; + const std::string name; +}; + +// The global operation table. +// +// When updating a status of a thread, the pointer of the OperationInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +// +// Note that it's not designed to be constant as in the future we +// might consider adding global count to the OperationInfo. +static OperationInfo global_operation_table[] = { + {ThreadStatus::OP_UNKNOWN, ""}, + {ThreadStatus::OP_COMPACTION, "Compaction"}, + {ThreadStatus::OP_FLUSH, "Flush"}, + {ThreadStatus::OP_DBOPEN, "DBOpen"}}; + +struct OperationStageInfo { + const ThreadStatus::OperationStage stage; + const std::string name; +}; + +// A table maintains the mapping from stage type to stage string. +// Note that the string must be changed accordingly when the +// associated function name changed. +static OperationStageInfo global_op_stage_table[] = { + {ThreadStatus::STAGE_UNKNOWN, ""}, + {ThreadStatus::STAGE_FLUSH_RUN, "FlushJob::Run"}, + {ThreadStatus::STAGE_FLUSH_WRITE_L0, "FlushJob::WriteLevel0Table"}, + {ThreadStatus::STAGE_COMPACTION_PREPARE, "CompactionJob::Prepare"}, + {ThreadStatus::STAGE_COMPACTION_RUN, "CompactionJob::Run"}, + {ThreadStatus::STAGE_COMPACTION_PROCESS_KV, + "CompactionJob::ProcessKeyValueCompaction"}, + {ThreadStatus::STAGE_COMPACTION_INSTALL, "CompactionJob::Install"}, + {ThreadStatus::STAGE_COMPACTION_SYNC_FILE, + "CompactionJob::FinishCompactionOutputFile"}, + {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH, + "MemTableList::PickMemtablesToFlush"}, + {ThreadStatus::STAGE_MEMTABLE_ROLLBACK, + "MemTableList::RollbackMemtableFlush"}, + {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + "MemTableList::TryInstallMemtableFlushResults"}, +}; + +// The structure that describes a state. +struct StateInfo { + const ThreadStatus::StateType type; + const std::string name; +}; + +// The global state table. +// +// When updating a status of a thread, the pointer of the StateInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +static StateInfo global_state_table[] = { + {ThreadStatus::STATE_UNKNOWN, ""}, + {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"}, +}; + +struct OperationProperty { + int code; + std::string name; +}; + +static OperationProperty compaction_operation_properties[] = { + {ThreadStatus::COMPACTION_JOB_ID, "JobID"}, + {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"}, + {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"}, + {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"}, + {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"}, + {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"}, +}; + +static OperationProperty flush_operation_properties[] = { + {ThreadStatus::FLUSH_JOB_ID, "JobID"}, + {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"}, + {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}}; + +#else + +struct OperationInfo {}; + +struct StateInfo {}; + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.cc new file mode 100644 index 0000000000..09706cac57 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.cc @@ -0,0 +1,551 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/threadpool_imp.h" + +#ifndef OS_WIN +#include +#endif + +#ifdef OS_LINUX +#include +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void ThreadPoolImpl::PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str()); + abort(); + } +} + +struct ThreadPoolImpl::Impl { + Impl(); + ~Impl(); + + void JoinThreads(bool wait_for_jobs_to_complete); + + void SetBackgroundThreadsInternal(int num, bool allow_reduce); + int GetBackgroundThreads(); + + unsigned int GetQueueLen() const { + return queue_len_.load(std::memory_order_relaxed); + } + + void LowerIOPriority(); + + void LowerCPUPriority(CpuPriority pri); + + void WakeUpAllThreads() { bgsignal_.notify_all(); } + + void BGThread(size_t thread_id); + + void StartBGThreads(); + + void Submit(std::function&& schedule, + std::function&& unschedule, void* tag); + + int UnSchedule(void* arg); + + void SetHostEnv(Env* env) { env_ = env; } + + Env* GetHostEnv() const { return env_; } + + bool HasExcessiveThread() const { + return static_cast(bgthreads_.size()) > total_threads_limit_; + } + + // Return true iff the current thread is the excessive thread to terminate. + // Always terminate the running thread that is added last, even if there are + // more than one thread to terminate. + bool IsLastExcessiveThread(size_t thread_id) const { + return HasExcessiveThread() && thread_id == bgthreads_.size() - 1; + } + + bool IsExcessiveThread(size_t thread_id) const { + return static_cast(thread_id) >= total_threads_limit_; + } + + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() const { return priority_; } + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority) { priority_ = priority; } + + int ReserveThreads(int threads_to_be_reserved) { + std::unique_lock lock(mu_); + // We can reserve at most num_waiting_threads_ in total so the number of + // threads that can be reserved might be fewer than the desired one. In + // rare cases, num_waiting_threads_ could be less than reserved_threads + // due to SetBackgroundThreadInternal or last excessive threads. If that + // happens, we cannot reserve any other threads. + int reserved_threads_in_success = + std::min(std::max(num_waiting_threads_ - reserved_threads_, 0), + threads_to_be_reserved); + reserved_threads_ += reserved_threads_in_success; + return reserved_threads_in_success; + } + + int ReleaseThreads(int threads_to_be_released) { + std::unique_lock lock(mu_); + // We cannot release more than reserved_threads_ + int released_threads_in_success = + std::min(reserved_threads_, threads_to_be_released); + reserved_threads_ -= released_threads_in_success; + WakeUpAllThreads(); + return released_threads_in_success; + } + + private: + static void BGThreadWrapper(void* arg); + + bool low_io_priority_; + CpuPriority cpu_priority_; + Env::Priority priority_; + Env* env_; + + int total_threads_limit_; + std::atomic_uint queue_len_; // Queue length. Used for stats reporting + // Number of reserved threads, managed by ReserveThreads(..) and + // ReleaseThreads(..), if num_waiting_threads_ is no larger than + // reserved_threads_, its thread will be blocked to ensure the reservation + // mechanism + int reserved_threads_; + // Number of waiting threads (Maximum number of threads that can be + // reserved), in rare cases, num_waiting_threads_ could be less than + // reserved_threads due to SetBackgroundThreadInternal or last + // excessive threads. + int num_waiting_threads_; + bool exit_all_threads_; + bool wait_for_jobs_to_complete_; + + // Entry per Schedule()/Submit() call + struct BGItem { + void* tag = nullptr; + std::function function; + std::function unschedFunction; + }; + + using BGQueue = std::deque; + BGQueue queue_; + + std::mutex mu_; + std::condition_variable bgsignal_; + std::vector bgthreads_; +}; + +inline ThreadPoolImpl::Impl::Impl() + : low_io_priority_(false), + cpu_priority_(CpuPriority::kNormal), + priority_(Env::LOW), + env_(nullptr), + total_threads_limit_(0), + queue_len_(), + reserved_threads_(0), + num_waiting_threads_(0), + exit_all_threads_(false), + wait_for_jobs_to_complete_(false), + queue_(), + mu_(), + bgsignal_(), + bgthreads_() {} + +inline ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); } + +void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) { + std::unique_lock lock(mu_); + assert(!exit_all_threads_); + + wait_for_jobs_to_complete_ = wait_for_jobs_to_complete; + exit_all_threads_ = true; + // prevent threads from being recreated right after they're joined, in case + // the user is concurrently submitting jobs. + total_threads_limit_ = 0; + reserved_threads_ = 0; + num_waiting_threads_ = 0; + + lock.unlock(); + + bgsignal_.notify_all(); + + for (auto& th : bgthreads_) { + th.join(); + } + + bgthreads_.clear(); + + exit_all_threads_ = false; + wait_for_jobs_to_complete_ = false; +} + +inline void ThreadPoolImpl::Impl::LowerIOPriority() { + std::lock_guard lock(mu_); + low_io_priority_ = true; +} + +inline void ThreadPoolImpl::Impl::LowerCPUPriority(CpuPriority pri) { + std::lock_guard lock(mu_); + cpu_priority_ = pri; +} + +void ThreadPoolImpl::Impl::BGThread(size_t thread_id) { + bool low_io_priority = false; + CpuPriority current_cpu_priority = CpuPriority::kNormal; + + while (true) { + // Wait until there is an item that is ready to run + std::unique_lock lock(mu_); + // Stop waiting if the thread needs to do work or needs to terminate. + // Increase num_waiting_threads_ once this task has started waiting + num_waiting_threads_++; + + TEST_SYNC_POINT("ThreadPoolImpl::BGThread::WaitingThreadsInc"); + TEST_IDX_SYNC_POINT("ThreadPoolImpl::BGThread::Start:th", thread_id); + // When not exist_all_threads and the current thread id is not the last + // excessive thread, it may be blocked due to 3 reasons: 1) queue is empty + // 2) it is the excessive thread (not the last one) + // 3) the number of waiting threads is not greater than reserved threads + // (i.e, no available threads due to full reservation") + while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) && + (queue_.empty() || IsExcessiveThread(thread_id) || + num_waiting_threads_ <= reserved_threads_)) { + bgsignal_.wait(lock); + } + // Decrease num_waiting_threads_ once the thread is not waiting + num_waiting_threads_--; + + if (exit_all_threads_) { // mechanism to let BG threads exit safely + + if (!wait_for_jobs_to_complete_ || queue_.empty()) { + break; + } + } else if (IsLastExcessiveThread(thread_id)) { + // Current thread is the last generated one and is excessive. + // We always terminate excessive thread in the reverse order of + // generation time. But not when `exit_all_threads_ == true`, + // otherwise `JoinThreads()` could try to `join()` a `detach()`ed + // thread. + auto& terminating_thread = bgthreads_.back(); + terminating_thread.detach(); + bgthreads_.pop_back(); + if (HasExcessiveThread()) { + // There is still at least more excessive thread to terminate. + WakeUpAllThreads(); + } + TEST_IDX_SYNC_POINT("ThreadPoolImpl::BGThread::Termination:th", + thread_id); + TEST_SYNC_POINT("ThreadPoolImpl::BGThread::Termination"); + break; + } + + auto func = std::move(queue_.front().function); + queue_.pop_front(); + + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); + + bool decrease_io_priority = (low_io_priority != low_io_priority_); + CpuPriority cpu_priority = cpu_priority_; + lock.unlock(); + + if (cpu_priority < current_cpu_priority) { + TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::BeforeSetCpuPriority", + ¤t_cpu_priority); + // 0 means current thread. + port::SetCpuPriority(0, cpu_priority); + current_cpu_priority = cpu_priority; + TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::AfterSetCpuPriority", + ¤t_cpu_priority); + } + +#ifdef OS_LINUX + if (decrease_io_priority) { +#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) + // Put schedule into IOPRIO_CLASS_IDLE class (lowest) + // These system calls only have an effect when used in conjunction + // with an I/O scheduler that supports I/O priorities. As at + // kernel 2.6.17 the only such scheduler is the Completely + // Fair Queuing (CFQ) I/O scheduler. + // To change scheduler: + // echo cfq > /sys/block//queue/schedule + // Tunables to consider: + // /sys/block//queue/slice_idle + // /sys/block//queue/slice_sync + syscall(SYS_ioprio_set, 1, // IOPRIO_WHO_PROCESS + 0, // current thread + IOPRIO_PRIO_VALUE(3, 0)); + low_io_priority = true; + } +#else + (void)decrease_io_priority; // avoid 'unused variable' error +#endif + + TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::Impl::BGThread:BeforeRun", + &priority_); + + func(); + } +} + +// Helper struct for passing arguments when creating threads. +struct BGThreadMetadata { + ThreadPoolImpl::Impl* thread_pool_; + size_t thread_id_; // Thread count in the thread. + BGThreadMetadata(ThreadPoolImpl::Impl* thread_pool, size_t thread_id) + : thread_pool_(thread_pool), thread_id_(thread_id) {} +}; + +void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) { + BGThreadMetadata* meta = reinterpret_cast(arg); + size_t thread_id = meta->thread_id_; + ThreadPoolImpl::Impl* tp = meta->thread_pool_; +#ifdef ROCKSDB_USING_THREAD_STATUS + // initialize it because compiler isn't good enough to see we don't use it + // uninitialized + ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES; + switch (tp->GetThreadPriority()) { + case Env::Priority::HIGH: + thread_type = ThreadStatus::HIGH_PRIORITY; + break; + case Env::Priority::LOW: + thread_type = ThreadStatus::LOW_PRIORITY; + break; + case Env::Priority::BOTTOM: + thread_type = ThreadStatus::BOTTOM_PRIORITY; + break; + case Env::Priority::USER: + thread_type = ThreadStatus::USER; + break; + case Env::Priority::TOTAL: + assert(false); + return; + } + assert(thread_type != ThreadStatus::NUM_THREAD_TYPES); + ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type); +#endif + delete meta; + tp->BGThread(thread_id); +#ifdef ROCKSDB_USING_THREAD_STATUS + ThreadStatusUtil::UnregisterThread(); +#endif + return; +} + +void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num, + bool allow_reduce) { + std::lock_guard lock(mu_); + if (exit_all_threads_) { + return; + } + if (num > total_threads_limit_ || + (num < total_threads_limit_ && allow_reduce)) { + total_threads_limit_ = std::max(0, num); + WakeUpAllThreads(); + StartBGThreads(); + } +} + +int ThreadPoolImpl::Impl::GetBackgroundThreads() { + std::unique_lock lock(mu_); + return total_threads_limit_; +} + +void ThreadPoolImpl::Impl::StartBGThreads() { + // Start background thread if necessary + while ((int)bgthreads_.size() < total_threads_limit_) { + port::Thread p_t(&BGThreadWrapper, + new BGThreadMetadata(this, bgthreads_.size())); + +// Set the thread name to aid debugging +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + auto th_handle = p_t.native_handle(); + std::string thread_priority = Env::PriorityToString(GetThreadPriority()); + std::ostringstream thread_name_stream; + thread_name_stream << "rocksdb:"; + for (char c : thread_priority) { + thread_name_stream << static_cast(tolower(c)); + } + pthread_setname_np(th_handle, thread_name_stream.str().c_str()); +#endif +#endif + bgthreads_.push_back(std::move(p_t)); + } +} + +void ThreadPoolImpl::Impl::Submit(std::function&& schedule, + std::function&& unschedule, + void* tag) { + std::lock_guard lock(mu_); + + if (exit_all_threads_) { + return; + } + + StartBGThreads(); + + // Add to priority queue + queue_.push_back(BGItem()); + TEST_SYNC_POINT("ThreadPoolImpl::Submit::Enqueue"); + auto& item = queue_.back(); + item.tag = tag; + item.function = std::move(schedule); + item.unschedFunction = std::move(unschedule); + + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); + + if (!HasExcessiveThread()) { + // Wake up at least one waiting thread. + bgsignal_.notify_one(); + } else { + // Need to wake up all threads to make sure the one woken + // up is not the one to terminate. + WakeUpAllThreads(); + } +} + +int ThreadPoolImpl::Impl::UnSchedule(void* arg) { + int count = 0; + + std::vector> candidates; + { + std::lock_guard lock(mu_); + + // Remove from priority queue + BGQueue::iterator it = queue_.begin(); + while (it != queue_.end()) { + if (arg == (*it).tag) { + if (it->unschedFunction) { + candidates.push_back(std::move(it->unschedFunction)); + } + it = queue_.erase(it); + count++; + } else { + ++it; + } + } + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); + } + + // Run unschedule functions outside the mutex + for (auto& f : candidates) { + f(); + } + + return count; +} + +ThreadPoolImpl::ThreadPoolImpl() : impl_(new Impl()) {} + +ThreadPoolImpl::~ThreadPoolImpl() {} + +void ThreadPoolImpl::JoinAllThreads() { impl_->JoinThreads(false); } + +void ThreadPoolImpl::SetBackgroundThreads(int num) { + impl_->SetBackgroundThreadsInternal(num, true); +} + +int ThreadPoolImpl::GetBackgroundThreads() { + return impl_->GetBackgroundThreads(); +} + +unsigned int ThreadPoolImpl::GetQueueLen() const { + return impl_->GetQueueLen(); +} + +void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() { + impl_->JoinThreads(true); +} + +void ThreadPoolImpl::LowerIOPriority() { impl_->LowerIOPriority(); } + +void ThreadPoolImpl::LowerCPUPriority(CpuPriority pri) { + impl_->LowerCPUPriority(pri); +} + +void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) { + impl_->SetBackgroundThreadsInternal(num, false); +} + +void ThreadPoolImpl::SubmitJob(const std::function& job) { + auto copy(job); + impl_->Submit(std::move(copy), std::function(), nullptr); +} + +void ThreadPoolImpl::SubmitJob(std::function&& job) { + impl_->Submit(std::move(job), std::function(), nullptr); +} + +void ThreadPoolImpl::Schedule(void (*function)(void* arg1), void* arg, + void* tag, void (*unschedFunction)(void* arg)) { + if (unschedFunction == nullptr) { + impl_->Submit(std::bind(function, arg), std::function(), tag); + } else { + impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg), + tag); + } +} + +int ThreadPoolImpl::UnSchedule(void* arg) { return impl_->UnSchedule(arg); } + +void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); } + +Env* ThreadPoolImpl::GetHostEnv() const { return impl_->GetHostEnv(); } + +// Return the thread priority. +// This would allow its member-thread to know its priority. +Env::Priority ThreadPoolImpl::GetThreadPriority() const { + return impl_->GetThreadPriority(); +} + +// Set the thread priority. +void ThreadPoolImpl::SetThreadPriority(Env::Priority priority) { + impl_->SetThreadPriority(priority); +} + +// Reserve a specific number of threads, prevent them from running other +// functions The number of reserved threads could be fewer than the desired one +int ThreadPoolImpl::ReserveThreads(int threads_to_be_reserved) { + return impl_->ReserveThreads(threads_to_be_reserved); +} + +// Release a specific number of threads +int ThreadPoolImpl::ReleaseThreads(int threads_to_be_released) { + return impl_->ReleaseThreads(threads_to_be_released); +} + +ThreadPool* NewThreadPool(int num_threads) { + ThreadPoolImpl* thread_pool = new ThreadPoolImpl(); + thread_pool->SetBackgroundThreads(num_threads); + return thread_pool; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.h new file mode 100644 index 0000000000..a5109e38f5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/threadpool_imp.h @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/threadpool.h" + +namespace ROCKSDB_NAMESPACE { + +class ThreadPoolImpl : public ThreadPool { + public: + ThreadPoolImpl(); + ~ThreadPoolImpl(); + + ThreadPoolImpl(ThreadPoolImpl&&) = delete; + ThreadPoolImpl& operator=(ThreadPoolImpl&&) = delete; + + // Implement ThreadPool interfaces + + // Wait for all threads to finish. + // Discards all the jobs that did not + // start executing and waits for those running + // to complete + void JoinAllThreads() override; + + // Set the number of background threads that will be executing the + // scheduled jobs. + void SetBackgroundThreads(int num) override; + int GetBackgroundThreads() override; + + // Get the number of jobs scheduled in the ThreadPool queue. + unsigned int GetQueueLen() const override; + + // Waits for all jobs to complete those + // that already started running and those that did not + // start yet + void WaitForJobsAndJoinAllThreads() override; + + // Make threads to run at a lower kernel IO priority + // Currently only has effect on Linux + void LowerIOPriority(); + + // Make threads to run at a lower kernel CPU priority + // Currently only has effect on Linux + void LowerCPUPriority(CpuPriority pri); + + // Ensure there is at aleast num threads in the pool + // but do not kill threads if there are more + void IncBackgroundThreadsIfNeeded(int num); + + // Submit a fire and forget job + // These jobs can not be unscheduled + + // This allows to submit the same job multiple times + void SubmitJob(const std::function&) override; + // This moves the function in for efficiency + void SubmitJob(std::function&&) override; + + // Schedule a job with an unschedule tag and unschedule function + // Can be used to filter and unschedule jobs by a tag + // that are still in the queue and did not start running + void Schedule(void (*function)(void* arg1), void* arg, void* tag, + void (*unschedFunction)(void* arg)); + + // Filter jobs that are still in a queue and match + // the given tag. Remove them from a queue if any + // and for each such job execute an unschedule function + // if such was given at scheduling time. + int UnSchedule(void* tag); + + void SetHostEnv(Env* env); + + Env* GetHostEnv() const; + + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() const; + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority); + + // Reserve a specific number of threads, prevent them from running other + // functions The number of reserved threads could be fewer than the desired + // one + int ReserveThreads(int threads_to_be_reserved) override; + + // Release a specific number of threads + int ReleaseThreads(int threads_to_be_released) override; + + static void PthreadCall(const char* label, int result); + + struct Impl; + + private: + // Current public virtual interface does not provide usable + // functionality and thus can not be used internally to + // facade different implementations. + // + // We propose a pimpl idiom in order to easily replace the thread pool impl + // w/o touching the header file but providing a different .cc potentially + // CMake option driven. + // + // Another option is to introduce a Env::MakeThreadPool() virtual interface + // and override the environment. This would require refactoring ThreadPool + // usage. + // + // We can also combine these two approaches + std::unique_ptr impl_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer.h new file mode 100644 index 0000000000..db71cefaf8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer.h @@ -0,0 +1,340 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// A Timer class to handle repeated work. +// +// `Start()` and `Shutdown()` are currently not thread-safe. The client must +// serialize calls to these two member functions. +// +// A single timer instance can handle multiple functions via a single thread. +// It is better to leave long running work to a dedicated thread pool. +// +// Timer can be started by calling `Start()`, and ended by calling `Shutdown()`. +// Work (in terms of a `void function`) can be scheduled by calling `Add` with +// a unique function name and de-scheduled by calling `Cancel`. +// Many functions can be added. +// +// Impl Details: +// A heap is used to keep track of when the next timer goes off. +// A map from a function name to the function keeps track of all the functions. +class Timer { + public: + explicit Timer(SystemClock* clock) + : clock_(clock), + mutex_(clock), + cond_var_(&mutex_), + running_(false), + executing_task_(false) {} + + ~Timer() { Shutdown(); } + + // Add a new function to run. + // fn_name has to be identical, otherwise it will fail to add and return false + // start_after_us is the initial delay. + // repeat_every_us is the interval between ending time of the last call and + // starting time of the next call. For example, repeat_every_us = 2000 and + // the function takes 1000us to run. If it starts at time [now]us, then it + // finishes at [now]+1000us, 2nd run starting time will be at [now]+3000us. + // repeat_every_us == 0 means do not repeat. + bool Add(std::function fn, const std::string& fn_name, + uint64_t start_after_us, uint64_t repeat_every_us) { + auto fn_info = std::make_unique(std::move(fn), fn_name, 0, + repeat_every_us); + InstrumentedMutexLock l(&mutex_); + // Assign time within mutex to make sure the next_run_time is larger than + // the current running one + fn_info->next_run_time_us = clock_->NowMicros() + start_after_us; + // the new task start time should never before the current task executing + // time, as the executing task can only be running if it's next_run_time_us + // is due (<= clock_->NowMicros()). + if (executing_task_ && + fn_info->next_run_time_us < heap_.top()->next_run_time_us) { + return false; + } + auto it = map_.find(fn_name); + if (it == map_.end()) { + heap_.push(fn_info.get()); + map_.try_emplace(fn_name, std::move(fn_info)); + } else { + // timer doesn't support duplicated function name + return false; + } + cond_var_.SignalAll(); + return true; + } + + void Cancel(const std::string& fn_name) { + InstrumentedMutexLock l(&mutex_); + + // Mark the function with fn_name as invalid so that it will not be + // requeued. + auto it = map_.find(fn_name); + if (it != map_.end() && it->second) { + it->second->Cancel(); + } + + // If the currently running function is fn_name, then we need to wait + // until it finishes before returning to caller. + while (!heap_.empty() && executing_task_) { + FunctionInfo* func_info = heap_.top(); + assert(func_info); + if (func_info->name == fn_name) { + WaitForTaskCompleteIfNecessary(); + } else { + break; + } + } + } + + void CancelAll() { + InstrumentedMutexLock l(&mutex_); + CancelAllWithLock(); + } + + // Start the Timer + bool Start() { + InstrumentedMutexLock l(&mutex_); + if (running_) { + return false; + } + + running_ = true; + thread_ = std::make_unique(&Timer::Run, this); + return true; + } + + // Shutdown the Timer + bool Shutdown() { + { + InstrumentedMutexLock l(&mutex_); + if (!running_) { + return false; + } + running_ = false; + CancelAllWithLock(); + cond_var_.SignalAll(); + } + + if (thread_) { + thread_->join(); + } + return true; + } + + bool HasPendingTask() const { + InstrumentedMutexLock l(&mutex_); + for (const auto& fn_info : map_) { + if (fn_info.second->IsValid()) { + return true; + } + } + return false; + } + +#ifndef NDEBUG + // Wait until Timer starting waiting, call the optional callback, then wait + // for Timer waiting again. + // Tests can provide a custom Clock object to mock time, and use the callback + // here to bump current time and trigger Timer. See timer_test for example. + // + // Note: only support one caller of this method. + void TEST_WaitForRun(const std::function& callback = nullptr) { + InstrumentedMutexLock l(&mutex_); + // It act as a spin lock + while (executing_task_ || + (!heap_.empty() && + heap_.top()->next_run_time_us <= clock_->NowMicros())) { + cond_var_.TimedWait(clock_->NowMicros() + 1000); + } + if (callback != nullptr) { + callback(); + } + cond_var_.SignalAll(); + do { + cond_var_.TimedWait(clock_->NowMicros() + 1000); + } while (executing_task_ || + (!heap_.empty() && + heap_.top()->next_run_time_us <= clock_->NowMicros())); + } + + size_t TEST_GetPendingTaskNum() const { + InstrumentedMutexLock l(&mutex_); + size_t ret = 0; + for (const auto& fn_info : map_) { + if (fn_info.second->IsValid()) { + ret++; + } + } + return ret; + } + + void TEST_OverrideTimer(SystemClock* clock) { + InstrumentedMutexLock l(&mutex_); + clock_ = clock; + } +#endif // NDEBUG + + private: + void Run() { + InstrumentedMutexLock l(&mutex_); + + while (running_) { + if (heap_.empty()) { + // wait + TEST_SYNC_POINT("Timer::Run::Waiting"); + cond_var_.Wait(); + continue; + } + + FunctionInfo* current_fn = heap_.top(); + assert(current_fn); + + if (!current_fn->IsValid()) { + heap_.pop(); + map_.erase(current_fn->name); + continue; + } + + if (current_fn->next_run_time_us <= clock_->NowMicros()) { + // make a copy of the function so it won't be changed after + // mutex_.unlock. + std::function fn = current_fn->fn; + executing_task_ = true; + mutex_.Unlock(); + // Execute the work + fn(); + mutex_.Lock(); + executing_task_ = false; + cond_var_.SignalAll(); + + // Remove the work from the heap once it is done executing, make sure + // it's the same function after executing the work while mutex is + // released. + // Note that we are just removing the pointer from the heap. Its + // memory is still managed in the map (as it holds a unique ptr). + // So current_fn is still a valid ptr. + assert(heap_.top() == current_fn); + heap_.pop(); + + // current_fn may be cancelled already. + if (current_fn->IsValid() && current_fn->repeat_every_us > 0) { + assert(running_); + current_fn->next_run_time_us = + clock_->NowMicros() + current_fn->repeat_every_us; + + // Schedule new work into the heap with new time. + heap_.push(current_fn); + } else { + // if current_fn is cancelled or no need to repeat, remove it from the + // map to avoid leak. + map_.erase(current_fn->name); + } + } else { + cond_var_.TimedWait(current_fn->next_run_time_us); + } + } + } + + void CancelAllWithLock() { + mutex_.AssertHeld(); + if (map_.empty() && heap_.empty()) { + return; + } + + // With mutex_ held, set all tasks to invalid so that they will not be + // re-queued. + for (auto& elem : map_) { + auto& func_info = elem.second; + assert(func_info); + func_info->Cancel(); + } + + // WaitForTaskCompleteIfNecessary() may release mutex_ + WaitForTaskCompleteIfNecessary(); + + while (!heap_.empty()) { + heap_.pop(); + } + map_.clear(); + } + + // A wrapper around std::function to keep track when it should run next + // and at what frequency. + struct FunctionInfo { + // the actual work + std::function fn; + // name of the function + std::string name; + // when the function should run next + uint64_t next_run_time_us; + // repeat interval + uint64_t repeat_every_us; + // controls whether this function is valid. + // A function is valid upon construction and until someone explicitly + // calls `Cancel()`. + bool valid; + + FunctionInfo(std::function&& _fn, std::string _name, + const uint64_t _next_run_time_us, uint64_t _repeat_every_us) + : fn(std::move(_fn)), + name(std::move(_name)), + next_run_time_us(_next_run_time_us), + repeat_every_us(_repeat_every_us), + valid(true) {} + + void Cancel() { valid = false; } + + bool IsValid() const { return valid; } + }; + + void WaitForTaskCompleteIfNecessary() { + mutex_.AssertHeld(); + while (executing_task_) { + TEST_SYNC_POINT("Timer::WaitForTaskCompleteIfNecessary:TaskExecuting"); + cond_var_.Wait(); + } + } + + struct RunTimeOrder { + bool operator()(const FunctionInfo* f1, const FunctionInfo* f2) { + return f1->next_run_time_us > f2->next_run_time_us; + } + }; + + SystemClock* clock_; + // This mutex controls both the heap_ and the map_. It needs to be held for + // making any changes in them. + mutable InstrumentedMutex mutex_; + InstrumentedCondVar cond_var_; + std::unique_ptr thread_; + bool running_; + bool executing_task_; + + std::priority_queue, RunTimeOrder> + heap_; + + // In addition to providing a mapping from a function name to a function, + // it is also responsible for memory management. + std::unordered_map> map_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer_queue.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer_queue.h new file mode 100644 index 0000000000..36a1744aca --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/timer_queue.h @@ -0,0 +1,231 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for commercial purposes, all without asking permission. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "test_util/sync_point.h" + +// Allows execution of handlers at a specified time in the future +// Guarantees: +// - All handlers are executed ONCE, even if cancelled (aborted parameter will +// be set to true) +// - If TimerQueue is destroyed, it will cancel all handlers. +// - Handlers are ALWAYS executed in the Timer Queue worker thread. +// - Handlers execution order is NOT guaranteed +// +//////////////////////////////////////////////////////////////////////////////// +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +class TimerQueue { + public: + TimerQueue() : m_th(&TimerQueue::run, this) {} + + ~TimerQueue() { shutdown(); } + + // This function is not thread-safe. + void shutdown() { + if (closed_) { + return; + } + cancelAll(); + // Abusing the timer queue to trigger the shutdown. + add(0, [this](bool) { + m_finish = true; + return std::make_pair(false, 0); + }); + m_th.join(); + closed_ = true; + } + + // Adds a new timer + // \return + // Returns the ID of the new timer. You can use this ID to cancel the + // timer + uint64_t add(int64_t milliseconds, + std::function(bool)> handler) { + WorkItem item; + Clock::time_point tp = Clock::now(); + item.end = tp + std::chrono::milliseconds(milliseconds); + TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end); + item.period = milliseconds; + item.handler = std::move(handler); + + std::unique_lock lk(m_mtx); + uint64_t id = ++m_idcounter; + item.id = id; + m_items.push(std::move(item)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return id; + } + + // Cancels the specified timer + // \return + // 1 if the timer was cancelled. + // 0 if you were too late to cancel (or the timer ID was never valid to + // start with) + size_t cancel(uint64_t id) { + // Instead of removing the item from the container (thus breaking the + // heap integrity), we set the item as having no handler, and put + // that handler on a new item at the top for immediate execution + // The timer thread will then ignore the original item, since it has no + // handler. + std::unique_lock lk(m_mtx); + for (auto&& item : m_items.getContainer()) { + if (item.id == id && item.handler) { + WorkItem newItem; + // Zero time, so it stays at the top for immediate execution + newItem.end = Clock::time_point(); + newItem.id = 0; // Means it is a canceled item + // Move the handler from item to newitem (thus clearing item) + newItem.handler = std::move(item.handler); + m_items.push(std::move(newItem)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return 1; + } + } + return 0; + } + + // Cancels all timers + // \return + // The number of timers cancelled + size_t cancelAll() { + // Setting all "end" to 0 (for immediate execution) is ok, + // since it maintains the heap integrity + std::unique_lock lk(m_mtx); + m_cancel = true; + for (auto&& item : m_items.getContainer()) { + if (item.id && item.handler) { + item.end = Clock::time_point(); + item.id = 0; + } + } + auto ret = m_items.size(); + + m_checkWork.notify_one(); + return ret; + } + + private: + using Clock = std::chrono::steady_clock; + TimerQueue(const TimerQueue&) = delete; + TimerQueue& operator=(const TimerQueue&) = delete; + + void run() { + std::unique_lock lk(m_mtx); + while (!m_finish) { + auto end = calcWaitTime_lock(); + if (end.first) { + // Timers found, so wait until it expires (or something else + // changes) + m_checkWork.wait_until(lk, end.second); + } else { + // No timers exist, so wait forever until something changes + m_checkWork.wait(lk); + } + + // Check and execute as much work as possible, such as, all expired + // timers + checkWork(&lk); + } + + // If we are shutting down, we should not have any items left, + // since the shutdown cancels all items + assert(m_items.size() == 0); + } + + std::pair calcWaitTime_lock() { + while (m_items.size()) { + if (m_items.top().handler) { + // Item present, so return the new wait time + return std::make_pair(true, m_items.top().end); + } else { + // Discard empty handlers (they were cancelled) + m_items.pop(); + } + } + + // No items found, so return no wait time (causes the thread to wait + // indefinitely) + return std::make_pair(false, Clock::time_point()); + } + + void checkWork(std::unique_lock* lk) { + while (m_items.size() && m_items.top().end <= Clock::now()) { + WorkItem item(m_items.top()); + m_items.pop(); + + if (item.handler) { + (*lk).unlock(); + auto reschedule_pair = item.handler(item.id == 0); + (*lk).lock(); + if (!m_cancel && reschedule_pair.first) { + int64_t new_period = (reschedule_pair.second == -1) + ? item.period + : reschedule_pair.second; + + item.period = new_period; + item.end = Clock::now() + std::chrono::milliseconds(new_period); + m_items.push(std::move(item)); + } + } + } + } + + bool m_finish = false; + bool m_cancel = false; + uint64_t m_idcounter = 0; + std::condition_variable m_checkWork; + + struct WorkItem { + Clock::time_point end; + int64_t period; + uint64_t id; // id==0 means it was cancelled + std::function(bool)> handler; + bool operator>(const WorkItem& other) const { return end > other.end; } + }; + + std::mutex m_mtx; + // Inheriting from priority_queue, so we can access the internal container + class Queue : public std::priority_queue, + std::greater> { + public: + std::vector& getContainer() { return this->c; } + } m_items; + ROCKSDB_NAMESPACE::port::Thread m_th; + bool closed_ = false; +}; diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/udt_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/udt_util.h new file mode 100644 index 0000000000..219a093d8d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/udt_util.h @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Dummy record in WAL logs signaling user-defined timestamp sizes for +// subsequent records. +class UserDefinedTimestampSizeRecord { + public: + UserDefinedTimestampSizeRecord() {} + explicit UserDefinedTimestampSizeRecord( + std::vector>&& cf_to_ts_sz) + : cf_to_ts_sz_(std::move(cf_to_ts_sz)) {} + + const std::vector>& GetUserDefinedTimestampSize() + const { + return cf_to_ts_sz_; + } + + inline void EncodeTo(std::string* dst) const { + assert(dst != nullptr); + for (const auto& [cf_id, ts_sz] : cf_to_ts_sz_) { + assert(ts_sz != 0); + PutFixed32(dst, cf_id); + PutFixed16(dst, static_cast(ts_sz)); + } + } + + inline Status DecodeFrom(Slice* src) { + const size_t total_size = src->size(); + if ((total_size % kSizePerColumnFamily) != 0) { + std::ostringstream oss; + oss << "User-defined timestamp size record length: " << total_size + << " is not a multiple of " << kSizePerColumnFamily << std::endl; + return Status::Corruption(oss.str()); + } + int num_of_entries = static_cast(total_size / kSizePerColumnFamily); + for (int i = 0; i < num_of_entries; i++) { + uint32_t cf_id = 0; + uint16_t ts_sz = 0; + if (!GetFixed32(src, &cf_id) || !GetFixed16(src, &ts_sz)) { + return Status::Corruption( + "Error decoding user-defined timestamp size record entry"); + } + cf_to_ts_sz_.emplace_back(cf_id, static_cast(ts_sz)); + } + return Status::OK(); + } + + inline std::string DebugString() const { + std::ostringstream oss; + + for (const auto& [cf_id, ts_sz] : cf_to_ts_sz_) { + oss << "Column family: " << cf_id + << ", user-defined timestamp size: " << ts_sz << std::endl; + } + return oss.str(); + } + + private: + // 4 bytes for column family id, 2 bytes for user-defined timestamp size. + static constexpr size_t kSizePerColumnFamily = 4 + 2; + + std::vector> cf_to_ts_sz_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/user_comparator_wrapper.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/user_comparator_wrapper.h new file mode 100644 index 0000000000..59ebada12d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/user_comparator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { + +// Wrapper of user comparator, with auto increment to +// perf_context.user_key_comparison_count. +class UserComparatorWrapper { + public: + // `UserComparatorWrapper`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + UserComparatorWrapper() : user_comparator_(nullptr) {} + + explicit UserComparatorWrapper(const Comparator* const user_cmp) + : user_comparator_(user_cmp) {} + + ~UserComparatorWrapper() = default; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const Slice& a, const Slice& b) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->Compare(a, b); + } + + bool Equal(const Slice& a, const Slice& b) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->Equal(a, b); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const { + return user_comparator_->CompareTimestamp(ts1, ts2); + } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->CompareWithoutTimestamp(a, b); + } + + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->CompareWithoutTimestamp(a, a_has_ts, b, b_has_ts); + } + + bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const { + return user_comparator_->EqualWithoutTimestamp(a, b); + } + + private: + const Comparator* user_comparator_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/vector_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/vector_iterator.h new file mode 100644 index 0000000000..c4cc01d561 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/vector_iterator.h @@ -0,0 +1,118 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// Iterator over a vector of keys/values +class VectorIterator : public InternalIterator { + public: + VectorIterator(std::vector keys, std::vector values, + const CompareInterface* icmp = nullptr) + : keys_(std::move(keys)), + values_(std::move(values)), + current_(keys_.size()), + indexed_cmp_(icmp, &keys_) { + assert(keys_.size() == values_.size()); + + indices_.reserve(keys_.size()); + for (size_t i = 0; i < keys_.size(); i++) { + indices_.push_back(i); + } + if (icmp != nullptr) { + std::sort(indices_.begin(), indices_.end(), indexed_cmp_); + } + } + + virtual bool Valid() const override { + return !indices_.empty() && current_ < indices_.size(); + } + + virtual void SeekToFirst() override { current_ = 0; } + virtual void SeekToLast() override { current_ = indices_.size() - 1; } + + virtual void Seek(const Slice& target) override { + if (indexed_cmp_.cmp != nullptr) { + current_ = std::lower_bound(indices_.begin(), indices_.end(), target, + indexed_cmp_) - + indices_.begin(); + } else { + current_ = + std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + } + + virtual void SeekForPrev(const Slice& target) override { + if (indexed_cmp_.cmp != nullptr) { + current_ = std::upper_bound(indices_.begin(), indices_.end(), target, + indexed_cmp_) - + indices_.begin(); + } else { + current_ = + std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + if (!Valid()) { + SeekToLast(); + } else { + Prev(); + } + } + + virtual void Next() override { current_++; } + virtual void Prev() override { current_--; } + + virtual Slice key() const override { + return Slice(keys_[indices_[current_]]); + } + virtual Slice value() const override { + return Slice(values_[indices_[current_]]); + } + + virtual Status status() const override { return Status::OK(); } + + virtual bool IsKeyPinned() const override { return true; } + virtual bool IsValuePinned() const override { return true; } + + protected: + std::vector keys_; + std::vector values_; + size_t current_; + + private: + struct IndexedKeyComparator { + IndexedKeyComparator(const CompareInterface* c, + const std::vector* ks) + : cmp(c), keys(ks) {} + + bool operator()(size_t a, size_t b) const { + return cmp->Compare((*keys)[a], (*keys)[b]) < 0; + } + + bool operator()(size_t a, const Slice& b) const { + return cmp->Compare((*keys)[a], b) < 0; + } + + bool operator()(const Slice& a, size_t b) const { + return cmp->Compare(a, (*keys)[b]) < 0; + } + + const CompareInterface* cmp; + const std::vector* keys; + }; + + IndexedKeyComparator indexed_cmp_; + std::vector indices_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/work_queue.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/work_queue.h new file mode 100644 index 0000000000..94ece85d9d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/work_queue.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +/// Unbounded thread-safe work queue. +// +// This file is an excerpt from Facebook's zstd repo at +// https://github.com/facebook/zstd/. The relevant file is +// contrib/pzstd/utils/WorkQueue.h. + +template +class WorkQueue { + // Protects all member variable access + std::mutex mutex_; + std::condition_variable readerCv_; + std::condition_variable writerCv_; + std::condition_variable finishCv_; + + std::queue queue_; + bool done_; + std::size_t maxSize_; + + // Must have lock to call this function + bool full() const { + if (maxSize_ == 0) { + return false; + } + return queue_.size() >= maxSize_; + } + + public: + /** + * Constructs an empty work queue with an optional max size. + * If `maxSize == 0` the queue size is unbounded. + * + * @param maxSize The maximum allowed size of the work queue. + */ + WorkQueue(std::size_t maxSize = 0) : done_(false), maxSize_(maxSize) {} + + /** + * Push an item onto the work queue. Notify a single thread that work is + * available. If `finish()` has been called, do nothing and return false. + * If `push()` returns false, then `item` has not been copied from. + * + * @param item Item to push onto the queue. + * @returns True upon success, false if `finish()` has been called. An + * item was pushed iff `push()` returns true. + */ + template + bool push(U&& item) { + { + std::unique_lock lock(mutex_); + while (full() && !done_) { + writerCv_.wait(lock); + } + if (done_) { + return false; + } + queue_.push(std::forward(item)); + } + readerCv_.notify_one(); + return true; + } + + /** + * Attempts to pop an item off the work queue. It will block until data is + * available or `finish()` has been called. + * + * @param[out] item If `pop` returns `true`, it contains the popped item. + * If `pop` returns `false`, it is unmodified. + * @returns True upon success. False if the queue is empty and + * `finish()` has been called. + */ + bool pop(T& item) { + { + std::unique_lock lock(mutex_); + while (queue_.empty() && !done_) { + readerCv_.wait(lock); + } + if (queue_.empty()) { + assert(done_); + return false; + } + item = queue_.front(); + queue_.pop(); + } + writerCv_.notify_one(); + return true; + } + + /** + * Sets the maximum queue size. If `maxSize == 0` then it is unbounded. + * + * @param maxSize The new maximum queue size. + */ + void setMaxSize(std::size_t maxSize) { + { + std::lock_guard lock(mutex_); + maxSize_ = maxSize; + } + writerCv_.notify_all(); + } + + /** + * Promise that `push()` won't be called again, so once the queue is empty + * there will never any more work. + */ + void finish() { + { + std::lock_guard lock(mutex_); + assert(!done_); + done_ = true; + } + readerCv_.notify_all(); + writerCv_.notify_all(); + finishCv_.notify_all(); + } + + /// Blocks until `finish()` has been called (but the queue may not be empty). + void waitUntilFinished() { + std::unique_lock lock(mutex_); + while (!done_) { + finishCv_.wait(lock); + } + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.cc new file mode 100644 index 0000000000..88852c3308 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.cc @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ +// clang-format off +#ifndef XXH_STATIC_LINKING_ONLY +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#endif // !defined(XXH_STATIC_LINKING_ONLY) +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.h new file mode 100644 index 0000000000..ad49bab816 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/util/xxhash.h @@ -0,0 +1,6360 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* BEGIN RocksDB customizations */ +#ifndef XXH_STATIC_LINKING_ONLY +// Using compiled xxhash.cc +#define XXH_STATIC_LINKING_ONLY 1 +#endif // !defined(XXH_STATIC_LINKING_ONLY) +#ifndef XXH_NAMESPACE +#define XXH_NAMESPACE ROCKSDB_ +#endif // !defined(XXH_NAMESPACE) + +// for FALLTHROUGH_INTENDED, inserted as appropriate +#include "port/lang.h" +/* END RocksDB customizations */ + +// clang-format off +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 1 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; + +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * See @ref single_shot_example "Single Shot Example" for an example. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * @see streaming_example at the top of @ref xxhash.h for an example. + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif + +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif + +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif + +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif + +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled via the @ref XXH_VECTOR + * macro. For the x86 family, an automatic dispatcher is included separately + * in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generage exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/* + * XXH3_64bits_reset(): + * Initialize with default parameters. + * digest will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } + + +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * . + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) \ + || defined(__loongarch64) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) XXH_ASSUME(c) +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD_W(var) __asm__ __volatile__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_W(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) +/* C23 and future versions have standard "unreachable()" */ +# include +# define XXH_UNREACHABLE() unreachable() + +#elif defined(__cplusplus) && (__cplusplus > 202002L) +/* C++23 and future versions have std::unreachable() */ +# include /* std::unreachable() */ +# define XXH_UNREACHABLE() std::unreachable() + +#elif XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang autovectorizes it incorrectly + * and it is pointless writing a NEON implementation that is basically the + * same speed as scalar for XXH32. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @ingroup XXH32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || defined(__aarch64__) || defined(_M_ARM) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * @ref XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/*! + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && (defined(__GNUC__) || defined(__clang__)) \ + && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM)) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(uint64x2_t const*)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and + * 2 lanes on scalar by default (except on Apple platforms, as Apple CPUs benefit + * from only using NEON). + * + * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the + * emulated 64-bit arithmetic is too slow. + * + * Modern ARM CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't + * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions, + * you are only using 2/3 of the CPU bandwidth. + * + * This is even more noticeable on the more advanced cores like the A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the + * remaining lanes will use scalar instructions. This improves the bandwidth + * and also gives the integer pipelines something to do besides twiddling loop + * counters and pointers. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= 0x9FB21C651E98DF25ULL; + h64 ^= (h64 >> 35) + len ; + h64 *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1, acc_end; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); + acc_end = 0; +#else + acc += XXH3_mix16B(input+0, secret+0, seed); + acc_end = XXH3_mix16B(input+len-16, secret+16, seed); + if (len > 32) { + acc += XXH3_mix16B(input+16, secret+32, seed); + acc_end += XXH3_mix16B(input+len-32, secret+48, seed); + if (len > 64) { + acc += XXH3_mix16B(input+32, secret+64, seed); + acc_end += XXH3_mix16B(input+len-48, secret+80, seed); + + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc_end += XXH3_mix16B(input+len-64, secret+112, seed); + } + } + } +#endif + return XXH3_avalanche(acc + acc_end); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { + uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + uint64x2_t acc_vec1 = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec1 = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec1 = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_21 = vextq_u64(data_vec1, data_vec1, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key1 = veorq_u64(data_vec1, key_vec1); + + uint64x2_t acc_vec2 = xacc[i+1]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_22 = vextq_u64(data_vec2, data_vec2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key2 = veorq_u64(data_vec2, key_vec2); + + /* data_key_lo = {(data_key1 & 0xFFFFFFFF), (data_key2 & 0xFFFFFFFF)}; + * data_key_hi = {(data_key1 >> 32), (data_key2 >> 32)}; + */ + uint32x4x2_t zipped = vuzpq_u32(vreinterpretq_u32_u64(data_key1), vreinterpretq_u32_u64(data_key2)); + uint32x4_t data_key_lo = zipped.val[0]; + uint32x4_t data_key_hi = zipped.val[1]; + + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_21 = vmlal_u32 (acc_vec_21, vget_low_u32(data_key_lo), vget_low_u32(data_key_hi)); + XXH_COMPILER_GUARD_W(acc_vec_21); + /* xacc[i] += acc_vec_2; */ + acc_vec1 = vaddq_u64 (acc_vec1, acc_vec_21); + xacc[i] = acc_vec1; + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_22 = vmlal_u32 (acc_vec_22, vget_high_u32(data_key_lo), vget_high_u32(data_key_hi)); + XXH_COMPILER_GUARD_W(acc_vec_22); + /* xacc[i] += acc_vec_2; */ + acc_vec2 = vaddq_u64 (acc_vec2, acc_vec_22); + xacc[i+1] = acc_vec2; + } + for (; i < XXH3_NEON_LANES / 2; i++) { + uint64x2_t acc_vec = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + data_key = veorq_u64(data_vec, key_vec); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi); + XXH_COMPILER_GUARD_W(acc_vec_2); + /* xacc[i] += acc_vec_2; */ + acc_vec = vaddq_u64 (acc_vec, acc_vec_2); + xacc[i] = acc_vec; + } + + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); + + size_t i; + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * XXH_PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + prod_hi = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime); + } + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + unsigned int* const xacc = (unsigned int*) acc; + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i); + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + /* xacc[i] = acc_vec; */ + vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == XXH3_kSecret); + + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/* Note : when XXH3_consumeStripes() is invoked, + * there must be a guarantee that at least one more byte must be consumed from input + * so that the function can blindly consume all stripes using the "normal" secret segment */ +XXH_FORCE_INLINE void +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { + /* need a scrambling operation */ + size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; + size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; + f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock); + f_scramble(acc, secret + secretLimit); + f_acc(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock); + *nbStripesSoFarPtr = nbStripesAfterBlock; + } else { + f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes); + *nbStripesSoFarPtr += nbStripes; + } +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + + /* large input to consume : ingest per full block */ + if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); + /* join to current block's end */ + { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; + XXH_ASSERT(nbStripesToEnd <= nbStripes); + f_acc(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd); + f_scramble(acc, secret + state->secretLimit); + state->nbStripesSoFar = 0; + input += nbStripesToEnd * XXH_STRIPE_LEN; + nbStripes -= nbStripesToEnd; + } + /* consume per entire blocks */ + while(nbStripes >= state->nbStripesPerBlock) { + f_acc(acc, input, secret, state->nbStripesPerBlock); + f_scramble(acc, secret + state->secretLimit); + input += state->nbStripesPerBlock * XXH_STRIPE_LEN; + nbStripes -= state->nbStripesPerBlock; + } + /* consume last partial block */ + f_acc(acc, input, secret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + XXH_ASSERT(input < bEnd); /* at least some bytes left */ + state->nbStripesSoFar = nbStripes; + /* buffer predecessor of last partial stripe */ + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); + } else { + /* content to consume <= block size */ + /* Consume input by a multiple of internal buffer size */ + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + input += XXH3_INTERNALBUFFER_SIZE; + } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + } + } + + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + /* last stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - XXH_STRIPE_LEN, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } else { /* bufferedSize < XXH_STRIPE_LEN */ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n +/* END RocksDB customizations */ + +// clang-format off +#if defined (__cplusplus) +extern "C" { +#endif + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXPH_OK=0, XXPH_ERROR } XXPH_errorcode; + + +/* **************************** + * API modifier + ******************************/ +/** XXPH_INLINE_ALL (and XXPH_PRIVATE_API) + * This build macro includes xxhash functions in `static` mode + * in order to inline them, and remove their symbol from the public list. + * Inlining offers great performance improvement on small keys, + * and dramatic ones when length is expressed as a compile-time constant. + * See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html . + * Methodology : + * #define XXPH_INLINE_ALL + * #include "xxhash.h" + * `xxhash.c` is automatically included. + * It's not useful to compile and link it as a separate object. + */ +#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) +# ifndef XXPH_STATIC_LINKING_ONLY +# define XXPH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXPH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXPH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXPH_PUBLIC_API static __inline +# else + /* this version may generate warnings for unused static functions */ +# define XXPH_PUBLIC_API static +# endif +#else +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXPH_IMPORT) || defined(XXPH_EXPORT)) +# ifdef XXPH_EXPORT +# define XXPH_PUBLIC_API __declspec(dllexport) +# elif XXPH_IMPORT +# define XXPH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXPH_PUBLIC_API /* do nothing */ +# endif +#endif /* XXPH_INLINE_ALL || XXPH_PRIVATE_API */ + +/*! XXPH_NAMESPACE, aka Namespace Emulation : + * + * If you want to include _and expose_ xxHash functions from within your own library, + * but also want to avoid symbol collisions with other libraries which may also include xxHash, + * + * you can use XXPH_NAMESPACE, to automatically prefix any public symbol from xxhash library + * with the value of XXPH_NAMESPACE (therefore, avoid NULL and numeric values). + * + * Note that no change is required within the calling program as long as it includes `xxhash.h` : + * regular symbol name will be automatically translated by this header. + */ +#ifdef XXPH_NAMESPACE +# define XXPH_CAT(A,B) A##B +# define XXPH_NAME2(A,B) XXPH_CAT(A,B) +# define XXPH_versionNumber XXPH_NAME2(XXPH_NAMESPACE, XXPH_versionNumber) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXPH_VERSION_MAJOR 0 +#define XXPH_VERSION_MINOR 7 +#define XXPH_VERSION_RELEASE 2 +#define XXPH_VERSION_NUMBER (XXPH_VERSION_MAJOR *100*100 + XXPH_VERSION_MINOR *100 + XXPH_VERSION_RELEASE) +XXPH_PUBLIC_API unsigned XXPH_versionNumber (void); + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXPH32_hash_t; +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXPH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXPH32_hash_t; +# else +# error "unsupported platform : need a 32-bit type" +# endif +# endif +#endif + +#ifndef XXPH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXPH64_hash_t; +#else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXPH64_hash_t; +#endif + +#endif /* XXPH_NO_LONG_LONG */ + + + +#ifdef XXPH_STATIC_LINKING_ONLY + +/* ================================================================================================ + This section contains declarations which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! +=================================================================================================== */ + + +/*-********************************************************************** +* XXPH3 +* New experimental hash +************************************************************************/ +#ifndef XXPH_NO_LONG_LONG + + +/* ============================================ + * XXPH3 is a new hash algorithm, + * featuring improved speed performance for both small and large inputs. + * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * In general, expect XXPH3 to run about ~2x faster on large inputs, + * and >3x faster on small ones, though exact differences depend on platform. + * + * The algorithm is portable, will generate the same hash on all platforms. + * It benefits greatly from vectorization units, but does not require it. + * + * XXPH3 offers 2 variants, _64bits and _128bits. + * When only 64 bits are needed, prefer calling the _64bits variant : + * it reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The XXPH3 algorithm is still considered experimental. + * Produced results can still change between versions. + * Results produced by v0.7.x are not comparable with results from v0.7.y . + * It's nonetheless possible to use XXPH3 for ephemeral data (local sessions), + * but avoid storing values in long-term storage for later reads. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + * + * There are still a number of opened questions that community can influence during the experimental period. + * I'm trying to list a few of them below, though don't consider this list as complete. + * + * - 128-bits output type : currently defined as a structure of two 64-bits fields. + * That's because 128-bit values do not exist in C standard. + * Note that it means that, at byte level, result is not identical depending on endianess. + * However, at field level, they are identical on all platforms. + * The canonical representation solves the issue of identical byte-level representation across platforms, + * which is necessary for serialization. + * Q1 : Would there be a better representation for a 128-bit hash result ? + * Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ? + * + * - Prototype XXPH128() : XXPH128() uses the same arguments as XXPH64(), for consistency. + * It means it maps to XXPH3_128bits_withSeed(). + * This variant is slightly slower than XXPH3_128bits(), + * because the seed is now part of the algorithm, and can't be simplified. + * Is that a good idea ? + * + * - Seed type for XXPH128() : currently, it's a single 64-bit value, like the 64-bit variant. + * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash. + * But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value. + * Such a variant could either replace current one, or become an additional one. + * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). + * Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXPH128 ? + * + * - Result for len==0 : Currently, the result of hashing a zero-length input is always `0`. + * It seems okay as a return value when using "default" secret and seed. + * But is it still fine to return `0` when secret or seed are non-default ? + * Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ? + * + * - Consistency (1) : Streaming XXPH128 uses an XXPH3 state, which is the same state as XXPH3_64bits(). + * It means a 128bit streaming loop must invoke the following symbols : + * XXPH3_createState(), XXPH3_128bits_reset(), XXPH3_128bits_update() (loop), XXPH3_128bits_digest(), XXPH3_freeState(). + * Is that consistent enough ? + * + * - Consistency (2) : The canonical representation of `XXPH3_64bits` is provided by existing functions + * XXPH64_canonicalFromHash(), and reverse operation XXPH64_hashFromCanonical(). + * As a mirror, canonical functions for XXPH128_hash_t results generated by `XXPH3_128bits` + * are XXPH128_canonicalFromHash() and XXPH128_hashFromCanonical(). + * Which means, `XXPH3` doesn't appear in the names, because canonical functions operate on a type, + * independently of which algorithm was used to generate that type. + * Is that consistent enough ? + */ + +#ifdef XXPH_NAMESPACE +# define XXPH3_64bits XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits) +# define XXPH3_64bits_withSecret XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSecret) +# define XXPH3_64bits_withSeed XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSeed) +#endif + +/* XXPH3_64bits() : + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* data, size_t len); + +/* XXPH3_64bits_withSecret() : + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The secret *must* be large enough (>= XXPH3_SECRET_SIZE_MIN). + * It should consist of random bytes. + * Avoid repeating same character, or sequences of bytes, + * and especially avoid swathes of \0. + * Failure to respect these conditions will result in a poor quality hash. + */ +#define XXPH3_SECRET_SIZE_MIN 136 +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/* XXPH3_64bits_withSeed() : + * This variant generates on the fly a custom secret, + * based on the default secret, altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * note : seed==0 produces same results as XXPH3_64bits() */ +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSeed(const void* data, size_t len, XXPH64_hash_t seed); + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include +# define XXPH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXPH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXPH_ALIGN(n) __declspec(align(n)) +#else +# define XXPH_ALIGN(n) /* disabled */ +#endif + +#define XXPH3_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ + +#endif /* XXPH_NO_LONG_LONG */ + + +/*-********************************************************************** +* XXPH_INLINE_ALL +************************************************************************/ +#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) + +/* === RocksDB modification: was #include here but permanently inlining === */ + +typedef struct { + XXPH64_hash_t low64; + XXPH64_hash_t high64; +} XXPH128_hash_t; + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXPH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXPH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) +# define XXPH_FORCE_MEMORY_ACCESS 2 +# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) +# define XXPH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXPH_ACCEPT_NULL_INPUT_POINTER : + * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. + * When this macro is enabled, xxHash actively checks input for null pointer. + * It it is, result for null input pointers is the same as a null-length input. + */ +#ifndef XXPH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXPH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*!XXPH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; + * set it to 0 when the input is guaranteed to be aligned, + * or when alignment doesn't matter for performance. + */ +#ifndef XXPH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXPH_FORCE_ALIGN_CHECK 0 +# else +# define XXPH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*!XXPH_REROLL: + * Whether to reroll XXPH32_finalize, and XXPH64_finalize, + * instead of using an unrolled jump table/if statement loop. + * + * This is automatically defined on -Os/-Oz on GCC and Clang. */ +#ifndef XXPH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXPH_REROLL 1 +# else +# define XXPH_REROLL 0 +# endif +#endif + +#include /* ULLONG_MAX */ + +#ifndef XXPH_STATIC_LINKING_ONLY +#define XXPH_STATIC_LINKING_ONLY +#endif + +/* BEGIN RocksDB customizations */ +#include "port/lang.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */ +/* END RocksDB customizations */ + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# define XXPH_FORCE_INLINE static __forceinline +# define XXPH_NO_INLINE static __declspec(noinline) +#else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define XXPH_FORCE_INLINE static inline __attribute__((always_inline)) +# define XXPH_NO_INLINE static __attribute__((noinline)) +# else +# define XXPH_FORCE_INLINE static inline +# define XXPH_NO_INLINE static +# endif +# else +# define XXPH_FORCE_INLINE static +# define XXPH_NO_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + + +/* ************************************* +* Debug +***************************************/ +/* DEBUGLEVEL is expected to be defined externally, + * typically through compiler command line. + * Value must be a number. */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL>=1) +# include /* note : can still be disabled with NDEBUG */ +# define XXPH_ASSERT(c) assert(c) +#else +# define XXPH_ASSERT(c) ((void)0) +#endif + +/* note : use after variable declarations */ +#define XXPH_STATIC_ASSERT(c) { enum { XXPH_sa = 1/(int)(!!(c)) }; } + + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXPH32_hash_t xxh_u32; + + +/* === Memory access === */ + +#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u32 XXPH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +static xxh_u32 XXPH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXPH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* === Endianess === */ + +/* XXPH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXPH_CPU_LITTLE_ENDIAN +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXPH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXPH_CPU_LITTLE_ENDIAN 0 +# else +static int XXPH_isLittleEndian(void) +{ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} +# define XXPH_CPU_LITTLE_ENDIAN XXPH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXPH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64) +# define XXPH_rotl32 __builtin_rotateleft32 +# define XXPH_rotl64 __builtin_rotateleft64 +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXPH_rotl32(x,r) _rotl(x,r) +# define XXPH_rotl64(x,r) _rotl64(x,r) +#else +# define XXPH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXPH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXPH_swap32 _byteswap_ulong +#elif XXPH_GCC_VERSION >= 403 +# define XXPH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXPH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXPH_aligned, XXPH_unaligned } XXPH_alignment; + +XXPH_FORCE_INLINE xxh_u32 XXPH_readLE32(const void* ptr) +{ + return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read32(ptr) : XXPH_swap32(XXPH_read32(ptr)); +} + +XXPH_FORCE_INLINE xxh_u32 +XXPH_readLE32_align(const void* ptr, XXPH_alignment align) +{ + if (align==XXPH_unaligned) { + return XXPH_readLE32(ptr); + } else { + return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXPH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +XXPH_PUBLIC_API unsigned XXPH_versionNumber (void) { return XXPH_VERSION_NUMBER; } + + +static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +#ifndef XXPH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/*====== Memory access ======*/ + +typedef XXPH64_hash_t xxh_u64; + +#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXPH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } + +#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +static xxh_u64 XXPH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static xxh_u64 XXPH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXPH_swap64 _byteswap_uint64 +#elif XXPH_GCC_VERSION >= 403 +# define XXPH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXPH_swap64 (xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + +XXPH_FORCE_INLINE xxh_u64 XXPH_readLE64(const void* ptr) +{ + return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read64(ptr) : XXPH_swap64(XXPH_read64(ptr)); +} + +XXPH_FORCE_INLINE xxh_u64 +XXPH_readLE64_align(const void* ptr, XXPH_alignment align) +{ + if (align==XXPH_unaligned) + return XXPH_readLE64(ptr); + else + return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXPH_swap64(*(const xxh_u64*)ptr); +} + + +/*====== xxh64 ======*/ + +static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + + +/* ********************************************************************* +* XXPH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ + +/*======== Was #include "xxh3.h", now inlined below ==========*/ + +/* + xxHash - Extremely Fast Hash algorithm + Development source file for `xxh3` + Copyright (C) 2019-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* RocksDB Note: This file contains a preview release (xxhash repository + version 0.7.2) of XXPH3 that is unlikely to be compatible with the final + version of XXPH3. We have therefore renamed this XXPH3 ("preview"), for + clarity so that we can continue to use this version even after + integrating a newer incompatible version. +*/ + +/* === Dependencies === */ + +#undef XXPH_INLINE_ALL /* in case it's already defined */ +#define XXPH_INLINE_ALL + + +/* === Compiler specifics === */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXPH_RESTRICT restrict +#else +/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXPH_RESTRICT /* disable */ +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include +# undef inline +# endif +#elif defined(_MSC_VER) +# include +#endif + +/* + * Sanity check. + * + * XXPH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * + * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the + * classic 16-bit only subset of ARM's instruction set. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand is helpful too. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we + * will give a warning. + * + * Usually, if this happens, it is because of an accident and you probably + * need to specify -march, as you probably meant to compileh for a newer + * architecture. + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXPH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ +#define XXPH_SCALAR 0 +#define XXPH_SSE2 1 +#define XXPH_AVX2 2 +#define XXPH_NEON 3 +#define XXPH_VSX 4 + +#ifndef XXPH_VECTOR /* can be defined on command line */ +# if defined(__AVX2__) +# define XXPH_VECTOR XXPH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXPH_VECTOR XXPH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXPH_VECTOR XXPH_NEON +# elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__) +# define XXPH_VECTOR XXPH_VSX +# else +# define XXPH_VECTOR XXPH_SCALAR +# endif +#endif + +/* control alignment of accumulator, + * for compatibility with fast vector loads */ +#ifndef XXPH_ACC_ALIGN +# if XXPH_VECTOR == 0 /* scalar */ +# define XXPH_ACC_ALIGN 8 +# elif XXPH_VECTOR == 1 /* sse2 */ +# define XXPH_ACC_ALIGN 16 +# elif XXPH_VECTOR == 2 /* avx2 */ +# define XXPH_ACC_ALIGN 32 +# elif XXPH_VECTOR == 3 /* neon */ +# define XXPH_ACC_ALIGN 16 +# elif XXPH_VECTOR == 4 /* vsx */ +# define XXPH_ACC_ALIGN 16 +# endif +#endif + +/* xxh_u64 XXPH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */ +#if defined(_MSC_VER) && defined(_M_IX86) +# include +# define XXPH_mult32to64(x, y) __emulu(x, y) +#else +# define XXPH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF)) +#endif + +/* VSX stuff. It's a lot because VSX support is mediocre across compilers and + * there is a lot of mischief with endianness. */ +#if XXPH_VECTOR == XXPH_VSX +# include +# undef vector +typedef __vector unsigned long long U64x2; +typedef __vector unsigned char U8x16; +typedef __vector unsigned U32x4; + +#ifndef XXPH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXPH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXPH_VSX_BE 1 +# else +# define XXPH_VSX_BE 0 +# endif +#endif + +/* We need some helpers for big endian mode. */ +#if XXPH_VSX_BE +/* A wrapper for POWER9's vec_revb. */ +# ifdef __POWER9_VECTOR__ +# define XXPH_vec_revb vec_revb +# else +XXPH_FORCE_INLINE U64x2 XXPH_vec_revb(U64x2 val) +{ + U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif + +/* Power8 Crypto gives us vpermxor which is very handy for + * PPC64EB. + * + * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask) + * { + * U8x16 ret; + * for (int i = 0; i < 16; i++) { + * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4]; + * } + * return ret; + * } + * + * Because both of the main loops load the key, swap, and xor it with input, + * we can combine the key swap into this instruction. + */ +# ifdef vec_permxor +# define XXPH_vec_permxor vec_permxor +# else +# define XXPH_vec_permxor __builtin_crypto_vpermxor +# endif +#endif /* XXPH_VSX_BE */ +/* + * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes + * vec_mule. + * + * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while. + * Clang has an easy way to control this, we can just use the builtin which doesn't swap. + * GCC needs inline assembly. */ +#if __has_builtin(__builtin_altivec_vmuleuw) +# define XXPH_vec_mulo __builtin_altivec_vmulouw +# define XXPH_vec_mule __builtin_altivec_vmuleuw +#else +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXPH_FORCE_INLINE U64x2 XXPH_vec_mulo(U32x4 a, U32x4 b) { + U64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXPH_FORCE_INLINE U64x2 XXPH_vec_mule(U32x4 a, U32x4 b) { + U64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +#endif /* __has_builtin(__builtin_altivec_vmuleuw) */ +#endif /* XXPH_VECTOR == XXPH_VSX */ + +/* prefetch + * can be disabled, by declaring XXPH_NO_PREFETCH build macro */ +#if defined(XXPH_NO_PREFETCH) +# define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +#if defined(_MSC_VER) && \ + (defined(_M_X64) || \ + defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXPH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXPH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXPH_NO_PREFETCH */ + + +/* ========================================== + * XXPH3 default settings + * ========================================== */ + +#define XXPH_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ + +#if (XXPH_SECRET_DEFAULT_SIZE < XXPH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +XXPH_ALIGN(64) static const xxh_u8 kSecret[XXPH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +/* + * GCC for x86 has a tendency to use SSE in this loop. While it + * successfully avoids swapping (as MUL overwrites EAX and EDX), it + * slows it down because instead of free register swap shifts, it + * must use pshufd and punpckl/hd. + * + * To prevent this, we use this attribute to shut off SSE. + */ +#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) +__attribute__((__target__("no-sse"))) +#endif +static XXPH128_hash_t +XXPH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this + * type despite not having the arithmetic for it. This results in a + * laggy compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; + XXPH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXPH128_hash_t const r128 = { product_low, product_high }; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown + * below with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 + * --------- + * 6 9 7 5 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for + * UINT64_MAX. This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARMv6+ A32/T32, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, + * and allows this to be calculated in only 4 instructions which + * is comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be + * a couple of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXPH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXPH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXPH128_hash_t r128 = { lower, upper }; + return r128; +#endif +} + +/* + * We want to keep the attribute here because a target switch + * disables inlining. + * + * Does a 64-bit to 128-bit multiply, then XOR folds it. + * The reason for the separate function is to prevent passing + * too many structs around by value. This will hopefully inline + * the multiply, but we don't force it. + */ +#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) +__attribute__((__target__("no-sse"))) +#endif +static xxh_u64 +XXPH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXPH128_hash_t product = XXPH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + + +static XXPH64_hash_t XXPH3_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 37; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +/* ========================================== + * Short keys + * ========================================== */ + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(input != NULL); + XXPH_ASSERT(1 <= len && len <= 3); + XXPH_ASSERT(secret != NULL); + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24); + xxh_u64 const keyed = (xxh_u64)combined ^ (XXPH_readLE32(secret) + seed); + xxh_u64 const mixed = keyed * PRIME64_1; + return XXPH3_avalanche(mixed); + } +} + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(input != NULL); + XXPH_ASSERT(secret != NULL); + XXPH_ASSERT(4 <= len && len <= 8); + { xxh_u32 const input_lo = XXPH_readLE32(input); + xxh_u32 const input_hi = XXPH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32); + xxh_u64 const keyed = input_64 ^ (XXPH_readLE64(secret) + seed); + xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1); + return XXPH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2); + } +} + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(input != NULL); + XXPH_ASSERT(secret != NULL); + XXPH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const input_lo = XXPH_readLE64(input) ^ (XXPH_readLE64(secret) + seed); + xxh_u64 const input_hi = XXPH_readLE64(input + len - 8) ^ (XXPH_readLE64(secret + 8) - seed); + xxh_u64 const acc = len + (input_lo + input_hi) + XXPH3_mul128_fold64(input_lo, input_hi); + return XXPH3_avalanche(acc); + } +} + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(len <= 16); + { if (len > 8) return XXPH3_len_9to16_64b(input, len, secret, seed); + if (len >= 4) return XXPH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXPH3_len_1to3_64b(input, len, secret, seed); + /* + * RocksDB modification from XXPH3 preview: zero result for empty + * string can be problematic for multiplication-based algorithms. + * Return a hash of the seed instead. + */ + return XXPH3_mul128_fold64(seed + XXPH_readLE64(secret), PRIME64_2); + } +} + + +/* === Long Keys === */ + +#define STRIPE_LEN 64 +#define XXPH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) + +typedef enum { XXPH3_acc_64bits, XXPH3_acc_128bits } XXPH3_accWidth_e; + +XXPH_FORCE_INLINE void +XXPH3_accumulate_512( void* XXPH_RESTRICT acc, + const void* XXPH_RESTRICT input, + const void* XXPH_RESTRICT secret, + XXPH3_accWidth_e accWidth) +{ +#if (XXPH_VECTOR == XXPH_AVX2) + + XXPH_ASSERT((((size_t)acc) & 31) == 0); + { XXPH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ + const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ + __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ + if (accWidth == XXPH3_acc_128bits) { + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + xacc[i] = _mm256_add_epi64(product, sum); + } else { /* XXPH3_acc_64bits */ + __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); + xacc[i] = _mm256_add_epi64(product, sum); + } + } } + +#elif (XXPH_VECTOR == XXPH_SSE2) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + { XXPH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ + const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ + __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ + if (accWidth == XXPH3_acc_128bits) { + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + xacc[i] = _mm_add_epi64(product, sum); + } else { /* XXPH3_acc_64bits */ + __m128i const sum = _mm_add_epi64(xacc[i], data_vec); + xacc[i] = _mm_add_epi64(product, sum); + } + } } + +#elif (XXPH_VECTOR == XXPH_NEON) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + { + XXPH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { +#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */ + /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this. + * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang + * assumes I don't want to destroy it and tries to make a copy. This slows down the code + * a lot. + * aarch64 not only uses an entirely different syntax, but it requires three + * instructions... + * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly + * zip1 v3.2s, v0.2s, v1.2s // first zip + * zip2 v2.2s, v0.2s, v1.2s // second zip + * ...to do what ARM does in one: + * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */ + + /* data_vec = xsecret[i]; */ + uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); + /* data_key = data_vec ^ key_vec; */ + uint32x4_t data_key; + + if (accWidth == XXPH3_acc_64bits) { + /* Add first to prevent register swaps */ + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXPH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + /* can probably be optimized better */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped= vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + + data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec)); + + /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place. + * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */ + __asm__("vzip.32 %e0, %f0" : "+w" (data_key)); + /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */ + xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key)); + +#else + /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */ + + /* data_vec = xsecret[i]; */ + uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */ + uint32x2_t const data_key_lo = vmovn_u64 (data_key); + /* data_key_hi = (uint32x2_t) (data_key >> 32); */ + uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32); + if (accWidth == XXPH3_acc_64bits) { + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXPH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped= vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + +#endif + } + } + +#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) + U64x2* const xacc = (U64x2*) acc; /* presumed aligned */ + U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */ + U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */ + U64x2 const v32 = { 32, 32 }; +#if XXPH_VSX_BE + U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, + 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; +#endif + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { + /* data_vec = xinput[i]; */ + /* key_vec = xsecret[i]; */ +#if XXPH_VSX_BE + /* byteswap */ + U64x2 const data_vec = XXPH_vec_revb(vec_vsx_ld(0, xinput + i)); + U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); + /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */ + U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); +#else + U64x2 const data_vec = vec_vsx_ld(0, xinput + i); + U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); + U64x2 const data_key = data_vec ^ key_vec; +#endif + /* shuffled = (data_key << 32) | (data_key >> 32); */ + U32x4 const shuffled = (U32x4)vec_rl(data_key, v32); + /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */ + U64x2 const product = XXPH_vec_mulo((U32x4)data_key, shuffled); + xacc[i] += product; + + if (accWidth == XXPH3_acc_64bits) { + xacc[i] += data_vec; + } else { /* XXPH3_acc_128bits */ + /* swap high and low halves */ + U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); + xacc[i] += data_swapped; + } + } + +#else /* scalar variant of Accumulator - universal */ + + XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXPH_ASSERT(((size_t)acc & (XXPH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const data_val = XXPH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXPH_readLE64(xsecret + i*8); + + if (accWidth == XXPH3_acc_64bits) { + xacc[i] += data_val; + } else { + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + } + xacc[i] += XXPH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +#endif +} + +XXPH_FORCE_INLINE void +XXPH3_scrambleAcc(void* XXPH_RESTRICT acc, const void* XXPH_RESTRICT secret) +{ +#if (XXPH_VECTOR == XXPH_AVX2) + + XXPH_ASSERT((((size_t)acc) & 31) == 0); + { XXPH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */ + const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXPH_VECTOR == XXPH_SSE2) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + { XXPH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */ + const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXPH_VECTOR == XXPH_NEON) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* const xacc = (uint64x2_t*) acc; + uint8_t const* const xsecret = (uint8_t const*) secret; + uint32x2_t const prime = vdup_n_u32 (PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* data_vec = xacc[i] ^ (xacc[i] >> 47); */ + uint64x2_t const acc_vec = xacc[i]; + uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted); + + /* key_vec = xsecret[i]; */ + uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16))); + /* data_key = data_vec ^ key_vec; */ + uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec); + /* shuffled = { data_key[0, 2], data_key[1, 3] }; */ + uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key)); + + /* data_key *= PRIME32_1 */ + + /* prod_hi = (data_key >> 32) * PRIME32_1; */ + uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime); + } } + +#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) + + U64x2* const xacc = (U64x2*) acc; + const U64x2* const xsecret = (const U64x2*) secret; + /* constants */ + U64x2 const v32 = { 32, 32 }; + U64x2 const v47 = { 47, 47 }; + U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + size_t i; +#if XXPH_VSX_BE + /* endian swap */ + U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, + 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; +#endif + for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { + U64x2 const acc_vec = xacc[i]; + U64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + /* key_vec = xsecret[i]; */ +#if XXPH_VSX_BE + /* swap bytes words */ + U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); + U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); +#else + U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); + U64x2 const data_key = data_vec ^ key_vec; +#endif + + /* data_key *= PRIME32_1 */ + + /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */ + U64x2 const prod_even = XXPH_vec_mule((U32x4)data_key, prime); + /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */ + U64x2 const prod_odd = XXPH_vec_mulo((U32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } + +#else /* scalar variant of Scrambler - universal */ + + XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXPH_ASSERT((((size_t)acc) & (XXPH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const key64 = XXPH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 ^= acc64 >> 47; + acc64 ^= key64; + acc64 *= PRIME32_1; + xacc[i] = acc64; + } + +#endif +} + +#define XXPH_PREFETCH_DIST 384 + +/* assumption : nbStripes will not overflow secret size */ +XXPH_FORCE_INLINE void +XXPH3_accumulate( xxh_u64* XXPH_RESTRICT acc, + const xxh_u8* XXPH_RESTRICT input, + const xxh_u8* XXPH_RESTRICT secret, + size_t nbStripes, + XXPH3_accWidth_e accWidth) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*STRIPE_LEN; + XXPH_PREFETCH(in + XXPH_PREFETCH_DIST); + XXPH3_accumulate_512(acc, + in, + secret + n*XXPH_SECRET_CONSUME_RATE, + accWidth); + } +} + +/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`, + * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`. + * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE` + * Pretty much every other modes and compilers prefer `FORCE_INLINE`. + */ + +#if defined(__clang__) && (XXPH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__) +static void +#else +XXPH_FORCE_INLINE void +#endif +XXPH3_hashLong_internal_loop( xxh_u64* XXPH_RESTRICT acc, + const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, + XXPH3_accWidth_e accWidth) +{ + size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXPH_SECRET_CONSUME_RATE; + size_t const block_len = STRIPE_LEN * nb_rounds; + size_t const nb_blocks = len / block_len; + + size_t n; + + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXPH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); + XXPH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + } + + /* last partial block */ + XXPH_ASSERT(len > STRIPE_LEN); + { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXPH_ASSERT(nbStripes <= (secretSize / XXPH_SECRET_CONSUME_RATE)); + XXPH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const xxh_u8* const p = input + len - STRIPE_LEN; +#define XXPH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */ + XXPH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXPH_SECRET_LASTACC_START, accWidth); + } } +} + +XXPH_FORCE_INLINE xxh_u64 +XXPH3_mix2Accs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret) +{ + return XXPH3_mul128_fold64( + acc[0] ^ XXPH_readLE64(secret), + acc[1] ^ XXPH_readLE64(secret+8) ); +} + +static XXPH64_hash_t +XXPH3_mergeAccs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + + result64 += XXPH3_mix2Accs(acc+0, secret + 0); + result64 += XXPH3_mix2Accs(acc+2, secret + 16); + result64 += XXPH3_mix2Accs(acc+4, secret + 32); + result64 += XXPH3_mix2Accs(acc+6, secret + 48); + + return XXPH3_avalanche(result64); +} + +#define XXPH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_hashLong_internal(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) +{ + XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXPH3_INIT_ACC; + + XXPH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXPH3_acc_64bits); + + /* converge into final hash */ + XXPH_STATIC_ASSERT(sizeof(acc) == 64); +#define XXPH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */ + XXPH_ASSERT(secretSize >= sizeof(acc) + XXPH_SECRET_MERGEACCS_START); + return XXPH3_mergeAccs(acc, secret + XXPH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); +} + + +XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXPH3_hashLong_64b_defaultSecret(const xxh_u8* XXPH_RESTRICT input, size_t len) +{ + return XXPH3_hashLong_internal(input, len, kSecret, sizeof(kSecret)); +} + +XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXPH3_hashLong_64b_withSecret(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) +{ + return XXPH3_hashLong_internal(input, len, secret, secretSize); +} + + +XXPH_FORCE_INLINE void XXPH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXPH_CPU_LITTLE_ENDIAN) v64 = XXPH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* XXPH3_initCustomSecret() : + * destination `customSecret` is presumed allocated and same size as `kSecret`. + */ +XXPH_FORCE_INLINE void XXPH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) +{ + int const nbRounds = XXPH_SECRET_DEFAULT_SIZE / 16; + int i; + + XXPH_STATIC_ASSERT((XXPH_SECRET_DEFAULT_SIZE & 15) == 0); + + for (i=0; i < nbRounds; i++) { + XXPH_writeLE64(customSecret + 16*i, XXPH_readLE64(kSecret + 16*i) + seed64); + XXPH_writeLE64(customSecret + 16*i + 8, XXPH_readLE64(kSecret + 16*i + 8) - seed64); + } +} + + +/* XXPH3_hashLong_64b_withSeed() : + * Generate a custom key, + * based on alteration of default kSecret with the seed, + * and then use this key for long mode hashing. + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + */ +XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXPH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXPH64_hash_t seed) +{ + XXPH_ALIGN(8) xxh_u8 secret[XXPH_SECRET_DEFAULT_SIZE]; + if (seed==0) return XXPH3_hashLong_64b_defaultSecret(input, len); + XXPH3_initCustomSecret(secret, seed); + return XXPH3_hashLong_internal(input, len, secret, sizeof(secret)); +} + + +XXPH_FORCE_INLINE xxh_u64 XXPH3_mix16B(const xxh_u8* XXPH_RESTRICT input, + const xxh_u8* XXPH_RESTRICT secret, xxh_u64 seed64) +{ + xxh_u64 const input_lo = XXPH_readLE64(input); + xxh_u64 const input_hi = XXPH_readLE64(input+8); + return XXPH3_mul128_fold64( + input_lo ^ (XXPH_readLE64(secret) + seed64), + input_hi ^ (XXPH_readLE64(secret+8) - seed64) ); +} + + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_17to128_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, + XXPH64_hash_t seed) +{ + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; + XXPH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXPH3_mix16B(input+48, secret+96, seed); + acc += XXPH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXPH3_mix16B(input+32, secret+64, seed); + acc += XXPH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXPH3_mix16B(input+16, secret+32, seed); + acc += XXPH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXPH3_mix16B(input+0, secret+0, seed); + acc += XXPH3_mix16B(input+len-16, secret+16, seed); + + return XXPH3_avalanche(acc); + } +} + +#define XXPH3_MIDSIZE_MAX 240 + +XXPH_NO_INLINE XXPH64_hash_t +XXPH3_len_129to240_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, + XXPH64_hash_t seed) +{ + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; + XXPH_ASSERT(128 < len && len <= XXPH3_MIDSIZE_MAX); + + #define XXPH3_MIDSIZE_STARTOFFSET 3 + #define XXPH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXPH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXPH3_avalanche(acc); + XXPH_ASSERT(nbRounds >= 8); + for (i=8 ; i < nbRounds; i++) { + acc += XXPH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXPH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXPH3_mix16B(input + len - 16, secret + XXPH3_SECRET_SIZE_MIN - XXPH3_MIDSIZE_LASTOFFSET, seed); + return XXPH3_avalanche(acc); + } +} + +/* === Public entry point === */ + +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* input, size_t len) +{ + if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXPH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); +} + +XXPH_PUBLIC_API XXPH64_hash_t +XXPH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); + /* if an action must be taken should `secret` conditions not be respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash */ + if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXPH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXPH_PUBLIC_API XXPH64_hash_t +XXPH3_64bits_withSeed(const void* input, size_t len, XXPH64_hash_t seed) +{ + if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXPH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); +} + +/* === XXPH3 streaming === */ + +/* RocksDB Note: unused & removed due to bug in preview version */ + +/*======== END #include "xxh3.h", now inlined above ==========*/ + +#endif /* XXPH_NO_LONG_LONG */ + +/* === END RocksDB modification of permanently inlining === */ + +#endif /* defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) */ + +#endif /* XXPH_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif + +#endif /* XXPHASH_H_5627135585666179 */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge.cc new file mode 100644 index 0000000000..8e5c536f55 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/agg_merge.h" + +#include + +#include +#include +#include +#include +#include + +#include "port/lang.h" +#include "port/likely.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "util/coding.h" +#include "utilities/agg_merge/agg_merge_impl.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map> func_map; +const std::string kUnnamedFuncName = ""; +const std::string kErrorFuncName = "kErrorFuncName"; + +Status AddAggregator(const std::string& function_name, + std::unique_ptr&& agg) { + if (function_name == kErrorFuncName) { + return Status::InvalidArgument( + "Cannot register function name kErrorFuncName"); + } + func_map.emplace(function_name, std::move(agg)); + return Status::OK(); +} + +AggMergeOperator::AggMergeOperator() {} + +std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, + const Slice& value) { + std::string result; + PutLengthPrefixedSlice(&result, function_name); + result += value.ToString(); + return result; +} + +Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload, + std::string& output) { + if (function_name == kErrorFuncName) { + return Status::InvalidArgument("Cannot use error function name"); + } + if (function_name != kUnnamedFuncName && + func_map.find(function_name.ToString()) == func_map.end()) { + return Status::InvalidArgument("Function name not registered"); + } + output = EncodeAggFuncAndPayloadNoCheck(function_name, payload); + return Status::OK(); +} + +bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value) { + value = op; + return GetLengthPrefixedSlice(&value, &func); +} + +bool ExtractList(const Slice& encoded_list, std::vector& decoded_list) { + decoded_list.clear(); + Slice list_slice = encoded_list; + Slice item; + while (GetLengthPrefixedSlice(&list_slice, &item)) { + decoded_list.push_back(item); + } + return list_slice.empty(); +} + +class AggMergeOperator::Accumulator { + public: + bool Add(const Slice& op, bool is_partial_aggregation) { + if (ignore_operands_) { + return true; + } + Slice my_func; + Slice my_value; + bool ret = ExtractAggFuncAndValue(op, my_func, my_value); + if (!ret) { + ignore_operands_ = true; + return true; + } + + // Determine whether we need to do partial merge. + if (is_partial_aggregation && !my_func.empty()) { + auto f = func_map.find(my_func.ToString()); + if (f == func_map.end() || !f->second->DoPartialAggregate()) { + return false; + } + } + + if (!func_valid_) { + if (my_func != kUnnamedFuncName) { + func_ = my_func; + func_valid_ = true; + } + } else if (func_ != my_func) { + // User switched aggregation function. Need to aggregate the older + // one first. + + // Previous aggreagion can't be done in partial merge + if (is_partial_aggregation) { + func_valid_ = false; + ignore_operands_ = true; + return false; + } + + // We could consider stashing an iterator into the hash of aggregators + // to avoid repeated lookups when the aggregator doesn't change. + auto f = func_map.find(func_.ToString()); + if (f == func_map.end() || !f->second->Aggregate(values_, scratch_)) { + func_valid_ = false; + ignore_operands_ = true; + return true; + } + std::swap(scratch_, aggregated_); + values_.clear(); + values_.push_back(aggregated_); + func_ = my_func; + } + values_.push_back(my_value); + return true; + } + + // Return false if aggregation fails. + // One possible reason + bool GetResult(std::string& result) { + if (!func_valid_) { + return false; + } + auto f = func_map.find(func_.ToString()); + if (f == func_map.end()) { + return false; + } + if (!f->second->Aggregate(values_, scratch_)) { + return false; + } + result = EncodeAggFuncAndPayloadNoCheck(func_, scratch_); + return true; + } + + void Clear() { + func_.clear(); + values_.clear(); + aggregated_.clear(); + scratch_.clear(); + ignore_operands_ = false; + func_valid_ = false; + } + + private: + Slice func_; + std::vector values_; + std::string aggregated_; + std::string scratch_; + bool ignore_operands_ = false; + bool func_valid_ = false; +}; + +// Creating and using a new Accumulator might invoke multiple malloc and is +// expensive if it needs to be done when processing each merge operation. +// AggMergeOperator's merge operators can be invoked concurrently by multiple +// threads so we cannot simply create one Aggregator and reuse. +// We use thread local instances instead. +AggMergeOperator::Accumulator& AggMergeOperator::GetTLSAccumulator() { + static thread_local Accumulator tls_acc; + tls_acc.Clear(); + return tls_acc; +} + +void AggMergeOperator::PackAllMergeOperands(const MergeOperationInput& merge_in, + MergeOperationOutput& merge_out) { + merge_out.new_value = ""; + PutLengthPrefixedSlice(&merge_out.new_value, kErrorFuncName); + if (merge_in.existing_value != nullptr) { + PutLengthPrefixedSlice(&merge_out.new_value, *merge_in.existing_value); + } + for (const Slice& op : merge_in.operand_list) { + PutLengthPrefixedSlice(&merge_out.new_value, op); + } +} + +bool AggMergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + Accumulator& agg = GetTLSAccumulator(); + if (merge_in.existing_value != nullptr) { + agg.Add(*merge_in.existing_value, /*is_partial_aggregation=*/false); + } + for (const Slice& e : merge_in.operand_list) { + agg.Add(e, /*is_partial_aggregation=*/false); + } + + bool succ = agg.GetResult(merge_out->new_value); + if (!succ) { + // If aggregation can't happen, pack all merge operands. In contrast to + // merge operator, we don't want to fail the DB. If users insert wrong + // format or call unregistered an aggregation function, we still hope + // the DB can continue functioning with other keys. + PackAllMergeOperands(merge_in, *merge_out); + } + agg.Clear(); + return true; +} + +bool AggMergeOperator::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + Accumulator& agg = GetTLSAccumulator(); + bool do_aggregation = true; + for (const Slice& item : operand_list) { + do_aggregation = agg.Add(item, /*is_partial_aggregation=*/true); + if (!do_aggregation) { + break; + } + } + if (do_aggregation) { + do_aggregation = agg.GetResult(*new_value); + } + agg.Clear(); + return do_aggregation; +} + +std::shared_ptr GetAggMergeOperator() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared()); + assert(instance); + return instance; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge_impl.h new file mode 100644 index 0000000000..00e58de088 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/agg_merge_impl.h @@ -0,0 +1,49 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/agg_merge.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +class AggMergeOperator : public MergeOperator { + public: + explicit AggMergeOperator(); + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "AggMergeOperator.v1"; } + + bool AllowSingleOperand() const override { return true; } + + bool ShouldMerge(const std::vector&) const override { return false; } + + private: + class Accumulator; + + // Pack all merge operands into one value. This is called when aggregation + // fails. The existing values are preserved and returned so that users can + // debug the problem. + static void PackAllMergeOperands(const MergeOperationInput& merge_in, + MergeOperationOutput& merge_out); + static Accumulator& GetTLSAccumulator(); +}; + +extern std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, + const Slice& value); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/test_agg_merge.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/test_agg_merge.h new file mode 100644 index 0000000000..5bdf8b9cc2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/agg_merge/test_agg_merge.h @@ -0,0 +1,47 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/agg_merge.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +class SumAggregator : public Aggregator { + public: + ~SumAggregator() override {} + bool Aggregate(const std::vector&, std::string& result) const override; + bool DoPartialAggregate() const override { return true; } +}; + +class MultipleAggregator : public Aggregator { + public: + ~MultipleAggregator() override {} + bool Aggregate(const std::vector&, std::string& result) const override; + bool DoPartialAggregate() const override { return true; } +}; + +class Last3Aggregator : public Aggregator { + public: + ~Last3Aggregator() override {} + bool Aggregate(const std::vector&, std::string& result) const override; +}; + +class EncodeHelper { + public: + static std::string EncodeFuncAndInt(const Slice& function_name, + int64_t value); + static std::string EncodeInt(int64_t value); + static std::string EncodeList(const std::vector& list); + static std::string EncodeFuncAndList(const Slice& function_name, + const std::vector& list); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine.cc new file mode 100644 index 0000000000..e74218d45c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine.cc @@ -0,0 +1,3355 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "env/fs_readonly.h" +#include "env/fs_remap.h" +#include "file/filename.h" +#include "file/line_file_reader.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/env.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/statistics.h" +#include "rocksdb/transaction_log.h" +#include "table/sst_file_dumper.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/channel.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/math.h" +#include "util/rate_limiter_impl.h" +#include "util/string_util.h" +#include "utilities/backup/backup_engine_impl.h" +#include "utilities/checkpoint/checkpoint_impl.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming; + +constexpr BackupID kLatestBackupIDMarker = static_cast(-2); + +inline uint32_t ChecksumHexToInt32(const std::string& checksum_hex) { + std::string checksum_str; + Slice(checksum_hex).DecodeHex(&checksum_str); + return EndianSwapValue(DecodeFixed32(checksum_str.c_str())); +} +inline std::string ChecksumStrToHex(const std::string& checksum_str) { + return Slice(checksum_str).ToString(true); +} +inline std::string ChecksumInt32ToHex(const uint32_t& checksum_value) { + std::string checksum_str; + PutFixed32(&checksum_str, EndianSwapValue(checksum_value)); + return ChecksumStrToHex(checksum_str); +} + +const std::string kPrivateDirName = "private"; +const std::string kMetaDirName = "meta"; +const std::string kSharedDirName = "shared"; +const std::string kSharedChecksumDirName = "shared_checksum"; +const std::string kPrivateDirSlash = kPrivateDirName + "/"; +const std::string kMetaDirSlash = kMetaDirName + "/"; +const std::string kSharedDirSlash = kSharedDirName + "/"; +const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/"; + +} // namespace + +void BackupStatistics::IncrementNumberSuccessBackup() { + number_success_backup++; +} +void BackupStatistics::IncrementNumberFailBackup() { number_fail_backup++; } + +uint32_t BackupStatistics::GetNumberSuccessBackup() const { + return number_success_backup; +} +uint32_t BackupStatistics::GetNumberFailBackup() const { + return number_fail_backup; +} + +std::string BackupStatistics::ToString() const { + char result[50]; + snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u", + GetNumberSuccessBackup(), GetNumberFailBackup()); + return result; +} + +void BackupEngineOptions::Dump(Logger* logger) const { + ROCKS_LOG_INFO(logger, " Options.backup_dir: %s", + backup_dir.c_str()); + ROCKS_LOG_INFO(logger, " Options.backup_env: %p", backup_env); + ROCKS_LOG_INFO(logger, " Options.share_table_files: %d", + static_cast(share_table_files)); + ROCKS_LOG_INFO(logger, " Options.info_log: %p", info_log); + ROCKS_LOG_INFO(logger, " Options.sync: %d", + static_cast(sync)); + ROCKS_LOG_INFO(logger, " Options.destroy_old_data: %d", + static_cast(destroy_old_data)); + ROCKS_LOG_INFO(logger, " Options.backup_log_files: %d", + static_cast(backup_log_files)); + ROCKS_LOG_INFO(logger, " Options.backup_rate_limit: %" PRIu64, + backup_rate_limit); + ROCKS_LOG_INFO(logger, " Options.restore_rate_limit: %" PRIu64, + restore_rate_limit); + ROCKS_LOG_INFO(logger, "Options.max_background_operations: %d", + max_background_operations); +} + +namespace { +// -------- BackupEngineImpl class --------- +class BackupEngineImpl { + public: + BackupEngineImpl(const BackupEngineOptions& options, Env* db_env, + bool read_only = false); + ~BackupEngineImpl(); + + IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options, + DB* db, const std::string& app_metadata, + BackupID* new_backup_id_ptr); + + IOStatus PurgeOldBackups(uint32_t num_backups_to_keep); + + IOStatus DeleteBackup(BackupID backup_id); + + void StopBackup() { stop_backup_.store(true, std::memory_order_release); } + + IOStatus GarbageCollect(); + + // The returned BackupInfos are in chronological order, which means the + // latest backup comes last. + void GetBackupInfo(std::vector* backup_info, + bool include_file_details) const; + + Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const; + + void GetCorruptedBackups(std::vector* corrupt_backup_ids) const; + + IOStatus RestoreDBFromBackup( + const RestoreOptions& options, BackupID backup_id, + const std::string& db_dir, const std::string& wal_dir, + const std::list& locked_restore_from_dirs) const; + + IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const; + + IOStatus Initialize(); + + ShareFilesNaming GetNamingNoFlags() const { + return options_.share_files_with_checksum_naming & + BackupEngineOptions::kMaskNoNamingFlags; + } + ShareFilesNaming GetNamingFlags() const { + return options_.share_files_with_checksum_naming & + BackupEngineOptions::kMaskNamingFlags; + } + + void TEST_SetDefaultRateLimitersClock( + const std::shared_ptr& backup_rate_limiter_clock, + const std::shared_ptr& restore_rate_limiter_clock) { + if (backup_rate_limiter_clock) { + static_cast(options_.backup_rate_limiter.get()) + ->TEST_SetClock(backup_rate_limiter_clock); + } + + if (restore_rate_limiter_clock) { + static_cast(options_.restore_rate_limiter.get()) + ->TEST_SetClock(restore_rate_limiter_clock); + } + } + + private: + void DeleteChildren(const std::string& dir, + uint32_t file_type_filter = 0) const; + IOStatus DeleteBackupNoGC(BackupID backup_id); + + // Extends the "result" map with pathname->size mappings for the contents of + // "dir" in "env". Pathnames are prefixed with "dir". + IOStatus ReadChildFileCurrentSizes( + const std::string& dir, const std::shared_ptr&, + std::unordered_map* result) const; + + struct FileInfo { + FileInfo(const std::string& fname, uint64_t sz, const std::string& checksum, + const std::string& id, const std::string& sid, Temperature _temp) + : refs(0), + filename(fname), + size(sz), + checksum_hex(checksum), + db_id(id), + db_session_id(sid), + temp(_temp) {} + + FileInfo(const FileInfo&) = delete; + FileInfo& operator=(const FileInfo&) = delete; + + int refs; + // Relative path from backup dir + const std::string filename; + const uint64_t size; + // crc32c checksum as hex. empty == unknown / unavailable + std::string checksum_hex; + // DB identities + // db_id is obtained for potential usage in the future but not used + // currently + const std::string db_id; + // db_session_id appears in the backup SST filename if the table naming + // option is kUseDbSessionId + const std::string db_session_id; + Temperature temp; + + std::string GetDbFileName() const { + std::string rv; + // extract the filename part + size_t slash = filename.find_last_of('/'); + // file will either be shared/, shared_checksum/, + // shared_checksum/, shared_checksum/, + // or private// + assert(slash != std::string::npos); + rv = filename.substr(slash + 1); + + // if the file was in shared_checksum, extract the real file name + // in this case the file is __., + // _., or __. + if (filename.substr(0, slash) == kSharedChecksumDirName) { + rv = GetFileFromChecksumFile(rv); + } + return rv; + } + }; + + // TODO: deprecate this function once we migrate all BackupEngine's rate + // limiting to lower-level ones (i.e, ones in file access wrapper level like + // `WritableFileWriter`) + static void LoopRateLimitRequestHelper(const size_t total_bytes_to_request, + RateLimiter* rate_limiter, + const Env::IOPriority pri, + Statistics* stats, + const RateLimiter::OpType op_type); + + static inline std::string WithoutTrailingSlash(const std::string& path) { + if (path.empty() || path.back() != '/') { + return path; + } else { + return path.substr(path.size() - 1); + } + } + + static inline std::string WithTrailingSlash(const std::string& path) { + if (path.empty() || path.back() != '/') { + return path + '/'; + } else { + return path; + } + } + + // A filesystem wrapper that makes shared backup files appear to be in the + // private backup directory (dst_dir), so that the private backup dir can + // be opened as a read-only DB. + class RemapSharedFileSystem : public RemapFileSystem { + public: + RemapSharedFileSystem(const std::shared_ptr& base, + const std::string& dst_dir, + const std::string& src_base_dir, + const std::vector>& files) + : RemapFileSystem(base), + dst_dir_(WithoutTrailingSlash(dst_dir)), + dst_dir_slash_(WithTrailingSlash(dst_dir)), + src_base_dir_(WithTrailingSlash(src_base_dir)) { + for (auto& info : files) { + if (!StartsWith(info->filename, kPrivateDirSlash)) { + assert(StartsWith(info->filename, kSharedDirSlash) || + StartsWith(info->filename, kSharedChecksumDirSlash)); + remaps_[info->GetDbFileName()] = info; + } + } + } + + const char* Name() const override { + return "BackupEngineImpl::RemapSharedFileSystem"; + } + + // Sometimes a directory listing is required in opening a DB + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + IOStatus s = RemapFileSystem::GetChildren(dir, options, result, dbg); + if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) { + // Assume remapped files exist + for (auto& r : remaps_) { + result->push_back(r.first); + } + } + return s; + } + + // Sometimes a directory listing is required in opening a DB + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + IOStatus s = + RemapFileSystem::GetChildrenFileAttributes(dir, options, result, dbg); + if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) { + // Assume remapped files exist with recorded size + for (auto& r : remaps_) { + result->emplace_back(); // clean up with C++20 + FileAttributes& attr = result->back(); + attr.name = r.first; + attr.size_bytes = r.second->size; + } + } + return s; + } + + protected: + // When a file in dst_dir is requested, see if we need to remap to shared + // file path. + std::pair EncodePath( + const std::string& path) override { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + std::pair rv{IOStatus(), path}; + if (StartsWith(path, dst_dir_slash_)) { + std::string relative = path.substr(dst_dir_slash_.size()); + auto it = remaps_.find(relative); + if (it != remaps_.end()) { + rv.second = src_base_dir_ + it->second->filename; + } + } + return rv; + } + + private: + // Absolute path to a directory that some extra files will be mapped into. + const std::string dst_dir_; + // Includes a trailing slash. + const std::string dst_dir_slash_; + // Absolute path to a directory containing some files to be mapped into + // dst_dir_. Includes a trailing slash. + const std::string src_base_dir_; + // If remaps_[x] exists, attempt to read dst_dir_ / x should instead read + // src_base_dir_ / remaps_[x]->filename. FileInfo is used to maximize + // sharing with other backup data in memory. + std::unordered_map> remaps_; + }; + + class BackupMeta { + public: + BackupMeta( + const std::string& meta_filename, const std::string& meta_tmp_filename, + std::unordered_map>* file_infos, + Env* env, const std::shared_ptr& fs) + : timestamp_(0), + sequence_number_(0), + size_(0), + meta_filename_(meta_filename), + meta_tmp_filename_(meta_tmp_filename), + file_infos_(file_infos), + env_(env), + fs_(fs) {} + + BackupMeta(const BackupMeta&) = delete; + BackupMeta& operator=(const BackupMeta&) = delete; + + ~BackupMeta() {} + + void RecordTimestamp() { + // Best effort + Status s = env_->GetCurrentTime(×tamp_); + if (!s.ok()) { + timestamp_ = /* something clearly fabricated */ 1; + } + } + int64_t GetTimestamp() const { return timestamp_; } + uint64_t GetSize() const { return size_; } + uint32_t GetNumberFiles() const { + return static_cast(files_.size()); + } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() const { return sequence_number_; } + + const std::string& GetAppMetadata() const { return app_metadata_; } + + void SetAppMetadata(const std::string& app_metadata) { + app_metadata_ = app_metadata; + } + + IOStatus AddFile(std::shared_ptr file_info); + + void AddExcludedFile(const std::string& relative_file) { + excluded_files_.emplace_back(relative_file); + } + + IOStatus Delete(bool delete_meta = true); + + bool Empty() const { return files_.empty(); } + + std::shared_ptr GetFile(const std::string& filename) const { + auto it = file_infos_->find(filename); + if (it == file_infos_->end()) { + return nullptr; + } + return it->second; + } + + const std::vector>& GetFiles() const { + return files_; + } + + const std::vector& GetExcludedFiles() const { + return excluded_files_; + } + + // @param abs_path_to_size Pre-fetched file sizes (bytes). + IOStatus LoadFromFile( + const std::string& backup_dir, + const std::unordered_map& abs_path_to_size, + RateLimiter* rate_limiter, Logger* info_log, + std::unordered_set* reported_ignored_fields); + IOStatus StoreToFile( + bool sync, int schema_version, + const TEST_BackupMetaSchemaOptions* schema_test_options); + + std::string GetInfoString() { + std::ostringstream ss; + ss << "Timestamp: " << timestamp_ << std::endl; + char human_size[16]; + AppendHumanBytes(size_, human_size, sizeof(human_size)); + ss << "Size: " << human_size << std::endl; + ss << "Files:" << std::endl; + for (const auto& file : files_) { + AppendHumanBytes(file->size, human_size, sizeof(human_size)); + ss << file->filename << ", size " << human_size << ", refs " + << file->refs << std::endl; + } + return ss.str(); + } + + const std::shared_ptr& GetEnvForOpen() const { + if (!env_for_open_) { + // Lazy initialize + // Find directories + std::string dst_dir = meta_filename_; + auto i = dst_dir.rfind(kMetaDirSlash); + assert(i != std::string::npos); + std::string src_base_dir = dst_dir.substr(0, i); + dst_dir.replace(i, kMetaDirSlash.size(), kPrivateDirSlash); + // Make the RemapSharedFileSystem + std::shared_ptr remap_fs = + std::make_shared(fs_, dst_dir, src_base_dir, + files_); + // Make it read-only for safety + remap_fs = std::make_shared(remap_fs); + // Make an Env wrapper + env_for_open_ = std::make_shared(env_, remap_fs); + } + return env_for_open_; + } + + private: + int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; + uint64_t size_; + std::string app_metadata_; + std::string const meta_filename_; + std::string const meta_tmp_filename_; + // files with relative paths (without "/" prefix!!) + std::vector> files_; + std::vector excluded_files_; + std::unordered_map>* file_infos_; + Env* env_; + mutable std::shared_ptr env_for_open_; + std::shared_ptr fs_; + IOOptions iooptions_ = IOOptions(); + }; // BackupMeta + + void SetBackupInfoFromBackupMeta(BackupID id, const BackupMeta& meta, + BackupInfo* backup_info, + bool include_file_details) const; + + inline std::string GetAbsolutePath( + const std::string& relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateFileRel(BackupID backup_id, bool tmp = false, + const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return kPrivateDirSlash + std::to_string(backup_id) + (tmp ? ".tmp" : "") + + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return kSharedDirSlash + std::string(tmp ? "." : "") + file + + (tmp ? ".tmp" : ""); + } + inline std::string GetSharedFileWithChecksumRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return kSharedChecksumDirSlash + std::string(tmp ? "." : "") + file + + (tmp ? ".tmp" : ""); + } + inline bool UseLegacyNaming(const std::string& sid) const { + return GetNamingNoFlags() == + BackupEngineOptions::kLegacyCrc32cAndFileSize || + sid.empty(); + } + inline std::string GetSharedFileWithChecksum( + const std::string& file, const std::string& checksum_hex, + const uint64_t file_size, const std::string& db_session_id) const { + assert(file.size() == 0 || file[0] != '/'); + std::string file_copy = file; + if (UseLegacyNaming(db_session_id)) { + assert(!checksum_hex.empty()); + file_copy.insert(file_copy.find_last_of('.'), + "_" + std::to_string(ChecksumHexToInt32(checksum_hex)) + + "_" + std::to_string(file_size)); + } else { + file_copy.insert(file_copy.find_last_of('.'), "_s" + db_session_id); + if (GetNamingFlags() & BackupEngineOptions::kFlagIncludeFileSize) { + file_copy.insert(file_copy.find_last_of('.'), + "_" + std::to_string(file_size)); + } + } + return file_copy; + } + static inline std::string GetFileFromChecksumFile(const std::string& file) { + assert(file.size() == 0 || file[0] != '/'); + std::string file_copy = file; + size_t first_underscore = file_copy.find_first_of('_'); + return file_copy.erase(first_underscore, + file_copy.find_last_of('.') - first_underscore); + } + inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const { + return GetAbsolutePath(kMetaDirName) + "/" + (tmp ? "." : "") + + std::to_string(backup_id) + (tmp ? ".tmp" : ""); + } + + // If size_limit == 0, there is no size limit, copy everything. + // + // Exactly one of src and contents must be non-empty. + // + // @param src If non-empty, the file is copied from this pathname. + // @param contents If non-empty, the file will be created with these contents. + // @param src_temperature Pass in expected temperature of src, return back + // temperature reported by FileSystem + IOStatus CopyOrCreateFile(const std::string& src, const std::string& dst, + const std::string& contents, uint64_t size_limit, + Env* src_env, Env* dst_env, + const EnvOptions& src_env_options, bool sync, + RateLimiter* rate_limiter, + std::function progress_callback, + Temperature* src_temperature, + Temperature dst_temperature, + uint64_t* bytes_toward_next_callback, + uint64_t* size, std::string* checksum_hex); + + IOStatus ReadFileAndComputeChecksum(const std::string& src, + const std::shared_ptr& src_fs, + const EnvOptions& src_env_options, + uint64_t size_limit, + std::string* checksum_hex, + const Temperature src_temperature) const; + + // Obtain db_id and db_session_id from the table properties of file_path + Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options, + const std::string& file_path, + Temperature file_temp, RateLimiter* rate_limiter, + std::string* db_id, std::string* db_session_id); + + struct CopyOrCreateResult { + ~CopyOrCreateResult() { + // The Status needs to be ignored here for two reasons. + // First, if the BackupEngineImpl shuts down with jobs outstanding, then + // it is possible that the Status in the future/promise is never read, + // resulting in an unchecked Status. Second, if there are items in the + // channel when the BackupEngineImpl is shutdown, these will also have + // Status that have not been checked. This + // TODO: Fix those issues so that the Status + io_status.PermitUncheckedError(); + } + uint64_t size; + std::string checksum_hex; + std::string db_id; + std::string db_session_id; + IOStatus io_status; + Temperature expected_src_temperature = Temperature::kUnknown; + Temperature current_src_temperature = Temperature::kUnknown; + }; + + // Exactly one of src_path and contents must be non-empty. If src_path is + // non-empty, the file is copied from this pathname. Otherwise, if contents is + // non-empty, the file will be created at dst_path with these contents. + struct CopyOrCreateWorkItem { + std::string src_path; + std::string dst_path; + Temperature src_temperature; + Temperature dst_temperature; + std::string contents; + Env* src_env; + Env* dst_env; + EnvOptions src_env_options; + bool sync; + RateLimiter* rate_limiter; + uint64_t size_limit; + Statistics* stats; + std::promise result; + std::function progress_callback; + std::string src_checksum_func_name; + std::string src_checksum_hex; + std::string db_id; + std::string db_session_id; + + CopyOrCreateWorkItem() + : src_path(""), + dst_path(""), + src_temperature(Temperature::kUnknown), + dst_temperature(Temperature::kUnknown), + contents(""), + src_env(nullptr), + dst_env(nullptr), + src_env_options(), + sync(false), + rate_limiter(nullptr), + size_limit(0), + stats(nullptr), + src_checksum_func_name(kUnknownFileChecksumFuncName), + src_checksum_hex(""), + db_id(""), + db_session_id("") {} + + CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete; + CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete; + + CopyOrCreateWorkItem(CopyOrCreateWorkItem&& o) noexcept { + *this = std::move(o); + } + + CopyOrCreateWorkItem& operator=(CopyOrCreateWorkItem&& o) noexcept { + src_path = std::move(o.src_path); + dst_path = std::move(o.dst_path); + src_temperature = std::move(o.src_temperature); + dst_temperature = std::move(o.dst_temperature); + contents = std::move(o.contents); + src_env = o.src_env; + dst_env = o.dst_env; + src_env_options = std::move(o.src_env_options); + sync = o.sync; + rate_limiter = o.rate_limiter; + size_limit = o.size_limit; + stats = o.stats; + result = std::move(o.result); + progress_callback = std::move(o.progress_callback); + src_checksum_func_name = std::move(o.src_checksum_func_name); + src_checksum_hex = std::move(o.src_checksum_hex); + db_id = std::move(o.db_id); + db_session_id = std::move(o.db_session_id); + src_temperature = o.src_temperature; + return *this; + } + + CopyOrCreateWorkItem(std::string _src_path, std::string _dst_path, + const Temperature _src_temperature, + const Temperature _dst_temperature, + std::string _contents, Env* _src_env, Env* _dst_env, + EnvOptions _src_env_options, bool _sync, + RateLimiter* _rate_limiter, uint64_t _size_limit, + Statistics* _stats, + std::function _progress_callback = {}, + const std::string& _src_checksum_func_name = + kUnknownFileChecksumFuncName, + const std::string& _src_checksum_hex = "", + const std::string& _db_id = "", + const std::string& _db_session_id = "") + : src_path(std::move(_src_path)), + dst_path(std::move(_dst_path)), + src_temperature(_src_temperature), + dst_temperature(_dst_temperature), + contents(std::move(_contents)), + src_env(_src_env), + dst_env(_dst_env), + src_env_options(std::move(_src_env_options)), + sync(_sync), + rate_limiter(_rate_limiter), + size_limit(_size_limit), + stats(_stats), + progress_callback(_progress_callback), + src_checksum_func_name(_src_checksum_func_name), + src_checksum_hex(_src_checksum_hex), + db_id(_db_id), + db_session_id(_db_session_id) {} + }; + + struct BackupAfterCopyOrCreateWorkItem { + std::future result; + bool shared; + bool needed_to_copy; + Env* backup_env; + std::string dst_path_tmp; + std::string dst_path; + std::string dst_relative; + BackupAfterCopyOrCreateWorkItem() + : shared(false), + needed_to_copy(false), + backup_env(nullptr), + dst_path_tmp(""), + dst_path(""), + dst_relative("") {} + + BackupAfterCopyOrCreateWorkItem( + BackupAfterCopyOrCreateWorkItem&& o) noexcept { + *this = std::move(o); + } + + BackupAfterCopyOrCreateWorkItem& operator=( + BackupAfterCopyOrCreateWorkItem&& o) noexcept { + result = std::move(o.result); + shared = o.shared; + needed_to_copy = o.needed_to_copy; + backup_env = o.backup_env; + dst_path_tmp = std::move(o.dst_path_tmp); + dst_path = std::move(o.dst_path); + dst_relative = std::move(o.dst_relative); + return *this; + } + + BackupAfterCopyOrCreateWorkItem(std::future&& _result, + bool _shared, bool _needed_to_copy, + Env* _backup_env, std::string _dst_path_tmp, + std::string _dst_path, + std::string _dst_relative) + : result(std::move(_result)), + shared(_shared), + needed_to_copy(_needed_to_copy), + backup_env(_backup_env), + dst_path_tmp(std::move(_dst_path_tmp)), + dst_path(std::move(_dst_path)), + dst_relative(std::move(_dst_relative)) {} + }; + + using BackupWorkItemPair = + std::pair; + + struct RestoreAfterCopyOrCreateWorkItem { + std::future result; + std::string from_file; + std::string to_file; + std::string checksum_hex; + RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {} + RestoreAfterCopyOrCreateWorkItem(std::future&& _result, + const std::string& _from_file, + const std::string& _to_file, + const std::string& _checksum_hex) + : result(std::move(_result)), + from_file(_from_file), + to_file(_to_file), + checksum_hex(_checksum_hex) {} + RestoreAfterCopyOrCreateWorkItem( + RestoreAfterCopyOrCreateWorkItem&& o) noexcept { + *this = std::move(o); + } + + RestoreAfterCopyOrCreateWorkItem& operator=( + RestoreAfterCopyOrCreateWorkItem&& o) noexcept { + result = std::move(o.result); + checksum_hex = std::move(o.checksum_hex); + return *this; + } + }; + + bool initialized_; + std::mutex byte_report_mutex_; + mutable channel files_to_copy_or_create_; + std::vector threads_; + std::atomic threads_cpu_priority_; + + // Certain operations like PurgeOldBackups and DeleteBackup will trigger + // automatic GarbageCollect (true) unless we've already done one in this + // session and have not failed to delete backup files since then (false). + bool might_need_garbage_collect_ = true; + + // Adds a file to the backup work queue to be copied or created if it doesn't + // already exist. + // + // Exactly one of src_dir and contents must be non-empty. + // + // @param src_dir If non-empty, the file in this directory named fname will be + // copied. + // @param fname Name of destination file and, in case of copy, source file. + // @param contents If non-empty, the file will be created with these contents. + IOStatus AddBackupFileWorkItem( + std::unordered_set& live_dst_paths, + std::deque& backup_items_to_finish, + std::deque* excludable_items, BackupID backup_id, + bool shared, const std::string& src_dir, + const std::string& fname, // starts with "/" + const EnvOptions& src_env_options, RateLimiter* rate_limiter, + FileType file_type, uint64_t size_bytes, Statistics* stats, + uint64_t size_limit = 0, bool shared_checksum = false, + std::function progress_callback = {}, + const std::string& contents = std::string(), + const std::string& src_checksum_func_name = kUnknownFileChecksumFuncName, + const std::string& src_checksum_str = kUnknownFileChecksum, + const Temperature src_temperature = Temperature::kUnknown); + + // backup state data + BackupID latest_backup_id_; + BackupID latest_valid_backup_id_; + std::map> backups_; + std::map>> + corrupt_backups_; + std::unordered_map> + backuped_file_infos_; + std::atomic stop_backup_; + + // options data + BackupEngineOptions options_; + Env* db_env_; + Env* backup_env_; + + // directories + std::unique_ptr backup_directory_; + std::unique_ptr shared_directory_; + std::unique_ptr meta_directory_; + std::unique_ptr private_directory_; + + static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB + bool read_only_; + BackupStatistics backup_statistics_; + std::unordered_set reported_ignored_fields_; + static const size_t kMaxAppMetaSize = 1024 * 1024; // 1MB + std::shared_ptr db_fs_; + std::shared_ptr backup_fs_; + IOOptions io_options_ = IOOptions(); + + public: + std::unique_ptr schema_test_options_; +}; + +// -------- BackupEngineImplThreadSafe class --------- +// This locking layer for thread safety in the public API is layered on +// top to prevent accidental recursive locking with RWMutex, which is UB. +// Note: BackupEngineReadOnlyBase inherited twice, but has no fields +class BackupEngineImplThreadSafe : public BackupEngine, + public BackupEngineReadOnly { + public: + BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env, + bool read_only = false) + : impl_(options, db_env, read_only) {} + ~BackupEngineImplThreadSafe() override {} + + using BackupEngine::CreateNewBackupWithMetadata; + IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options, + DB* db, const std::string& app_metadata, + BackupID* new_backup_id) override { + WriteLock lock(&mutex_); + return impl_.CreateNewBackupWithMetadata(options, db, app_metadata, + new_backup_id); + } + + IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) override { + WriteLock lock(&mutex_); + return impl_.PurgeOldBackups(num_backups_to_keep); + } + + IOStatus DeleteBackup(BackupID backup_id) override { + WriteLock lock(&mutex_); + return impl_.DeleteBackup(backup_id); + } + + void StopBackup() override { + // No locking needed + impl_.StopBackup(); + } + + IOStatus GarbageCollect() override { + WriteLock lock(&mutex_); + return impl_.GarbageCollect(); + } + + Status GetLatestBackupInfo(BackupInfo* backup_info, + bool include_file_details = false) const override { + ReadLock lock(&mutex_); + return impl_.GetBackupInfo(kLatestBackupIDMarker, backup_info, + include_file_details); + } + + Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const override { + ReadLock lock(&mutex_); + return impl_.GetBackupInfo(backup_id, backup_info, include_file_details); + } + + void GetBackupInfo(std::vector* backup_info, + bool include_file_details) const override { + ReadLock lock(&mutex_); + impl_.GetBackupInfo(backup_info, include_file_details); + } + + void GetCorruptedBackups( + std::vector* corrupt_backup_ids) const override { + ReadLock lock(&mutex_); + impl_.GetCorruptedBackups(corrupt_backup_ids); + } + + using BackupEngine::RestoreDBFromBackup; + IOStatus RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir) const override { + // TSAN reports a lock inversion (potential deadlock) if we acquire read + // locks in different orders. Assuming the implementation of RWMutex + // allows simultaneous read locks, there should be no deadlock, because + // there is no write lock involved here. Nevertheless, to appease TSAN and + // in case of degraded RWMutex implementation, we lock the BackupEngines + // including this one and those in options.alternate_dirs in a consistent + // order. + // However, locked_restore_from_dirs is kept in "search" order. + std::list locked_restore_from_dirs; + std::vector mutexes; + + // Add `this` + locked_restore_from_dirs.emplace_back(&impl_); + mutexes.push_back(&mutex_); + + // Add alternates + for (BackupEngineReadOnlyBase* be : options.alternate_dirs) { + BackupEngineImplThreadSafe* bets = + static_cast_with_check( + be->AsBackupEngine()); + locked_restore_from_dirs.emplace_back(&bets->impl_); + mutexes.push_back(&bets->mutex_); + } + + // Acquire read locks in pointer order + std::sort(mutexes.begin(), mutexes.end()); + std::vector locks(mutexes.begin(), mutexes.end()); + + // Impl + return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir, + locked_restore_from_dirs); + } + + using BackupEngine::RestoreDBFromLatestBackup; + IOStatus RestoreDBFromLatestBackup( + const RestoreOptions& options, const std::string& db_dir, + const std::string& wal_dir) const override { + // Defer to above function, which locks + return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir); + } + + IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const override { + ReadLock lock(&mutex_); + return impl_.VerifyBackup(backup_id, verify_with_checksum); + } + + BackupEngine* AsBackupEngine() override { return this; } + + // Not public API but needed + IOStatus Initialize() { + // No locking needed + return impl_.Initialize(); + } + + // Not public API but used in testing + void TEST_SetBackupMetaSchemaOptions( + const TEST_BackupMetaSchemaOptions& options) { + impl_.schema_test_options_.reset(new TEST_BackupMetaSchemaOptions(options)); + } + + // Not public API but used in testing + void TEST_SetDefaultRateLimitersClock( + const std::shared_ptr& backup_rate_limiter_clock = nullptr, + const std::shared_ptr& restore_rate_limiter_clock = + nullptr) { + impl_.TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock, + restore_rate_limiter_clock); + } + + private: + mutable port::RWMutex mutex_; + BackupEngineImpl impl_; +}; + +} // namespace + +IOStatus BackupEngine::Open(const BackupEngineOptions& options, Env* env, + BackupEngine** backup_engine_ptr) { + std::unique_ptr backup_engine( + new BackupEngineImplThreadSafe(options, env)); + auto s = backup_engine->Initialize(); + if (!s.ok()) { + *backup_engine_ptr = nullptr; + return s; + } + *backup_engine_ptr = backup_engine.release(); + return IOStatus::OK(); +} + +namespace { +BackupEngineImpl::BackupEngineImpl(const BackupEngineOptions& options, + Env* db_env, bool read_only) + : initialized_(false), + threads_cpu_priority_(), + latest_backup_id_(0), + latest_valid_backup_id_(0), + stop_backup_(false), + options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_), + read_only_(read_only) { + if (options_.backup_rate_limiter == nullptr && + options_.backup_rate_limit > 0) { + options_.backup_rate_limiter.reset( + NewGenericRateLimiter(options_.backup_rate_limit)); + } + if (options_.restore_rate_limiter == nullptr && + options_.restore_rate_limit > 0) { + options_.restore_rate_limiter.reset( + NewGenericRateLimiter(options_.restore_rate_limit)); + } + db_fs_ = db_env_->GetFileSystem(); + backup_fs_ = backup_env_->GetFileSystem(); +} + +BackupEngineImpl::~BackupEngineImpl() { + files_to_copy_or_create_.sendEof(); + for (auto& t : threads_) { + t.join(); + } + LogFlush(options_.info_log); + for (const auto& it : corrupt_backups_) { + it.second.first.PermitUncheckedError(); + } +} + +IOStatus BackupEngineImpl::Initialize() { + assert(!initialized_); + initialized_ = true; + if (read_only_) { + ROCKS_LOG_INFO(options_.info_log, "Starting read_only backup engine"); + } + options_.Dump(options_.info_log); + + auto meta_path = GetAbsolutePath(kMetaDirName); + + if (!read_only_) { + // we might need to clean up from previous crash or I/O errors + might_need_garbage_collect_ = true; + + if (options_.max_valid_backups_to_open != + std::numeric_limits::max()) { + options_.max_valid_backups_to_open = std::numeric_limits::max(); + ROCKS_LOG_WARN( + options_.info_log, + "`max_valid_backups_to_open` is not set to the default value. " + "Ignoring its value since BackupEngine is not read-only."); + } + + // gather the list of directories that we need to create + std::vector*>> + directories; + directories.emplace_back(GetAbsolutePath(), &backup_directory_); + if (options_.share_table_files) { + if (options_.share_files_with_checksum) { + directories.emplace_back( + GetAbsolutePath(GetSharedFileWithChecksumRel()), + &shared_directory_); + } else { + directories.emplace_back(GetAbsolutePath(GetSharedFileRel()), + &shared_directory_); + } + } + directories.emplace_back(GetAbsolutePath(kPrivateDirName), + &private_directory_); + directories.emplace_back(meta_path, &meta_directory_); + // create all the dirs we need + for (const auto& d : directories) { + IOStatus io_s = + backup_fs_->CreateDirIfMissing(d.first, io_options_, nullptr); + if (io_s.ok()) { + io_s = + backup_fs_->NewDirectory(d.first, io_options_, d.second, nullptr); + } + if (!io_s.ok()) { + return io_s; + } + } + } + + std::vector backup_meta_files; + { + IOStatus io_s = backup_fs_->GetChildren(meta_path, io_options_, + &backup_meta_files, nullptr); + if (io_s.IsNotFound()) { + return IOStatus::NotFound(meta_path + " is missing"); + } else if (!io_s.ok()) { + return io_s; + } + } + // create backups_ structure + for (auto& file : backup_meta_files) { + ROCKS_LOG_INFO(options_.info_log, "Detected backup %s", file.c_str()); + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + // Invalid file name, will be deleted with auto-GC when user + // initiates an append or write operation. (Behave as read-only until + // then.) + ROCKS_LOG_INFO(options_.info_log, "Skipping unrecognized meta file %s", + file.c_str()); + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + // Insert all the (backup_id, BackupMeta) that will be loaded later + // The loading performed later will check whether there are corrupt backups + // and move the corrupt backups to corrupt_backups_ + backups_.insert(std::make_pair( + backup_id, std::unique_ptr(new BackupMeta( + GetBackupMetaFile(backup_id, false /* tmp */), + GetBackupMetaFile(backup_id, true /* tmp */), + &backuped_file_infos_, backup_env_, backup_fs_)))); + } + + latest_backup_id_ = 0; + latest_valid_backup_id_ = 0; + if (options_.destroy_old_data) { // Destroy old data + assert(!read_only_); + ROCKS_LOG_INFO( + options_.info_log, + "Backup Engine started with destroy_old_data == true, deleting all " + "backups"); + IOStatus io_s = PurgeOldBackups(0); + if (io_s.ok()) { + io_s = GarbageCollect(); + } + if (!io_s.ok()) { + return io_s; + } + } else { // Load data from storage + // abs_path_to_size: maps absolute paths of files in backup directory to + // their corresponding sizes + std::unordered_map abs_path_to_size; + // Insert files and their sizes in backup sub-directories (shared and + // shared_checksum) to abs_path_to_size + for (const auto& rel_dir : + {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) { + const auto abs_dir = GetAbsolutePath(rel_dir); + IOStatus io_s = + ReadChildFileCurrentSizes(abs_dir, backup_fs_, &abs_path_to_size); + if (!io_s.ok()) { + // I/O error likely impacting all backups + return io_s; + } + } + // load the backups if any, until valid_backups_to_open of the latest + // non-corrupted backups have been successfully opened. + int valid_backups_to_open = options_.max_valid_backups_to_open; + for (auto backup_iter = backups_.rbegin(); backup_iter != backups_.rend(); + ++backup_iter) { + assert(latest_backup_id_ == 0 || latest_backup_id_ > backup_iter->first); + if (latest_backup_id_ == 0) { + latest_backup_id_ = backup_iter->first; + } + if (valid_backups_to_open == 0) { + break; + } + + // Insert files and their sizes in backup sub-directories + // (private/backup_id) to abs_path_to_size + IOStatus io_s = ReadChildFileCurrentSizes( + GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_fs_, + &abs_path_to_size); + if (io_s.ok()) { + io_s = backup_iter->second->LoadFromFile( + options_.backup_dir, abs_path_to_size, + options_.backup_rate_limiter.get(), options_.info_log, + &reported_ignored_fields_); + } + if (io_s.IsCorruption() || io_s.IsNotSupported()) { + ROCKS_LOG_INFO(options_.info_log, "Backup %u corrupted -- %s", + backup_iter->first, io_s.ToString().c_str()); + corrupt_backups_.insert(std::make_pair( + backup_iter->first, + std::make_pair(io_s, std::move(backup_iter->second)))); + } else if (!io_s.ok()) { + // Distinguish corruption errors from errors in the backup Env. + // Errors in the backup Env (i.e., this code path) will cause Open() to + // fail, whereas corruption errors would not cause Open() failures. + return io_s; + } else { + ROCKS_LOG_INFO(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s", + backup_iter->first, + backup_iter->second->GetInfoString().c_str()); + assert(latest_valid_backup_id_ == 0 || + latest_valid_backup_id_ > backup_iter->first); + if (latest_valid_backup_id_ == 0) { + latest_valid_backup_id_ = backup_iter->first; + } + --valid_backups_to_open; + } + } + + for (const auto& corrupt : corrupt_backups_) { + backups_.erase(backups_.find(corrupt.first)); + } + // erase the backups before max_valid_backups_to_open + int num_unopened_backups; + if (options_.max_valid_backups_to_open == 0) { + num_unopened_backups = 0; + } else { + num_unopened_backups = + std::max(0, static_cast(backups_.size()) - + options_.max_valid_backups_to_open); + } + for (int i = 0; i < num_unopened_backups; ++i) { + assert(backups_.begin()->second->Empty()); + backups_.erase(backups_.begin()); + } + } + + ROCKS_LOG_INFO(options_.info_log, "Latest backup is %u", latest_backup_id_); + ROCKS_LOG_INFO(options_.info_log, "Latest valid backup is %u", + latest_valid_backup_id_); + + // set up threads perform copies from files_to_copy_or_create_ in the + // background + threads_cpu_priority_ = CpuPriority::kNormal; + threads_.reserve(options_.max_background_operations); + for (int t = 0; t < options_.max_background_operations; t++) { + threads_.emplace_back([this]() { +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + pthread_setname_np(pthread_self(), "backup_engine"); +#endif +#endif + CpuPriority current_priority = CpuPriority::kNormal; + CopyOrCreateWorkItem work_item; + uint64_t bytes_toward_next_callback = 0; + while (files_to_copy_or_create_.read(work_item)) { + CpuPriority priority = threads_cpu_priority_; + if (current_priority != priority) { + TEST_SYNC_POINT_CALLBACK( + "BackupEngineImpl::Initialize:SetCpuPriority", &priority); + port::SetCpuPriority(0, priority); + current_priority = priority; + } + // `bytes_read` and `bytes_written` stats are enabled based on + // compile-time support and cannot be dynamically toggled. So we do not + // need to worry about `PerfLevel` here, unlike many other + // `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + uint64_t prev_bytes_written = IOSTATS(bytes_written); + + CopyOrCreateResult result; + Temperature temp = work_item.src_temperature; + result.io_status = CopyOrCreateFile( + work_item.src_path, work_item.dst_path, work_item.contents, + work_item.size_limit, work_item.src_env, work_item.dst_env, + work_item.src_env_options, work_item.sync, work_item.rate_limiter, + work_item.progress_callback, &temp, work_item.dst_temperature, + &bytes_toward_next_callback, &result.size, &result.checksum_hex); + + RecordTick(work_item.stats, BACKUP_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + RecordTick(work_item.stats, BACKUP_WRITE_BYTES, + IOSTATS(bytes_written) - prev_bytes_written); + + result.db_id = work_item.db_id; + result.db_session_id = work_item.db_session_id; + result.expected_src_temperature = work_item.src_temperature; + result.current_src_temperature = temp; + if (result.io_status.ok() && !work_item.src_checksum_hex.empty()) { + // unknown checksum function name implies no db table file checksum in + // db manifest; work_item.src_checksum_hex not empty means + // backup engine has calculated its crc32c checksum for the table + // file; therefore, we are able to compare the checksums. + if (work_item.src_checksum_func_name == + kUnknownFileChecksumFuncName || + work_item.src_checksum_func_name == kDbFileChecksumFuncName) { + if (work_item.src_checksum_hex != result.checksum_hex) { + std::string checksum_info( + "Expected checksum is " + work_item.src_checksum_hex + + " while computed checksum is " + result.checksum_hex); + result.io_status = IOStatus::Corruption( + "Checksum mismatch after copying to " + work_item.dst_path + + ": " + checksum_info); + } + } else { + // FIXME(peterd): dead code? + std::string checksum_function_info( + "Existing checksum function is " + + work_item.src_checksum_func_name + + " while provided checksum function is " + + kBackupFileChecksumFuncName); + ROCKS_LOG_INFO( + options_.info_log, + "Unable to verify checksum after copying to %s: %s\n", + work_item.dst_path.c_str(), checksum_function_info.c_str()); + } + } + work_item.result.set_value(std::move(result)); + } + }); + } + ROCKS_LOG_INFO(options_.info_log, "Initialized BackupEngine"); + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( + const CreateBackupOptions& options, DB* db, const std::string& app_metadata, + BackupID* new_backup_id_ptr) { + assert(initialized_); + assert(!read_only_); + if (app_metadata.size() > kMaxAppMetaSize) { + return IOStatus::InvalidArgument("App metadata too large"); + } + + bool maybe_exclude_items = bool{options.exclude_files_callback}; + if (maybe_exclude_items && options_.schema_version < 2) { + return IOStatus::InvalidArgument( + "exclude_files_callback requires schema_version >= 2"); + } + + if (options.decrease_background_thread_cpu_priority) { + if (options.background_thread_cpu_priority < threads_cpu_priority_) { + threads_cpu_priority_.store(options.background_thread_cpu_priority); + } + } + + BackupID new_backup_id = latest_backup_id_ + 1; + + // `bytes_read` and `bytes_written` stats are enabled based on compile-time + // support and cannot be dynamically toggled. So we do not need to worry about + // `PerfLevel` here, unlike many other `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + uint64_t prev_bytes_written = IOSTATS(bytes_written); + + assert(backups_.find(new_backup_id) == backups_.end()); + + auto private_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id)); + IOStatus io_s = backup_fs_->FileExists(private_dir, io_options_, nullptr); + if (io_s.ok()) { + // maybe last backup failed and left partial state behind, clean it up. + // need to do this before updating backups_ such that a private dir + // named after new_backup_id will be cleaned up. + // (If an incomplete new backup is followed by an incomplete delete + // of the latest full backup, then there could be more than one next + // id with a private dir, the last thing to be deleted in delete + // backup, but all will be cleaned up with a GarbageCollect.) + io_s = GarbageCollect(); + } else if (io_s.IsNotFound()) { + // normal case, the new backup's private dir doesn't exist yet + io_s = IOStatus::OK(); + } + + auto ret = backups_.insert(std::make_pair( + new_backup_id, std::unique_ptr(new BackupMeta( + GetBackupMetaFile(new_backup_id, false /* tmp */), + GetBackupMetaFile(new_backup_id, true /* tmp */), + &backuped_file_infos_, backup_env_, backup_fs_)))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup->RecordTimestamp(); + new_backup->SetAppMetadata(app_metadata); + + auto start_backup = backup_env_->NowMicros(); + + ROCKS_LOG_INFO(options_.info_log, + "Started the backup process -- creating backup %u", + new_backup_id); + + if (options_.share_table_files && !options_.share_files_with_checksum) { + ROCKS_LOG_WARN(options_.info_log, + "BackupEngineOptions::share_files_with_checksum=false is " + "DEPRECATED and could lead to data loss."); + } + + if (io_s.ok()) { + io_s = backup_fs_->CreateDir(private_dir, io_options_, nullptr); + } + + // A set into which we will insert the dst_paths that are calculated for live + // files and live WAL files. + // This is used to check whether a live files shares a dst_path with another + // live file. + std::unordered_set live_dst_paths; + + std::deque excludable_items; + std::deque backup_items_to_finish; + // Add a CopyOrCreateWorkItem to the channel for each live file + Status disabled = db->DisableFileDeletions(); + DBOptions db_options = db->GetDBOptions(); + Statistics* stats = db_options.statistics.get(); + if (io_s.ok()) { + CheckpointImpl checkpoint(db); + uint64_t sequence_number = 0; + FileChecksumGenFactory* db_checksum_factory = + db_options.file_checksum_gen_factory.get(); + const std::string kFileChecksumGenFactoryName = + "FileChecksumGenCrc32cFactory"; + bool compare_checksum = + db_checksum_factory != nullptr && + db_checksum_factory->Name() == kFileChecksumGenFactoryName + ? true + : false; + EnvOptions src_raw_env_options(db_options); + RateLimiter* rate_limiter = options_.backup_rate_limiter.get(); + io_s = status_to_io_status(checkpoint.CreateCustomCheckpoint( + [&](const std::string& /*src_dirname*/, const std::string& /*fname*/, + FileType) { + // custom checkpoint will switch to calling copy_file_cb after it sees + // NotSupported returned from link_file_cb. + return IOStatus::NotSupported(); + } /* link_file_cb */, + [&](const std::string& src_dirname, const std::string& fname, + uint64_t size_limit_bytes, FileType type, + const std::string& checksum_func_name, + const std::string& checksum_val, + const Temperature src_temperature) { + if (type == kWalFile && !options_.backup_log_files) { + return IOStatus::OK(); + } + Log(options_.info_log, "add file for backup %s", fname.c_str()); + uint64_t size_bytes = 0; + IOStatus io_st; + if (type == kTableFile || type == kBlobFile) { + io_st = db_fs_->GetFileSize(src_dirname + "/" + fname, io_options_, + &size_bytes, nullptr); + if (!io_st.ok()) { + Log(options_.info_log, "GetFileSize is failed: %s", + io_st.ToString().c_str()); + return io_st; + } + } + EnvOptions src_env_options; + switch (type) { + case kWalFile: + src_env_options = + db_env_->OptimizeForLogRead(src_raw_env_options); + break; + case kTableFile: + src_env_options = db_env_->OptimizeForCompactionTableRead( + src_raw_env_options, ImmutableDBOptions(db_options)); + break; + case kDescriptorFile: + src_env_options = + db_env_->OptimizeForManifestRead(src_raw_env_options); + break; + case kBlobFile: + src_env_options = db_env_->OptimizeForBlobFileRead( + src_raw_env_options, ImmutableDBOptions(db_options)); + break; + default: + // Other backed up files (like options file) are not read by live + // DB, so don't need to worry about avoiding mixing buffered and + // direct I/O. Just use plain defaults. + src_env_options = src_raw_env_options; + break; + } + io_st = AddBackupFileWorkItem( + live_dst_paths, backup_items_to_finish, + maybe_exclude_items ? &excludable_items : nullptr, new_backup_id, + options_.share_table_files && + (type == kTableFile || type == kBlobFile), + src_dirname, fname, src_env_options, rate_limiter, type, + size_bytes, db_options.statistics.get(), size_limit_bytes, + options_.share_files_with_checksum && + (type == kTableFile || type == kBlobFile), + options.progress_callback, "" /* contents */, checksum_func_name, + checksum_val, src_temperature); + return io_st; + } /* copy_file_cb */, + [&](const std::string& fname, const std::string& contents, + FileType type) { + Log(options_.info_log, "add file for backup %s", fname.c_str()); + return AddBackupFileWorkItem( + live_dst_paths, backup_items_to_finish, + maybe_exclude_items ? &excludable_items : nullptr, new_backup_id, + false /* shared */, "" /* src_dir */, fname, + EnvOptions() /* src_env_options */, rate_limiter, type, + contents.size(), db_options.statistics.get(), 0 /* size_limit */, + false /* shared_checksum */, options.progress_callback, contents); + } /* create_file_cb */, + &sequence_number, + options.flush_before_backup ? 0 : std::numeric_limits::max(), + compare_checksum)); + if (io_s.ok()) { + new_backup->SetSequenceNumber(sequence_number); + } + } + ROCKS_LOG_INFO(options_.info_log, "add files for backup done."); + if (io_s.ok() && maybe_exclude_items) { + assert(options.exclude_files_callback); + size_t count = excludable_items.size(); + std::vector maybe_exclude_files; + maybe_exclude_files.reserve(count); + for (auto& e : excludable_items) { + maybe_exclude_files.emplace_back( + BackupExcludedFileInfo(e.second.dst_relative)); + } + if (count > 0) { + try { + options.exclude_files_callback( + &maybe_exclude_files.front(), + /*end pointer*/ &maybe_exclude_files.back() + 1); + } catch (const std::exception& exn) { + io_s = IOStatus::Aborted("Exception in exclude_files_callback: " + + std::string(exn.what())); + } catch (...) { + io_s = IOStatus::Aborted("Unknown exception in exclude_files_callback"); + } + } + if (io_s.ok()) { + for (size_t i = 0; i < count; ++i) { + auto& e = excludable_items[i]; + if (maybe_exclude_files[i].exclude_decision) { + new_backup.get()->AddExcludedFile(e.second.dst_relative); + } else { + files_to_copy_or_create_.write(std::move(e.first)); + backup_items_to_finish.push_back(std::move(e.second)); + } + } + } + excludable_items.clear(); + } else { + assert(!options.exclude_files_callback); + assert(excludable_items.empty()); + } + ROCKS_LOG_INFO(options_.info_log, + "dispatch files for backup done, wait for finish."); + IOStatus item_io_status; + for (auto& item : backup_items_to_finish) { + item.result.wait(); + auto result = item.result.get(); + item_io_status = result.io_status; + Temperature temp = result.expected_src_temperature; + if (result.current_src_temperature != Temperature::kUnknown && + (temp == Temperature::kUnknown || + options_.current_temperatures_override_manifest)) { + temp = result.current_src_temperature; + } + if (item_io_status.ok() && item.shared && item.needed_to_copy) { + item_io_status = item.backup_env->GetFileSystem()->RenameFile( + item.dst_path_tmp, item.dst_path, io_options_, nullptr); + } + if (item_io_status.ok()) { + item_io_status = new_backup.get()->AddFile(std::make_shared( + item.dst_relative, result.size, result.checksum_hex, result.db_id, + result.db_session_id, temp)); + } + if (!item_io_status.ok()) { + io_s = item_io_status; + } + } + + // we copied all the files, enable file deletions + if (disabled.ok()) { // If we successfully disabled file deletions + db->EnableFileDeletions(false).PermitUncheckedError(); + } + auto backup_time = backup_env_->NowMicros() - start_backup; + + if (io_s.ok()) { + // persist the backup metadata on the disk + io_s = new_backup->StoreToFile(options_.sync, options_.schema_version, + schema_test_options_.get()); + } + if (io_s.ok() && options_.sync) { + std::unique_ptr backup_private_directory; + backup_fs_ + ->NewDirectory(GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)), + io_options_, &backup_private_directory, nullptr) + .PermitUncheckedError(); + if (backup_private_directory != nullptr) { + io_s = backup_private_directory->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && private_directory_ != nullptr) { + io_s = private_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && meta_directory_ != nullptr) { + io_s = meta_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && shared_directory_ != nullptr) { + io_s = shared_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && backup_directory_ != nullptr) { + io_s = backup_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + } + + if (io_s.ok()) { + backup_statistics_.IncrementNumberSuccessBackup(); + // here we know that we succeeded and installed the new backup + latest_backup_id_ = new_backup_id; + latest_valid_backup_id_ = new_backup_id; + if (new_backup_id_ptr) { + *new_backup_id_ptr = new_backup_id; + } + ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good"); + + // backup_speed is in byte/second + double backup_speed = new_backup->GetSize() / (1.048576 * backup_time); + ROCKS_LOG_INFO(options_.info_log, "Backup number of files: %u", + new_backup->GetNumberFiles()); + char human_size[16]; + AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size)); + ROCKS_LOG_INFO(options_.info_log, "Backup size: %s", human_size); + ROCKS_LOG_INFO(options_.info_log, "Backup time: %" PRIu64 " microseconds", + backup_time); + ROCKS_LOG_INFO(options_.info_log, "Backup speed: %.3f MB/s", backup_speed); + ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s", + backup_statistics_.ToString().c_str()); + } else { + backup_statistics_.IncrementNumberFailBackup(); + // clean all the files we might have created + ROCKS_LOG_INFO(options_.info_log, "Backup failed -- %s", + io_s.ToString().c_str()); + ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n", + backup_statistics_.ToString().c_str()); + // delete files that we might have already written + might_need_garbage_collect_ = true; + DeleteBackup(new_backup_id).PermitUncheckedError(); + } + + RecordTick(stats, BACKUP_READ_BYTES, IOSTATS(bytes_read) - prev_bytes_read); + RecordTick(stats, BACKUP_WRITE_BYTES, + IOSTATS(bytes_written) - prev_bytes_written); + return io_s; +} + +IOStatus BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { + assert(initialized_); + assert(!read_only_); + + // Best effort deletion even with errors + IOStatus overall_status = IOStatus::OK(); + + ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + std::vector to_delete; + auto itr = backups_.begin(); + while ((backups_.size() - to_delete.size()) > num_backups_to_keep) { + to_delete.push_back(itr->first); + itr++; + } + for (auto backup_id : to_delete) { + // Do not GC until end + IOStatus io_s = DeleteBackupNoGC(backup_id); + if (!io_s.ok()) { + overall_status = io_s; + } + } + // Clean up after any incomplete backup deletion, potentially from + // earlier session. + if (might_need_garbage_collect_) { + IOStatus io_s = GarbageCollect(); + if (!io_s.ok() && overall_status.ok()) { + overall_status = io_s; + } + } + return overall_status; +} + +IOStatus BackupEngineImpl::DeleteBackup(BackupID backup_id) { + IOStatus s1 = DeleteBackupNoGC(backup_id); + IOStatus s2 = IOStatus::OK(); + + // Clean up after any incomplete backup deletion, potentially from + // earlier session. + if (might_need_garbage_collect_) { + s2 = GarbageCollect(); + } + + if (!s1.ok()) { + // Any failure in the primary objective trumps any failure in the + // secondary objective. + s2.PermitUncheckedError(); + return s1; + } else { + return s2; + } +} + +// Does not auto-GarbageCollect nor lock +IOStatus BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) { + assert(initialized_); + assert(!read_only_); + + ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup != backups_.end()) { + IOStatus io_s = backup->second->Delete(); + if (!io_s.ok()) { + return io_s; + } + backups_.erase(backup); + if (backups_.empty()) { + latest_valid_backup_id_ = 0; + } else { + latest_valid_backup_id_ = backups_.rbegin()->first; + } + } else { + auto corrupt = corrupt_backups_.find(backup_id); + if (corrupt == corrupt_backups_.end()) { + return IOStatus::NotFound("Backup not found"); + } + IOStatus io_s = corrupt->second.second->Delete(); + if (!io_s.ok()) { + return io_s; + } + corrupt->second.first.PermitUncheckedError(); + corrupt_backups_.erase(corrupt); + } + + // After removing meta file, best effort deletion even with errors. + // (Don't delete other files if we can't delete the meta file right + // now.) + std::vector to_delete; + for (auto& itr : backuped_file_infos_) { + if (itr.second->refs == 0) { + IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(itr.first), + io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + io_s.ToString().c_str()); + to_delete.push_back(itr.first); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + } + for (auto& td : to_delete) { + backuped_file_infos_.erase(td); + } + + // take care of private dirs -- GarbageCollect() will take care of them + // if they are not empty + std::string private_dir = GetPrivateFileRel(backup_id); + IOStatus io_s = + backup_fs_->DeleteDir(GetAbsolutePath(private_dir), io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), io_s.ToString().c_str()); + if (!io_s.ok()) { + // Full gc or trying again later might work + might_need_garbage_collect_ = true; + } + return IOStatus::OK(); +} + +void BackupEngineImpl::SetBackupInfoFromBackupMeta( + BackupID id, const BackupMeta& meta, BackupInfo* backup_info, + bool include_file_details) const { + *backup_info = BackupInfo(id, meta.GetTimestamp(), meta.GetSize(), + meta.GetNumberFiles(), meta.GetAppMetadata()); + std::string dir = + options_.backup_dir + "/" + kPrivateDirSlash + std::to_string(id); + if (include_file_details) { + auto& file_details = backup_info->file_details; + file_details.reserve(meta.GetFiles().size()); + for (auto& file_ptr : meta.GetFiles()) { + BackupFileInfo& finfo = file_details.emplace_back(); + finfo.relative_filename = file_ptr->filename; + finfo.size = file_ptr->size; + finfo.directory = dir; + uint64_t number; + FileType type; + bool ok = ParseFileName(file_ptr->filename, &number, &type); + if (ok) { + finfo.file_number = number; + finfo.file_type = type; + } + // TODO: temperature, file_checksum, file_checksum_func_name + // finfo.temperature = file_ptr->temp; + } + backup_info->excluded_files = meta.GetExcludedFiles(); + + backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id)); + backup_info->name_for_open.pop_back(); // remove trailing '/' + backup_info->env_for_open = meta.GetEnvForOpen(); + } +} + +Status BackupEngineImpl::GetBackupInfo(BackupID backup_id, + BackupInfo* backup_info, + bool include_file_details) const { + assert(initialized_); + if (backup_id == kLatestBackupIDMarker) { + // Note: Read latest_valid_backup_id_ inside of lock + backup_id = latest_valid_backup_id_; + } + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return Status::Corruption(corrupt_itr->second.first.ToString()); + } + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup->Empty()) { + return Status::NotFound("Backup not found"); + } + + SetBackupInfoFromBackupMeta(backup_id, *backup, backup_info, + include_file_details); + return Status::OK(); +} + +void BackupEngineImpl::GetBackupInfo(std::vector* backup_info, + bool include_file_details) const { + assert(initialized_); + backup_info->resize(backups_.size()); + size_t i = 0; + for (auto& backup : backups_) { + const BackupMeta& meta = *backup.second; + if (!meta.Empty()) { + SetBackupInfoFromBackupMeta(backup.first, meta, &backup_info->at(i++), + include_file_details); + } + } +} + +void BackupEngineImpl::GetCorruptedBackups( + std::vector* corrupt_backup_ids) const { + assert(initialized_); + corrupt_backup_ids->reserve(corrupt_backups_.size()); + for (auto& backup : corrupt_backups_) { + corrupt_backup_ids->push_back(backup.first); + } +} + +IOStatus BackupEngineImpl::RestoreDBFromBackup( + const RestoreOptions& options, BackupID backup_id, + const std::string& db_dir, const std::string& wal_dir, + const std::list& locked_restore_from_dirs) const { + assert(initialized_); + if (backup_id == kLatestBackupIDMarker) { + // Note: Read latest_valid_backup_id_ inside of lock + backup_id = latest_valid_backup_id_; + } + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return corrupt_itr->second.first; + } + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return IOStatus::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup->Empty()) { + return IOStatus::NotFound("Backup not found"); + } + + ROCKS_LOG_INFO(options_.info_log, "Restoring backup id %u\n", backup_id); + ROCKS_LOG_INFO(options_.info_log, "keep_log_files: %d\n", + static_cast(options.keep_log_files)); + + // just in case. Ignore errors + db_fs_->CreateDirIfMissing(db_dir, io_options_, nullptr) + .PermitUncheckedError(); + db_fs_->CreateDirIfMissing(wal_dir, io_options_, nullptr) + .PermitUncheckedError(); + + if (options.keep_log_files) { + // delete files in db_dir, but keep all the log files + DeleteChildren(db_dir, 1 << kWalFile); + // move all the files from archive dir to wal_dir + std::string archive_dir = ArchivalDirectory(wal_dir); + std::vector archive_files; + db_fs_->GetChildren(archive_dir, io_options_, &archive_files, nullptr) + .PermitUncheckedError(); // ignore errors + for (const auto& f : archive_files) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kWalFile) { + ROCKS_LOG_INFO(options_.info_log, + "Moving log file from archive/ to wal_dir: %s", + f.c_str()); + IOStatus io_s = db_fs_->RenameFile( + archive_dir + "/" + f, wal_dir + "/" + f, io_options_, nullptr); + if (!io_s.ok()) { + // if we can't move log file from archive_dir to wal_dir, + // we should fail, since it might mean data loss + return io_s; + } + } + } + } else { + DeleteChildren(wal_dir); + DeleteChildren(ArchivalDirectory(wal_dir)); + DeleteChildren(db_dir); + } + + // Files to restore, and from where (taking into account excluded files) + std::vector> + restore_file_infos; + restore_file_infos.reserve(backup->GetFiles().size() + + backup->GetExcludedFiles().size()); + + for (const auto& ef : backup->GetExcludedFiles()) { + const std::string& file = ef.relative_file; + + bool found = false; + for (auto be : locked_restore_from_dirs) { + auto it = be->backuped_file_infos_.find(file); + if (it != backuped_file_infos_.end()) { + restore_file_infos.emplace_back(be, &*it->second); + found = true; + break; + } + } + if (!found) { + return IOStatus::InvalidArgument( + "Excluded file " + file + " not found in other backups nor in " + + std::to_string(locked_restore_from_dirs.size() - 1) + + " alternate backup directories"); + } + } + + // Non-excluded files + for (const auto& file_info_shared : backup->GetFiles()) { + restore_file_infos.emplace_back(this, &*file_info_shared); + } + + IOStatus io_s; + std::vector restore_items_to_finish; + std::string temporary_current_file; + std::string final_current_file; + std::unique_ptr db_dir_for_fsync; + std::unique_ptr wal_dir_for_fsync; + + for (const auto& engine_and_file_info : restore_file_infos) { + const FileInfo* file_info = engine_and_file_info.second; + const std::string& file = file_info->filename; + std::string absolute_file = + engine_and_file_info.first->GetAbsolutePath(file); + Env* src_env = engine_and_file_info.first->backup_env_; + + // 1. get DB filename + std::string dst = file_info->GetDbFileName(); + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return IOStatus::Corruption("Backup corrupted: Fail to parse filename " + + dst); + } + // 3. Construct the final path + // kWalFile lives in wal_dir and all the rest live in db_dir + if (type == kWalFile) { + dst = wal_dir + "/" + dst; + if (options_.sync && !wal_dir_for_fsync) { + io_s = db_fs_->NewDirectory(wal_dir, io_options_, &wal_dir_for_fsync, + nullptr); + if (!io_s.ok()) { + return io_s; + } + } + } else { + dst = db_dir + "/" + dst; + if (options_.sync && !db_dir_for_fsync) { + io_s = db_fs_->NewDirectory(db_dir, io_options_, &db_dir_for_fsync, + nullptr); + if (!io_s.ok()) { + return io_s; + } + } + } + // For atomicity, initially restore CURRENT file to a temporary name. + // This is useful even without options_.sync e.g. in case the restore + // process is interrupted. + if (type == kCurrentFile) { + final_current_file = dst; + dst = temporary_current_file = dst + ".tmp"; + } + + ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(), + dst.c_str()); + CopyOrCreateWorkItem copy_or_create_work_item( + absolute_file, dst, Temperature::kUnknown /* src_temp */, + file_info->temp, "" /* contents */, src_env, db_env_, + EnvOptions() /* src_env_options */, options_.sync, + options_.restore_rate_limiter.get(), file_info->size, + nullptr /* stats */); + RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + copy_or_create_work_item.result.get_future(), file, dst, + file_info->checksum_hex); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + restore_items_to_finish.push_back( + std::move(after_copy_or_create_work_item)); + } + IOStatus item_io_status; + for (auto& item : restore_items_to_finish) { + item.result.wait(); + auto result = item.result.get(); + item_io_status = result.io_status; + // Note: It is possible that both of the following bad-status cases occur + // during copying. But, we only return one status. + if (!item_io_status.ok()) { + io_s = item_io_status; + break; + } else if (!item.checksum_hex.empty() && + item.checksum_hex != result.checksum_hex) { + io_s = IOStatus::Corruption( + "While restoring " + item.from_file + " -> " + item.to_file + + ": expected checksum is " + item.checksum_hex + + " while computed checksum is " + result.checksum_hex); + break; + } + } + + // When enabled, the first FsyncWithDirOptions is to ensure all files are + // fully persisted before renaming CURRENT.tmp + if (io_s.ok() && db_dir_for_fsync) { + ROCKS_LOG_INFO(options_.info_log, "Restore: fsync\n"); + io_s = db_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + + if (io_s.ok() && wal_dir_for_fsync) { + io_s = wal_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + + if (io_s.ok() && !temporary_current_file.empty()) { + ROCKS_LOG_INFO(options_.info_log, "Restore: atomic rename CURRENT.tmp\n"); + assert(!final_current_file.empty()); + io_s = db_fs_->RenameFile(temporary_current_file, final_current_file, + io_options_, nullptr); + } + + if (io_s.ok() && db_dir_for_fsync && !temporary_current_file.empty()) { + // Second FsyncWithDirOptions is to ensure the final atomic rename of DB + // restore is fully persisted even if power goes out right after restore + // operation returns success + assert(db_dir_for_fsync); + io_s = db_dir_for_fsync->FsyncWithDirOptions( + io_options_, nullptr, DirFsyncOptions(final_current_file)); + } + + ROCKS_LOG_INFO(options_.info_log, "Restoring done -- %s\n", + io_s.ToString().c_str()); + return io_s; +} + +IOStatus BackupEngineImpl::VerifyBackup(BackupID backup_id, + bool verify_with_checksum) const { + assert(initialized_); + // Check if backup_id is corrupted, or valid and registered + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return corrupt_itr->second.first; + } + + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return IOStatus::NotFound(); + } + + auto& backup = backup_itr->second; + if (backup->Empty()) { + return IOStatus::NotFound(); + } + + ROCKS_LOG_INFO(options_.info_log, "Verifying backup id %u\n", backup_id); + + // Find all existing backup files belong to backup_id + std::unordered_map curr_abs_path_to_size; + for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(), + GetSharedFileWithChecksumRel()}) { + const auto abs_dir = GetAbsolutePath(rel_dir); + // Shared directories allowed to be missing in some cases. Expected but + // missing files will be reported a few lines down. + ReadChildFileCurrentSizes(abs_dir, backup_fs_, &curr_abs_path_to_size) + .PermitUncheckedError(); + } + + // For all files registered in backup + for (const auto& file_info : backup->GetFiles()) { + const auto abs_path = GetAbsolutePath(file_info->filename); + // check existence of the file + if (curr_abs_path_to_size.find(abs_path) == curr_abs_path_to_size.end()) { + return IOStatus::NotFound("File missing: " + abs_path); + } + // verify file size + if (file_info->size != curr_abs_path_to_size[abs_path]) { + std::string size_info("Expected file size is " + + std::to_string(file_info->size) + + " while found file size is " + + std::to_string(curr_abs_path_to_size[abs_path])); + return IOStatus::Corruption("File corrupted: File size mismatch for " + + abs_path + ": " + size_info); + } + if (verify_with_checksum && !file_info->checksum_hex.empty()) { + // verify file checksum + std::string checksum_hex; + ROCKS_LOG_INFO(options_.info_log, "Verifying %s checksum...\n", + abs_path.c_str()); + IOStatus io_s = ReadFileAndComputeChecksum( + abs_path, backup_fs_, EnvOptions(), 0 /* size_limit */, &checksum_hex, + Temperature::kUnknown); + if (!io_s.ok()) { + return io_s; + } else if (file_info->checksum_hex != checksum_hex) { + std::string checksum_info( + "Expected checksum is " + file_info->checksum_hex + + " while computed checksum is " + checksum_hex); + return IOStatus::Corruption("File corrupted: Checksum mismatch for " + + abs_path + ": " + checksum_info); + } + } + } + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::CopyOrCreateFile( + const std::string& src, const std::string& dst, const std::string& contents, + uint64_t size_limit, Env* src_env, Env* dst_env, + const EnvOptions& src_env_options, bool sync, RateLimiter* rate_limiter, + std::function progress_callback, Temperature* src_temperature, + Temperature dst_temperature, uint64_t* bytes_toward_next_callback, + uint64_t* size, std::string* checksum_hex) { + assert(src.empty() != contents.empty()); + IOStatus io_s; + std::unique_ptr dst_file; + std::unique_ptr src_file; + FileOptions dst_file_options; + dst_file_options.use_mmap_writes = false; + dst_file_options.temperature = dst_temperature; + // TODO:(gzh) maybe use direct reads/writes here if possible + if (size != nullptr) { + *size = 0; + } + uint32_t checksum_value = 0; + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options, + &dst_file, nullptr); + if (io_s.ok() && !src.empty()) { + auto src_file_options = FileOptions(src_env_options); + src_file_options.temperature = *src_temperature; + io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options, + &src_file, nullptr); + } + if (io_s.IsPathNotFound() && *src_temperature != Temperature::kUnknown) { + // Retry without temperature hint in case the FileSystem is strict with + // non-kUnknown temperature option + io_s = src_env->GetFileSystem()->NewSequentialFile( + src, FileOptions(src_env_options), &src_file, nullptr); + } + if (!io_s.ok()) { + return io_s; + } + + size_t buf_size = + rate_limiter ? static_cast(rate_limiter->GetSingleBurstBytes()) + : kDefaultCopyFileBufferSize; + + std::unique_ptr dest_writer( + new WritableFileWriter(std::move(dst_file), dst, dst_file_options)); + std::unique_ptr src_reader; + std::unique_ptr buf; + if (!src.empty()) { + // Return back current temperature in FileSystem + *src_temperature = src_file->GetTemperature(); + + src_reader.reset(new SequentialFileReader( + std::move(src_file), src, nullptr /* io_tracer */, {}, rate_limiter)); + buf.reset(new char[buf_size]); + } + + Slice data; + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return status_to_io_status(Status::Incomplete("Backup stopped")); + } + if (!src.empty()) { + size_t buffer_to_read = + (buf_size < size_limit) ? buf_size : static_cast(size_limit); + io_s = src_reader->Read(buffer_to_read, &data, buf.get(), + Env::IO_LOW /* rate_limiter_priority */); + *bytes_toward_next_callback += data.size(); + } else { + data = contents; + } + size_limit -= data.size(); + TEST_SYNC_POINT_CALLBACK( + "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup", + (src.length() > 4 && src.rfind(".sst") == src.length() - 4) ? &data + : nullptr); + + if (!io_s.ok()) { + return io_s; + } + + if (size != nullptr) { + *size += data.size(); + } + if (checksum_hex != nullptr) { + checksum_value = crc32c::Extend(checksum_value, data.data(), data.size()); + } + io_s = dest_writer->Append(data); + + if (rate_limiter != nullptr) { + if (!src.empty()) { + rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } else { + LoopRateLimitRequestHelper(data.size(), rate_limiter, Env::IO_LOW, + nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + while (*bytes_toward_next_callback >= + options_.callback_trigger_interval_size) { + *bytes_toward_next_callback -= options_.callback_trigger_interval_size; + if (progress_callback) { + std::lock_guard lock(byte_report_mutex_); + try { + progress_callback(); + } catch (const std::exception& exn) { + io_s = IOStatus::Aborted("Exception in progress_callback: " + + std::string(exn.what())); + break; + } catch (...) { + io_s = IOStatus::Aborted("Unknown exception in progress_callback"); + break; + } + } + } + } while (io_s.ok() && contents.empty() && data.size() > 0 && size_limit > 0); + + // Convert uint32_t checksum to hex checksum + if (checksum_hex != nullptr) { + checksum_hex->assign(ChecksumInt32ToHex(checksum_value)); + } + + if (io_s.ok() && sync) { + io_s = dest_writer->Sync(false); + } + if (io_s.ok()) { + io_s = dest_writer->Close(); + } + return io_s; +} + +// fname will always start with "/" +IOStatus BackupEngineImpl::AddBackupFileWorkItem( + std::unordered_set& live_dst_paths, + std::deque& backup_items_to_finish, + std::deque* excludable_items, BackupID backup_id, + bool shared, const std::string& src_dir, const std::string& fname, + const EnvOptions& src_env_options, RateLimiter* rate_limiter, + FileType file_type, uint64_t size_bytes, Statistics* stats, + uint64_t size_limit, bool shared_checksum, + std::function progress_callback, const std::string& contents, + const std::string& src_checksum_func_name, + const std::string& src_checksum_str, const Temperature src_temperature) { + assert(contents.empty() != src_dir.empty()); + + std::string src_path = src_dir + "/" + fname; + std::string dst_relative; + std::string dst_relative_tmp; + std::string db_id; + std::string db_session_id; + // crc32c checksum in hex. empty == unavailable / unknown + std::string checksum_hex; + + // Whenever a default checksum function name is passed in, we will compares + // the corresponding checksum values after copying. Note that only table and + // blob files may have a known checksum function name passed in. + // + // If no default checksum function name is passed in and db session id is not + // available, we will calculate the checksum *before* copying in two cases + // (we always calcuate checksums when copying or creating for any file types): + // a) share_files_with_checksum is true and file type is table; + // b) share_table_files is true and the file exists already. + // + // Step 0: Check if default checksum function name is passed in + if (kDbFileChecksumFuncName == src_checksum_func_name) { + if (src_checksum_str == kUnknownFileChecksum) { + return status_to_io_status( + Status::Aborted("Unknown checksum value for " + fname)); + } + checksum_hex = ChecksumStrToHex(src_checksum_str); + } + + // Step 1: Prepare the relative path to destination + if (shared && shared_checksum) { + if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize && + file_type != kBlobFile) { + // Prepare db_session_id to add to the file name + // Ignore the returned status + // In the failed cases, db_id and db_session_id will be empty + GetFileDbIdentities(db_env_, src_env_options, src_path, src_temperature, + rate_limiter, &db_id, &db_session_id) + .PermitUncheckedError(); + } + // Calculate checksum if checksum and db session id are not available. + // If db session id is available, we will not calculate the checksum + // since the session id should suffice to avoid file name collision in + // the shared_checksum directory. + if (checksum_hex.empty() && db_session_id.empty()) { + IOStatus io_s = ReadFileAndComputeChecksum( + src_path, db_fs_, src_env_options, size_limit, &checksum_hex, + src_temperature); + if (!io_s.ok()) { + return io_s; + } + } + if (size_bytes == std::numeric_limits::max()) { + return IOStatus::NotFound("File missing: " + src_path); + } + // dst_relative depends on the following conditions: + // 1) the naming scheme is kUseDbSessionId, + // 2) db_session_id is not empty, + // 3) checksum is available in the DB manifest. + // If 1,2,3) are satisfied, then dst_relative will be of the form: + // shared_checksum/__.sst + // If 1,2) are satisfied, then dst_relative will be of the form: + // shared_checksum/_.sst + // Otherwise, dst_relative is of the form + // shared_checksum/__.sst + // + // For blob files, db_session_id is not supported with the blob file format. + // It uses original/legacy naming scheme. + // dst_relative will be of the form: + // shared_checksum/__.blob + dst_relative = GetSharedFileWithChecksum(fname, checksum_hex, size_bytes, + db_session_id); + dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true); + dst_relative = GetSharedFileWithChecksumRel(dst_relative, false); + } else if (shared) { + dst_relative_tmp = GetSharedFileRel(fname, true); + dst_relative = GetSharedFileRel(fname, false); + } else { + dst_relative = GetPrivateFileRel(backup_id, false, fname); + } + + // We copy into `temp_dest_path` and, once finished, rename it to + // `final_dest_path`. This allows files to atomically appear at + // `final_dest_path`. We can copy directly to the final path when atomicity + // is unnecessary, like for files in private backup directories. + const std::string* copy_dest_path; + std::string temp_dest_path; + std::string final_dest_path = GetAbsolutePath(dst_relative); + if (!dst_relative_tmp.empty()) { + temp_dest_path = GetAbsolutePath(dst_relative_tmp); + copy_dest_path = &temp_dest_path; + } else { + copy_dest_path = &final_dest_path; + } + + // Step 2: Determine whether to copy or not + // if it's shared, we also need to check if it exists -- if it does, no need + // to copy it again. + bool need_to_copy = true; + // true if final_dest_path is the same path as another live file + const bool same_path = + live_dst_paths.find(final_dest_path) != live_dst_paths.end(); + + bool file_exists = false; + if (shared && !same_path) { + // Should be in shared directory but not a live path, check existence in + // shared directory + IOStatus exist = + backup_fs_->FileExists(final_dest_path, io_options_, nullptr); + if (exist.ok()) { + file_exists = true; + } else if (exist.IsNotFound()) { + file_exists = false; + } else { + return exist; + } + } + + if (!contents.empty()) { + need_to_copy = false; + } else if (shared && (same_path || file_exists)) { + need_to_copy = false; + auto find_result = backuped_file_infos_.find(dst_relative); + if (find_result == backuped_file_infos_.end() && !same_path) { + // file exists but not referenced + ROCKS_LOG_INFO( + options_.info_log, + "%s already present, but not referenced by any backup. We will " + "overwrite the file.", + fname.c_str()); + need_to_copy = true; + // Defer any failure reporting to when we try to write the file + backup_fs_->DeleteFile(final_dest_path, io_options_, nullptr) + .PermitUncheckedError(); + } else { + // file exists and referenced + if (checksum_hex.empty()) { + // same_path should not happen for a standard DB, so OK to + // read file contents to check for checksum mismatch between + // two files from same DB getting same name. + // For compatibility with future meta file that might not have + // crc32c checksum available, consider it might be empty, but + // we don't currently generate meta file without crc32c checksum. + // Therefore we have to read & compute it if we don't have it. + if (!same_path && !find_result->second->checksum_hex.empty()) { + assert(find_result != backuped_file_infos_.end()); + // Note: to save I/O on incremental backups, we copy prior known + // checksum of the file instead of reading entire file contents + // to recompute it. + checksum_hex = find_result->second->checksum_hex; + // Regarding corruption detection, consider: + // (a) the DB file is corrupt (since previous backup) and the backup + // file is OK: we failed to detect, but the backup is safe. DB can + // be repaired/restored once its corruption is detected. + // (b) the backup file is corrupt (since previous backup) and the + // db file is OK: we failed to detect, but the backup is corrupt. + // CreateNewBackup should support fast incremental backups and + // there's no way to support that without reading all the files. + // We might add an option for extra checks on incremental backup, + // but until then, use VerifyBackups to check existing backup data. + // (c) file name collision with legitimately different content. + // This is almost inconceivable with a well-generated DB session + // ID, but even in that case, we double check the file sizes in + // BackupMeta::AddFile. + } else { + IOStatus io_s = ReadFileAndComputeChecksum( + src_path, db_fs_, src_env_options, size_limit, &checksum_hex, + src_temperature); + if (!io_s.ok()) { + return io_s; + } + } + } + if (!db_session_id.empty()) { + ROCKS_LOG_INFO(options_.info_log, + "%s already present, with checksum %s, size %" PRIu64 + " and DB session identity %s", + fname.c_str(), checksum_hex.c_str(), size_bytes, + db_session_id.c_str()); + } else { + ROCKS_LOG_INFO(options_.info_log, + "%s already present, with checksum %s and size %" PRIu64, + fname.c_str(), checksum_hex.c_str(), size_bytes); + } + } + } + live_dst_paths.insert(final_dest_path); + + // Step 3: Add work item + if (!contents.empty() || need_to_copy) { + CopyOrCreateWorkItem copy_or_create_work_item( + src_dir.empty() ? "" : src_path, *copy_dest_path, src_temperature, + Temperature::kUnknown /*dst_temp*/, contents, db_env_, backup_env_, + src_env_options, options_.sync, rate_limiter, size_limit, stats, + progress_callback, src_checksum_func_name, checksum_hex, db_id, + db_session_id); + BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + copy_or_create_work_item.result.get_future(), shared, need_to_copy, + backup_env_, temp_dest_path, final_dest_path, dst_relative); + if (excludable_items != nullptr && shared && shared_checksum && + need_to_copy) { + ROCKS_LOG_INFO(options_.info_log, "Copying (if not excluded) %s to %s", + fname.c_str(), copy_dest_path->c_str()); + excludable_items->emplace_back(std::move(copy_or_create_work_item), + std::move(after_copy_or_create_work_item)); + } else { + // For files known not excluded, can start copying even before finishing + // the checkpoint + ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(), + copy_dest_path->c_str()); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + backup_items_to_finish.push_back( + std::move(after_copy_or_create_work_item)); + } + } else { + std::promise promise_result; + BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + promise_result.get_future(), shared, need_to_copy, backup_env_, + temp_dest_path, final_dest_path, dst_relative); + backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item)); + CopyOrCreateResult result; + result.io_status = IOStatus::OK(); + result.size = size_bytes; + result.checksum_hex = std::move(checksum_hex); + result.db_id = std::move(db_id); + result.db_session_id = std::move(db_session_id); + promise_result.set_value(std::move(result)); + } + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::ReadFileAndComputeChecksum( + const std::string& src, const std::shared_ptr& src_fs, + const EnvOptions& src_env_options, uint64_t size_limit, + std::string* checksum_hex, const Temperature src_temperature) const { + if (checksum_hex == nullptr) { + return status_to_io_status(Status::Aborted("Checksum pointer is null")); + } + uint32_t checksum_value = 0; + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + std::unique_ptr src_reader; + auto file_options = FileOptions(src_env_options); + file_options.temperature = src_temperature; + RateLimiter* rate_limiter = options_.backup_rate_limiter.get(); + IOStatus io_s = SequentialFileReader::Create( + src_fs, src, file_options, &src_reader, nullptr /* dbg */, rate_limiter); + if (io_s.IsPathNotFound() && src_temperature != Temperature::kUnknown) { + // Retry without temperature hint in case the FileSystem is strict with + // non-kUnknown temperature option + file_options.temperature = Temperature::kUnknown; + io_s = SequentialFileReader::Create(src_fs, src, file_options, &src_reader, + nullptr /* dbg */, rate_limiter); + } + if (!io_s.ok()) { + return io_s; + } + + size_t buf_size = kDefaultCopyFileBufferSize; + std::unique_ptr buf(new char[buf_size]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return status_to_io_status(Status::Incomplete("Backup stopped")); + } + size_t buffer_to_read = + (buf_size < size_limit) ? buf_size : static_cast(size_limit); + io_s = src_reader->Read(buffer_to_read, &data, buf.get(), + Env::IO_LOW /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + + size_limit -= data.size(); + checksum_value = crc32c::Extend(checksum_value, data.data(), data.size()); + } while (data.size() > 0 && size_limit > 0); + + checksum_hex->assign(ChecksumInt32ToHex(checksum_value)); + + return io_s; +} + +Status BackupEngineImpl::GetFileDbIdentities( + Env* src_env, const EnvOptions& src_env_options, + const std::string& file_path, Temperature file_temp, + RateLimiter* rate_limiter, std::string* db_id, std::string* db_session_id) { + assert(db_id != nullptr || db_session_id != nullptr); + + Options options; + options.env = src_env; + SstFileDumper sst_reader(options, file_path, file_temp, + 2 * 1024 * 1024 + /* readahead_size */, + false /* verify_checksum */, false /* output_hex */, + false /* decode_blob_index */, src_env_options, + true /* silent */); + + const TableProperties* table_properties = nullptr; + std::shared_ptr tp; + Status s = sst_reader.getStatus(); + + if (s.ok()) { + // Try to get table properties from the table reader of sst_reader + if (!sst_reader.ReadTableProperties(&tp).ok()) { + // Try to use table properites from the initialization of sst_reader + table_properties = sst_reader.GetInitTableProperties(); + } else { + table_properties = tp.get(); + if (table_properties != nullptr && rate_limiter != nullptr) { + // sizeof(*table_properties) is a sufficent but far-from-exact + // approximation of read bytes due to metaindex block, std::string + // properties and varint compression + LoopRateLimitRequestHelper(sizeof(*table_properties), rate_limiter, + Env::IO_LOW, nullptr /* stats */, + RateLimiter::OpType::kRead); + } + } + } else { + ROCKS_LOG_INFO(options_.info_log, "Failed to read %s: %s", + file_path.c_str(), s.ToString().c_str()); + return s; + } + + if (table_properties != nullptr) { + if (db_id != nullptr) { + db_id->assign(table_properties->db_id); + } + if (db_session_id != nullptr) { + db_session_id->assign(table_properties->db_session_id); + if (db_session_id->empty()) { + s = Status::NotFound("DB session identity not found in " + file_path); + ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str()); + return s; + } + } + return Status::OK(); + } else { + s = Status::Corruption("Table properties missing in " + file_path); + ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str()); + return s; + } +} + +void BackupEngineImpl::LoopRateLimitRequestHelper( + const size_t total_bytes_to_request, RateLimiter* rate_limiter, + const Env::IOPriority pri, Statistics* stats, + const RateLimiter::OpType op_type) { + assert(rate_limiter != nullptr); + size_t remaining_bytes = total_bytes_to_request; + size_t request_bytes = 0; + while (remaining_bytes > 0) { + request_bytes = + std::min(static_cast(rate_limiter->GetSingleBurstBytes()), + remaining_bytes); + rate_limiter->Request(request_bytes, pri, stats, op_type); + remaining_bytes -= request_bytes; + } +} + +void BackupEngineImpl::DeleteChildren(const std::string& dir, + uint32_t file_type_filter) const { + std::vector children; + db_fs_->GetChildren(dir, io_options_, &children, nullptr) + .PermitUncheckedError(); // ignore errors + + for (const auto& f : children) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && (file_type_filter & (1 << type))) { + // don't delete this file + continue; + } + db_fs_->DeleteFile(dir + "/" + f, io_options_, nullptr) + .PermitUncheckedError(); // ignore errors + } +} + +IOStatus BackupEngineImpl::ReadChildFileCurrentSizes( + const std::string& dir, const std::shared_ptr& fs, + std::unordered_map* result) const { + assert(result != nullptr); + std::vector files_attrs; + IOStatus io_status = fs->FileExists(dir, io_options_, nullptr); + if (io_status.ok()) { + io_status = + fs->GetChildrenFileAttributes(dir, io_options_, &files_attrs, nullptr); + } else if (io_status.IsNotFound()) { + // Insert no entries can be considered success + io_status = IOStatus::OK(); + } + const bool slash_needed = dir.empty() || dir.back() != '/'; + for (const auto& file_attrs : files_attrs) { + result->emplace(dir + (slash_needed ? "/" : "") + file_attrs.name, + file_attrs.size_bytes); + } + return io_status; +} + +IOStatus BackupEngineImpl::GarbageCollect() { + assert(!read_only_); + + // We will make a best effort to remove all garbage even in the presence + // of inconsistencies or I/O failures that inhibit finding garbage. + IOStatus overall_status = IOStatus::OK(); + // If all goes well, we don't need another auto-GC this session + might_need_garbage_collect_ = false; + + ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection"); + + // delete obsolete shared files + for (bool with_checksum : {false, true}) { + std::vector shared_children; + { + std::string shared_path; + if (with_checksum) { + shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel()); + } else { + shared_path = GetAbsolutePath(GetSharedFileRel()); + } + IOStatus io_s = backup_fs_->FileExists(shared_path, io_options_, nullptr); + if (io_s.ok()) { + io_s = backup_fs_->GetChildren(shared_path, io_options_, + &shared_children, nullptr); + } else if (io_s.IsNotFound()) { + io_s = IOStatus::OK(); + } + if (!io_s.ok()) { + overall_status = io_s; + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + for (auto& child : shared_children) { + std::string rel_fname; + if (with_checksum) { + rel_fname = GetSharedFileWithChecksumRel(child); + } else { + rel_fname = GetSharedFileRel(child); + } + auto child_itr = backuped_file_infos_.find(rel_fname); + // if it's not refcounted, delete it + if (child_itr == backuped_file_infos_.end() || + child_itr->second->refs == 0) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(rel_fname), + io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", + rel_fname.c_str(), io_s.ToString().c_str()); + backuped_file_infos_.erase(rel_fname); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + } + } + + // delete obsolete private files + std::vector private_children; + { + IOStatus io_s = + backup_fs_->GetChildren(GetAbsolutePath(kPrivateDirName), io_options_, + &private_children, nullptr); + if (!io_s.ok()) { + overall_status = io_s; + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + for (auto& child : private_children) { + BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; + sscanf(child.c_str(), "%u", &backup_id); + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id)); + std::vector subchildren; + if (backup_fs_ + ->GetChildren(full_private_path, io_options_, &subchildren, nullptr) + .ok()) { + for (auto& subchild : subchildren) { + IOStatus io_s = backup_fs_->DeleteFile(full_private_path + subchild, + io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", + (full_private_path + subchild).c_str(), + io_s.ToString().c_str()); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + } + // finally delete the private dir + IOStatus io_s = + backup_fs_->DeleteDir(full_private_path, io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s", + full_private_path.c_str(), io_s.ToString().c_str()); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + + assert(overall_status.ok() || might_need_garbage_collect_); + return overall_status; +} + +// ------- BackupMeta class -------- + +IOStatus BackupEngineImpl::BackupMeta::AddFile( + std::shared_ptr file_info) { + auto itr = file_infos_->find(file_info->filename); + if (itr == file_infos_->end()) { + auto ret = file_infos_->insert({file_info->filename, file_info}); + if (ret.second) { + itr = ret.first; + itr->second->refs = 1; + } else { + // if this happens, something is seriously wrong + return IOStatus::Corruption("In memory metadata insertion error"); + } + } else { + // Compare sizes, because we scanned that off the filesystem on both + // ends. This is like a check in VerifyBackup. + if (itr->second->size != file_info->size) { + std::string msg = "Size mismatch for existing backup file: "; + msg.append(file_info->filename); + msg.append(" Size in backup is " + std::to_string(itr->second->size) + + " while size in DB is " + std::to_string(file_info->size)); + msg.append( + " If this DB file checks as not corrupt, try deleting old" + " backups or backing up to a different backup directory."); + return IOStatus::Corruption(msg); + } + if (file_info->checksum_hex.empty()) { + // No checksum available to check + } else if (itr->second->checksum_hex.empty()) { + // Remember checksum if newly acquired + itr->second->checksum_hex = file_info->checksum_hex; + } else if (itr->second->checksum_hex != file_info->checksum_hex) { + // Note: to save I/O, these will be equal trivially on already backed + // up files that don't have the checksum in their name. And it should + // never fail for files that do have checksum in their name. + + // Should never reach here, but produce an appropriate corruption + // message in case we do in a release build. + assert(false); + std::string msg = "Checksum mismatch for existing backup file: "; + msg.append(file_info->filename); + msg.append(" Expected checksum is " + itr->second->checksum_hex + + " while computed checksum is " + file_info->checksum_hex); + msg.append( + " If this DB file checks as not corrupt, try deleting old" + " backups or backing up to a different backup directory."); + return IOStatus::Corruption(msg); + } + ++itr->second->refs; // increase refcount if already present + } + + size_ += file_info->size; + files_.push_back(itr->second); + + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::BackupMeta::Delete(bool delete_meta) { + IOStatus io_s; + for (const auto& file : files_) { + --file->refs; // decrease refcount + } + files_.clear(); + // delete meta file + if (delete_meta) { + io_s = fs_->FileExists(meta_filename_, iooptions_, nullptr); + if (io_s.ok()) { + io_s = fs_->DeleteFile(meta_filename_, iooptions_, nullptr); + } else if (io_s.IsNotFound()) { + io_s = IOStatus::OK(); // nothing to delete + } + } + timestamp_ = 0; + return io_s; +} + +// Constants for backup meta file schema (see LoadFromFile) +const std::string kSchemaVersionPrefix{"schema_version "}; +const std::string kFooterMarker{"// FOOTER"}; + +const std::string kAppMetaDataFieldName{"metadata"}; + +// WART: The checksums are crc32c but named "crc32" +const std::string kFileCrc32cFieldName{"crc32"}; +const std::string kFileSizeFieldName{"size"}; +const std::string kTemperatureFieldName{"temp"}; +const std::string kExcludedFieldName{"ni::excluded"}; + +// Marks a (future) field that should cause failure if not recognized. +// Other fields are assumed to be ignorable. For example, in the future +// we might add +// ni::file_name_escape uri_percent +// to indicate all file names have had spaces and special characters +// escaped using a URI percent encoding. +const std::string kNonIgnorableFieldPrefix{"ni::"}; + +// Each backup meta file is of the format (schema version 1): +//---------------------------------------------------------- +// +// +// metadata (optional) +// +// crc32 +// crc32 +// ... +//---------------------------------------------------------- +// +// For schema version 2.x: +//---------------------------------------------------------- +// schema_version +// +// +// [ ] +// ... +// +// ( )* +// ( )* +// ... +// [// FOOTER] +// [ ] +// ... +//---------------------------------------------------------- +// where +// ::= [0-9]+([.][0-9]+) +// ::= [A-Za-z_][A-Za-z_0-9.]+ +// is anything but newline +// is anything but space and newline +// Although "// FOOTER" wouldn't strictly be required as a delimiter +// given the number of files is included, it is there for parsing +// sanity in case of corruption. It is only required if followed +// by footer fields, such as a checksum of the meta file (so far). +// Unrecognized fields are ignored, to support schema evolution on +// non-critical features with forward compatibility. Update schema +// major version for breaking changes. Schema minor versions are indicated +// only for diagnostic/debugging purposes. +// +// Fields in schema version 2.0: +// * Top-level meta fields: +// * Only "metadata" as in schema version 1 +// * File meta fields: +// * "crc32" - a crc32c checksum as in schema version 1 +// * "size" - the size of the file (new) +// * Footer meta fields: +// * None yet (future use for meta file checksum anticipated) +// +IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( + const std::string& backup_dir, + const std::unordered_map& abs_path_to_size, + RateLimiter* rate_limiter, Logger* info_log, + std::unordered_set* reported_ignored_fields) { + assert(reported_ignored_fields); + assert(Empty()); + + std::unique_ptr backup_meta_reader; + { + IOStatus io_s = LineFileReader::Create(fs_, meta_filename_, FileOptions(), + &backup_meta_reader, + nullptr /* dbg */, rate_limiter); + if (!io_s.ok()) { + return io_s; + } + } + + // If we don't read an explicit schema_version, that implies version 1, + // which is what we call the original backup meta schema. + int schema_major_version = 1; + + // Failures handled at the end + std::string line; + if (backup_meta_reader->ReadLine(&line, + Env::IO_LOW /* rate_limiter_priority */)) { + if (StartsWith(line, kSchemaVersionPrefix)) { + std::string ver = line.substr(kSchemaVersionPrefix.size()); + if (ver == "2" || StartsWith(ver, "2.")) { + schema_major_version = 2; + } else { + return IOStatus::NotSupported( + "Unsupported/unrecognized schema version: " + ver); + } + line.clear(); + } else if (line.empty()) { + return IOStatus::Corruption("Unexpected empty line"); + } + } + if (!line.empty()) { + timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10); + } else if (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10); + } + if (backup_meta_reader->ReadLine(&line, + Env::IO_LOW /* rate_limiter_priority */)) { + sequence_number_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10); + } + uint32_t num_files = UINT32_MAX; + while (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + if (line.empty()) { + return IOStatus::Corruption("Unexpected empty line"); + } + // Number -> number of files -> exit loop reading optional meta fields + if (line[0] >= '0' && line[0] <= '9') { + num_files = static_cast(strtoul(line.c_str(), nullptr, 10)); + break; + } + // else, must be a meta field assignment + auto space_pos = line.find_first_of(' '); + if (space_pos == std::string::npos) { + return IOStatus::Corruption("Expected number of files or meta field"); + } + std::string field_name = line.substr(0, space_pos); + std::string field_data = line.substr(space_pos + 1); + if (field_name == kAppMetaDataFieldName) { + // app metadata present + bool decode_success = Slice(field_data).DecodeHex(&app_metadata_); + if (!decode_success) { + return IOStatus::Corruption( + "Failed to decode stored hex encoded app metadata"); + } + } else if (schema_major_version < 2) { + return IOStatus::Corruption("Expected number of files or \"" + + kAppMetaDataFieldName + "\" field"); + } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { + return IOStatus::NotSupported("Unrecognized non-ignorable meta field " + + field_name + " (from future version?)"); + } else { + // Warn the first time we see any particular unrecognized meta field + if (reported_ignored_fields->insert("meta:" + field_name).second) { + ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup meta field %s", + field_name.c_str()); + } + } + } + std::vector> files; + bool footer_present = false; + while (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + std::vector components = StringSplit(line, ' '); + + if (components.size() < 1) { + return IOStatus::Corruption("Empty line instead of file entry."); + } + if (schema_major_version >= 2 && components.size() == 2 && + line == kFooterMarker) { + footer_present = true; + break; + } + + const std::string& filename = components[0]; + + if (schema_major_version >= 2) { + if (components.size() % 2 != 1) { + return IOStatus::Corruption( + "Bad number of line components for file entry."); + } + } else { + // Check restricted original schema + if (components.size() < 3) { + return IOStatus::Corruption("File checksum is missing for " + filename + + " in " + meta_filename_); + } + if (components[1] != kFileCrc32cFieldName) { + return IOStatus::Corruption("Unknown checksum type for " + filename + + " in " + meta_filename_); + } + if (components.size() > 3) { + return IOStatus::Corruption("Extra data for entry " + filename + + " in " + meta_filename_); + } + } + + std::optional expected_size{}; + std::string checksum_hex; + Temperature temp = Temperature::kUnknown; + bool excluded = false; + for (unsigned i = 1; i < components.size(); i += 2) { + const std::string& field_name = components[i]; + const std::string& field_data = components[i + 1]; + + if (field_name == kFileCrc32cFieldName) { + uint32_t checksum_value = + static_cast(strtoul(field_data.c_str(), nullptr, 10)); + if (field_data != std::to_string(checksum_value)) { + return IOStatus::Corruption("Invalid checksum value for " + filename + + " in " + meta_filename_); + } + checksum_hex = ChecksumInt32ToHex(checksum_value); + } else if (field_name == kFileSizeFieldName) { + expected_size = std::strtoull(field_data.c_str(), nullptr, /*base*/ 10); + } else if (field_name == kTemperatureFieldName) { + auto iter = temperature_string_map.find(field_data); + if (iter != temperature_string_map.end()) { + temp = iter->second; + } else { + // Could report corruption, but in case of new temperatures added + // in future, letting those map to kUnknown which should generally + // be safe. + temp = Temperature::kUnknown; + } + } else if (field_name == kExcludedFieldName) { + if (field_data == "true") { + excluded = true; + } else if (field_data == "false") { + excluded = false; + } else { + return IOStatus::NotSupported("Unrecognized value \"" + field_data + + "\" for field " + field_name); + } + } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { + return IOStatus::NotSupported("Unrecognized non-ignorable file field " + + field_name + " (from future version?)"); + } else { + // Warn the first time we see any particular unrecognized file field + if (reported_ignored_fields->insert("file:" + field_name).second) { + ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup file field %s", + field_name.c_str()); + } + } + } + + if (excluded) { + excluded_files_.emplace_back(filename); + } else { + // Verify file exists, with expected size + std::string abs_path = backup_dir + "/" + filename; + auto e = abs_path_to_size.find(abs_path); + if (e == abs_path_to_size.end()) { + return IOStatus::Corruption( + "Pathname in meta file not found on disk: " + abs_path); + } + uint64_t actual_size = e->second; + if (expected_size.has_value() && *expected_size != actual_size) { + return IOStatus::Corruption("For file " + filename + " expected size " + + std::to_string(*expected_size) + + " but found size" + + std::to_string(actual_size)); + } + + // NOTE: FileInfo will be coalesced for sharing later (AddFile below) + files.emplace_back( + std::make_shared(filename, actual_size, checksum_hex, + /*id*/ "", /*sid*/ "", temp)); + } + } + + if (footer_present) { + assert(schema_major_version >= 2); + while (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + if (line.empty()) { + return IOStatus::Corruption("Unexpected empty line"); + } + auto space_pos = line.find_first_of(' '); + if (space_pos == std::string::npos) { + return IOStatus::Corruption("Expected footer field"); + } + std::string field_name = line.substr(0, space_pos); + std::string field_data = line.substr(space_pos + 1); + if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { + return IOStatus::NotSupported("Unrecognized non-ignorable field " + + field_name + " (from future version?)"); + } else if (reported_ignored_fields->insert("footer:" + field_name) + .second) { + // Warn the first time we see any particular unrecognized footer field + ROCKS_LOG_WARN(info_log, + "Ignoring unrecognized backup meta footer field %s", + field_name.c_str()); + } + } + } + + { + IOStatus io_s = backup_meta_reader->GetStatus(); + if (!io_s.ok()) { + return io_s; + } + } + + if (num_files != files.size()) { + return IOStatus::Corruption( + "Inconsistent number of files or missing/incomplete header in " + + meta_filename_); + } + + files_.reserve(files.size()); + for (const auto& file_info : files) { + IOStatus io_s = AddFile(file_info); + if (!io_s.ok()) { + return io_s; + } + } + + return IOStatus::OK(); +} + +const std::vector minor_version_strings{ + "", // invalid major version 0 + "", // implicit major version 1 + "2.1", +}; + +IOStatus BackupEngineImpl::BackupMeta::StoreToFile( + bool sync, int schema_version, + const TEST_BackupMetaSchemaOptions* schema_test_options) { + if (schema_version < 1) { + return IOStatus::InvalidArgument( + "BackupEngineOptions::schema_version must be >= 1"); + } + if (schema_version > static_cast(minor_version_strings.size() - 1)) { + return IOStatus::NotSupported( + "Only BackupEngineOptions::schema_version <= " + + std::to_string(minor_version_strings.size() - 1) + " is supported"); + } + std::string ver = minor_version_strings[schema_version]; + + // Need schema_version >= 2 for TEST_BackupMetaSchemaOptions + assert(schema_version >= 2 || schema_test_options == nullptr); + + IOStatus io_s; + std::unique_ptr backup_meta_file; + FileOptions file_options; + file_options.use_mmap_writes = false; + file_options.use_direct_writes = false; + io_s = fs_->NewWritableFile(meta_tmp_filename_, file_options, + &backup_meta_file, nullptr); + if (!io_s.ok()) { + return io_s; + } + + std::ostringstream buf; + if (schema_test_options) { + // override for testing + ver = schema_test_options->version; + } + if (!ver.empty()) { + assert(schema_version >= 2); + buf << kSchemaVersionPrefix << ver << "\n"; + } + buf << static_cast(timestamp_) << "\n"; + buf << sequence_number_ << "\n"; + + if (!app_metadata_.empty()) { + std::string hex_encoded_metadata = + Slice(app_metadata_).ToString(/* hex */ true); + buf << kAppMetaDataFieldName << " " << hex_encoded_metadata << "\n"; + } + if (schema_test_options) { + for (auto& e : schema_test_options->meta_fields) { + buf << e.first << " " << e.second << "\n"; + } + } + buf << files_.size() << "\n"; + + for (const auto& file : files_) { + buf << file->filename; + if (schema_test_options == nullptr || + schema_test_options->crc32c_checksums) { + // use crc32c for now, switch to something else if needed + buf << " " << kFileCrc32cFieldName << " " + << ChecksumHexToInt32(file->checksum_hex); + } + if (schema_version >= 2 && file->temp != Temperature::kUnknown) { + buf << " " << kTemperatureFieldName << " " + << temperature_to_string[file->temp]; + } + if (schema_test_options && schema_test_options->file_sizes) { + buf << " " << kFileSizeFieldName << " " << std::to_string(file->size); + } + if (schema_test_options) { + for (auto& e : schema_test_options->file_fields) { + buf << " " << e.first << " " << e.second; + } + } + buf << "\n"; + } + + for (const auto& file : excluded_files_) { + assert(schema_version >= 2); + buf << file.relative_file << " " << kExcludedFieldName << " true\n"; + } + + if (schema_test_options && !schema_test_options->footer_fields.empty()) { + buf << kFooterMarker << "\n"; + for (auto& e : schema_test_options->footer_fields) { + buf << e.first << " " << e.second << "\n"; + } + } + + io_s = backup_meta_file->Append(Slice(buf.str()), iooptions_, nullptr); + IOSTATS_ADD(bytes_written, buf.str().size()); + if (io_s.ok() && sync) { + io_s = backup_meta_file->Sync(iooptions_, nullptr); + } + if (io_s.ok()) { + io_s = backup_meta_file->Close(iooptions_, nullptr); + } + if (io_s.ok()) { + io_s = fs_->RenameFile(meta_tmp_filename_, meta_filename_, iooptions_, + nullptr); + } + return io_s; +} +} // namespace + +IOStatus BackupEngineReadOnly::Open(const BackupEngineOptions& options, + Env* env, + BackupEngineReadOnly** backup_engine_ptr) { + if (options.destroy_old_data) { + return IOStatus::InvalidArgument( + "Can't destroy old data with ReadOnly BackupEngine"); + } + std::unique_ptr backup_engine( + new BackupEngineImplThreadSafe(options, env, true /*read_only*/)); + auto s = backup_engine->Initialize(); + if (!s.ok()) { + *backup_engine_ptr = nullptr; + return s; + } + *backup_engine_ptr = backup_engine.release(); + return IOStatus::OK(); +} + +void TEST_SetBackupMetaSchemaOptions( + BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options) { + BackupEngineImplThreadSafe* impl = + static_cast_with_check(engine); + impl->TEST_SetBackupMetaSchemaOptions(options); +} + +void TEST_SetDefaultRateLimitersClock( + BackupEngine* engine, + const std::shared_ptr& backup_rate_limiter_clock, + const std::shared_ptr& restore_rate_limiter_clock) { + BackupEngineImplThreadSafe* impl = + static_cast_with_check(engine); + impl->TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock, + restore_rate_limiter_clock); +} +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine_impl.h new file mode 100644 index 0000000000..a764b34d9b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/backup/backup_engine_impl.h @@ -0,0 +1,34 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/utilities/backup_engine.h" + +namespace ROCKSDB_NAMESPACE { + +struct TEST_BackupMetaSchemaOptions { + std::string version = "2"; + bool crc32c_checksums = false; + bool file_sizes = true; + std::map meta_fields; + std::map file_fields; + std::map footer_fields; +}; + +// Modifies the BackupEngine(Impl) to write backup meta files using the +// unpublished schema version 2, for the life of this object (not backup_dir). +// TEST_BackupMetaSchemaOptions offers some customization for testing. +void TEST_SetBackupMetaSchemaOptions( + BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options); + +// Modifies the BackupEngine(Impl) to use specified clocks for backup and +// restore rate limiters created by default if not specified by users for +// test speedup. +void TEST_SetDefaultRateLimitersClock( + BackupEngine* engine, + const std::shared_ptr& backup_rate_limiter_clock = nullptr, + const std::shared_ptr& restore_rate_limiter_clock = nullptr); +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.cc new file mode 100644 index 0000000000..ddaa98c7d3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.cc @@ -0,0 +1,488 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/blob_db/blob_compaction_filter.h" + +#include + +#include "db/dbformat.h" +#include "logging/logging.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() { + if (blob_file_) { + CloseAndRegisterNewBlobFile(); + } + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_); +} + +CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2( + int level, const Slice& key, ValueType value_type, const Slice& value, + std::string* new_value, std::string* skip_until) const { + const CompactionFilter* ucf = user_comp_filter(); + if (value_type != kBlobIndex) { + if (ucf == nullptr) { + return Decision::kKeep; + } + // Apply user compaction filter for inlined data. + CompactionFilter::Decision decision = + ucf->FilterV2(level, key, value_type, value, new_value, skip_until); + if (decision == Decision::kChangeValue) { + return HandleValueChange(key, new_value); + } + return decision; + } + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + // Unable to decode blob index. Keeping the value. + return Decision::kKeep; + } + if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) { + // Expired + expired_count_++; + expired_size_ += key.size() + value.size(); + return Decision::kRemove; + } + if (!blob_index.IsInlined() && + blob_index.file_number() < context_.next_file_number && + context_.current_blob_files.count(blob_index.file_number()) == 0) { + // Corresponding blob file gone (most likely, evicted by FIFO eviction). + evicted_count_++; + evicted_size_ += key.size() + value.size(); + return Decision::kRemove; + } + if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() && + blob_index.expiration() < context_.evict_expiration_up_to) { + // Hack: Internal key is passed to BlobIndexCompactionFilter for it to + // get sequence number. + ParsedInternalKey ikey; + if (!ParseInternalKey( + key, &ikey, + context_.blob_db_impl->db_options_.allow_data_in_errors) + .ok()) { + assert(false); + return Decision::kKeep; + } + // Remove keys that could have been remove by last FIFO eviction. + // If get error while parsing key, ignore and continue. + if (ikey.sequence < context_.fifo_eviction_seq) { + evicted_count_++; + evicted_size_ += key.size() + value.size(); + return Decision::kRemove; + } + } + // Apply user compaction filter for all non-TTL blob data. + if (ucf != nullptr && !blob_index.HasTTL()) { + // Hack: Internal key is passed to BlobIndexCompactionFilter for it to + // get sequence number. + ParsedInternalKey ikey; + if (!ParseInternalKey( + key, &ikey, + context_.blob_db_impl->db_options_.allow_data_in_errors) + .ok()) { + assert(false); + return Decision::kKeep; + } + // Read value from blob file. + PinnableSlice blob; + CompressionType compression_type = kNoCompression; + constexpr bool need_decompress = true; + if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress, + &compression_type)) { + return Decision::kIOError; + } + CompactionFilter::Decision decision = ucf->FilterV2( + level, ikey.user_key, kValue, blob, new_value, skip_until); + if (decision == Decision::kChangeValue) { + return HandleValueChange(ikey.user_key, new_value); + } + return decision; + } + return Decision::kKeep; +} + +CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange( + const Slice& key, std::string* new_value) const { + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) { + // Keep new_value inlined. + return Decision::kChangeValue; + } + if (!OpenNewBlobFileIfNeeded()) { + return Decision::kIOError; + } + Slice new_blob_value(*new_value); + std::string compression_output; + if (blob_db_impl->bdb_options_.compression != kNoCompression) { + new_blob_value = + blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output); + } + uint64_t new_blob_file_number = 0; + uint64_t new_blob_offset = 0; + if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number, + &new_blob_offset)) { + return Decision::kIOError; + } + if (!CloseAndRegisterNewBlobFileIfNeeded()) { + return Decision::kIOError; + } + BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset, + new_blob_value.size(), + blob_db_impl->bdb_options_.compression); + return Decision::kChangeBlobIndex; +} + +BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() { + assert(context().blob_db_impl); + + ROCKS_LOG_INFO(context().blob_db_impl->db_options_.info_log, + "GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64 + " bytes), relocated %" PRIu64 " blobs (%" PRIu64 + " bytes), created %" PRIu64 " new blob file(s)", + !gc_stats_.HasError() ? "successfully" : "with failure", + gc_stats_.AllBlobs(), gc_stats_.AllBytes(), + gc_stats_.RelocatedBlobs(), gc_stats_.RelocatedBytes(), + gc_stats_.NewFiles()); + + RecordTick(statistics(), BLOB_DB_GC_NUM_KEYS_RELOCATED, + gc_stats_.RelocatedBlobs()); + RecordTick(statistics(), BLOB_DB_GC_BYTES_RELOCATED, + gc_stats_.RelocatedBytes()); + RecordTick(statistics(), BLOB_DB_GC_NUM_NEW_FILES, gc_stats_.NewFiles()); + RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError()); +} + +bool BlobIndexCompactionFilterBase::IsBlobFileOpened() const { + if (blob_file_) { + assert(writer_); + return true; + } + return false; +} + +bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const { + if (IsBlobFileOpened()) { + return true; + } + + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + const Status s = blob_db_impl->CreateBlobFileAndWriter( + /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_, + &writer_); + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Error opening new blob file during compaction/GC, status: %s", + s.ToString().c_str()); + blob_file_.reset(); + writer_.reset(); + return false; + } + + assert(blob_file_); + assert(writer_); + + return true; +} + +bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile( + const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob, + bool need_decompress, CompressionType* compression_type) const { + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + Status s = blob_db_impl->GetRawBlobFromFile( + key, blob_index.file_number(), blob_index.offset(), blob_index.size(), + blob, compression_type); + + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Error reading blob during compaction/GC, key: %s (%s), status: %s", + key.ToString(/* output_hex */ true).c_str(), + blob_index.DebugString(/* output_hex */ true).c_str(), + s.ToString().c_str()); + + return false; + } + + if (need_decompress && *compression_type != kNoCompression) { + s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob); + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Uncompression error during blob read from file: %" PRIu64 + " blob_offset: %" PRIu64 " blob_size: %" PRIu64 + " key: %s status: '%s'", + blob_index.file_number(), blob_index.offset(), blob_index.size(), + key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); + + return false; + } + } + + return true; +} + +bool BlobIndexCompactionFilterBase::WriteBlobToNewFile( + const Slice& key, const Slice& blob, uint64_t* new_blob_file_number, + uint64_t* new_blob_offset) const { + TEST_SYNC_POINT("BlobIndexCompactionFilterBase::WriteBlobToNewFile"); + assert(new_blob_file_number); + assert(new_blob_offset); + + assert(blob_file_); + *new_blob_file_number = blob_file_->BlobFileNumber(); + + assert(writer_); + uint64_t new_key_offset = 0; + const Status s = writer_->AddRecord(key, blob, kNoExpiration, &new_key_offset, + new_blob_offset); + + if (!s.ok()) { + const BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log, + "Error writing blob to new file %s during compaction/GC, " + "key: %s, status: %s", + blob_file_->PathName().c_str(), + key.ToString(/* output_hex */ true).c_str(), + s.ToString().c_str()); + return false; + } + + const uint64_t new_size = + BlobLogRecord::kHeaderSize + key.size() + blob.size(); + blob_file_->BlobRecordAdded(new_size); + + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + blob_db_impl->total_blob_size_ += new_size; + + return true; +} + +bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFileIfNeeded() + const { + const BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + assert(blob_file_); + if (blob_file_->GetFileSize() < blob_db_impl->bdb_options_.blob_file_size) { + return true; + } + + return CloseAndRegisterNewBlobFile(); +} + +bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const { + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + assert(blob_file_); + + Status s; + + { + WriteLock wl(&blob_db_impl->mutex_); + + s = blob_db_impl->CloseBlobFile(blob_file_); + + // Note: we delay registering the new blob file until it's closed to + // prevent FIFO eviction from processing it during compaction/GC. + blob_db_impl->RegisterBlobFile(blob_file_); + } + + assert(blob_file_->Immutable()); + + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Error closing new blob file %s during compaction/GC, status: %s", + blob_file_->PathName().c_str(), s.ToString().c_str()); + } + + blob_file_.reset(); + return s.ok(); +} + +CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput( + const Slice& key, const Slice& existing_value, + std::string* new_value) const { + assert(new_value); + + const BlobDBImpl* const blob_db_impl = context().blob_db_impl; + (void)blob_db_impl; + + assert(blob_db_impl); + assert(blob_db_impl->bdb_options_.enable_garbage_collection); + + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(existing_value); + if (!s.ok()) { + gc_stats_.SetError(); + return BlobDecision::kCorruption; + } + + if (blob_index.IsInlined()) { + gc_stats_.AddBlob(blob_index.value().size()); + + return BlobDecision::kKeep; + } + + gc_stats_.AddBlob(blob_index.size()); + + if (blob_index.HasTTL()) { + return BlobDecision::kKeep; + } + + if (blob_index.file_number() >= context_gc_.cutoff_file_number) { + return BlobDecision::kKeep; + } + + // Note: each compaction generates its own blob files, which, depending on the + // workload, might result in many small blob files. The total number of files + // is bounded though (determined by the number of compactions and the blob + // file size option). + if (!OpenNewBlobFileIfNeeded()) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + PinnableSlice blob; + CompressionType compression_type = kNoCompression; + std::string compression_output; + if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + // If the compression_type is changed, re-compress it with the new compression + // type. + if (compression_type != blob_db_impl->bdb_options_.compression) { + if (compression_type != kNoCompression) { + const Status status = + blob_db_impl->DecompressSlice(blob, compression_type, &blob); + if (!status.ok()) { + gc_stats_.SetError(); + return BlobDecision::kCorruption; + } + } + if (blob_db_impl->bdb_options_.compression != kNoCompression) { + blob_db_impl->GetCompressedSlice(blob, &compression_output); + blob = PinnableSlice(&compression_output); + blob.PinSelf(); + } + } + + uint64_t new_blob_file_number = 0; + uint64_t new_blob_offset = 0; + if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + if (!CloseAndRegisterNewBlobFileIfNeeded()) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset, + blob.size(), compression_type); + + gc_stats_.AddRelocatedBlob(blob_index.size()); + + return BlobDecision::kChangeValue; +} + +bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const { + if (IsBlobFileOpened()) { + return true; + } + bool result = BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded(); + if (result) { + gc_stats_.AddNewFile(); + } + return result; +} + +std::unique_ptr +BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory( + const CompactionFilter::Context& context) const { + std::unique_ptr user_comp_filter_from_factory; + if (user_comp_filter_factory_) { + user_comp_filter_from_factory = + user_comp_filter_factory_->CreateCompactionFilter(context); + } + return user_comp_filter_from_factory; +} + +std::unique_ptr +BlobIndexCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context& _context) { + assert(clock()); + + int64_t current_time = 0; + Status s = clock()->GetCurrentTime(¤t_time); + if (!s.ok()) { + return nullptr; + } + assert(current_time >= 0); + + assert(blob_db_impl()); + + BlobCompactionContext context; + blob_db_impl()->GetCompactionContext(&context); + + std::unique_ptr user_comp_filter_from_factory = + CreateUserCompactionFilterFromFactory(_context); + + return std::unique_ptr(new BlobIndexCompactionFilter( + std::move(context), user_comp_filter(), + std::move(user_comp_filter_from_factory), current_time, statistics())); +} + +std::unique_ptr +BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter( + const CompactionFilter::Context& _context) { + assert(clock()); + + int64_t current_time = 0; + Status s = clock()->GetCurrentTime(¤t_time); + if (!s.ok()) { + return nullptr; + } + assert(current_time >= 0); + + assert(blob_db_impl()); + + BlobCompactionContext context; + BlobCompactionContextGC context_gc; + blob_db_impl()->GetCompactionContext(&context, &context_gc); + + std::unique_ptr user_comp_filter_from_factory = + CreateUserCompactionFilterFromFactory(_context); + + return std::unique_ptr(new BlobIndexCompactionFilterGC( + std::move(context), std::move(context_gc), user_comp_filter(), + std::move(user_comp_filter_from_factory), current_time, statistics())); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.h new file mode 100644 index 0000000000..cb83d0d034 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_compaction_filter.h @@ -0,0 +1,202 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "db/blob/blob_index.h" +#include "monitoring/statistics_impl.h" +#include "rocksdb/compaction_filter.h" +#include "utilities/blob_db/blob_db_gc_stats.h" +#include "utilities/blob_db/blob_db_impl.h" +#include "utilities/compaction_filters/layered_compaction_filter_base.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; +namespace blob_db { + +struct BlobCompactionContext { + BlobDBImpl* blob_db_impl = nullptr; + uint64_t next_file_number = 0; + std::unordered_set current_blob_files; + SequenceNumber fifo_eviction_seq = 0; + uint64_t evict_expiration_up_to = 0; +}; + +struct BlobCompactionContextGC { + uint64_t cutoff_file_number = 0; +}; + +// Compaction filter that deletes expired blob indexes from the base DB. +// Comes into two varieties, one for the non-GC case and one for the GC case. +class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase { + public: + BlobIndexCompactionFilterBase( + BlobCompactionContext&& _context, + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory, + uint64_t current_time, Statistics* stats) + : LayeredCompactionFilterBase(_user_comp_filter, + std::move(_user_comp_filter_from_factory)), + context_(std::move(_context)), + current_time_(current_time), + statistics_(stats) {} + + ~BlobIndexCompactionFilterBase() override; + + // Filter expired blob indexes regardless of snapshots. + bool IgnoreSnapshots() const override { return true; } + + Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& value, std::string* new_value, + std::string* skip_until) const override; + + bool IsStackedBlobDbInternalCompactionFilter() const override { return true; } + + protected: + bool IsBlobFileOpened() const; + virtual bool OpenNewBlobFileIfNeeded() const; + bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index, + PinnableSlice* blob, bool need_decompress, + CompressionType* compression_type) const; + bool WriteBlobToNewFile(const Slice& key, const Slice& blob, + uint64_t* new_blob_file_number, + uint64_t* new_blob_offset) const; + bool CloseAndRegisterNewBlobFileIfNeeded() const; + bool CloseAndRegisterNewBlobFile() const; + + Statistics* statistics() const { return statistics_; } + const BlobCompactionContext& context() const { return context_; } + + private: + Decision HandleValueChange(const Slice& key, std::string* new_value) const; + + private: + BlobCompactionContext context_; + const uint64_t current_time_; + Statistics* statistics_; + + mutable std::shared_ptr blob_file_; + mutable std::shared_ptr writer_; + + // It is safe to not using std::atomic since the compaction filter, created + // from a compaction filter factroy, will not be called from multiple threads. + mutable uint64_t expired_count_ = 0; + mutable uint64_t expired_size_ = 0; + mutable uint64_t evicted_count_ = 0; + mutable uint64_t evicted_size_ = 0; +}; + +class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase { + public: + BlobIndexCompactionFilter( + BlobCompactionContext&& _context, + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory, + uint64_t current_time, Statistics* stats) + : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter, + std::move(_user_comp_filter_from_factory), + current_time, stats) {} + + const char* Name() const override { return "BlobIndexCompactionFilter"; } +}; + +class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase { + public: + BlobIndexCompactionFilterGC( + BlobCompactionContext&& _context, BlobCompactionContextGC&& context_gc, + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory, + uint64_t current_time, Statistics* stats) + : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter, + std::move(_user_comp_filter_from_factory), + current_time, stats), + context_gc_(std::move(context_gc)) {} + + ~BlobIndexCompactionFilterGC() override; + + const char* Name() const override { return "BlobIndexCompactionFilterGC"; } + + BlobDecision PrepareBlobOutput(const Slice& key, const Slice& existing_value, + std::string* new_value) const override; + + private: + bool OpenNewBlobFileIfNeeded() const override; + + private: + BlobCompactionContextGC context_gc_; + mutable BlobDBGarbageCollectionStats gc_stats_; +}; + +// Compaction filter factory; similarly to the filters above, it comes +// in two flavors, one that creates filters that support GC, and one +// that creates non-GC filters. +class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory { + public: + BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl, + SystemClock* _clock, + const ColumnFamilyOptions& _cf_options, + Statistics* _statistics) + : blob_db_impl_(_blob_db_impl), + clock_(_clock), + statistics_(_statistics), + user_comp_filter_(_cf_options.compaction_filter), + user_comp_filter_factory_(_cf_options.compaction_filter_factory) {} + + protected: + std::unique_ptr CreateUserCompactionFilterFromFactory( + const CompactionFilter::Context& context) const; + + BlobDBImpl* blob_db_impl() const { return blob_db_impl_; } + SystemClock* clock() const { return clock_; } + Statistics* statistics() const { return statistics_; } + const CompactionFilter* user_comp_filter() const { return user_comp_filter_; } + + private: + BlobDBImpl* blob_db_impl_; + SystemClock* clock_; + Statistics* statistics_; + const CompactionFilter* user_comp_filter_; + std::shared_ptr user_comp_filter_factory_; +}; + +class BlobIndexCompactionFilterFactory + : public BlobIndexCompactionFilterFactoryBase { + public: + BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl, + SystemClock* _clock, + const ColumnFamilyOptions& _cf_options, + Statistics* _statistics) + : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options, + _statistics) {} + + const char* Name() const override { + return "BlobIndexCompactionFilterFactory"; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; +}; + +class BlobIndexCompactionFilterFactoryGC + : public BlobIndexCompactionFilterFactoryBase { + public: + BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl, + SystemClock* _clock, + const ColumnFamilyOptions& _cf_options, + Statistics* _statistics) + : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options, + _statistics) {} + + const char* Name() const override { + return "BlobIndexCompactionFilterFactoryGC"; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.cc new file mode 100644 index 0000000000..b6fe039036 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "utilities/blob_db/blob_db.h" + +#include + +#include "logging/logging.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db) { + *blob_db = nullptr; + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families, + &handles, blob_db); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status BlobDB::Open(const DBOptions& db_options, + const BlobDBOptions& bdb_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + BlobDB** blob_db) { + assert(handles); + + if (column_families.size() != 1 || + column_families[0].name != kDefaultColumnFamilyName) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + + BlobDBImpl* blob_db_impl = new BlobDBImpl(dbname, bdb_options, db_options, + column_families[0].options); + Status s = blob_db_impl->Open(handles); + if (s.ok()) { + *blob_db = static_cast(blob_db_impl); + } else { + if (!handles->empty()) { + for (ColumnFamilyHandle* cfh : *handles) { + blob_db_impl->DestroyColumnFamilyHandle(cfh); + } + + handles->clear(); + } + + delete blob_db_impl; + *blob_db = nullptr; + } + return s; +} + +BlobDB::BlobDB() : StackableDB(nullptr) {} + +void BlobDBOptions::Dump(Logger* log) const { + ROCKS_LOG_HEADER( + log, " BlobDBOptions.blob_dir: %s", + blob_dir.c_str()); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.path_relative: %d", + path_relative); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.is_fifo: %d", + is_fifo); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.max_db_size: %" PRIu64, + max_db_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.ttl_range_secs: %" PRIu64, + ttl_range_secs); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.bytes_per_sync: %" PRIu64, + bytes_per_sync); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.compression: %d", + static_cast(compression)); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.enable_garbage_collection: %d", + enable_garbage_collection); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.garbage_collection_cutoff: %f", + garbage_collection_cutoff); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.disable_background_tasks: %d", + disable_background_tasks); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.h new file mode 100644 index 0000000000..e2f0b7bdbd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db.h @@ -0,0 +1,264 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +namespace blob_db { + +// A wrapped database which puts values of KV pairs in a separate log +// and store location to the log in the underlying DB. +// +// The factory needs to be moved to include/rocksdb/utilities to allow +// users to use blob DB. + +constexpr uint64_t kNoExpiration = std::numeric_limits::max(); + +struct BlobDBOptions { + // Name of the directory under the base DB where blobs will be stored. Using + // a directory where the base DB stores its SST files is not supported. + // Default is "blob_dir" + std::string blob_dir = "blob_dir"; + + // whether the blob_dir path is relative or absolute. + bool path_relative = true; + + // When max_db_size is reached, evict blob files to free up space + // instead of returnning NoSpace error on write. Blob files will be + // evicted from oldest to newest, based on file creation time. + bool is_fifo = false; + + // Maximum size of the database (including SST files and blob files). + // + // Default: 0 (no limits) + uint64_t max_db_size = 0; + + // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds + // (10 minutes), and the first bucket starts at 1471542000 + // then the blob buckets will be + // first bucket is 1471542000 - 1471542600 + // second bucket is 1471542600 - 1471543200 + // and so on + uint64_t ttl_range_secs = 3600; + + // The smallest value to store in blob log. Values smaller than this threshold + // will be inlined in base DB together with the key. + uint64_t min_blob_size = 0; + + // Allows OS to incrementally sync blob files to disk for every + // bytes_per_sync bytes written. Users shouldn't rely on it for + // persistency guarantee. + uint64_t bytes_per_sync = 512 * 1024; + + // the target size of each blob file. File will become immutable + // after it exceeds that size + uint64_t blob_file_size = 256 * 1024 * 1024; + + // what compression to use for Blob's + CompressionType compression = kNoCompression; + + // If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction + // by rewriting the remaining live blobs to new files. + bool enable_garbage_collection = false; + + // The cutoff in terms of blob file age for garbage collection. Blobs in + // the oldest N non-TTL blob files will be rewritten when encountered during + // compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files. + double garbage_collection_cutoff = 0.25; + + // Disable all background job. Used for test only. + bool disable_background_tasks = false; + + void Dump(Logger* log) const; +}; + +class BlobDB : public StackableDB { + public: + using ROCKSDB_NAMESPACE::StackableDB::Put; + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) override = 0; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + return Put(options, key, value); + } + + using ROCKSDB_NAMESPACE::StackableDB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + assert(db_ != nullptr); + return db_->Delete(options, column_family, key); + } + + virtual Status PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) = 0; + virtual Status PutWithTTL(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t ttl) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + return PutWithTTL(options, key, value, ttl); + } + + // Put with expiration. Key with expiration time equal to + // std::numeric_limits::max() means the key don't expire. + virtual Status PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration) = 0; + virtual Status PutUntil(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t expiration) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + return PutUntil(options, key, value, expiration); + } + + using ROCKSDB_NAMESPACE::StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override = 0; + + // Get value and expiration. + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + PinnableSlice* value, uint64_t* expiration) { + return Get(options, DefaultColumnFamily(), key, value, expiration); + } + + using ROCKSDB_NAMESPACE::StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override = 0; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_families, + const std::vector& keys, + std::vector* values) override { + for (auto column_family : column_families) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return std::vector( + column_families.size(), + Status::NotSupported( + "Blob DB doesn't support non-default column family.")); + } + } + return MultiGet(options, keys, values); + } + virtual void MultiGet(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const size_t num_keys, const Slice* /*keys*/, + PinnableSlice* /*values*/, Status* statuses, + const bool /*sorted_input*/ = false) override { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = + Status::NotSupported("Blob DB doesn't support batched MultiGet"); + } + } + + using ROCKSDB_NAMESPACE::StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*wopts*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + using ROCKSDB_NAMESPACE::StackableDB::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + virtual Status Write(const WriteOptions& opts, + WriteBatch* updates) override = 0; + + using ROCKSDB_NAMESPACE::StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options) override = 0; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + // Blob DB doesn't support non-default column family. + return nullptr; + } + return NewIterator(options); + } + + Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override = 0; + Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + + return CompactFiles(compact_options, input_file_names, output_level, + output_path_id, output_file_names, compaction_job_info); + } + + using ROCKSDB_NAMESPACE::StackableDB::Close; + virtual Status Close() override = 0; + + // Opening blob db. + static Status Open(const Options& options, const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db); + + static Status Open(const DBOptions& db_options, + const BlobDBOptions& bdb_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + BlobDB** blob_db); + + virtual BlobDBOptions GetBlobDBOptions() const = 0; + + virtual Status SyncBlobFiles() = 0; + + virtual ~BlobDB() {} + + protected: + explicit BlobDB(); +}; + +// Destroy the content of the database. +Status DestroyBlobDB(const std::string& dbname, const Options& options, + const BlobDBOptions& bdb_options); + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_gc_stats.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_gc_stats.h new file mode 100644 index 0000000000..12c11af854 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_gc_stats.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + + +namespace ROCKSDB_NAMESPACE { + +namespace blob_db { + +/** + * Statistics related to a single garbage collection pass (i.e. a single + * (sub)compaction). + */ +class BlobDBGarbageCollectionStats { + public: + uint64_t AllBlobs() const { return all_blobs_; } + uint64_t AllBytes() const { return all_bytes_; } + uint64_t RelocatedBlobs() const { return relocated_blobs_; } + uint64_t RelocatedBytes() const { return relocated_bytes_; } + uint64_t NewFiles() const { return new_files_; } + bool HasError() const { return error_; } + + void AddBlob(uint64_t size) { + ++all_blobs_; + all_bytes_ += size; + } + + void AddRelocatedBlob(uint64_t size) { + ++relocated_blobs_; + relocated_bytes_ += size; + } + + void AddNewFile() { ++new_files_; } + + void SetError() { error_ = true; } + + private: + uint64_t all_blobs_ = 0; + uint64_t all_bytes_ = 0; + uint64_t relocated_blobs_ = 0; + uint64_t relocated_bytes_ = 0; + uint64_t new_files_ = 0; + bool error_ = false; +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.cc new file mode 100644 index 0000000000..7a4b603f22 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.cc @@ -0,0 +1,2185 @@ + +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/blob_db/blob_db_impl.h" + +#include +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/db_impl/db_impl.h" +#include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/instrumented_mutex.h" +#include "monitoring/statistics_impl.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_builder.h" +#include "table/meta_blocks.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/timer_queue.h" +#include "utilities/blob_db/blob_compaction_filter.h" +#include "utilities/blob_db/blob_db_iterator.h" +#include "utilities/blob_db/blob_db_listener.h" + +namespace { +int kBlockBasedTableVersionFormat = 2; +} // end namespace + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +bool BlobFileComparator::operator()( + const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const { + return lhs->BlobFileNumber() > rhs->BlobFileNumber(); +} + +bool BlobFileComparatorTTL::operator()( + const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const { + assert(lhs->HasTTL() && rhs->HasTTL()); + if (lhs->expiration_range_.first < rhs->expiration_range_.first) { + return true; + } + if (lhs->expiration_range_.first > rhs->expiration_range_.first) { + return false; + } + return lhs->BlobFileNumber() < rhs->BlobFileNumber(); +} + +BlobDBImpl::BlobDBImpl(const std::string& dbname, + const BlobDBOptions& blob_db_options, + const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : BlobDB(), + dbname_(dbname), + db_impl_(nullptr), + env_(db_options.env), + bdb_options_(blob_db_options), + db_options_(db_options), + cf_options_(cf_options), + file_options_(db_options), + statistics_(db_options_.statistics.get()), + next_file_number_(1), + flush_sequence_(0), + closed_(true), + open_file_count_(0), + total_blob_size_(0), + live_sst_size_(0), + fifo_eviction_seq_(0), + evict_expiration_up_to_(0), + debug_level_(0) { + clock_ = env_->GetSystemClock().get(); + blob_dir_ = (bdb_options_.path_relative) + ? dbname + "/" + bdb_options_.blob_dir + : bdb_options_.blob_dir; + file_options_.bytes_per_sync = blob_db_options.bytes_per_sync; +} + +BlobDBImpl::~BlobDBImpl() { + tqueue_.shutdown(); + // CancelAllBackgroundWork(db_, true); + Status s __attribute__((__unused__)) = Close(); + assert(s.ok()); +} + +Status BlobDBImpl::Close() { + if (closed_) { + return Status::OK(); + } + closed_ = true; + + // Close base DB before BlobDBImpl destructs to stop event listener and + // compaction filter call. + Status s = db_->Close(); + // delete db_ anyway even if close failed. + delete db_; + // Reset pointers to avoid StackableDB delete the pointer again. + db_ = nullptr; + db_impl_ = nullptr; + if (!s.ok()) { + return s; + } + + s = SyncBlobFiles(); + return s; +} + +BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; } + +Status BlobDBImpl::Open(std::vector* handles) { + assert(handles != nullptr); + assert(db_ == nullptr); + + if (blob_dir_.empty()) { + return Status::NotSupported("No blob directory in options"); + } + + if (bdb_options_.garbage_collection_cutoff < 0.0 || + bdb_options_.garbage_collection_cutoff > 1.0) { + return Status::InvalidArgument( + "Garbage collection cutoff must be in the interval [0.0, 1.0]"); + } + + // Temporarily disable compactions in the base DB during open; save the user + // defined value beforehand so we can restore it once BlobDB is initialized. + // Note: this is only needed if garbage collection is enabled. + const bool disable_auto_compactions = cf_options_.disable_auto_compactions; + + if (bdb_options_.enable_garbage_collection) { + cf_options_.disable_auto_compactions = true; + } + + Status s; + + // Create info log. + if (db_options_.info_log == nullptr) { + s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log); + if (!s.ok()) { + return s; + } + } + + ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB..."); + + if ((cf_options_.compaction_filter != nullptr || + cf_options_.compaction_filter_factory != nullptr)) { + ROCKS_LOG_INFO(db_options_.info_log, + "BlobDB only support compaction filter on non-TTL values."); + } + + // Open blob directory. + s = env_->CreateDirIfMissing(blob_dir_); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to create blob_dir %s, status: %s", + blob_dir_.c_str(), s.ToString().c_str()); + } + s = env_->GetFileSystem()->NewDirectory(blob_dir_, IOOptions(), &dir_ent_, + nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(), + s.ToString().c_str()); + return s; + } + + // Open blob files. + s = OpenAllBlobFiles(); + if (!s.ok()) { + return s; + } + + // Update options + if (bdb_options_.enable_garbage_collection) { + db_options_.listeners.push_back(std::make_shared(this)); + cf_options_.compaction_filter_factory = + std::make_shared( + this, clock_, cf_options_, statistics_); + } else { + db_options_.listeners.push_back(std::make_shared(this)); + cf_options_.compaction_filter_factory = + std::make_shared( + this, clock_, cf_options_, statistics_); + } + + // Reset user compaction filter after building into compaction factory. + cf_options_.compaction_filter = nullptr; + + // Open base db. + ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_); + s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_); + if (!s.ok()) { + return s; + } + db_impl_ = static_cast_with_check(db_->GetRootDB()); + + // Sanitize the blob_dir provided. Using a directory where the + // base DB stores its files for the default CF is not supported. + const ColumnFamilyData* const cfd = + static_cast(DefaultColumnFamily())->cfd(); + assert(cfd); + + const ImmutableCFOptions* const ioptions = cfd->ioptions(); + assert(ioptions); + + assert(env_); + + for (const auto& cf_path : ioptions->cf_paths) { + bool blob_dir_same_as_cf_dir = false; + s = env_->AreFilesSame(blob_dir_, cf_path.path, &blob_dir_same_as_cf_dir); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Error while sanitizing blob_dir %s, status: %s", + blob_dir_.c_str(), s.ToString().c_str()); + return s; + } + + if (blob_dir_same_as_cf_dir) { + return Status::NotSupported( + "Using the base DB's storage directories for BlobDB files is not " + "supported."); + } + } + + // Initialize SST file <-> oldest blob file mapping if garbage collection + // is enabled. + if (bdb_options_.enable_garbage_collection) { + std::vector live_files; + db_->GetLiveFilesMetaData(&live_files); + + InitializeBlobFileToSstMapping(live_files); + + MarkUnreferencedBlobFilesObsoleteDuringOpen(); + + if (!disable_auto_compactions) { + s = db_->EnableAutoCompaction(*handles); + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to enable automatic compactions during open, status: %s", + s.ToString().c_str()); + return s; + } + } + } + + // Add trash files in blob dir to file delete scheduler. + SstFileManagerImpl* sfm = static_cast( + db_impl_->immutable_db_options().sst_file_manager.get()); + DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_); + + UpdateLiveSSTSize(); + + // Start background jobs. + if (!bdb_options_.disable_background_tasks) { + StartBackgroundTasks(); + } + + ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this); + bdb_options_.Dump(db_options_.info_log.get()); + closed_ = false; + return s; +} + +void BlobDBImpl::StartBackgroundTasks() { + // store a call to a member function and object + tqueue_.add( + kReclaimOpenFilesPeriodMillisecs, + std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1)); + tqueue_.add( + kDeleteObsoleteFilesPeriodMillisecs, + std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1)); + tqueue_.add(kSanityCheckPeriodMillisecs, + std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1)); + tqueue_.add( + kEvictExpiredFilesPeriodMillisecs, + std::bind(&BlobDBImpl::EvictExpiredFiles, this, std::placeholders::_1)); +} + +Status BlobDBImpl::GetAllBlobFiles(std::set* file_numbers) { + assert(file_numbers != nullptr); + std::vector all_files; + Status s = env_->GetChildren(blob_dir_, &all_files); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to get list of blob files, status: %s", + s.ToString().c_str()); + return s; + } + + for (const auto& file_name : all_files) { + uint64_t file_number; + FileType type; + bool success = ParseFileName(file_name, &file_number, &type); + if (success && type == kBlobFile) { + file_numbers->insert(file_number); + } else { + ROCKS_LOG_WARN(db_options_.info_log, + "Skipping file in blob directory: %s", file_name.c_str()); + } + } + + return s; +} + +Status BlobDBImpl::OpenAllBlobFiles() { + std::set file_numbers; + Status s = GetAllBlobFiles(&file_numbers); + if (!s.ok()) { + return s; + } + + if (!file_numbers.empty()) { + next_file_number_.store(*file_numbers.rbegin() + 1); + } + + std::ostringstream blob_file_oss; + std::ostringstream live_imm_oss; + std::ostringstream obsolete_file_oss; + + for (auto& file_number : file_numbers) { + std::shared_ptr blob_file = std::make_shared( + this, blob_dir_, file_number, db_options_.info_log.get()); + blob_file->MarkImmutable(/* sequence */ 0); + + // Read file header and footer + Status read_metadata_status = + blob_file->ReadMetadata(env_->GetFileSystem(), file_options_); + if (read_metadata_status.IsCorruption()) { + // Remove incomplete file. + if (!obsolete_files_.empty()) { + obsolete_file_oss << ", "; + } + obsolete_file_oss << file_number; + + ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/); + continue; + } else if (!read_metadata_status.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Unable to read metadata of blob file %" PRIu64 + ", status: '%s'", + file_number, read_metadata_status.ToString().c_str()); + return read_metadata_status; + } + + total_blob_size_ += blob_file->GetFileSize(); + + if (!blob_files_.empty()) { + blob_file_oss << ", "; + } + blob_file_oss << file_number; + + blob_files_[file_number] = blob_file; + + if (!blob_file->HasTTL()) { + if (!live_imm_non_ttl_blob_files_.empty()) { + live_imm_oss << ", "; + } + live_imm_oss << file_number; + + live_imm_non_ttl_blob_files_[file_number] = blob_file; + } + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(), + blob_file_oss.str().c_str()); + ROCKS_LOG_INFO( + db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s", + live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str()); + ROCKS_LOG_INFO(db_options_.info_log, + "Found %" ROCKSDB_PRIszt + " incomplete or corrupted blob files: %s", + obsolete_files_.size(), obsolete_file_oss.str().c_str()); + return s; +} + +template +void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number, + uint64_t blob_file_number, + Linker linker) { + assert(bdb_options_.enable_garbage_collection); + assert(blob_file_number != kInvalidBlobFileNumber); + + auto it = blob_files_.find(blob_file_number); + if (it == blob_files_.end()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Blob file %" PRIu64 + " not found while trying to link " + "SST file %" PRIu64, + blob_file_number, sst_file_number); + return; + } + + BlobFile* const blob_file = it->second.get(); + assert(blob_file); + + linker(blob_file, sst_file_number); + + ROCKS_LOG_INFO(db_options_.info_log, + "Blob file %" PRIu64 " linked to SST file %" PRIu64, + blob_file_number, sst_file_number); +} + +void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number, + uint64_t blob_file_number) { + auto linker = [](BlobFile* blob_file, uint64_t sst_file) { + WriteLock file_lock(&blob_file->mutex_); + blob_file->LinkSstFile(sst_file); + }; + + LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker); +} + +void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number, + uint64_t blob_file_number) { + auto linker = [](BlobFile* blob_file, uint64_t sst_file) { + blob_file->LinkSstFile(sst_file); + }; + + LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker); +} + +void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number, + uint64_t blob_file_number) { + assert(bdb_options_.enable_garbage_collection); + assert(blob_file_number != kInvalidBlobFileNumber); + + auto it = blob_files_.find(blob_file_number); + if (it == blob_files_.end()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Blob file %" PRIu64 + " not found while trying to unlink " + "SST file %" PRIu64, + blob_file_number, sst_file_number); + return; + } + + BlobFile* const blob_file = it->second.get(); + assert(blob_file); + + { + WriteLock file_lock(&blob_file->mutex_); + blob_file->UnlinkSstFile(sst_file_number); + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Blob file %" PRIu64 " unlinked from SST file %" PRIu64, + blob_file_number, sst_file_number); +} + +void BlobDBImpl::InitializeBlobFileToSstMapping( + const std::vector& live_files) { + assert(bdb_options_.enable_garbage_collection); + + for (const auto& live_file : live_files) { + const uint64_t sst_file_number = live_file.file_number; + const uint64_t blob_file_number = live_file.oldest_blob_file_number; + + if (blob_file_number == kInvalidBlobFileNumber) { + continue; + } + + LinkSstToBlobFileNoLock(sst_file_number, blob_file_number); + } +} + +void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) { + assert(bdb_options_.enable_garbage_collection); + + WriteLock lock(&mutex_); + + if (info.oldest_blob_file_number != kInvalidBlobFileNumber) { + LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number); + } + + assert(flush_sequence_ < info.largest_seqno); + flush_sequence_ = info.largest_seqno; + + MarkUnreferencedBlobFilesObsolete(); +} + +void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) { + assert(bdb_options_.enable_garbage_collection); + + if (!info.status.ok()) { + return; + } + + // Note: the same SST file may appear in both the input and the output + // file list in case of a trivial move. We walk through the two lists + // below in a fashion that's similar to merge sort to detect this. + + auto cmp = [](const CompactionFileInfo& lhs, const CompactionFileInfo& rhs) { + return lhs.file_number < rhs.file_number; + }; + + auto inputs = info.input_file_infos; + auto iit = inputs.begin(); + const auto iit_end = inputs.end(); + + std::sort(iit, iit_end, cmp); + + auto outputs = info.output_file_infos; + auto oit = outputs.begin(); + const auto oit_end = outputs.end(); + + std::sort(oit, oit_end, cmp); + + WriteLock lock(&mutex_); + + while (iit != iit_end && oit != oit_end) { + const auto& input = *iit; + const auto& output = *oit; + + if (input.file_number == output.file_number) { + ++iit; + ++oit; + } else if (input.file_number < output.file_number) { + if (input.oldest_blob_file_number != kInvalidBlobFileNumber) { + UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number); + } + + ++iit; + } else { + assert(output.file_number < input.file_number); + + if (output.oldest_blob_file_number != kInvalidBlobFileNumber) { + LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number); + } + + ++oit; + } + } + + while (iit != iit_end) { + const auto& input = *iit; + + if (input.oldest_blob_file_number != kInvalidBlobFileNumber) { + UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number); + } + + ++iit; + } + + while (oit != oit_end) { + const auto& output = *oit; + + if (output.oldest_blob_file_number != kInvalidBlobFileNumber) { + LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number); + } + + ++oit; + } + + MarkUnreferencedBlobFilesObsolete(); +} + +bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded( + const std::shared_ptr& blob_file, SequenceNumber obsolete_seq) { + assert(blob_file); + assert(!blob_file->HasTTL()); + assert(blob_file->Immutable()); + assert(bdb_options_.enable_garbage_collection); + + // Note: FIFO eviction could have marked this file obsolete already. + if (blob_file->Obsolete()) { + return true; + } + + // We cannot mark this file (or any higher-numbered files for that matter) + // obsolete if it is referenced by any memtables or SSTs. We keep track of + // the SSTs explicitly. To account for memtables, we keep track of the highest + // sequence number received in flush notifications, and we do not mark the + // blob file obsolete if there are still unflushed memtables from before + // the time the blob file was closed. + if (blob_file->GetImmutableSequence() > flush_sequence_ || + !blob_file->GetLinkedSstFiles().empty()) { + return false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Blob file %" PRIu64 " is no longer needed, marking obsolete", + blob_file->BlobFileNumber()); + + ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true); + return true; +} + +template +void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) { + assert(bdb_options_.enable_garbage_collection); + + // Iterate through all live immutable non-TTL blob files, and mark them + // obsolete assuming no SST files or memtables rely on the blobs in them. + // Note: we need to stop as soon as we find a blob file that has any + // linked SSTs (or one potentially referenced by memtables). + + uint64_t obsoleted_files = 0; + + auto it = live_imm_non_ttl_blob_files_.begin(); + while (it != live_imm_non_ttl_blob_files_.end()) { + const auto& blob_file = it->second; + assert(blob_file); + assert(blob_file->BlobFileNumber() == it->first); + assert(!blob_file->HasTTL()); + assert(blob_file->Immutable()); + + // Small optimization: Obsolete() does an atomic read, so we can do + // this check without taking a lock on the blob file's mutex. + if (blob_file->Obsolete()) { + it = live_imm_non_ttl_blob_files_.erase(it); + continue; + } + + if (!mark_if_needed(blob_file)) { + break; + } + + it = live_imm_non_ttl_blob_files_.erase(it); + + ++obsoleted_files; + } + + if (obsoleted_files > 0) { + ROCKS_LOG_INFO(db_options_.info_log, + "%" PRIu64 " blob file(s) marked obsolete by GC", + obsoleted_files); + RecordTick(statistics_, BLOB_DB_GC_NUM_FILES, obsoleted_files); + } +} + +void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() { + const SequenceNumber obsolete_seq = GetLatestSequenceNumber(); + + MarkUnreferencedBlobFilesObsoleteImpl( + [this, obsolete_seq](const std::shared_ptr& blob_file) { + WriteLock file_lock(&blob_file->mutex_); + return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq); + }); +} + +void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() { + MarkUnreferencedBlobFilesObsoleteImpl( + [this](const std::shared_ptr& blob_file) { + return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0); + }); +} + +void BlobDBImpl::CloseRandomAccessLocked( + const std::shared_ptr& bfile) { + bfile->CloseRandomAccessLocked(); + open_file_count_--; +} + +Status BlobDBImpl::GetBlobFileReader( + const std::shared_ptr& blob_file, + std::shared_ptr* reader) { + assert(reader != nullptr); + bool fresh_open = false; + Status s = blob_file->GetReader(env_, file_options_, reader, &fresh_open); + if (s.ok() && fresh_open) { + assert(*reader != nullptr); + open_file_count_++; + } + return s; +} + +std::shared_ptr BlobDBImpl::NewBlobFile( + bool has_ttl, const ExpirationRange& expiration_range, + const std::string& reason) { + assert(has_ttl == (expiration_range.first || expiration_range.second)); + + uint64_t file_num = next_file_number_++; + + const uint32_t column_family_id = + static_cast(DefaultColumnFamily())->GetID(); + auto blob_file = std::make_shared( + this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id, + bdb_options_.compression, has_ttl, expiration_range); + + ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'", + blob_file->PathName().c_str(), reason.c_str()); + LogFlush(db_options_.info_log); + + return blob_file; +} + +void BlobDBImpl::RegisterBlobFile(std::shared_ptr blob_file) { + const uint64_t blob_file_number = blob_file->BlobFileNumber(); + + auto it = blob_files_.lower_bound(blob_file_number); + assert(it == blob_files_.end() || it->first != blob_file_number); + + blob_files_.insert(it, + std::map>::value_type( + blob_file_number, std::move(blob_file))); +} + +Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr& bfile) { + std::string fpath(bfile->PathName()); + std::unique_ptr wfile; + const auto& fs = env_->GetFileSystem(); + + Status s = fs->ReopenWritableFile(fpath, file_options_, &wfile, nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to open blob file for write: %s status: '%s'" + " exists: '%s'", + fpath.c_str(), s.ToString().c_str(), + fs->FileExists(fpath, file_options_.io_options, nullptr) + .ToString() + .c_str()); + return s; + } + + std::unique_ptr fwriter; + fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_)); + + uint64_t boffset = bfile->GetFileSize(); + if (debug_level_ >= 2 && boffset) { + ROCKS_LOG_DEBUG(db_options_.info_log, + "Open blob file: %s with offset: %" PRIu64, fpath.c_str(), + boffset); + } + + BlobLogWriter::ElemType et = BlobLogWriter::kEtNone; + if (bfile->file_size_ == BlobLogHeader::kSize) { + et = BlobLogWriter::kEtFileHdr; + } else if (bfile->file_size_ > BlobLogHeader::kSize) { + et = BlobLogWriter::kEtRecord; + } else if (bfile->file_size_) { + ROCKS_LOG_WARN(db_options_.info_log, + "Open blob file: %s with wrong size: %" PRIu64, + fpath.c_str(), boffset); + return Status::Corruption("Invalid blob file size"); + } + + constexpr bool do_flush = true; + + bfile->log_writer_ = std::make_shared( + std::move(fwriter), clock_, statistics_, bfile->file_number_, + db_options_.use_fsync, do_flush, boffset); + bfile->log_writer_->last_elem_type_ = et; + + return s; +} + +std::shared_ptr BlobDBImpl::FindBlobFileLocked( + uint64_t expiration) const { + if (open_ttl_files_.empty()) { + return nullptr; + } + + std::shared_ptr tmp = std::make_shared(); + tmp->SetHasTTL(true); + tmp->expiration_range_ = std::make_pair(expiration, 0); + tmp->file_number_ = std::numeric_limits::max(); + + auto citr = open_ttl_files_.equal_range(tmp); + if (citr.first == open_ttl_files_.end()) { + assert(citr.second == open_ttl_files_.end()); + + std::shared_ptr check = *(open_ttl_files_.rbegin()); + return (check->expiration_range_.second <= expiration) ? nullptr : check; + } + + if (citr.first != citr.second) { + return *(citr.first); + } + + auto finditr = citr.second; + if (finditr != open_ttl_files_.begin()) { + --finditr; + } + + bool b2 = (*finditr)->expiration_range_.second <= expiration; + bool b1 = (*finditr)->expiration_range_.first > expiration; + + return (b1 || b2) ? nullptr : (*finditr); +} + +Status BlobDBImpl::CheckOrCreateWriterLocked( + const std::shared_ptr& blob_file, + std::shared_ptr* writer) { + assert(writer != nullptr); + *writer = blob_file->GetWriter(); + if (*writer != nullptr) { + return Status::OK(); + } + Status s = CreateWriterLocked(blob_file); + if (s.ok()) { + *writer = blob_file->GetWriter(); + } + return s; +} + +Status BlobDBImpl::CreateBlobFileAndWriter( + bool has_ttl, const ExpirationRange& expiration_range, + const std::string& reason, std::shared_ptr* blob_file, + std::shared_ptr* writer) { + TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter"); + assert(has_ttl == (expiration_range.first || expiration_range.second)); + assert(blob_file); + assert(writer); + + *blob_file = NewBlobFile(has_ttl, expiration_range, reason); + assert(*blob_file); + + // file not visible, hence no lock + Status s = CheckOrCreateWriterLocked(*blob_file, writer); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to get writer for blob file: %s, error: %s", + (*blob_file)->PathName().c_str(), s.ToString().c_str()); + return s; + } + + assert(*writer); + + s = (*writer)->WriteHeader((*blob_file)->header_); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to write header to new blob file: %s" + " status: '%s'", + (*blob_file)->PathName().c_str(), s.ToString().c_str()); + return s; + } + + (*blob_file)->SetFileSize(BlobLogHeader::kSize); + total_blob_size_ += BlobLogHeader::kSize; + + return s; +} + +Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { + assert(blob_file); + + { + ReadLock rl(&mutex_); + + if (open_non_ttl_file_) { + assert(!open_non_ttl_file_->Immutable()); + *blob_file = open_non_ttl_file_; + return Status::OK(); + } + } + + // Check again + WriteLock wl(&mutex_); + + if (open_non_ttl_file_) { + assert(!open_non_ttl_file_->Immutable()); + *blob_file = open_non_ttl_file_; + return Status::OK(); + } + + std::shared_ptr writer; + const Status s = CreateBlobFileAndWriter( + /* has_ttl */ false, ExpirationRange(), + /* reason */ "SelectBlobFile", blob_file, &writer); + if (!s.ok()) { + return s; + } + + RegisterBlobFile(*blob_file); + open_non_ttl_file_ = *blob_file; + + return s; +} + +Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration, + std::shared_ptr* blob_file) { + assert(blob_file); + assert(expiration != kNoExpiration); + + { + ReadLock rl(&mutex_); + + *blob_file = FindBlobFileLocked(expiration); + if (*blob_file != nullptr) { + assert(!(*blob_file)->Immutable()); + return Status::OK(); + } + } + + // Check again + WriteLock wl(&mutex_); + + *blob_file = FindBlobFileLocked(expiration); + if (*blob_file != nullptr) { + assert(!(*blob_file)->Immutable()); + return Status::OK(); + } + + const uint64_t exp_low = + (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs; + const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs; + const ExpirationRange expiration_range(exp_low, exp_high); + + std::ostringstream oss; + oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')'; + + std::shared_ptr writer; + const Status s = + CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range, + /* reason */ oss.str(), blob_file, &writer); + if (!s.ok()) { + return s; + } + + RegisterBlobFile(*blob_file); + open_ttl_files_.insert(*blob_file); + + return s; +} + +class BlobDBImpl::BlobInserter : public WriteBatch::Handler { + private: + const WriteOptions& options_; + BlobDBImpl* blob_db_impl_; + uint32_t default_cf_id_; + WriteBatch batch_; + + public: + BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl, + uint32_t default_cf_id) + : options_(options), + blob_db_impl_(blob_db_impl), + default_cf_id_(default_cf_id) {} + + WriteBatch* batch() { return &batch_; } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id != default_cf_id_) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration, + &batch_); + return s; + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id != default_cf_id_) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key); + return s; + } + + virtual Status DeleteRange(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) { + if (column_family_id != default_cf_id_) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id, + begin_key, end_key); + return s; + } + + Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + void LogData(const Slice& blob) override { batch_.PutLogData(blob); } +}; + +Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_WRITE); + uint32_t default_cf_id = + static_cast_with_check(DefaultColumnFamily()) + ->GetID(); + Status s; + BlobInserter blob_inserter(options, this, default_cf_id); + { + // Release write_mutex_ before DB write to avoid race condition with + // flush begin listener, which also require write_mutex_ to sync + // blob files. + MutexLock l(&write_mutex_); + s = updates->Iterate(&blob_inserter); + } + if (!s.ok()) { + return s; + } + return db_->Write(options, blob_inserter.batch()); +} + +Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return PutUntil(options, key, value, kNoExpiration); +} + +Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) { + uint64_t now = EpochNow(); + uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration; + return PutUntil(options, key, value, expiration); +} + +Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_PUT); + Status s; + WriteBatch batch; + { + // Release write_mutex_ before DB write to avoid race condition with + // flush begin listener, which also require write_mutex_ to sync + // blob files. + MutexLock l(&write_mutex_); + s = PutBlobValue(options, key, value, expiration, &batch); + } + if (s.ok()) { + s = db_->Write(options, &batch); + } + return s; +} + +Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, + const Slice& key, const Slice& value, + uint64_t expiration, WriteBatch* batch) { + write_mutex_.AssertHeld(); + Status s; + std::string index_entry; + uint32_t column_family_id = + static_cast_with_check(DefaultColumnFamily()) + ->GetID(); + if (value.size() < bdb_options_.min_blob_size) { + if (expiration == kNoExpiration) { + // Put as normal value + s = batch->Put(key, value); + RecordTick(statistics_, BLOB_DB_WRITE_INLINED); + } else { + // Inlined with TTL + BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value); + s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, + index_entry); + RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL); + } + } else { + std::string compression_output; + Slice value_compressed = GetCompressedSlice(value, &compression_output); + + std::string headerbuf; + BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed, + expiration); + + // Check DB size limit before selecting blob file to + // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be + // done before calling SelectBlobFile(). + s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() + + value_compressed.size()); + if (!s.ok()) { + return s; + } + + std::shared_ptr blob_file; + if (expiration != kNoExpiration) { + s = SelectBlobFileTTL(expiration, &blob_file); + } else { + s = SelectBlobFile(&blob_file); + } + if (s.ok()) { + assert(blob_file != nullptr); + assert(blob_file->GetCompressionType() == bdb_options_.compression); + s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration, + &index_entry); + } + if (s.ok()) { + if (expiration != kNoExpiration) { + WriteLock file_lock(&blob_file->mutex_); + blob_file->ExtendExpirationRange(expiration); + } + s = CloseBlobFileIfNeeded(blob_file); + } + if (s.ok()) { + s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, + index_entry); + } + if (s.ok()) { + if (expiration == kNoExpiration) { + RecordTick(statistics_, BLOB_DB_WRITE_BLOB); + } else { + RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL); + } + } else { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt + " status: '%s' blob_file: '%s'", + blob_file->PathName().c_str(), key.ToString().c_str(), value.size(), + s.ToString().c_str(), blob_file->DumpState().c_str()); + } + } + + RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN); + RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size()); + RecordInHistogram(statistics_, BLOB_DB_KEY_SIZE, key.size()); + RecordInHistogram(statistics_, BLOB_DB_VALUE_SIZE, value.size()); + + return s; +} + +Slice BlobDBImpl::GetCompressedSlice(const Slice& raw, + std::string* compression_output) const { + if (bdb_options_.compression == kNoCompression) { + return raw; + } + StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS); + CompressionType type = bdb_options_.compression; + CompressionOptions opts; + CompressionContext context(type); + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type, + 0 /* sample_for_compression */); + CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false, + compression_output, nullptr, nullptr); + return *compression_output; +} + +Status BlobDBImpl::DecompressSlice(const Slice& compressed_value, + CompressionType compression_type, + PinnableSlice* value_output) const { + assert(compression_type != kNoCompression); + + BlockContents contents; + auto cfh = static_cast(DefaultColumnFamily()); + + { + StopWatch decompression_sw(clock_, statistics_, + BLOB_DB_DECOMPRESSION_MICROS); + UncompressionContext context(compression_type); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression_type); + Status s = UncompressBlockData( + info, compressed_value.data(), compressed_value.size(), &contents, + kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions())); + if (!s.ok()) { + return Status::Corruption("Unable to decompress blob."); + } + } + + value_output->PinSelf(contents.data); + + return Status::OK(); +} + +Status BlobDBImpl::CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id, std::vector* const output_file_names, + CompactionJobInfo* compaction_job_info) { + // Note: we need CompactionJobInfo to be able to track updates to the + // blob file <-> SST mappings, so we provide one if the user hasn't, + // assuming that GC is enabled. + CompactionJobInfo info{}; + if (bdb_options_.enable_garbage_collection && !compaction_job_info) { + compaction_job_info = &info; + } + + const Status s = + db_->CompactFiles(compact_options, input_file_names, output_level, + output_path_id, output_file_names, compaction_job_info); + if (!s.ok()) { + return s; + } + + if (bdb_options_.enable_garbage_collection) { + assert(compaction_job_info); + ProcessCompactionJobInfo(*compaction_job_info); + } + + return s; +} + +void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) { + assert(context); + + context->blob_db_impl = this; + context->next_file_number = next_file_number_.load(); + context->current_blob_files.clear(); + for (auto& p : blob_files_) { + context->current_blob_files.insert(p.first); + } + context->fifo_eviction_seq = fifo_eviction_seq_; + context->evict_expiration_up_to = evict_expiration_up_to_; +} + +void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) { + assert(context); + + ReadLock l(&mutex_); + GetCompactionContextCommon(context); +} + +void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context, + BlobCompactionContextGC* context_gc) { + assert(context); + assert(context_gc); + + ReadLock l(&mutex_); + GetCompactionContextCommon(context); + + if (!live_imm_non_ttl_blob_files_.empty()) { + auto it = live_imm_non_ttl_blob_files_.begin(); + std::advance(it, bdb_options_.garbage_collection_cutoff * + live_imm_non_ttl_blob_files_.size()); + context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end() + ? it->first + : std::numeric_limits::max(); + } +} + +void BlobDBImpl::UpdateLiveSSTSize() { + uint64_t live_sst_size = 0; + bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); + if (ok) { + live_sst_size_.store(live_sst_size); + ROCKS_LOG_INFO(db_options_.info_log, + "Updated total SST file size: %" PRIu64 " bytes.", + live_sst_size); + } else { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to update total SST file size after flush or compaction."); + } + { + // Trigger FIFO eviction if needed. + MutexLock l(&write_mutex_); + Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/); + if (s.IsNoSpace()) { + ROCKS_LOG_WARN(db_options_.info_log, + "DB grow out-of-space after SST size updated. Current live" + " SST size: %" PRIu64 + " , current blob files size: %" PRIu64 ".", + live_sst_size_.load(), total_blob_size_.load()); + } + } +} + +Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, + bool force_evict) { + write_mutex_.AssertHeld(); + + uint64_t live_sst_size = live_sst_size_.load(); + if (bdb_options_.max_db_size == 0 || + live_sst_size + total_blob_size_.load() + blob_size <= + bdb_options_.max_db_size) { + return Status::OK(); + } + + if (bdb_options_.is_fifo == false || + (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) { + // FIFO eviction is disabled, or no space to insert new blob even we evict + // all blob files. + return Status::NoSpace( + "Write failed, as writing it would exceed max_db_size limit."); + } + + std::vector> candidate_files; + CopyBlobFiles(&candidate_files); + std::sort(candidate_files.begin(), candidate_files.end(), + BlobFileComparator()); + fifo_eviction_seq_ = GetLatestSequenceNumber(); + + WriteLock l(&mutex_); + + while (!candidate_files.empty() && + live_sst_size + total_blob_size_.load() + blob_size > + bdb_options_.max_db_size) { + std::shared_ptr blob_file = candidate_files.back(); + candidate_files.pop_back(); + WriteLock file_lock(&blob_file->mutex_); + if (blob_file->Obsolete()) { + // File already obsoleted by someone else. + assert(blob_file->Immutable()); + continue; + } + // FIFO eviction can evict open blob files. + if (!blob_file->Immutable()) { + Status s = CloseBlobFile(blob_file); + if (!s.ok()) { + return s; + } + } + assert(blob_file->Immutable()); + auto expiration_range = blob_file->GetExpirationRange(); + ROCKS_LOG_INFO(db_options_.info_log, + "Evict oldest blob file since DB out of space. Current " + "live SST file size: %" PRIu64 ", total blob size: %" PRIu64 + ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64 + ".", + live_sst_size, total_blob_size_.load(), + bdb_options_.max_db_size, blob_file->BlobFileNumber()); + ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/); + evict_expiration_up_to_ = expiration_range.first; + RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED); + RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED, + blob_file->BlobCount()); + RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED, + blob_file->GetFileSize()); + TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted"); + } + if (live_sst_size + total_blob_size_.load() + blob_size > + bdb_options_.max_db_size) { + return Status::NoSpace( + "Write failed, as writing it would exceed max_db_size limit."); + } + return Status::OK(); +} + +Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, + const std::string& headerbuf, const Slice& key, + const Slice& value, uint64_t expiration, + std::string* index_entry) { + Status s; + uint64_t blob_offset = 0; + uint64_t key_offset = 0; + { + WriteLock lockbfile_w(&bfile->mutex_); + std::shared_ptr writer; + s = CheckOrCreateWriterLocked(bfile, &writer); + if (!s.ok()) { + return s; + } + + // write the blob to the blob log. + s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset, + &blob_offset); + } + + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Invalid status in AppendBlob: %s status: '%s'", + bfile->PathName().c_str(), s.ToString().c_str()); + return s; + } + + uint64_t size_put = headerbuf.size() + key.size() + value.size(); + bfile->BlobRecordAdded(size_put); + total_blob_size_ += size_put; + + if (expiration == kNoExpiration) { + BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset, + value.size(), bdb_options_.compression); + } else { + BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(), + blob_offset, value.size(), + bdb_options_.compression); + } + + return s; +} + +std::vector BlobDBImpl::MultiGet(const ReadOptions& read_options, + const std::vector& keys, + std::vector* values) { + StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_MULTIGET); + // Get a snapshot to avoid blob file get deleted between we + // fetch and index entry and reading from the file. + ReadOptions ro(read_options); + bool snapshot_created = SetSnapshotIfNeeded(&ro); + + std::vector statuses; + statuses.reserve(keys.size()); + values->clear(); + values->reserve(keys.size()); + PinnableSlice value; + for (size_t i = 0; i < keys.size(); i++) { + statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value)); + values->push_back(value.ToString()); + value.Reset(); + } + if (snapshot_created) { + db_->ReleaseSnapshot(ro.snapshot); + } + return statuses; +} + +bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) { + assert(read_options != nullptr); + if (read_options->snapshot != nullptr) { + return false; + } + read_options->snapshot = db_->GetSnapshot(); + return true; +} + +Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value, uint64_t* expiration) { + assert(value); + + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(index_entry); + if (!s.ok()) { + return s; + } + + if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) { + return Status::NotFound("Key expired"); + } + + if (expiration != nullptr) { + if (blob_index.HasTTL()) { + *expiration = blob_index.expiration(); + } else { + *expiration = kNoExpiration; + } + } + + if (blob_index.IsInlined()) { + // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same + // memory buffer to avoid extra copy. + value->PinSelf(blob_index.value()); + return Status::OK(); + } + + CompressionType compression_type = kNoCompression; + s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(), + blob_index.size(), value, &compression_type); + if (!s.ok()) { + return s; + } + + if (compression_type != kNoCompression) { + s = DecompressSlice(*value, compression_type, value); + if (!s.ok()) { + if (debug_level_ >= 2) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Uncompression error during blob read from file: %" PRIu64 + " blob_offset: %" PRIu64 " blob_size: %" PRIu64 + " key: %s status: '%s'", + blob_index.file_number(), blob_index.offset(), blob_index.size(), + key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); + } + return s; + } + } + + return Status::OK(); +} + +Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, + uint64_t offset, uint64_t size, + PinnableSlice* value, + CompressionType* compression_type) { + assert(value); + assert(compression_type); + assert(*compression_type == kNoCompression); + + if (!size) { + value->PinSelf(""); + return Status::OK(); + } + + // offset has to have certain min, as we will read CRC + // later from the Blob Header, which needs to be also a + // valid offset. + if (offset < + (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) { + if (debug_level_ >= 2) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Invalid blob index file_number: %" PRIu64 + " blob_offset: %" PRIu64 " blob_size: %" PRIu64 + " key: %s", + file_number, offset, size, + key.ToString(/* output_hex */ true).c_str()); + } + + return Status::NotFound("Invalid blob offset"); + } + + std::shared_ptr blob_file; + + { + ReadLock rl(&mutex_); + auto it = blob_files_.find(file_number); + + // file was deleted + if (it == blob_files_.end()) { + return Status::NotFound("Blob Not Found as blob file missing"); + } + + blob_file = it->second; + } + + *compression_type = blob_file->GetCompressionType(); + + // takes locks when called + std::shared_ptr reader; + Status s = GetBlobFileReader(blob_file, &reader); + if (!s.ok()) { + return s; + } + + assert(offset >= key.size() + sizeof(uint32_t)); + const uint64_t record_offset = offset - key.size() - sizeof(uint32_t); + const uint64_t record_size = sizeof(uint32_t) + key.size() + size; + + // Allocate the buffer. This is safe in C++11 + std::string buf; + AlignedBuf aligned_buf; + + // A partial blob record contain checksum, key and value. + Slice blob_record; + + { + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + // TODO: rate limit old blob DB file reads. + if (reader->use_direct_io()) { + s = reader->Read(IOOptions(), record_offset, + static_cast(record_size), &blob_record, nullptr, + &aligned_buf, Env::IO_TOTAL /* rate_limiter_priority */); + } else { + buf.reserve(static_cast(record_size)); + s = reader->Read(IOOptions(), record_offset, + static_cast(record_size), &blob_record, &buf[0], + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + } + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); + } + + if (!s.ok()) { + ROCKS_LOG_DEBUG( + db_options_.info_log, + "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64 + ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt ", status: '%s'", + file_number, offset, size, key.size(), s.ToString().c_str()); + return s; + } + + if (blob_record.size() != record_size) { + ROCKS_LOG_DEBUG( + db_options_.info_log, + "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64 + ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt + ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes", + file_number, offset, size, key.size(), blob_record.size(), record_size); + + return Status::Corruption("Failed to retrieve blob from blob index."); + } + + Slice crc_slice(blob_record.data(), sizeof(uint32_t)); + Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(), + static_cast(size)); + + uint32_t crc_exp = 0; + if (!GetFixed32(&crc_slice, &crc_exp)) { + ROCKS_LOG_DEBUG( + db_options_.info_log, + "Unable to decode CRC from blob file %" PRIu64 ", blob_offset: %" PRIu64 + ", blob_size: %" PRIu64 ", key size: %" ROCKSDB_PRIszt ", status: '%s'", + file_number, offset, size, key.size(), s.ToString().c_str()); + return Status::Corruption("Unable to decode checksum."); + } + + uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t), + blob_record.size() - sizeof(uint32_t)); + crc = crc32c::Mask(crc); // Adjust for storage + if (crc != crc_exp) { + if (debug_level_ >= 2) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64 + " blob_size: %" PRIu64 " key: %s status: '%s'", + file_number, offset, size, + key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); + } + + return Status::Corruption("Corruption. Blob CRC mismatch"); + } + + value->PinSelf(blob_value); + + return Status::OK(); +} + +Status BlobDBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return Get(read_options, column_family, key, value, + static_cast(nullptr) /*expiration*/); +} + +Status BlobDBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) { + StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_GET); + return GetImpl(read_options, column_family, key, value, expiration); +} + +Status BlobDBImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + // Get a snapshot to avoid blob file get deleted between we + // fetch and index entry and reading from the file. + // TODO(yiwu): For Get() retry if file not found would be a simpler strategy. + ReadOptions ro(read_options); + bool snapshot_created = SetSnapshotIfNeeded(&ro); + + PinnableSlice index_entry; + Status s; + bool is_blob_index = false; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &index_entry; + get_impl_options.is_blob_index = &is_blob_index; + s = db_impl_->GetImpl(ro, key, get_impl_options); + if (expiration != nullptr) { + *expiration = kNoExpiration; + } + RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ); + if (s.ok()) { + if (is_blob_index) { + s = GetBlobValue(key, index_entry, value, expiration); + } else { + // The index entry is the value itself in this case. + value->PinSelf(index_entry); + } + RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size()); + } + if (snapshot_created) { + db_->ReleaseSnapshot(ro.snapshot); + } + return s; +} + +std::pair BlobDBImpl::SanityCheck(bool aborted) { + if (aborted) { + return std::make_pair(false, -1); + } + + ReadLock rl(&mutex_); + + ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check"); + ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt, + blob_files_.size()); + ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt, + open_ttl_files_.size()); + + for (const auto& blob_file : open_ttl_files_) { + (void)blob_file; + assert(!blob_file->Immutable()); + } + + for (const auto& pair : live_imm_non_ttl_blob_files_) { + const auto& blob_file = pair.second; + (void)blob_file; + assert(!blob_file->HasTTL()); + assert(blob_file->Immutable()); + } + + uint64_t now = EpochNow(); + + for (auto blob_file_pair : blob_files_) { + auto blob_file = blob_file_pair.second; + std::ostringstream buf; + + buf << "Blob file " << blob_file->BlobFileNumber() << ", size " + << blob_file->GetFileSize() << ", blob count " << blob_file->BlobCount() + << ", immutable " << blob_file->Immutable(); + + if (blob_file->HasTTL()) { + ExpirationRange expiration_range; + { + ReadLock file_lock(&blob_file->mutex_); + expiration_range = blob_file->GetExpirationRange(); + } + buf << ", expiration range (" << expiration_range.first << ", " + << expiration_range.second << ")"; + + if (!blob_file->Obsolete()) { + buf << ", expire in " << (expiration_range.second - now) << "seconds"; + } + } + if (blob_file->Obsolete()) { + buf << ", obsolete at " << blob_file->GetObsoleteSequence(); + } + buf << "."; + ROCKS_LOG_INFO(db_options_.info_log, "%s", buf.str().c_str()); + } + + // reschedule + return std::make_pair(true, -1); +} + +Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { + TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile"); + assert(bfile); + assert(!bfile->Immutable()); + assert(!bfile->Obsolete()); + + if (bfile->HasTTL() || bfile == open_non_ttl_file_) { + write_mutex_.AssertHeld(); + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Closing blob file %" PRIu64 ". Path: %s", + bfile->BlobFileNumber(), bfile->PathName().c_str()); + + const SequenceNumber sequence = GetLatestSequenceNumber(); + + const Status s = bfile->WriteFooterAndCloseLocked(sequence); + + if (s.ok()) { + total_blob_size_ += BlobLogFooter::kSize; + } else { + bfile->MarkImmutable(sequence); + + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to close blob file %" PRIu64 "with error: %s", + bfile->BlobFileNumber(), s.ToString().c_str()); + } + + if (bfile->HasTTL()) { + size_t erased __attribute__((__unused__)); + erased = open_ttl_files_.erase(bfile); + } else { + if (bfile == open_non_ttl_file_) { + open_non_ttl_file_ = nullptr; + } + + const uint64_t blob_file_number = bfile->BlobFileNumber(); + auto it = live_imm_non_ttl_blob_files_.lower_bound(blob_file_number); + assert(it == live_imm_non_ttl_blob_files_.end() || + it->first != blob_file_number); + live_imm_non_ttl_blob_files_.insert( + it, std::map>::value_type( + blob_file_number, bfile)); + } + + return s; +} + +Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { + write_mutex_.AssertHeld(); + + // atomic read + if (bfile->GetFileSize() < bdb_options_.blob_file_size) { + return Status::OK(); + } + + WriteLock lock(&mutex_); + WriteLock file_lock(&bfile->mutex_); + + assert(!bfile->Obsolete() || bfile->Immutable()); + if (bfile->Immutable()) { + return Status::OK(); + } + + return CloseBlobFile(bfile); +} + +void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr blob_file, + SequenceNumber obsolete_seq, + bool update_size) { + assert(blob_file->Immutable()); + assert(!blob_file->Obsolete()); + + // Should hold write lock of mutex_ or during DB open. + blob_file->MarkObsolete(obsolete_seq); + obsolete_files_.push_back(blob_file); + assert(total_blob_size_.load() >= blob_file->GetFileSize()); + if (update_size) { + total_blob_size_ -= blob_file->GetFileSize(); + } +} + +bool BlobDBImpl::VisibleToActiveSnapshot( + const std::shared_ptr& bfile) { + assert(bfile->Obsolete()); + + // We check whether the oldest snapshot is no less than the last sequence + // by the time the blob file become obsolete. If so, the blob file is not + // visible to all existing snapshots. + // + // If we keep track of the earliest sequence of the keys in the blob file, + // we could instead check if there's a snapshot falls in range + // [earliest_sequence, obsolete_sequence). But doing so will make the + // implementation more complicated. + SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence(); + SequenceNumber oldest_snapshot = kMaxSequenceNumber; + { + // Need to lock DBImpl mutex before access snapshot list. + InstrumentedMutexLock l(db_impl_->mutex()); + auto& snapshots = db_impl_->snapshots(); + if (!snapshots.empty()) { + oldest_snapshot = snapshots.oldest()->GetSequenceNumber(); + } + } + bool visible = oldest_snapshot < obsolete_sequence; + if (visible) { + ROCKS_LOG_INFO(db_options_.info_log, + "Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64 + ") visible to oldest snapshot %" PRIu64 ".", + bfile->BlobFileNumber(), obsolete_sequence, oldest_snapshot); + } + return visible; +} + +std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { + if (aborted) { + return std::make_pair(false, -1); + } + + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:0"); + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:1"); + + std::vector> process_files; + uint64_t now = EpochNow(); + { + ReadLock rl(&mutex_); + for (auto p : blob_files_) { + auto& blob_file = p.second; + ReadLock file_lock(&blob_file->mutex_); + if (blob_file->HasTTL() && !blob_file->Obsolete() && + blob_file->GetExpirationRange().second <= now) { + process_files.push_back(blob_file); + } + } + } + + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:2"); + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:3"); + TEST_SYNC_POINT_CALLBACK("BlobDBImpl::EvictExpiredFiles:cb", nullptr); + + SequenceNumber seq = GetLatestSequenceNumber(); + { + MutexLock l(&write_mutex_); + WriteLock lock(&mutex_); + for (auto& blob_file : process_files) { + WriteLock file_lock(&blob_file->mutex_); + + // Need to double check if the file is obsolete. + if (blob_file->Obsolete()) { + assert(blob_file->Immutable()); + continue; + } + + if (!blob_file->Immutable()) { + CloseBlobFile(blob_file); + } + + assert(blob_file->Immutable()); + + ObsoleteBlobFile(blob_file, seq, true /*update_size*/); + } + } + + return std::make_pair(true, -1); +} + +Status BlobDBImpl::SyncBlobFiles() { + MutexLock l(&write_mutex_); + + std::vector> process_files; + { + ReadLock rl(&mutex_); + for (auto fitr : open_ttl_files_) { + process_files.push_back(fitr); + } + if (open_non_ttl_file_ != nullptr) { + process_files.push_back(open_non_ttl_file_); + } + } + + Status s; + for (auto& blob_file : process_files) { + s = blob_file->Fsync(); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to sync blob file %" PRIu64 ", status: %s", + blob_file->BlobFileNumber(), s.ToString().c_str()); + return s; + } + } + + s = dir_ent_->FsyncWithDirOptions(IOOptions(), nullptr, DirFsyncOptions()); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to sync blob directory, status: %s", + s.ToString().c_str()); + } + return s; +} + +std::pair BlobDBImpl::ReclaimOpenFiles(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + if (open_file_count_.load() < kOpenFilesTrigger) { + return std::make_pair(true, -1); + } + + // in the future, we should sort by last_access_ + // instead of closing every file + ReadLock rl(&mutex_); + for (auto const& ent : blob_files_) { + auto bfile = ent.second; + if (bfile->last_access_.load() == -1) continue; + + WriteLock lockbfile_w(&bfile->mutex_); + CloseRandomAccessLocked(bfile); + } + + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { + if (aborted) { + return std::make_pair(false, -1); + } + + MutexLock delete_file_lock(&delete_file_mutex_); + if (disable_file_deletions_ > 0) { + return std::make_pair(true, -1); + } + + std::list> tobsolete; + { + WriteLock wl(&mutex_); + if (obsolete_files_.empty()) { + return std::make_pair(true, -1); + } + tobsolete.swap(obsolete_files_); + } + + bool file_deleted = false; + for (auto iter = tobsolete.begin(); iter != tobsolete.end();) { + auto bfile = *iter; + { + ReadLock lockbfile_r(&bfile->mutex_); + if (VisibleToActiveSnapshot(bfile)) { + ROCKS_LOG_INFO(db_options_.info_log, + "Could not delete file due to snapshot failure %s", + bfile->PathName().c_str()); + ++iter; + continue; + } + } + ROCKS_LOG_INFO(db_options_.info_log, + "Will delete file due to snapshot success %s", + bfile->PathName().c_str()); + + { + WriteLock wl(&mutex_); + blob_files_.erase(bfile->BlobFileNumber()); + } + + Status s = DeleteDBFile(&(db_impl_->immutable_db_options()), + bfile->PathName(), blob_dir_, true, + /*force_fg=*/false); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "File failed to be deleted as obsolete %s", + bfile->PathName().c_str()); + ++iter; + continue; + } + + file_deleted = true; + ROCKS_LOG_INFO(db_options_.info_log, + "File deleted as obsolete from blob dir %s", + bfile->PathName().c_str()); + + iter = tobsolete.erase(iter); + } + + // directory change. Fsync + if (file_deleted) { + Status s = dir_ent_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s", + blob_dir_.c_str(), s.ToString().c_str()); + } + } + + // put files back into obsolete if for some reason, delete failed + if (!tobsolete.empty()) { + WriteLock wl(&mutex_); + for (auto bfile : tobsolete) { + blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); + obsolete_files_.push_front(bfile); + } + } + + return std::make_pair(!aborted, -1); +} + +void BlobDBImpl::CopyBlobFiles( + std::vector>* bfiles_copy) { + ReadLock rl(&mutex_); + for (auto const& p : blob_files_) { + bfiles_copy->push_back(p.second); + } +} + +Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + auto* cfd = + static_cast_with_check(DefaultColumnFamily()) + ->cfd(); + // Get a snapshot to avoid blob file get deleted between we + // fetch and index entry and reading from the file. + ManagedSnapshot* own_snapshot = nullptr; + const Snapshot* snapshot = read_options.snapshot; + if (snapshot == nullptr) { + own_snapshot = new ManagedSnapshot(db_); + snapshot = own_snapshot->snapshot(); + } + auto* iter = db_impl_->NewIteratorImpl( + read_options, cfd, snapshot->GetSequenceNumber(), + nullptr /*read_callback*/, true /*expose_blob_index*/); + return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_); +} + +Status DestroyBlobDB(const std::string& dbname, const Options& options, + const BlobDBOptions& bdb_options) { + const ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + + Status status; + std::string blobdir; + blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir + : bdb_options.blob_dir; + + std::vector filenames; + if (env->GetChildren(blobdir, &filenames).ok()) { + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kBlobFile) { + Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true, + /*force_fg=*/false); + if (status.ok() && !del.ok()) { + status = del; + } + } + } + // TODO: What to do if we cannot delete the directory? + env->DeleteDir(blobdir).PermitUncheckedError(); + } + Status destroy = DestroyDB(dbname, options); + if (status.ok() && !destroy.ok()) { + status = destroy; + } + + return status; +} + +#ifndef NDEBUG +Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value) { + return GetBlobValue(key, index_entry, value); +} + +void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number, + SequenceNumber immutable_sequence) { + auto blob_file = std::make_shared(this, blob_dir_, blob_file_number, + db_options_.info_log.get()); + blob_file->MarkImmutable(immutable_sequence); + + blob_files_[blob_file_number] = blob_file; + live_imm_non_ttl_blob_files_[blob_file_number] = blob_file; +} + +std::vector> BlobDBImpl::TEST_GetBlobFiles() const { + ReadLock l(&mutex_); + std::vector> blob_files; + for (auto& p : blob_files_) { + blob_files.emplace_back(p.second); + } + return blob_files; +} + +std::vector> BlobDBImpl::TEST_GetLiveImmNonTTLFiles() + const { + ReadLock l(&mutex_); + std::vector> live_imm_non_ttl_files; + for (const auto& pair : live_imm_non_ttl_blob_files_) { + live_imm_non_ttl_files.emplace_back(pair.second); + } + return live_imm_non_ttl_files; +} + +std::vector> BlobDBImpl::TEST_GetObsoleteFiles() + const { + ReadLock l(&mutex_); + std::vector> obsolete_files; + for (auto& bfile : obsolete_files_) { + obsolete_files.emplace_back(bfile); + } + return obsolete_files; +} + +void BlobDBImpl::TEST_DeleteObsoleteFiles() { + DeleteObsoleteFiles(false /*abort*/); +} + +Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr& bfile) { + MutexLock l(&write_mutex_); + WriteLock lock(&mutex_); + WriteLock file_lock(&bfile->mutex_); + + return CloseBlobFile(bfile); +} + +void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, + SequenceNumber obsolete_seq, + bool update_size) { + return ObsoleteBlobFile(blob_file, obsolete_seq, update_size); +} + +void BlobDBImpl::TEST_EvictExpiredFiles() { + EvictExpiredFiles(false /*abort*/); +} + +uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); } + +void BlobDBImpl::TEST_InitializeBlobFileToSstMapping( + const std::vector& live_files) { + InitializeBlobFileToSstMapping(live_files); +} + +void BlobDBImpl::TEST_ProcessFlushJobInfo(const FlushJobInfo& info) { + ProcessFlushJobInfo(info); +} + +void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) { + ProcessCompactionJobInfo(info); +} + +#endif // !NDEBUG + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.h new file mode 100644 index 0000000000..de888c0683 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl.h @@ -0,0 +1,501 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/db_iter.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/wal_filter.h" +#include "util/mutexlock.h" +#include "util/timer_queue.h" +#include "utilities/blob_db/blob_db.h" +#include "utilities/blob_db/blob_file.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class ColumnFamilyHandle; +class ColumnFamilyData; +class SystemClock; + +struct FlushJobInfo; + +namespace blob_db { + +struct BlobCompactionContext; +struct BlobCompactionContextGC; +class BlobDBImpl; +class BlobFile; + +// Comparator to sort "TTL" aware Blob files based on the lower value of +// TTL range. +struct BlobFileComparatorTTL { + bool operator()(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const; +}; + +struct BlobFileComparator { + bool operator()(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const; +}; + +/** + * The implementation class for BlobDB. It manages the blob logs, which + * are sequentially written files. Blob logs can be of the TTL or non-TTL + * varieties; the former are cleaned up when they expire, while the latter + * are (optionally) garbage collected. + */ +class BlobDBImpl : public BlobDB { + friend class BlobFile; + friend class BlobDBIterator; + friend class BlobDBListener; + friend class BlobDBListenerGC; + friend class BlobIndexCompactionFilterBase; + friend class BlobIndexCompactionFilterGC; + + public: + // deletions check period + static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000; + + // sanity check task + static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000; + + // how many random access open files can we tolerate + static constexpr uint32_t kOpenFilesTrigger = 100; + + // how often to schedule reclaim open files. + static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000; + + // how often to schedule delete obs files periods + static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000; + + // how often to schedule expired files eviction. + static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000; + + // when should oldest file be evicted: + // on reaching 90% of blob_dir_size + static constexpr double kEvictOldestFileAtSize = 0.9; + + using BlobDB::Put; + Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) override; + + using BlobDB::Get; + Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + uint64_t* expiration) override; + + using BlobDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& read_options) override; + + using BlobDB::NewIterators; + virtual Status NewIterators( + const ReadOptions& /*read_options*/, + const std::vector& /*column_families*/, + std::vector* /*iterators*/) override { + return Status::NotSupported("Not implemented"); + } + + using BlobDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& read_options, const std::vector& keys, + std::vector* values) override; + + using BlobDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + virtual Status Close() override; + + using BlobDB::PutWithTTL; + Status PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) override; + + using BlobDB::PutUntil; + Status PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration) override; + + using BlobDB::CompactFiles; + Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override; + + BlobDBOptions GetBlobDBOptions() const override; + + BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options, + const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override; + virtual void GetLiveFilesMetaData(std::vector*) override; + + ~BlobDBImpl(); + + Status Open(std::vector* handles); + + Status SyncBlobFiles() override; + + // Common part of the two GetCompactionContext methods below. + // REQUIRES: read lock on mutex_ + void GetCompactionContextCommon(BlobCompactionContext* context); + + void GetCompactionContext(BlobCompactionContext* context); + void GetCompactionContext(BlobCompactionContext* context, + BlobCompactionContextGC* context_gc); + +#ifndef NDEBUG + Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value); + + void TEST_AddDummyBlobFile(uint64_t blob_file_number, + SequenceNumber immutable_sequence); + + std::vector> TEST_GetBlobFiles() const; + + std::vector> TEST_GetLiveImmNonTTLFiles() const; + + std::vector> TEST_GetObsoleteFiles() const; + + Status TEST_CloseBlobFile(std::shared_ptr& bfile); + + void TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, + SequenceNumber obsolete_seq = 0, + bool update_size = true); + + void TEST_EvictExpiredFiles(); + + void TEST_DeleteObsoleteFiles(); + + uint64_t TEST_live_sst_size(); + + const std::string& TEST_blob_dir() const { return blob_dir_; } + + void TEST_InitializeBlobFileToSstMapping( + const std::vector& live_files); + + void TEST_ProcessFlushJobInfo(const FlushJobInfo& info); + + void TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info); + +#endif // !NDEBUG + + private: + class BlobInserter; + + // Create a snapshot if there isn't one in read options. + // Return true if a snapshot is created. + bool SetSnapshotIfNeeded(ReadOptions* read_options); + + Status GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration = nullptr); + + Status GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value, uint64_t* expiration = nullptr); + + Status GetRawBlobFromFile(const Slice& key, uint64_t file_number, + uint64_t offset, uint64_t size, + PinnableSlice* value, + CompressionType* compression_type); + + Slice GetCompressedSlice(const Slice& raw, + std::string* compression_output) const; + + Status DecompressSlice(const Slice& compressed_value, + CompressionType compression_type, + PinnableSlice* value_output) const; + + // Close a file by appending a footer, and removes file from open files list. + // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_ + // and the blob file's mutex_. If called on a blob file which is visible only + // to a single thread (like in the case of new files written during + // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be + // avoided. + Status CloseBlobFile(std::shared_ptr bfile); + + // Close a file if its size exceeds blob_file_size + // REQUIRES: lock held on write_mutex_. + Status CloseBlobFileIfNeeded(std::shared_ptr& bfile); + + // Mark file as obsolete and move the file to obsolete file list. + // + // REQUIRED: hold write lock of mutex_ or during DB open. + void ObsoleteBlobFile(std::shared_ptr blob_file, + SequenceNumber obsolete_seq, bool update_size); + + Status PutBlobValue(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration, + WriteBatch* batch); + + Status AppendBlob(const std::shared_ptr& bfile, + const std::string& headerbuf, const Slice& key, + const Slice& value, uint64_t expiration, + std::string* index_entry); + + // Create a new blob file and associated writer. + Status CreateBlobFileAndWriter(bool has_ttl, + const ExpirationRange& expiration_range, + const std::string& reason, + std::shared_ptr* blob_file, + std::shared_ptr* writer); + + // Get the open non-TTL blob log file, or create a new one if no such file + // exists. + Status SelectBlobFile(std::shared_ptr* blob_file); + + // Get the open TTL blob log file for a certain expiration, or create a new + // one if no such file exists. + Status SelectBlobFileTTL(uint64_t expiration, + std::shared_ptr* blob_file); + + std::shared_ptr FindBlobFileLocked(uint64_t expiration) const; + + // periodic sanity check. Bunch of checks + std::pair SanityCheck(bool aborted); + + // Delete files that have been marked obsolete (either because of TTL + // or GC). Check whether any snapshots exist which refer to the same. + std::pair DeleteObsoleteFiles(bool aborted); + + // periodically check if open blob files and their TTL's has expired + // if expired, close the sequential writer and make the file immutable + std::pair EvictExpiredFiles(bool aborted); + + // if the number of open files, approaches ULIMIT's this + // task will close random readers, which are kept around for + // efficiency + std::pair ReclaimOpenFiles(bool aborted); + + std::pair RemoveTimerQ(TimerQueue* tq, bool aborted); + + // Adds the background tasks to the timer queue + void StartBackgroundTasks(); + + // add a new Blob File + std::shared_ptr NewBlobFile(bool has_ttl, + const ExpirationRange& expiration_range, + const std::string& reason); + + // Register a new blob file. + // REQUIRES: write lock on mutex_. + void RegisterBlobFile(std::shared_ptr blob_file); + + // collect all the blob log files from the blob directory + Status GetAllBlobFiles(std::set* file_numbers); + + // Open all blob files found in blob_dir. + Status OpenAllBlobFiles(); + + // Link an SST to a blob file. Comes in locking and non-locking varieties + // (the latter is used during Open). + template + void LinkSstToBlobFileImpl(uint64_t sst_file_number, + uint64_t blob_file_number, Linker linker); + + void LinkSstToBlobFile(uint64_t sst_file_number, uint64_t blob_file_number); + + void LinkSstToBlobFileNoLock(uint64_t sst_file_number, + uint64_t blob_file_number); + + // Unlink an SST from a blob file. + void UnlinkSstFromBlobFile(uint64_t sst_file_number, + uint64_t blob_file_number); + + // Initialize the mapping between blob files and SSTs during Open. + void InitializeBlobFileToSstMapping( + const std::vector& live_files); + + // Update the mapping between blob files and SSTs after a flush and mark + // any unneeded blob files obsolete. + void ProcessFlushJobInfo(const FlushJobInfo& info); + + // Update the mapping between blob files and SSTs after a compaction and + // mark any unneeded blob files obsolete. + void ProcessCompactionJobInfo(const CompactionJobInfo& info); + + // Mark an immutable non-TTL blob file obsolete assuming it has no more SSTs + // linked to it, and all memtables from before the blob file became immutable + // have been flushed. Note: should only be called if the condition holds for + // all lower-numbered non-TTL blob files as well. + bool MarkBlobFileObsoleteIfNeeded(const std::shared_ptr& blob_file, + SequenceNumber obsolete_seq); + + // Mark all immutable non-TTL blob files that aren't needed by any SSTs as + // obsolete. Comes in two varieties; the version used during Open need not + // worry about locking or snapshots. + template + void MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed); + + void MarkUnreferencedBlobFilesObsolete(); + void MarkUnreferencedBlobFilesObsoleteDuringOpen(); + + void UpdateLiveSSTSize(); + + Status GetBlobFileReader(const std::shared_ptr& blob_file, + std::shared_ptr* reader); + + // hold write mutex on file and call. + // Close the above Random Access reader + void CloseRandomAccessLocked(const std::shared_ptr& bfile); + + // hold write mutex on file and call + // creates a sequential (append) writer for this blobfile + Status CreateWriterLocked(const std::shared_ptr& bfile); + + // returns a BlobLogWriter object for the file. If writer is not + // already present, creates one. Needs Write Mutex to be held + Status CheckOrCreateWriterLocked(const std::shared_ptr& blob_file, + std::shared_ptr* writer); + + // checks if there is no snapshot which is referencing the + // blobs + bool VisibleToActiveSnapshot(const std::shared_ptr& file); + bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr& bfile); + + void CopyBlobFiles(std::vector>* bfiles_copy); + + uint64_t EpochNow() { return clock_->NowMicros() / 1000000; } + + // Check if inserting a new blob will make DB grow out of space. + // If is_fifo = true, FIFO eviction will be triggered to make room for the + // new blob. If force_evict = true, FIFO eviction will evict blob files + // even eviction will not make enough room for the new blob. + Status CheckSizeAndEvictBlobFiles(uint64_t blob_size, + bool force_evict = false); + + // name of the database directory + std::string dbname_; + + // the base DB + DBImpl* db_impl_; + Env* env_; + SystemClock* clock_; + // the options that govern the behavior of Blob Storage + BlobDBOptions bdb_options_; + DBOptions db_options_; + ColumnFamilyOptions cf_options_; + FileOptions file_options_; + + // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold + // ownership. + Statistics* statistics_; + + // by default this is "blob_dir" under dbname_ + // but can be configured + std::string blob_dir_; + + // pointer to directory + std::unique_ptr dir_ent_; + + // Read Write Mutex, which protects all the data structures + // HEAVILY TRAFFICKED + mutable port::RWMutex mutex_; + + // Writers has to hold write_mutex_ before writing. + mutable port::Mutex write_mutex_; + + // counter for blob file number + std::atomic next_file_number_; + + // entire metadata of all the BLOB files memory + std::map> blob_files_; + + // All live immutable non-TTL blob files. + std::map> live_imm_non_ttl_blob_files_; + + // The largest sequence number that has been flushed. + SequenceNumber flush_sequence_; + + // opened non-TTL blob file. + std::shared_ptr open_non_ttl_file_; + + // all the blob files which are currently being appended to based + // on variety of incoming TTL's + std::set, BlobFileComparatorTTL> open_ttl_files_; + + // Flag to check whether Close() has been called on this DB + bool closed_; + + // timer based queue to execute tasks + TimerQueue tqueue_; + + // number of files opened for random access/GET + // counter is used to monitor and close excess RA files. + std::atomic open_file_count_; + + // Total size of all live blob files (i.e. exclude obsolete files). + std::atomic total_blob_size_; + + // total size of SST files. + std::atomic live_sst_size_; + + // Latest FIFO eviction timestamp + // + // REQUIRES: access with metex_ lock held. + uint64_t fifo_eviction_seq_; + + // The expiration up to which latest FIFO eviction evicts. + // + // REQUIRES: access with metex_ lock held. + uint64_t evict_expiration_up_to_; + + std::list> obsolete_files_; + + // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block + // on the mutex to avoid contention. + // + // While DeleteObsoleteFiles hold both mutex_ and delete_file_mutex_, note + // the difference. mutex_ only needs to be held when access the + // data-structure, and delete_file_mutex_ needs to be held the whole time + // during DeleteObsoleteFiles to avoid being run simultaneously with + // DisableFileDeletions. + // + // If both of mutex_ and delete_file_mutex_ needs to be held, it is adviced + // to hold delete_file_mutex_ first to avoid deadlock. + mutable port::Mutex delete_file_mutex_; + + // Each call of DisableFileDeletions will increase disable_file_deletion_ + // by 1. EnableFileDeletions will either decrease the count by 1 or reset + // it to zeor, depending on the force flag. + // + // REQUIRES: access with delete_file_mutex_ held. + int disable_file_deletions_ = 0; + + uint32_t debug_level_; +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl_filesnapshot.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl_filesnapshot.cc new file mode 100644 index 0000000000..da2d02d07a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_impl_filesnapshot.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "file/filename.h" +#include "logging/logging.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "utilities/blob_db/blob_db_impl.h" + +// BlobDBImpl methods to get snapshot of files, e.g. for replication. + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +Status BlobDBImpl::DisableFileDeletions() { + // Disable base DB file deletions. + Status s = db_impl_->DisableFileDeletions(); + if (!s.ok()) { + return s; + } + + int count = 0; + { + // Hold delete_file_mutex_ to make sure no DeleteObsoleteFiles job + // is running. + MutexLock l(&delete_file_mutex_); + count = ++disable_file_deletions_; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Disabled blob file deletions. count: %d", count); + return Status::OK(); +} + +Status BlobDBImpl::EnableFileDeletions(bool force) { + // Enable base DB file deletions. + Status s = db_impl_->EnableFileDeletions(force); + if (!s.ok()) { + return s; + } + + int count = 0; + { + MutexLock l(&delete_file_mutex_); + if (force) { + disable_file_deletions_ = 0; + } else if (disable_file_deletions_ > 0) { + count = --disable_file_deletions_; + } + assert(count >= 0); + } + + ROCKS_LOG_INFO(db_options_.info_log, "Enabled blob file deletions. count: %d", + count); + // Consider trigger DeleteobsoleteFiles once after re-enabled, if we are to + // make DeleteobsoleteFiles re-run interval configuration. + return Status::OK(); +} + +Status BlobDBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool flush_memtable) { + if (!bdb_options_.path_relative) { + return Status::NotSupported( + "Not able to get relative blob file path from absolute blob_dir."); + } + // Hold a lock in the beginning to avoid updates to base DB during the call + ReadLock rl(&mutex_); + Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable); + if (!s.ok()) { + return s; + } + ret.reserve(ret.size() + blob_files_.size()); + for (auto bfile_pair : blob_files_) { + auto blob_file = bfile_pair.second; + // Path should be relative to db_name, but begin with slash. + ret.emplace_back( + BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber())); + } + return Status::OK(); +} + +void BlobDBImpl::GetLiveFilesMetaData(std::vector* metadata) { + // Path should be relative to db_name. + assert(bdb_options_.path_relative); + // Hold a lock in the beginning to avoid updates to base DB during the call + ReadLock rl(&mutex_); + db_->GetLiveFilesMetaData(metadata); + for (auto bfile_pair : blob_files_) { + auto blob_file = bfile_pair.second; + LiveFileMetaData filemetadata; + filemetadata.size = blob_file->GetFileSize(); + const uint64_t file_number = blob_file->BlobFileNumber(); + // Path should be relative to db_name, but begin with slash. + filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number); + filemetadata.file_number = file_number; + if (blob_file->HasTTL()) { + filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first; + } + auto cfh = + static_cast_with_check(DefaultColumnFamily()); + filemetadata.column_family_name = cfh->GetName(); + metadata->emplace_back(filemetadata); + } +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_iterator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_iterator.h new file mode 100644 index 0000000000..4898ddfd76 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_iterator.h @@ -0,0 +1,148 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/arena_wrapped_db_iter.h" +#include "rocksdb/iterator.h" +#include "util/stop_watch.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { +class Statistics; +class SystemClock; + +namespace blob_db { + +using ROCKSDB_NAMESPACE::ManagedSnapshot; + +class BlobDBIterator : public Iterator { + public: + BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter, + BlobDBImpl* blob_db, SystemClock* clock, + Statistics* statistics) + : snapshot_(snapshot), + iter_(iter), + blob_db_(blob_db), + clock_(clock), + statistics_(statistics) {} + + virtual ~BlobDBIterator() = default; + + bool Valid() const override { + if (!iter_->Valid()) { + return false; + } + return status_.ok(); + } + + Status status() const override { + if (!iter_->status().ok()) { + return iter_->status(); + } + return status_; + } + + void SeekToFirst() override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->SeekToFirst(); + while (UpdateBlobValue()) { + iter_->Next(); + } + } + + void SeekToLast() override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->SeekToLast(); + while (UpdateBlobValue()) { + iter_->Prev(); + } + } + + void Seek(const Slice& target) override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->Seek(target); + while (UpdateBlobValue()) { + iter_->Next(); + } + } + + void SeekForPrev(const Slice& target) override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->SeekForPrev(target); + while (UpdateBlobValue()) { + iter_->Prev(); + } + } + + void Next() override { + assert(Valid()); + StopWatch next_sw(clock_, statistics_, BLOB_DB_NEXT_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_NEXT); + iter_->Next(); + while (UpdateBlobValue()) { + iter_->Next(); + } + } + + void Prev() override { + assert(Valid()); + StopWatch prev_sw(clock_, statistics_, BLOB_DB_PREV_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_PREV); + iter_->Prev(); + while (UpdateBlobValue()) { + iter_->Prev(); + } + } + + Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + Slice value() const override { + assert(Valid()); + if (!iter_->IsBlob()) { + return iter_->value(); + } + return value_; + } + + // Iterator::Refresh() not supported. + + private: + // Return true if caller should continue to next value. + bool UpdateBlobValue() { + value_.Reset(); + status_ = Status::OK(); + if (iter_->Valid() && iter_->status().ok() && iter_->IsBlob()) { + Status s = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_); + if (s.IsNotFound()) { + return true; + } else { + if (!s.ok()) { + status_ = s; + } + return false; + } + } else { + return false; + } + } + + std::unique_ptr snapshot_; + std::unique_ptr iter_; + BlobDBImpl* blob_db_; + SystemClock* clock_; + Statistics* statistics_; + Status status_; + PinnableSlice value_; +}; +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_listener.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_listener.h new file mode 100644 index 0000000000..16aed33404 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_db_listener.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "rocksdb/listener.h" +#include "util/mutexlock.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDBListener : public EventListener { + public: + explicit BlobDBListener(BlobDBImpl* blob_db_impl) + : blob_db_impl_(blob_db_impl) {} + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->SyncBlobFiles(); + } + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->UpdateLiveSSTSize(); + } + + void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->UpdateLiveSSTSize(); + } + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "BlobDBListener"; } + + protected: + BlobDBImpl* blob_db_impl_; +}; + +class BlobDBListenerGC : public BlobDBListener { + public: + explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl) + : BlobDBListener(blob_db_impl) {} + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "BlobDBListenerGC"; } + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + BlobDBListener::OnFlushCompleted(db, info); + + assert(blob_db_impl_); + blob_db_impl_->ProcessFlushJobInfo(info); + } + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { + BlobDBListener::OnCompactionCompleted(db, info); + + assert(blob_db_impl_); + blob_db_impl_->ProcessCompactionJobInfo(info); + } +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_dump_tool.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_dump_tool.h new file mode 100644 index 0000000000..12cd1bf422 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_dump_tool.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "file/random_access_file_reader.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDumpTool { + public: + enum class DisplayType { + kNone, + kRaw, + kHex, + kDetail, + }; + + BlobDumpTool(); + + Status Run(const std::string& filename, DisplayType show_key, + DisplayType show_blob, DisplayType show_uncompressed_blob, + bool show_summary); + + private: + std::unique_ptr reader_; + std::unique_ptr buffer_; + size_t buffer_size_; + + Status Read(uint64_t offset, size_t size, Slice* result); + Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression); + Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset); + Status DumpRecord(DisplayType show_key, DisplayType show_blob, + DisplayType show_uncompressed_blob, bool show_summary, + CompressionType compression, uint64_t* offset, + uint64_t* total_records, uint64_t* total_key_size, + uint64_t* total_blob_size, + uint64_t* total_uncompressed_blob_size); + void DumpSlice(const Slice s, DisplayType type); + + template + std::string GetString(std::pair p); +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.cc new file mode 100644 index 0000000000..cad89f2e49 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.cc @@ -0,0 +1,316 @@ + +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "utilities/blob_db/blob_file.h" + +#include + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "file/filename.h" +#include "file/readahead_raf.h" +#include "logging/logging.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +namespace blob_db { + +BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn, + Logger* info_log) + : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {} + +BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn, + Logger* info_log, uint32_t column_family_id, + CompressionType compression, bool has_ttl, + const ExpirationRange& expiration_range) + : parent_(p), + path_to_dir_(bdir), + file_number_(fn), + info_log_(info_log), + column_family_id_(column_family_id), + compression_(compression), + has_ttl_(has_ttl), + expiration_range_(expiration_range), + header_(column_family_id, compression, has_ttl, expiration_range), + header_valid_(true) {} + +BlobFile::~BlobFile() { + if (obsolete_) { + std::string pn(PathName()); + Status s = Env::Default()->DeleteFile(PathName()); + if (!s.ok()) { + // ROCKS_LOG_INFO(db_options_.info_log, + // "File could not be deleted %s", pn.c_str()); + } + } +} + +uint32_t BlobFile::GetColumnFamilyId() const { return column_family_id_; } + +std::string BlobFile::PathName() const { + return BlobFileName(path_to_dir_, file_number_); +} + +std::string BlobFile::DumpState() const { + char str[1000]; + snprintf( + str, sizeof(str), + "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " file_size: %" PRIu64 + " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64 + "), writer: %d reader: %d", + path_to_dir_.c_str(), file_number_, blob_count_.load(), file_size_.load(), + closed_.load(), obsolete_.load(), expiration_range_.first, + expiration_range_.second, (!!log_writer_), (!!ra_file_reader_)); + return str; +} + +void BlobFile::MarkObsolete(SequenceNumber sequence) { + assert(Immutable()); + obsolete_sequence_ = sequence; + obsolete_.store(true); +} + +Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) { + BlobLogFooter footer; + footer.blob_count = blob_count_; + if (HasTTL()) { + footer.expiration_range = expiration_range_; + } + + // this will close the file and reset the Writable File Pointer. + Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr, + /* checksum_value */ nullptr); + if (s.ok()) { + closed_ = true; + immutable_sequence_ = sequence; + file_size_ += BlobLogFooter::kSize; + } + // delete the sequential writer + log_writer_.reset(); + return s; +} + +Status BlobFile::ReadFooter(BlobLogFooter* bf) { + if (file_size_ < (BlobLogHeader::kSize + BlobLogFooter::kSize)) { + return Status::IOError("File does not have footer", PathName()); + } + + uint64_t footer_offset = file_size_ - BlobLogFooter::kSize; + // assume that ra_file_reader_ is valid before we enter this + assert(ra_file_reader_); + + Slice result; + std::string buf; + AlignedBuf aligned_buf; + Status s; + // TODO: rate limit reading footers from blob files. + if (ra_file_reader_->use_direct_io()) { + s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, + &result, nullptr, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + } else { + buf.reserve(BlobLogFooter::kSize + 10); + s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, + &result, &buf[0], nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + } + if (!s.ok()) return s; + if (result.size() != BlobLogFooter::kSize) { + // should not happen + return Status::IOError("EOF reached before footer"); + } + + s = bf->DecodeFrom(result); + return s; +} + +Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) { + blob_count_ = footer.blob_count; + expiration_range_ = footer.expiration_range; + closed_ = true; + return Status::OK(); +} + +Status BlobFile::Fsync() { + Status s; + if (log_writer_.get()) { + s = log_writer_->Sync(); + } + return s; +} + +void BlobFile::CloseRandomAccessLocked() { + ra_file_reader_.reset(); + last_access_ = -1; +} + +Status BlobFile::GetReader(Env* env, const FileOptions& file_options, + std::shared_ptr* reader, + bool* fresh_open) { + assert(reader != nullptr); + assert(fresh_open != nullptr); + *fresh_open = false; + int64_t current_time = 0; + if (env->GetCurrentTime(¤t_time).ok()) { + last_access_.store(current_time); + } + Status s; + + { + ReadLock lockbfile_r(&mutex_); + if (ra_file_reader_) { + *reader = ra_file_reader_; + return s; + } + } + + WriteLock lockbfile_w(&mutex_); + // Double check. + if (ra_file_reader_) { + *reader = ra_file_reader_; + return s; + } + + std::unique_ptr rfile; + s = env->GetFileSystem()->NewRandomAccessFile(PathName(), file_options, + &rfile, nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, + "Failed to open blob file for random-read: %s status: '%s'" + " exists: '%s'", + PathName().c_str(), s.ToString().c_str(), + env->FileExists(PathName()).ToString().c_str()); + return s; + } + + ra_file_reader_ = + std::make_shared(std::move(rfile), PathName()); + *reader = ra_file_reader_; + *fresh_open = true; + return s; +} + +Status BlobFile::ReadMetadata(const std::shared_ptr& fs, + const FileOptions& file_options) { + assert(Immutable()); + // Get file size. + uint64_t file_size = 0; + Status s = + fs->GetFileSize(PathName(), file_options.io_options, &file_size, nullptr); + if (s.ok()) { + file_size_ = file_size; + } else { + ROCKS_LOG_ERROR(info_log_, + "Failed to get size of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + if (file_size < BlobLogHeader::kSize) { + ROCKS_LOG_ERROR( + info_log_, "Incomplete blob file blob file %" PRIu64 ", size: %" PRIu64, + file_number_, file_size); + return Status::Corruption("Incomplete blob file header."); + } + + // Create file reader. + std::unique_ptr file_reader; + s = RandomAccessFileReader::Create(fs, PathName(), file_options, &file_reader, + nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, + "Failed to open blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + + // Read file header. + std::string header_buf; + AlignedBuf aligned_buf; + Slice header_slice; + // TODO: rate limit reading headers from blob files. + if (file_reader->use_direct_io()) { + s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, + nullptr, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + } else { + header_buf.reserve(BlobLogHeader::kSize); + s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, + &header_buf[0], nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + info_log_, "Failed to read header of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + BlobLogHeader header; + s = header.DecodeFrom(header_slice); + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, + "Failed to decode header of blob file %" PRIu64 + ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + column_family_id_ = header.column_family_id; + compression_ = header.compression; + has_ttl_ = header.has_ttl; + if (has_ttl_) { + expiration_range_ = header.expiration_range; + } + header_valid_ = true; + + // Read file footer. + if (file_size_ < BlobLogHeader::kSize + BlobLogFooter::kSize) { + // OK not to have footer. + assert(!footer_valid_); + return Status::OK(); + } + std::string footer_buf; + Slice footer_slice; + // TODO: rate limit reading footers from blob files. + if (file_reader->use_direct_io()) { + s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, + BlobLogFooter::kSize, &footer_slice, nullptr, + &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + } else { + footer_buf.reserve(BlobLogFooter::kSize); + s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, + BlobLogFooter::kSize, &footer_slice, &footer_buf[0], + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + info_log_, "Failed to read footer of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + BlobLogFooter footer; + s = footer.DecodeFrom(footer_slice); + if (!s.ok()) { + // OK not to have footer. + assert(!footer_valid_); + return Status::OK(); + } + blob_count_ = footer.blob_count; + if (has_ttl_) { + assert(header.expiration_range.first <= footer.expiration_range.first); + assert(header.expiration_range.second >= footer.expiration_range.second); + expiration_range_ = footer.expiration_range; + } + footer_valid_ = true; + return Status::OK(); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.h new file mode 100644 index 0000000000..8651c6b672 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/blob_db/blob_file.h @@ -0,0 +1,244 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "file/random_access_file_reader.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDBImpl; + +class BlobFile { + friend class BlobDBImpl; + friend struct BlobFileComparator; + friend struct BlobFileComparatorTTL; + friend class BlobIndexCompactionFilterBase; + friend class BlobIndexCompactionFilterGC; + + private: + // access to parent + const BlobDBImpl* parent_{nullptr}; + + // path to blob directory + std::string path_to_dir_; + + // the id of the file. + // the above 2 are created during file creation and never changed + // after that + uint64_t file_number_{0}; + + // The file numbers of the SST files whose oldest blob file reference + // points to this blob file. + std::unordered_set linked_sst_files_; + + // Info log. + Logger* info_log_{nullptr}; + + // Column family id. + uint32_t column_family_id_{std::numeric_limits::max()}; + + // Compression type of blobs in the file + CompressionType compression_{kNoCompression}; + + // If true, the keys in this file all has TTL. Otherwise all keys don't + // have TTL. + bool has_ttl_{false}; + + // TTL range of blobs in the file. + ExpirationRange expiration_range_; + + // number of blobs in the file + std::atomic blob_count_{0}; + + // size of the file + std::atomic file_size_{0}; + + BlobLogHeader header_; + + // closed_ = true implies the file is no more mutable + // no more blobs will be appended and the footer has been written out + std::atomic closed_{false}; + + // The latest sequence number when the file was closed/made immutable. + SequenceNumber immutable_sequence_{0}; + + // Whether the file was marked obsolete (due to either TTL or GC). + // obsolete_ still needs to do iterator/snapshot checks + std::atomic obsolete_{false}; + + // The last sequence number by the time the file marked as obsolete. + // Data in this file is visible to a snapshot taken before the sequence. + SequenceNumber obsolete_sequence_{0}; + + // Sequential/Append writer for blobs + std::shared_ptr log_writer_; + + // random access file reader for GET calls + std::shared_ptr ra_file_reader_; + + // This Read-Write mutex is per file specific and protects + // all the datastructures + mutable port::RWMutex mutex_; + + // time when the random access reader was last created. + std::atomic last_access_{-1}; + + bool header_valid_{false}; + + bool footer_valid_{false}; + + public: + BlobFile() = default; + + BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum, + Logger* info_log); + + BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum, + Logger* info_log, uint32_t column_family_id, + CompressionType compression, bool has_ttl, + const ExpirationRange& expiration_range); + + ~BlobFile(); + + uint32_t GetColumnFamilyId() const; + + // Returns log file's absolute pathname. + std::string PathName() const; + + // Primary identifier for blob file. + // once the file is created, this never changes + uint64_t BlobFileNumber() const { return file_number_; } + + // Get the set of SST files whose oldest blob file reference points to + // this file. + const std::unordered_set& GetLinkedSstFiles() const { + return linked_sst_files_; + } + + // Link an SST file whose oldest blob file reference points to this file. + void LinkSstFile(uint64_t sst_file_number) { + assert(linked_sst_files_.find(sst_file_number) == linked_sst_files_.end()); + linked_sst_files_.insert(sst_file_number); + } + + // Unlink an SST file whose oldest blob file reference points to this file. + void UnlinkSstFile(uint64_t sst_file_number) { + auto it = linked_sst_files_.find(sst_file_number); + assert(it != linked_sst_files_.end()); + linked_sst_files_.erase(it); + } + + // the following functions are atomic, and don't need + // read lock + uint64_t BlobCount() const { + return blob_count_.load(std::memory_order_acquire); + } + + std::string DumpState() const; + + // if the file is not taking any more appends. + bool Immutable() const { return closed_.load(); } + + // Mark the file as immutable. + // REQUIRES: write lock held, or access from single thread (on DB open). + void MarkImmutable(SequenceNumber sequence) { + closed_ = true; + immutable_sequence_ = sequence; + } + + SequenceNumber GetImmutableSequence() const { + assert(Immutable()); + return immutable_sequence_; + } + + // Whether the file was marked obsolete (due to either TTL or GC). + bool Obsolete() const { + assert(Immutable() || !obsolete_.load()); + return obsolete_.load(); + } + + // Mark file as obsolete (due to either TTL or GC). The file is not visible to + // snapshots with sequence greater or equal to the given sequence. + void MarkObsolete(SequenceNumber sequence); + + SequenceNumber GetObsoleteSequence() const { + assert(Obsolete()); + return obsolete_sequence_; + } + + Status Fsync(); + + uint64_t GetFileSize() const { + return file_size_.load(std::memory_order_acquire); + } + + // All Get functions which are not atomic, will need ReadLock on the mutex + + const ExpirationRange& GetExpirationRange() const { + return expiration_range_; + } + + void ExtendExpirationRange(uint64_t expiration) { + expiration_range_.first = std::min(expiration_range_.first, expiration); + expiration_range_.second = std::max(expiration_range_.second, expiration); + } + + bool HasTTL() const { return has_ttl_; } + + void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; } + + CompressionType GetCompressionType() const { return compression_; } + + std::shared_ptr GetWriter() const { return log_writer_; } + + // Read blob file header and footer. Return corruption if file header is + // malform or incomplete. If footer is malform or incomplete, set + // footer_valid_ to false and return Status::OK. + Status ReadMetadata(const std::shared_ptr& fs, + const FileOptions& file_options); + + Status GetReader(Env* env, const FileOptions& file_options, + std::shared_ptr* reader, + bool* fresh_open); + + private: + Status ReadFooter(BlobLogFooter* footer); + + Status WriteFooterAndCloseLocked(SequenceNumber sequence); + + void CloseRandomAccessLocked(); + + // this is used, when you are reading only the footer of a + // previously closed file + Status SetFromFooterLocked(const BlobLogFooter& footer); + + void set_expiration_range(const ExpirationRange& expiration_range) { + expiration_range_ = expiration_range; + } + + // The following functions are atomic, and don't need locks + void SetFileSize(uint64_t fs) { file_size_ = fs; } + + void SetBlobCount(uint64_t bc) { blob_count_ = bc; } + + void BlobRecordAdded(uint64_t record_size) { + ++blob_count_; + file_size_ += record_size; + } +}; +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load.cc new file mode 100644 index 0000000000..bbe02a9340 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load.cc @@ -0,0 +1,67 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/utilities/cache_dump_load.h" + +#include "file/writable_file_writer.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "table/format.h" +#include "util/crc32c.h" +#include "utilities/cache_dump_load_impl.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus NewToFileCacheDumpWriter(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* writer) { + std::unique_ptr file_writer; + IOStatus io_s = WritableFileWriter::Create(fs, file_name, file_opts, + &file_writer, nullptr); + if (!io_s.ok()) { + return io_s; + } + writer->reset(new ToFileCacheDumpWriter(std::move(file_writer))); + return io_s; +} + +IOStatus NewFromFileCacheDumpReader(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* reader) { + std::unique_ptr file_reader; + IOStatus io_s = RandomAccessFileReader::Create(fs, file_name, file_opts, + &file_reader, nullptr); + if (!io_s.ok()) { + return io_s; + } + reader->reset(new FromFileCacheDumpReader(std::move(file_reader))); + return io_s; +} + +Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer, + std::unique_ptr* cache_dumper) { + cache_dumper->reset( + new CacheDumperImpl(dump_options, cache, std::move(writer))); + return Status::OK(); +} + +Status NewDefaultCacheDumpedLoader( + const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& toptions, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader, + std::unique_ptr* cache_dump_loader) { + cache_dump_loader->reset(new CacheDumpedLoaderImpl( + dump_options, toptions, secondary_cache, std::move(reader))); + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.cc new file mode 100644 index 0000000000..52f2a4df7d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.cc @@ -0,0 +1,369 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_key.h" +#include "table/block_based/block_based_table_reader.h" + +#include "cache/cache_entry_roles.h" +#include "file/writable_file_writer.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "table/format.h" +#include "util/crc32c.h" +#include "utilities/cache_dump_load_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// Set the dump filter with a list of DBs. Block cache may be shared by multipe +// DBs and we may only want to dump out the blocks belonging to certain DB(s). +// Therefore, a filter is need to decide if the key of the block satisfy the +// requirement. +Status CacheDumperImpl::SetDumpFilter(std::vector db_list) { + Status s = Status::OK(); + for (size_t i = 0; i < db_list.size(); i++) { + assert(i < db_list.size()); + TablePropertiesCollection ptc; + assert(db_list[i] != nullptr); + s = db_list[i]->GetPropertiesOfAllTables(&ptc); + if (!s.ok()) { + return s; + } + for (auto id = ptc.begin(); id != ptc.end(); id++) { + OffsetableCacheKey base; + // We only want to save cache entries that are portable to another + // DB::Open, so only save entries with stable keys. + bool is_stable; + BlockBasedTable::SetupBaseCacheKey(id->second.get(), + /*cur_db_session_id*/ "", + /*cur_file_num*/ 0, &base, &is_stable); + if (is_stable) { + Slice prefix_slice = base.CommonPrefixSlice(); + assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize); + prefix_filter_.insert(prefix_slice.ToString()); + } + } + } + return s; +} + +// This is the main function to dump out the cache block entries to the writer. +// The writer may create a file or write to other systems. Currently, we will +// iterate the whole block cache, get the blocks, and write them to the writer +IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() { + // Prepare stage, check the parameters. + if (cache_ == nullptr) { + return IOStatus::InvalidArgument("Cache is null"); + } + if (writer_ == nullptr) { + return IOStatus::InvalidArgument("CacheDumpWriter is null"); + } + // Set the system clock + if (options_.clock == nullptr) { + return IOStatus::InvalidArgument("System clock is null"); + } + clock_ = options_.clock; + + // Set the sequence number + sequence_num_ = 0; + + // Dump stage, first, we write the hader + IOStatus io_s = WriteHeader(); + if (!io_s.ok()) { + return io_s; + } + + // Then, we iterate the block cache and dump out the blocks that are not + // filtered out. + std::string buf; + cache_->ApplyToAllEntries(DumpOneBlockCallBack(buf), {}); + + // Finally, write the footer + io_s = WriteFooter(); + if (!io_s.ok()) { + return io_s; + } + io_s = writer_->Close(); + return io_s; +} + +// Check if we need to filter out the block based on its key +bool CacheDumperImpl::ShouldFilterOut(const Slice& key) { + if (key.size() < OffsetableCacheKey::kCommonPrefixSize) { + return /*filter out*/ true; + } + Slice key_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize); + std::string prefix = key_prefix.ToString(); + // Filter out if not found + return prefix_filter_.find(prefix) == prefix_filter_.end(); +} + +// This is the callback function which will be applied to +// Cache::ApplyToAllEntries. In this callback function, we will get the block +// type, decide if the block needs to be dumped based on the filter, and write +// the block through the provided writer. `buf` is passed in for efficiennt +// reuse. +std::function +CacheDumperImpl::DumpOneBlockCallBack(std::string& buf) { + return [&](const Slice& key, Cache::ObjectPtr value, size_t /*charge*/, + const Cache::CacheItemHelper* helper) { + if (helper == nullptr || helper->size_cb == nullptr || + helper->saveto_cb == nullptr) { + // Not compatible with dumping. Skip this entry. + return; + } + + CacheEntryRole role = helper->role; + CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax; + + switch (role) { + case CacheEntryRole::kDataBlock: + type = CacheDumpUnitType::kData; + break; + case CacheEntryRole::kFilterBlock: + type = CacheDumpUnitType::kFilter; + break; + case CacheEntryRole::kFilterMetaBlock: + type = CacheDumpUnitType::kFilterMetaBlock; + break; + case CacheEntryRole::kIndexBlock: + type = CacheDumpUnitType::kIndex; + break; + default: + // Filter out other entries + // FIXME? Do we need the CacheDumpUnitTypes? UncompressionDict? + return; + } + + // based on the key prefix, check if the block should be filter out. + if (ShouldFilterOut(key)) { + return; + } + + assert(type != CacheDumpUnitType::kBlockTypeMax); + + // Use cache item helper to get persistable data + // FIXME: reduce copying + size_t len = helper->size_cb(value); + buf.assign(len, '\0'); + Status s = helper->saveto_cb(value, /*start*/ 0, len, buf.data()); + + if (s.ok()) { + // Write it out + WriteBlock(type, key, buf).PermitUncheckedError(); + } + }; +} + +// Write the block to the writer. It takes the timestamp of the +// block being copied from block cache, block type, key, block pointer, +// block size and block checksum as the input. When writing the dumper raw +// block, we first create the dump unit and encoude it to a string. Then, +// we calculate the checksum of the whole dump unit string and store it in +// the dump unit metadata. +// First, we write the metadata first, which is a fixed size string. Then, we +// Append the dump unit string to the writer. +IOStatus CacheDumperImpl::WriteBlock(CacheDumpUnitType type, const Slice& key, + const Slice& value) { + uint64_t timestamp = clock_->NowMicros(); + uint32_t value_checksum = crc32c::Value(value.data(), value.size()); + + // First, serialize the block information in a string + DumpUnit dump_unit; + dump_unit.timestamp = timestamp; + dump_unit.key = key; + dump_unit.type = type; + dump_unit.value_len = value.size(); + dump_unit.value = const_cast(value.data()); + dump_unit.value_checksum = value_checksum; + std::string encoded_data; + CacheDumperHelper::EncodeDumpUnit(dump_unit, &encoded_data); + + // Second, create the metadata, which contains a sequence number, the dump + // unit string checksum and the string size. The sequence number monotonically + // increases from 0. + DumpUnitMeta unit_meta; + unit_meta.sequence_num = sequence_num_; + sequence_num_++; + unit_meta.dump_unit_checksum = + crc32c::Value(encoded_data.data(), encoded_data.size()); + unit_meta.dump_unit_size = encoded_data.size(); + std::string encoded_meta; + CacheDumperHelper::EncodeDumpUnitMeta(unit_meta, &encoded_meta); + + // We write the metadata first. + assert(writer_ != nullptr); + IOStatus io_s = writer_->WriteMetadata(encoded_meta); + if (!io_s.ok()) { + return io_s; + } + // followed by the dump unit. + return writer_->WritePacket(encoded_data); +} + +// Before we write any block, we write the header first to store the cache dump +// format version, rocksdb version, and brief intro. +IOStatus CacheDumperImpl::WriteHeader() { + std::string header_key = "header"; + std::ostringstream s; + s << kTraceMagic << "\t" + << "Cache dump format version: " << kCacheDumpMajorVersion << "." + << kCacheDumpMinorVersion << "\t" + << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t" + << "Format: dump_unit_metadata , dump_unit cache_value\n"; + std::string header_value(s.str()); + CacheDumpUnitType type = CacheDumpUnitType::kHeader; + return WriteBlock(type, header_key, header_value); +} + +// Write the footer after all the blocks are stored to indicate the ending. +IOStatus CacheDumperImpl::WriteFooter() { + std::string footer_key = "footer"; + std::string footer_value("cache dump completed"); + CacheDumpUnitType type = CacheDumpUnitType::kFooter; + return WriteBlock(type, footer_key, footer_value); +} + +// This is the main function to restore the cache entries to secondary cache. +// First, we check if all the arguments are valid. Then, we read the block +// sequentially from the reader and insert them to the secondary cache. +IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() { + // TODO: remove this line when options are used in the loader + (void)options_; + // Step 1: we check if all the arguments are valid + if (secondary_cache_ == nullptr) { + return IOStatus::InvalidArgument("Secondary Cache is null"); + } + if (reader_ == nullptr) { + return IOStatus::InvalidArgument("CacheDumpReader is null"); + } + + // Step 2: read the header + // TODO: we need to check the cache dump format version and RocksDB version + // after the header is read out. + IOStatus io_s; + DumpUnit dump_unit; + std::string data; + io_s = ReadHeader(&data, &dump_unit); + if (!io_s.ok()) { + return io_s; + } + + // Step 3: read out the rest of the blocks from the reader. The loop will stop + // either I/O status is not ok or we reach to the the end. + while (io_s.ok()) { + dump_unit.reset(); + data.clear(); + // read the content and store in the dump_unit + io_s = ReadCacheBlock(&data, &dump_unit); + if (!io_s.ok()) { + break; + } + if (dump_unit.type == CacheDumpUnitType::kFooter) { + break; + } + // Create the uncompressed_block based on the information in the dump_unit + // (There is no block trailer here compatible with block-based SST file.) + Slice content = + Slice(static_cast(dump_unit.value), dump_unit.value_len); + Status s = secondary_cache_->InsertSaved(dump_unit.key, content); + if (!s.ok()) { + io_s = status_to_io_status(std::move(s)); + } + } + if (dump_unit.type == CacheDumpUnitType::kFooter) { + return IOStatus::OK(); + } else { + return io_s; + } +} + +// Read and copy the dump unit metadata to std::string data, decode and create +// the unit metadata based on the string +IOStatus CacheDumpedLoaderImpl::ReadDumpUnitMeta(std::string* data, + DumpUnitMeta* unit_meta) { + assert(reader_ != nullptr); + assert(data != nullptr); + assert(unit_meta != nullptr); + IOStatus io_s = reader_->ReadMetadata(data); + if (!io_s.ok()) { + return io_s; + } + return status_to_io_status( + CacheDumperHelper::DecodeDumpUnitMeta(*data, unit_meta)); +} + +// Read and copy the dump unit to std::string data, decode and create the unit +// based on the string +IOStatus CacheDumpedLoaderImpl::ReadDumpUnit(size_t len, std::string* data, + DumpUnit* unit) { + assert(reader_ != nullptr); + assert(data != nullptr); + assert(unit != nullptr); + IOStatus io_s = reader_->ReadPacket(data); + if (!io_s.ok()) { + return io_s; + } + if (data->size() != len) { + return IOStatus::Corruption( + "The data being read out does not match the size stored in metadata!"); + } + Slice block; + return status_to_io_status(CacheDumperHelper::DecodeDumpUnit(*data, unit)); +} + +// Read the header +IOStatus CacheDumpedLoaderImpl::ReadHeader(std::string* data, + DumpUnit* dump_unit) { + DumpUnitMeta header_meta; + header_meta.reset(); + std::string meta_string; + IOStatus io_s = ReadDumpUnitMeta(&meta_string, &header_meta); + if (!io_s.ok()) { + return io_s; + } + + io_s = ReadDumpUnit(header_meta.dump_unit_size, data, dump_unit); + if (!io_s.ok()) { + return io_s; + } + uint32_t unit_checksum = crc32c::Value(data->data(), data->size()); + if (unit_checksum != header_meta.dump_unit_checksum) { + return IOStatus::Corruption("Read header unit corrupted!"); + } + return io_s; +} + +// Read the blocks after header is read out +IOStatus CacheDumpedLoaderImpl::ReadCacheBlock(std::string* data, + DumpUnit* dump_unit) { + // According to the write process, we read the dump_unit_metadata first + DumpUnitMeta unit_meta; + unit_meta.reset(); + std::string unit_string; + IOStatus io_s = ReadDumpUnitMeta(&unit_string, &unit_meta); + if (!io_s.ok()) { + return io_s; + } + + // Based on the information in the dump_unit_metadata, we read the dump_unit + // and verify if its content is correct. + io_s = ReadDumpUnit(unit_meta.dump_unit_size, data, dump_unit); + if (!io_s.ok()) { + return io_s; + } + uint32_t unit_checksum = crc32c::Value(data->data(), data->size()); + if (unit_checksum != unit_meta.dump_unit_checksum) { + return IOStatus::Corruption( + "Checksum does not match! Read dumped unit corrupted!"); + } + return io_s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.h new file mode 100644 index 0000000000..9811ff2d91 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cache_dump_load_impl.h @@ -0,0 +1,356 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "rocksdb/utilities/cache_dump_load.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/block_based/reader_common.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +// the read buffer size of for the default CacheDumpReader +const unsigned int kDumpReaderBufferSize = 1024; // 1KB +static const unsigned int kSizePrefixLen = 4; + +enum CacheDumpUnitType : unsigned char { + kHeader = 1, + kFooter = 2, + kData = 3, + kFilter = 4, + kProperties = 5, + kCompressionDictionary = 6, + kRangeDeletion = 7, + kHashIndexPrefixes = 8, + kHashIndexMetadata = 9, + kMetaIndex = 10, + kIndex = 11, + kDeprecatedFilterBlock = 12, // OBSOLETE / DEPRECATED + kFilterMetaBlock = 13, + kBlockTypeMax, +}; + +// The metadata of a dump unit. After it is serilized, its size is fixed 16 +// bytes. +struct DumpUnitMeta { + // sequence number is a monotonically increasing number to indicate the order + // of the blocks being written. Header is 0. + uint32_t sequence_num; + // The Crc32c checksum of its dump unit. + uint32_t dump_unit_checksum; + // The dump unit size after the dump unit is serilized to a string. + uint64_t dump_unit_size; + + void reset() { + sequence_num = 0; + dump_unit_checksum = 0; + dump_unit_size = 0; + } +}; + +// The data structure to hold a block and its information. +struct DumpUnit { + // The timestamp when the block is identified, copied, and dumped from block + // cache + uint64_t timestamp; + // The type of the block + CacheDumpUnitType type; + // The key of this block when the block is referenced by this Cache + Slice key; + // The block size + size_t value_len; + // The Crc32c checksum of the block + uint32_t value_checksum; + // Pointer to the block. Note that, in the dump process, it points to a memory + // buffer copied from cache block. The buffer is freed when we process the + // next block. In the load process, we use an std::string to store the + // serialized dump_unit read from the reader. So it points to the memory + // address of the begin of the block in this string. + void* value; + + DumpUnit() { reset(); } + + void reset() { + timestamp = 0; + type = CacheDumpUnitType::kBlockTypeMax; + key.clear(); + value_len = 0; + value_checksum = 0; + value = nullptr; + } +}; + +// The default implementation of the Cache Dumper +class CacheDumperImpl : public CacheDumper { + public: + CacheDumperImpl(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer) + : options_(dump_options), cache_(cache), writer_(std::move(writer)) {} + ~CacheDumperImpl() { writer_.reset(); } + Status SetDumpFilter(std::vector db_list) override; + IOStatus DumpCacheEntriesToWriter() override; + + private: + IOStatus WriteBlock(CacheDumpUnitType type, const Slice& key, + const Slice& value); + IOStatus WriteHeader(); + IOStatus WriteFooter(); + bool ShouldFilterOut(const Slice& key); + std::function + DumpOneBlockCallBack(std::string& buf); + + CacheDumpOptions options_; + std::shared_ptr cache_; + std::unique_ptr writer_; + SystemClock* clock_; + uint32_t sequence_num_; + // The cache key prefix filter. Currently, we use db_session_id as the prefix, + // so using std::set to store the prefixes as filter is enough. Further + // improvement can be applied like BloomFilter or others to speedup the + // filtering. + std::set prefix_filter_; +}; + +// The default implementation of CacheDumpedLoader +class CacheDumpedLoaderImpl : public CacheDumpedLoader { + public: + CacheDumpedLoaderImpl(const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& /*toptions*/, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader) + : options_(dump_options), + secondary_cache_(secondary_cache), + reader_(std::move(reader)) {} + ~CacheDumpedLoaderImpl() {} + IOStatus RestoreCacheEntriesToSecondaryCache() override; + + private: + IOStatus ReadDumpUnitMeta(std::string* data, DumpUnitMeta* unit_meta); + IOStatus ReadDumpUnit(size_t len, std::string* data, DumpUnit* unit); + IOStatus ReadHeader(std::string* data, DumpUnit* dump_unit); + IOStatus ReadCacheBlock(std::string* data, DumpUnit* dump_unit); + + CacheDumpOptions options_; + std::shared_ptr secondary_cache_; + std::unique_ptr reader_; +}; + +// The default implementation of CacheDumpWriter. We write the blocks to a file +// sequentially. +class ToFileCacheDumpWriter : public CacheDumpWriter { + public: + explicit ToFileCacheDumpWriter( + std::unique_ptr&& file_writer) + : file_writer_(std::move(file_writer)) {} + + ~ToFileCacheDumpWriter() { Close().PermitUncheckedError(); } + + // Write the serialized metadata to the file + virtual IOStatus WriteMetadata(const Slice& metadata) override { + assert(file_writer_ != nullptr); + std::string prefix; + PutFixed32(&prefix, static_cast(metadata.size())); + IOStatus io_s = file_writer_->Append(Slice(prefix)); + if (!io_s.ok()) { + return io_s; + } + io_s = file_writer_->Append(metadata); + return io_s; + } + + // Write the serialized data to the file + virtual IOStatus WritePacket(const Slice& data) override { + assert(file_writer_ != nullptr); + std::string prefix; + PutFixed32(&prefix, static_cast(data.size())); + IOStatus io_s = file_writer_->Append(Slice(prefix)); + if (!io_s.ok()) { + return io_s; + } + io_s = file_writer_->Append(data); + return io_s; + } + + // Reset the writer + virtual IOStatus Close() override { + file_writer_.reset(); + return IOStatus::OK(); + } + + private: + std::unique_ptr file_writer_; +}; + +// The default implementation of CacheDumpReader. It is implemented based on +// RandomAccessFileReader. Note that, we keep an internal variable to remember +// the current offset. +class FromFileCacheDumpReader : public CacheDumpReader { + public: + explicit FromFileCacheDumpReader( + std::unique_ptr&& reader) + : file_reader_(std::move(reader)), + offset_(0), + buffer_(new char[kDumpReaderBufferSize]) {} + + ~FromFileCacheDumpReader() { delete[] buffer_; } + + virtual IOStatus ReadMetadata(std::string* metadata) override { + uint32_t metadata_len = 0; + IOStatus io_s = ReadSizePrefix(&metadata_len); + if (!io_s.ok()) { + return io_s; + } + return Read(metadata_len, metadata); + } + + virtual IOStatus ReadPacket(std::string* data) override { + uint32_t data_len = 0; + IOStatus io_s = ReadSizePrefix(&data_len); + if (!io_s.ok()) { + return io_s; + } + return Read(data_len, data); + } + + private: + IOStatus ReadSizePrefix(uint32_t* len) { + std::string prefix; + IOStatus io_s = Read(kSizePrefixLen, &prefix); + if (!io_s.ok()) { + return io_s; + } + Slice encoded_slice(prefix); + if (!GetFixed32(&encoded_slice, len)) { + return IOStatus::Corruption("Decode size prefix string failed"); + } + return IOStatus::OK(); + } + + IOStatus Read(size_t len, std::string* data) { + assert(file_reader_ != nullptr); + IOStatus io_s; + + unsigned int bytes_to_read = static_cast(len); + unsigned int to_read = bytes_to_read > kDumpReaderBufferSize + ? kDumpReaderBufferSize + : bytes_to_read; + + while (to_read > 0) { + io_s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, + buffer_, nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + if (result_.size() < to_read) { + return IOStatus::Corruption("Corrupted cache dump file."); + } + data->append(result_.data(), result_.size()); + + offset_ += to_read; + bytes_to_read -= to_read; + to_read = bytes_to_read > kDumpReaderBufferSize ? kDumpReaderBufferSize + : bytes_to_read; + } + return io_s; + } + std::unique_ptr file_reader_; + Slice result_; + size_t offset_; + char* buffer_; +}; + +// The cache dump and load helper class +class CacheDumperHelper { + public: + // serialize the dump_unit_meta to a string, it is fixed 16 bytes size. + static void EncodeDumpUnitMeta(const DumpUnitMeta& meta, std::string* data) { + assert(data); + PutFixed32(data, static_cast(meta.sequence_num)); + PutFixed32(data, static_cast(meta.dump_unit_checksum)); + PutFixed64(data, meta.dump_unit_size); + } + + // Serialize the dump_unit to a string. + static void EncodeDumpUnit(const DumpUnit& dump_unit, std::string* data) { + assert(data); + PutFixed64(data, dump_unit.timestamp); + data->push_back(dump_unit.type); + PutLengthPrefixedSlice(data, dump_unit.key); + PutFixed32(data, static_cast(dump_unit.value_len)); + PutFixed32(data, dump_unit.value_checksum); + PutLengthPrefixedSlice(data, + Slice((char*)dump_unit.value, dump_unit.value_len)); + } + + // Deserialize the dump_unit_meta from a string + static Status DecodeDumpUnitMeta(const std::string& encoded_data, + DumpUnitMeta* unit_meta) { + assert(unit_meta != nullptr); + Slice encoded_slice = Slice(encoded_data); + if (!GetFixed32(&encoded_slice, &(unit_meta->sequence_num))) { + return Status::Incomplete("Decode dumped unit meta sequence_num failed"); + } + if (!GetFixed32(&encoded_slice, &(unit_meta->dump_unit_checksum))) { + return Status::Incomplete( + "Decode dumped unit meta dump_unit_checksum failed"); + } + if (!GetFixed64(&encoded_slice, &(unit_meta->dump_unit_size))) { + return Status::Incomplete( + "Decode dumped unit meta dump_unit_size failed"); + } + return Status::OK(); + } + + // Deserialize the dump_unit from a string. + static Status DecodeDumpUnit(const std::string& encoded_data, + DumpUnit* dump_unit) { + assert(dump_unit != nullptr); + Slice encoded_slice = Slice(encoded_data); + + // Decode timestamp + if (!GetFixed64(&encoded_slice, &dump_unit->timestamp)) { + return Status::Incomplete("Decode dumped unit string failed"); + } + // Decode the block type + dump_unit->type = static_cast(encoded_slice[0]); + encoded_slice.remove_prefix(1); + // Decode the key + if (!GetLengthPrefixedSlice(&encoded_slice, &(dump_unit->key))) { + return Status::Incomplete("Decode dumped unit string failed"); + } + // Decode the value size + uint32_t value_len; + if (!GetFixed32(&encoded_slice, &value_len)) { + return Status::Incomplete("Decode dumped unit string failed"); + } + dump_unit->value_len = static_cast(value_len); + // Decode the value checksum + if (!GetFixed32(&encoded_slice, &(dump_unit->value_checksum))) { + return Status::Incomplete("Decode dumped unit string failed"); + } + // Decode the block content and copy to the memory space whose pointer + // will be managed by the cache finally. + Slice block; + if (!GetLengthPrefixedSlice(&encoded_slice, &block)) { + return Status::Incomplete("Decode dumped unit string failed"); + } + dump_unit->value = (void*)block.data(); + assert(block.size() == dump_unit->value_len); + return Status::OK(); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.cc new file mode 100644 index 0000000000..b7da2ba0cb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/cassandra/cassandra_compaction_filter.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/cassandra/format.h" +#include "utilities/cassandra/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +static std::unordered_map + cassandra_filter_type_info = { + {"purge_ttl_on_expiration", + {offsetof(struct CassandraOptions, purge_ttl_on_expiration), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"gc_grace_period_in_seconds", + {offsetof(struct CassandraOptions, gc_grace_period_in_seconds), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +CassandraCompactionFilter::CassandraCompactionFilter( + bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds) + : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) { + RegisterOptions(&options_, &cassandra_filter_type_info); +} + +CompactionFilter::Decision CassandraCompactionFilter::FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + bool value_changed = false; + RowValue row_value = + RowValue::Deserialize(existing_value.data(), existing_value.size()); + RowValue compacted = + options_.purge_ttl_on_expiration + ? row_value.RemoveExpiredColumns(&value_changed) + : row_value.ConvertExpiredColumnsToTombstones(&value_changed); + + if (value_type == ValueType::kValue) { + compacted = compacted.RemoveTombstones(options_.gc_grace_period_in_seconds); + } + + if (compacted.Empty()) { + return Decision::kRemove; + } + + if (value_changed) { + compacted.Serialize(new_value); + return Decision::kChangeValue; + } + + return Decision::kKeep; +} + +CassandraCompactionFilterFactory::CassandraCompactionFilterFactory( + bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds) + : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) { + RegisterOptions(&options_, &cassandra_filter_type_info); +} + +std::unique_ptr +CassandraCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context&) { + std::unique_ptr result(new CassandraCompactionFilter( + options_.purge_ttl_on_expiration, options_.gc_grace_period_in_seconds)); + return result; +} + +int RegisterCassandraObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + CassandraValueMergeOperator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CassandraValueMergeOperator(0)); + return guard->get(); + }); + library.AddFactory( + CassandraCompactionFilter::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { + return new CassandraCompactionFilter(false, 0); + }); + library.AddFactory( + CassandraCompactionFilterFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CassandraCompactionFilterFactory(false, 0)); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.h new file mode 100644 index 0000000000..0325a4c395 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_compaction_filter.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +/** + * Compaction filter for removing expired Cassandra data with ttl. + * If option `purge_ttl_on_expiration` is set to true, expired data + * will be directly purged. Otherwise expired data will be converted + * tombstones first, then be eventally removed after gc grace period. + * `purge_ttl_on_expiration` should only be on in the case all the + * writes have same ttl setting, otherwise it could bring old data back. + * + * Compaction filter is also in charge of removing tombstone that has been + * promoted to kValue type after serials of merging in compaction. + */ +class CassandraCompactionFilter : public CompactionFilter { + public: + explicit CassandraCompactionFilter(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds); + static const char* kClassName() { return "CassandraCompactionFilter"; } + const char* Name() const override { return kClassName(); } + + virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* skip_until) const override; + + private: + CassandraOptions options_; +}; + +class CassandraCompactionFilterFactory : public CompactionFilterFactory { + public: + explicit CassandraCompactionFilterFactory(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds); + ~CassandraCompactionFilterFactory() override {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; + static const char* kClassName() { return "CassandraCompactionFilterFactory"; } + const char* Name() const override { return kClassName(); } + + private: + CassandraOptions options_; +}; +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_options.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_options.h new file mode 100644 index 0000000000..e0527aada6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/cassandra_options.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class ObjectLibrary; +namespace cassandra { +struct CassandraOptions { + static const char* kName() { return "CassandraOptions"; } + CassandraOptions(int32_t _gc_grace_period_in_seconds, size_t _operands_limit, + bool _purge_ttl_on_expiration = false) + : operands_limit(_operands_limit), + gc_grace_period_in_seconds(_gc_grace_period_in_seconds), + purge_ttl_on_expiration(_purge_ttl_on_expiration) {} + // Limit on the number of merge operands. + size_t operands_limit; + + // How long (in seconds) tombstoned data remains before it is purged + int32_t gc_grace_period_in_seconds; + + // If is set to true, expired data will be directly purged. + // Otherwise expired data will be converted tombstones first, + // then be eventually removed after gc grace period. This value should + // only true if all writes have same ttl setting, otherwise it could bring old + // data back. + bool purge_ttl_on_expiration; +}; +extern "C" { +int RegisterCassandraObjects(ObjectLibrary& library, const std::string& arg); +} // extern "C" +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.cc new file mode 100644 index 0000000000..cc1dd2f280 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "format.h" + +#include +#include +#include + +#include "utilities/cassandra/serialize.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +namespace { +const int32_t kDefaultLocalDeletionTime = std::numeric_limits::max(); +const int64_t kDefaultMarkedForDeleteAt = std::numeric_limits::min(); +} // namespace + +ColumnBase::ColumnBase(int8_t mask, int8_t index) + : mask_(mask), index_(index) {} + +std::size_t ColumnBase::Size() const { return sizeof(mask_) + sizeof(index_); } + +int8_t ColumnBase::Mask() const { return mask_; } + +int8_t ColumnBase::Index() const { return index_; } + +void ColumnBase::Serialize(std::string* dest) const { + ROCKSDB_NAMESPACE::cassandra::Serialize(mask_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(index_, dest); +} + +std::shared_ptr ColumnBase::Deserialize(const char* src, + std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + if ((mask & ColumnTypeMask::DELETION_MASK) != 0) { + return Tombstone::Deserialize(src, offset); + } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) { + return ExpiringColumn::Deserialize(src, offset); + } else { + return Column::Deserialize(src, offset); + } +} + +Column::Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, + const char* value) + : ColumnBase(mask, index), + timestamp_(timestamp), + value_size_(value_size), + value_(value) {} + +int64_t Column::Timestamp() const { return timestamp_; } + +std::size_t Column::Size() const { + return ColumnBase::Size() + sizeof(timestamp_) + sizeof(value_size_) + + value_size_; +} + +void Column::Serialize(std::string* dest) const { + ColumnBase::Serialize(dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(timestamp_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(value_size_, dest); + dest->append(value_, value_size_); +} + +std::shared_ptr Column::Deserialize(const char* src, + std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(mask); + int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(index); + int64_t timestamp = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(timestamp); + int32_t value_size = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(value_size); + return std::make_shared(mask, index, timestamp, value_size, + src + offset); +} + +ExpiringColumn::ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, + int32_t value_size, const char* value, + int32_t ttl) + : Column(mask, index, timestamp, value_size, value), ttl_(ttl) {} + +std::size_t ExpiringColumn::Size() const { + return Column::Size() + sizeof(ttl_); +} + +void ExpiringColumn::Serialize(std::string* dest) const { + Column::Serialize(dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(ttl_, dest); +} + +std::chrono::time_point ExpiringColumn::TimePoint() + const { + return std::chrono::time_point( + std::chrono::microseconds(Timestamp())); +} + +std::chrono::seconds ExpiringColumn::Ttl() const { + return std::chrono::seconds(ttl_); +} + +bool ExpiringColumn::Expired() const { + return TimePoint() + Ttl() < std::chrono::system_clock::now(); +} + +std::shared_ptr ExpiringColumn::ToTombstone() const { + auto expired_at = (TimePoint() + Ttl()).time_since_epoch(); + int32_t local_deletion_time = static_cast( + std::chrono::duration_cast(expired_at).count()); + int64_t marked_for_delete_at = + std::chrono::duration_cast(expired_at).count(); + return std::make_shared( + static_cast(ColumnTypeMask::DELETION_MASK), Index(), + local_deletion_time, marked_for_delete_at); +} + +std::shared_ptr ExpiringColumn::Deserialize( + const char* src, std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(mask); + int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(index); + int64_t timestamp = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(timestamp); + int32_t value_size = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(value_size); + const char* value = src + offset; + offset += value_size; + int32_t ttl = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + return std::make_shared(mask, index, timestamp, value_size, + value, ttl); +} + +Tombstone::Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, + int64_t marked_for_delete_at) + : ColumnBase(mask, index), + local_deletion_time_(local_deletion_time), + marked_for_delete_at_(marked_for_delete_at) {} + +int64_t Tombstone::Timestamp() const { return marked_for_delete_at_; } + +std::size_t Tombstone::Size() const { + return ColumnBase::Size() + sizeof(local_deletion_time_) + + sizeof(marked_for_delete_at_); +} + +void Tombstone::Serialize(std::string* dest) const { + ColumnBase::Serialize(dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(local_deletion_time_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(marked_for_delete_at_, dest); +} + +bool Tombstone::Collectable(int32_t gc_grace_period_in_seconds) const { + auto local_deleted_at = std::chrono::time_point( + std::chrono::seconds(local_deletion_time_)); + auto gc_grace_period = std::chrono::seconds(gc_grace_period_in_seconds); + return local_deleted_at + gc_grace_period < std::chrono::system_clock::now(); +} + +std::shared_ptr Tombstone::Deserialize(const char* src, + std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(mask); + int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(index); + int32_t local_deletion_time = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(int32_t); + int64_t marked_for_delete_at = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + return std::make_shared(mask, index, local_deletion_time, + marked_for_delete_at); +} + +RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at) + : local_deletion_time_(local_deletion_time), + marked_for_delete_at_(marked_for_delete_at), + columns_(), + last_modified_time_(0) {} + +RowValue::RowValue(Columns columns, int64_t last_modified_time) + : local_deletion_time_(kDefaultLocalDeletionTime), + marked_for_delete_at_(kDefaultMarkedForDeleteAt), + columns_(std::move(columns)), + last_modified_time_(last_modified_time) {} + +std::size_t RowValue::Size() const { + std::size_t size = + sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_); + for (const auto& column : columns_) { + size += column->Size(); + } + return size; +} + +int64_t RowValue::LastModifiedTime() const { + if (IsTombstone()) { + return marked_for_delete_at_; + } else { + return last_modified_time_; + } +} + +bool RowValue::IsTombstone() const { + return marked_for_delete_at_ > kDefaultMarkedForDeleteAt; +} + +void RowValue::Serialize(std::string* dest) const { + ROCKSDB_NAMESPACE::cassandra::Serialize(local_deletion_time_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(marked_for_delete_at_, dest); + for (const auto& column : columns_) { + column->Serialize(dest); + } +} + +RowValue RowValue::RemoveExpiredColumns(bool* changed) const { + *changed = false; + Columns new_columns; + for (auto& column : columns_) { + if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { + std::shared_ptr expiring_column = + std::static_pointer_cast(column); + + if (expiring_column->Expired()) { + *changed = true; + continue; + } + } + + new_columns.push_back(column); + } + return RowValue(std::move(new_columns), last_modified_time_); +} + +RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const { + *changed = false; + Columns new_columns; + for (auto& column : columns_) { + if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { + std::shared_ptr expiring_column = + std::static_pointer_cast(column); + + if (expiring_column->Expired()) { + std::shared_ptr tombstone = expiring_column->ToTombstone(); + new_columns.push_back(tombstone); + *changed = true; + continue; + } + } + new_columns.push_back(column); + } + return RowValue(std::move(new_columns), last_modified_time_); +} + +RowValue RowValue::RemoveTombstones(int32_t gc_grace_period) const { + Columns new_columns; + for (auto& column : columns_) { + if (column->Mask() == ColumnTypeMask::DELETION_MASK) { + std::shared_ptr tombstone = + std::static_pointer_cast(column); + + if (tombstone->Collectable(gc_grace_period)) { + continue; + } + } + + new_columns.push_back(column); + } + return RowValue(std::move(new_columns), last_modified_time_); +} + +bool RowValue::Empty() const { return columns_.empty(); } + +RowValue RowValue::Deserialize(const char* src, std::size_t size) { + std::size_t offset = 0; + assert(size >= sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_)); + int32_t local_deletion_time = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(int32_t); + int64_t marked_for_delete_at = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(int64_t); + if (offset == size) { + return RowValue(local_deletion_time, marked_for_delete_at); + } + + assert(local_deletion_time == kDefaultLocalDeletionTime); + assert(marked_for_delete_at == kDefaultMarkedForDeleteAt); + Columns columns; + int64_t last_modified_time = 0; + while (offset < size) { + auto c = ColumnBase::Deserialize(src, offset); + offset += c->Size(); + assert(offset <= size); + last_modified_time = std::max(last_modified_time, c->Timestamp()); + columns.push_back(std::move(c)); + } + + return RowValue(std::move(columns), last_modified_time); +} + +// Merge multiple row values into one. +// For each column in rows with same index, we pick the one with latest +// timestamp. And we also take row tombstone into consideration, by iterating +// each row from reverse timestamp order, and stop once we hit the first +// row tombstone. +RowValue RowValue::Merge(std::vector&& values) { + assert(values.size() > 0); + if (values.size() == 1) { + return std::move(values[0]); + } + + // Merge columns by their last modified time, and skip once we hit + // a row tombstone. + std::sort(values.begin(), values.end(), + [](const RowValue& r1, const RowValue& r2) { + return r1.LastModifiedTime() > r2.LastModifiedTime(); + }); + + std::map> merged_columns; + int64_t tombstone_timestamp = 0; + + for (auto& value : values) { + if (value.IsTombstone()) { + if (merged_columns.size() == 0) { + return std::move(value); + } + tombstone_timestamp = value.LastModifiedTime(); + break; + } + for (auto& column : value.columns_) { + int8_t index = column->Index(); + if (merged_columns.find(index) == merged_columns.end()) { + merged_columns[index] = column; + } else { + if (column->Timestamp() > merged_columns[index]->Timestamp()) { + merged_columns[index] = column; + } + } + } + } + + int64_t last_modified_time = 0; + Columns columns; + for (auto& pair : merged_columns) { + // For some row, its last_modified_time > row tombstone_timestamp, but + // it might have rows whose timestamp is ealier than tombstone, so we + // ned to filter these rows. + if (pair.second->Timestamp() <= tombstone_timestamp) { + continue; + } + last_modified_time = std::max(last_modified_time, pair.second->Timestamp()); + columns.push_back(std::move(pair.second)); + } + return RowValue(std::move(columns), last_modified_time); +} + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.h new file mode 100644 index 0000000000..1b27147351 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/format.h @@ -0,0 +1,183 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/** + * The encoding of Cassandra Row Value. + * + * A Cassandra Row Value could either be a row tombstone, + * or contains multiple columns, it has following fields: + * + * struct row_value { + * int32_t local_deletion_time; // Time in second when the row is deleted, + * // only used for Cassandra tombstone gc. + * int64_t marked_for_delete_at; // Ms that marked this row is deleted. + * struct column_base columns[]; // For non tombstone row, all columns + * // are stored here. + * } + * + * If the local_deletion_time and marked_for_delete_at is set, then this is + * a tombstone, otherwise it contains multiple columns. + * + * There are three type of Columns: Normal Column, Expiring Column and Column + * Tombstone, which have following fields: + * + * // Identify the type of the column. + * enum mask { + * DELETION_MASK = 0x01, + * EXPIRATION_MASK = 0x02, + * }; + * + * struct column { + * int8_t mask = 0; + * int8_t index; + * int64_t timestamp; + * int32_t value_length; + * char value[value_length]; + * } + * + * struct expiring_column { + * int8_t mask = mask.EXPIRATION_MASK; + * int8_t index; + * int64_t timestamp; + * int32_t value_length; + * char value[value_length]; + * int32_t ttl; + * } + * + * struct tombstone_column { + * int8_t mask = mask.DELETION_MASK; + * int8_t index; + * int32_t local_deletion_time; // Similar to row_value's field. + * int64_t marked_for_delete_at; + * } + */ + +#pragma once +#include +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +// Identify the type of the column. +enum ColumnTypeMask { + DELETION_MASK = 0x01, + EXPIRATION_MASK = 0x02, +}; + +class ColumnBase { + public: + ColumnBase(int8_t mask, int8_t index); + virtual ~ColumnBase() = default; + + virtual int64_t Timestamp() const = 0; + virtual int8_t Mask() const; + virtual int8_t Index() const; + virtual std::size_t Size() const; + virtual void Serialize(std::string* dest) const; + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int8_t mask_; + int8_t index_; +}; + +class Column : public ColumnBase { + public: + Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, + const char* value); + + virtual int64_t Timestamp() const override; + virtual std::size_t Size() const override; + virtual void Serialize(std::string* dest) const override; + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int64_t timestamp_; + int32_t value_size_; + const char* value_; +}; + +class Tombstone : public ColumnBase { + public: + Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, + int64_t marked_for_delete_at); + + virtual int64_t Timestamp() const override; + virtual std::size_t Size() const override; + virtual void Serialize(std::string* dest) const override; + bool Collectable(int32_t gc_grace_period) const; + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int32_t local_deletion_time_; + int64_t marked_for_delete_at_; +}; + +class ExpiringColumn : public Column { + public: + ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, + int32_t value_size, const char* value, int32_t ttl); + + virtual std::size_t Size() const override; + virtual void Serialize(std::string* dest) const override; + bool Expired() const; + std::shared_ptr ToTombstone() const; + + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int32_t ttl_; + std::chrono::time_point TimePoint() const; + std::chrono::seconds Ttl() const; +}; + +using Columns = std::vector>; + +class RowValue { + public: + // Create a Row Tombstone. + RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at); + // Create a Row containing columns. + RowValue(Columns columns, int64_t last_modified_time); + RowValue(const RowValue& /*that*/) = delete; + RowValue(RowValue&& /*that*/) noexcept = default; + RowValue& operator=(const RowValue& /*that*/) = delete; + RowValue& operator=(RowValue&& /*that*/) = default; + + std::size_t Size() const; + bool IsTombstone() const; + // For Tombstone this returns the marked_for_delete_at_, + // otherwise it returns the max timestamp of containing columns. + int64_t LastModifiedTime() const; + void Serialize(std::string* dest) const; + RowValue RemoveExpiredColumns(bool* changed) const; + RowValue ConvertExpiredColumnsToTombstones(bool* changed) const; + RowValue RemoveTombstones(int32_t gc_grace_period) const; + bool Empty() const; + + static RowValue Deserialize(const char* src, std::size_t size); + // Merge multiple rows according to their timestamp. + static RowValue Merge(std::vector&& values); + + const Columns& get_columns() { return columns_; } + + private: + int32_t local_deletion_time_; + int64_t marked_for_delete_at_; + Columns columns_; + int64_t last_modified_time_; +}; + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.cc new file mode 100644 index 0000000000..366d8fa443 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "merge_operator.h" + +#include + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/cassandra/format.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +static std::unordered_map + merge_operator_options_info = { + {"gc_grace_period_in_seconds", + {offsetof(struct CassandraOptions, gc_grace_period_in_seconds), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"operands_limit", + {offsetof(struct CassandraOptions, operands_limit), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +CassandraValueMergeOperator::CassandraValueMergeOperator( + int32_t gc_grace_period_in_seconds, size_t operands_limit) + : options_(gc_grace_period_in_seconds, operands_limit) { + RegisterOptions(&options_, &merge_operator_options_info); +} + +// Implementation for the merge operation (merges two Cassandra values) +bool CassandraValueMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Clear the *new_value for writing. + merge_out->new_value.clear(); + std::vector row_values; + if (merge_in.existing_value) { + row_values.push_back(RowValue::Deserialize( + merge_in.existing_value->data(), merge_in.existing_value->size())); + } + + for (auto& operand : merge_in.operand_list) { + row_values.push_back(RowValue::Deserialize(operand.data(), operand.size())); + } + + RowValue merged = RowValue::Merge(std::move(row_values)); + merged = merged.RemoveTombstones(options_.gc_grace_period_in_seconds); + merge_out->new_value.reserve(merged.Size()); + merged.Serialize(&(merge_out->new_value)); + + return true; +} + +bool CassandraValueMergeOperator::PartialMergeMulti( + const Slice& /*key*/, const std::deque& operand_list, + std::string* new_value, Logger* /*logger*/) const { + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + std::vector row_values; + for (auto& operand : operand_list) { + row_values.push_back(RowValue::Deserialize(operand.data(), operand.size())); + } + RowValue merged = RowValue::Merge(std::move(row_values)); + new_value->reserve(merged.Size()); + merged.Serialize(new_value); + return true; +} + +} // namespace cassandra + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.h new file mode 100644 index 0000000000..af8725db7d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/merge_operator.h @@ -0,0 +1,44 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +/** + * A MergeOperator for rocksdb that implements Cassandra row value merge. + */ +class CassandraValueMergeOperator : public MergeOperator { + public: + explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds, + size_t operands_limit = 0); + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "CassandraValueMergeOperator"; } + + virtual bool AllowSingleOperand() const override { return true; } + + virtual bool ShouldMerge(const std::vector& operands) const override { + return options_.operands_limit > 0 && + operands.size() >= options_.operands_limit; + } + + private: + CassandraOptions options_; +}; +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/serialize.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/serialize.h new file mode 100644 index 0000000000..4bd552bfc1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/serialize.h @@ -0,0 +1,81 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/** + * Helper functions which serialize and deserialize integers + * into bytes in big endian. + */ + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +namespace { +const int64_t kCharMask = 0xFFLL; +const int32_t kBitsPerByte = 8; +} // namespace + +template +void Serialize(T val, std::string* dest); + +template +T Deserialize(const char* src, std::size_t offset = 0); + +// Specializations +template <> +inline void Serialize(int8_t t, std::string* dest) { + dest->append(1, static_cast(t & kCharMask)); +} + +template <> +inline void Serialize(int32_t t, std::string* dest) { + for (unsigned long i = 0; i < sizeof(int32_t); i++) { + dest->append( + 1, static_cast((t >> (sizeof(int32_t) - 1 - i) * kBitsPerByte) & + kCharMask)); + } +} + +template <> +inline void Serialize(int64_t t, std::string* dest) { + for (unsigned long i = 0; i < sizeof(int64_t); i++) { + dest->append( + 1, static_cast((t >> (sizeof(int64_t) - 1 - i) * kBitsPerByte) & + kCharMask)); + } +} + +template <> +inline int8_t Deserialize(const char* src, std::size_t offset) { + return static_cast(src[offset]); +} + +template <> +inline int32_t Deserialize(const char* src, std::size_t offset) { + int32_t result = 0; + for (unsigned long i = 0; i < sizeof(int32_t); i++) { + result |= static_cast(static_cast(src[offset + i])) + << ((sizeof(int32_t) - 1 - i) * kBitsPerByte); + } + return result; +} + +template <> +inline int64_t Deserialize(const char* src, std::size_t offset) { + int64_t result = 0; + for (unsigned long i = 0; i < sizeof(int64_t); i++) { + result |= static_cast(static_cast(src[offset + i])) + << ((sizeof(int64_t) - 1 - i) * kBitsPerByte); + } + return result; +} + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/test_utils.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/test_utils.h new file mode 100644 index 0000000000..be23f70760 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/cassandra/test_utils.h @@ -0,0 +1,42 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "test_util/testharness.h" +#include "utilities/cassandra/format.h" +#include "utilities/cassandra/serialize.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +extern const char kData[]; +extern const char kExpiringData[]; +extern const int32_t kTtl; +extern const int8_t kColumn; +extern const int8_t kTombstone; +extern const int8_t kExpiringColumn; + +std::shared_ptr CreateTestColumn(int8_t mask, int8_t index, + int64_t timestamp); + +std::tuple CreateTestColumnSpec(int8_t mask, + int8_t index, + int64_t timestamp); + +RowValue CreateTestRowValue( + std::vector> column_specs); + +RowValue CreateRowTombstone(int64_t timestamp); + +void VerifyRowValueColumns( + const std::vector> &columns, + std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index, + int64_t expected_timestamp); + +int64_t ToMicroSeconds(int64_t seconds); +int32_t ToSeconds(int64_t microseconds); +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.cc new file mode 100644 index 0000000000..4a0cc71596 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + + +#include "utilities/checkpoint/checkpoint_impl.h" + +#include +#include +#include +#include +#include +#include + +#include "db/wal_manager.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/checkpoint.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/file_checksum_helper.h" + +namespace ROCKSDB_NAMESPACE { + +Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) { + *checkpoint_ptr = new CheckpointImpl(db); + return Status::OK(); +} + +Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/, + uint64_t /*log_size_for_flush*/, + uint64_t* /*sequence_number_ptr*/) { + return Status::NotSupported(""); +} + +void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path, + Logger* info_log) { + std::vector subchildren; + Status s = db_->GetEnv()->FileExists(full_private_path); + if (s.IsNotFound()) { + return; + } + ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + s = db_->GetEnv()->GetChildren(full_private_path, &subchildren); + if (s.ok()) { + for (auto& subchild : subchildren) { + std::string subchild_path = full_private_path + "/" + subchild; + s = db_->GetEnv()->DeleteFile(subchild_path); + ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(), + s.ToString().c_str()); + } + } + // finally delete the private dir + s = db_->GetEnv()->DeleteDir(full_private_path); + ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); +} + +Status Checkpoint::ExportColumnFamily( + ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/, + ExportImportFilesMetaData** /*metadata*/) { + return Status::NotSupported(""); +} + +// Builds an openable snapshot of RocksDB +Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush, + uint64_t* sequence_number_ptr) { + DBOptions db_options = db_->GetDBOptions(); + + Status s = db_->GetEnv()->FileExists(checkpoint_dir); + if (s.ok()) { + return Status::InvalidArgument("Directory exists"); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + ROCKS_LOG_INFO( + db_options.info_log, + "Started the snapshot process -- creating snapshot in directory %s", + checkpoint_dir.c_str()); + + size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/'); + if (final_nonslash_idx == std::string::npos) { + // npos means it's only slashes or empty. Non-empty means it's the root + // directory, but it shouldn't be because we verified above the directory + // doesn't exist. + assert(checkpoint_dir.empty()); + return Status::InvalidArgument("invalid checkpoint directory name"); + } + + std::string full_private_path = + checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp"; + ROCKS_LOG_INFO(db_options.info_log, + "Snapshot process -- using temporary directory %s", + full_private_path.c_str()); + CleanStagingDirectory(full_private_path, db_options.info_log.get()); + // create snapshot directory + s = db_->GetEnv()->CreateDir(full_private_path); + uint64_t sequence_number = 0; + if (s.ok()) { + // enable file deletions + s = db_->DisableFileDeletions(); + const bool disabled_file_deletions = s.ok(); + + if (s.ok() || s.IsNotSupported()) { + s = CreateCustomCheckpoint( + [&](const std::string& src_dirname, const std::string& fname, + FileType) { + ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s", + fname.c_str()); + return db_->GetFileSystem()->LinkFile( + src_dirname + "/" + fname, full_private_path + "/" + fname, + IOOptions(), nullptr); + } /* link_file_cb */, + [&](const std::string& src_dirname, const std::string& fname, + uint64_t size_limit_bytes, FileType, + const std::string& /* checksum_func_name */, + const std::string& /* checksum_val */, + const Temperature temperature) { + ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str()); + return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname, + full_private_path + "/" + fname, size_limit_bytes, + db_options.use_fsync, nullptr, temperature); + } /* copy_file_cb */, + [&](const std::string& fname, const std::string& contents, FileType) { + ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str()); + return CreateFile(db_->GetFileSystem(), + full_private_path + "/" + fname, contents, + db_options.use_fsync); + } /* create_file_cb */, + &sequence_number, log_size_for_flush); + + // we copied all the files, enable file deletions + if (disabled_file_deletions) { + Status ss = db_->EnableFileDeletions(false); + assert(ss.ok()); + ss.PermitUncheckedError(); + } + } + } + + if (s.ok()) { + // move tmp private backup to real snapshot directory + s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir); + } + if (s.ok()) { + std::unique_ptr checkpoint_directory; + s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(), + &checkpoint_directory, nullptr); + if (s.ok() && checkpoint_directory != nullptr) { + s = checkpoint_directory->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed)); + } + } + + if (s.ok()) { + if (sequence_number_ptr != nullptr) { + *sequence_number_ptr = sequence_number; + } + // here we know that we succeeded and installed the new snapshot + ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good"); + ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64, + sequence_number); + } else { + // clean all the files we might have created + ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s", + s.ToString().c_str()); + CleanStagingDirectory(full_private_path, db_options.info_log.get()); + } + return s; +} + +Status CheckpointImpl::CreateCustomCheckpoint( + std::function + link_file_cb, + std::function< + Status(const std::string& src_dirname, const std::string& src_fname, + uint64_t size_limit_bytes, FileType type, + const std::string& checksum_func_name, + const std::string& checksum_val, const Temperature temperature)> + copy_file_cb, + std::function + create_file_cb, + uint64_t* sequence_number, uint64_t log_size_for_flush, + bool get_live_table_checksum) { + *sequence_number = db_->GetLatestSequenceNumber(); + + LiveFilesStorageInfoOptions opts; + opts.include_checksum_info = get_live_table_checksum; + opts.wal_size_for_flush = log_size_for_flush; + + std::vector infos; + { + Status s = db_->GetLiveFilesStorageInfo(opts, &infos); + if (!s.ok()) { + return s; + } + } + + // Verify that everything except WAL files are in same directory + // (db_paths / cf_paths not supported) + std::unordered_set dirs; + for (auto& info : infos) { + if (info.file_type != kWalFile) { + dirs.insert(info.directory); + } + } + if (dirs.size() > 1) { + return Status::NotSupported( + "db_paths / cf_paths not supported for Checkpoint nor BackupEngine"); + } + + bool same_fs = true; + + for (auto& info : infos) { + Status s; + if (!info.replacement_contents.empty()) { + // Currently should only be used for CURRENT file. + assert(info.file_type == kCurrentFile); + + if (info.size != info.replacement_contents.size()) { + s = Status::Corruption("Inconsistent size metadata for " + + info.relative_filename); + } else { + s = create_file_cb(info.relative_filename, info.replacement_contents, + info.file_type); + } + } else { + if (same_fs && !info.trim_to_size) { + s = link_file_cb(info.directory, info.relative_filename, + info.file_type); + if (s.IsNotSupported()) { + same_fs = false; + s = Status::OK(); + } + s.MustCheck(); + } + if (!same_fs || info.trim_to_size) { + assert(info.file_checksum_func_name.empty() == + !opts.include_checksum_info); + // no assertion on file_checksum because empty is used for both "not + // set" and "unknown" + if (opts.include_checksum_info) { + s = copy_file_cb(info.directory, info.relative_filename, info.size, + info.file_type, info.file_checksum_func_name, + info.file_checksum, info.temperature); + } else { + s = copy_file_cb(info.directory, info.relative_filename, info.size, + info.file_type, kUnknownFileChecksumFuncName, + kUnknownFileChecksum, info.temperature); + } + } + } + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +// Exports all live SST files of a specified Column Family onto export_dir, +// returning SST files information in metadata. +Status CheckpointImpl::ExportColumnFamily( + ColumnFamilyHandle* handle, const std::string& export_dir, + ExportImportFilesMetaData** metadata) { + auto cfh = static_cast_with_check(handle); + const auto cf_name = cfh->GetName(); + const auto db_options = db_->GetDBOptions(); + + assert(metadata != nullptr); + assert(*metadata == nullptr); + auto s = db_->GetEnv()->FileExists(export_dir); + if (s.ok()) { + return Status::InvalidArgument("Specified export_dir exists"); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + const auto final_nonslash_idx = export_dir.find_last_not_of('/'); + if (final_nonslash_idx == std::string::npos) { + return Status::InvalidArgument("Specified export_dir invalid"); + } + ROCKS_LOG_INFO(db_options.info_log, + "[%s] export column family onto export directory %s", + cf_name.c_str(), export_dir.c_str()); + + // Create a temporary export directory. + const auto tmp_export_dir = + export_dir.substr(0, final_nonslash_idx + 1) + ".tmp"; + s = db_->GetEnv()->CreateDir(tmp_export_dir); + + if (s.ok()) { + s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle); + } + + ColumnFamilyMetaData db_metadata; + if (s.ok()) { + // Export live sst files with file deletions disabled. + s = db_->DisableFileDeletions(); + if (s.ok()) { + db_->GetColumnFamilyMetaData(handle, &db_metadata); + + s = ExportFilesInMetaData( + db_options, db_metadata, + [&](const std::string& src_dirname, const std::string& fname) { + ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s", + cf_name.c_str(), fname.c_str()); + return db_->GetEnv()->LinkFile(src_dirname + fname, + tmp_export_dir + fname); + } /*link_file_cb*/, + [&](const std::string& src_dirname, const std::string& fname) { + ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s", + cf_name.c_str(), fname.c_str()); + return CopyFile(db_->GetFileSystem(), src_dirname + fname, + tmp_export_dir + fname, 0, db_options.use_fsync, + nullptr, Temperature::kUnknown); + } /*copy_file_cb*/); + + const auto enable_status = db_->EnableFileDeletions(false /*force*/); + if (s.ok()) { + s = enable_status; + } + } + } + + auto moved_to_user_specified_dir = false; + if (s.ok()) { + // Move temporary export directory to the actual export directory. + s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir); + } + + if (s.ok()) { + // Fsync export directory. + moved_to_user_specified_dir = true; + std::unique_ptr dir_ptr; + s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr, + nullptr); + if (s.ok()) { + assert(dir_ptr != nullptr); + s = dir_ptr->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed)); + } + } + + if (s.ok()) { + // Export of files succeeded. Fill in the metadata information. + auto result_metadata = new ExportImportFilesMetaData(); + result_metadata->db_comparator_name = handle->GetComparator()->Name(); + for (const auto& level_metadata : db_metadata.levels) { + for (const auto& file_metadata : level_metadata.files) { + LiveFileMetaData live_file_metadata; + live_file_metadata.size = file_metadata.size; + live_file_metadata.name = file_metadata.name; + live_file_metadata.file_number = file_metadata.file_number; + live_file_metadata.db_path = export_dir; + live_file_metadata.smallest_seqno = file_metadata.smallest_seqno; + live_file_metadata.largest_seqno = file_metadata.largest_seqno; + live_file_metadata.smallestkey = file_metadata.smallestkey; + live_file_metadata.largestkey = file_metadata.largestkey; + live_file_metadata.oldest_blob_file_number = + file_metadata.oldest_blob_file_number; + live_file_metadata.epoch_number = file_metadata.epoch_number; + live_file_metadata.level = level_metadata.level; + live_file_metadata.smallest = file_metadata.smallest; + live_file_metadata.largest = file_metadata.largest; + result_metadata->files.push_back(live_file_metadata); + } + *metadata = result_metadata; + } + ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.", + cf_name.c_str()); + } else { + // Failure: Clean up all the files/directories created. + ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s", + cf_name.c_str(), s.ToString().c_str()); + std::vector subchildren; + const auto cleanup_dir = + moved_to_user_specified_dir ? export_dir : tmp_export_dir; + db_->GetEnv()->GetChildren(cleanup_dir, &subchildren); + for (const auto& subchild : subchildren) { + const auto subchild_path = cleanup_dir + "/" + subchild; + const auto status = db_->GetEnv()->DeleteFile(subchild_path); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s", + subchild_path.c_str(), status.ToString().c_str()); + } + } + const auto status = db_->GetEnv()->DeleteDir(cleanup_dir); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s", + cleanup_dir.c_str(), status.ToString().c_str()); + } + } + return s; +} + +Status CheckpointImpl::ExportFilesInMetaData( + const DBOptions& db_options, const ColumnFamilyMetaData& metadata, + std::function + link_file_cb, + std::function + copy_file_cb) { + Status s; + auto hardlink_file = true; + + // Copy/hard link files in metadata. + size_t num_files = 0; + for (const auto& level_metadata : metadata.levels) { + for (const auto& file_metadata : level_metadata.files) { + uint64_t number; + FileType type; + const auto ok = ParseFileName(file_metadata.name, &number, &type); + if (!ok) { + s = Status::Corruption("Could not parse file name"); + break; + } + + // We should only get sst files here. + assert(type == kTableFile); + assert(file_metadata.size > 0 && file_metadata.name[0] == '/'); + const auto src_fname = file_metadata.name; + ++num_files; + + if (hardlink_file) { + s = link_file_cb(db_->GetName(), src_fname); + if (num_files == 1 && s.IsNotSupported()) { + // Fallback to copy if link failed due to cross-device directories. + hardlink_file = false; + s = Status::OK(); + } + } + if (!hardlink_file) { + s = copy_file_cb(db_->GetName(), src_fname); + } + if (!s.ok()) { + break; + } + } + } + ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt, + num_files); + + return s; +} +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.h new file mode 100644 index 0000000000..3cb9a6477f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/checkpoint/checkpoint_impl.h @@ -0,0 +1,64 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "file/filename.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/checkpoint.h" + +namespace ROCKSDB_NAMESPACE { + +class CheckpointImpl : public Checkpoint { + public: + explicit CheckpointImpl(DB* db) : db_(db) {} + + Status CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush, + uint64_t* sequence_number_ptr) override; + + Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata) override; + + // Checkpoint logic can be customized by providing callbacks for link, copy, + // or create. + Status CreateCustomCheckpoint( + std::function + link_file_cb, + std::function + copy_file_cb, + std::function + create_file_cb, + uint64_t* sequence_number, uint64_t log_size_for_flush, + bool get_live_table_checksum = false); + + private: + void CleanStagingDirectory(const std::string& path, Logger* info_log); + + // Export logic customization by providing callbacks for link or copy. + Status ExportFilesInMetaData( + const DBOptions& db_options, const ColumnFamilyMetaData& metadata, + std::function + link_file_cb, + std::function + copy_file_cb); + + private: + DB* db_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters.cc new file mode 100644 index 0000000000..999f723651 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters.cc @@ -0,0 +1,52 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/compaction_filters/layered_compaction_filter_base.h" +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" + +namespace ROCKSDB_NAMESPACE { +static int RegisterBuiltinCompactionFilters(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + RemoveEmptyValueCompactionFilter::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /*errmsg*/) { + return new RemoveEmptyValueCompactionFilter(); + }); + return 1; +} +Status CompactionFilter::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + const CompactionFilter** result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinCompactionFilters(*(ObjectLibrary::Default().get()), ""); + }); + CompactionFilter* filter = const_cast(*result); + Status status = + LoadStaticObject(config_options, value, &filter); + if (status.ok()) { + *result = const_cast(filter); + } + return status; +} + +Status CompactionFilterFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + // Currently there are no builtin CompactionFilterFactories. + // If any are introduced, they need to be registered here. + Status status = + LoadSharedObject(config_options, value, result); + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/layered_compaction_filter_base.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/layered_compaction_filter_base.h new file mode 100644 index 0000000000..803fa94ae3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/layered_compaction_filter_base.h @@ -0,0 +1,41 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +// Abstract base class for building layered compaction filter on top of +// user compaction filter. +// See BlobIndexCompactionFilter or TtlCompactionFilter for a basic usage. +class LayeredCompactionFilterBase : public CompactionFilter { + public: + LayeredCompactionFilterBase( + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory) + : user_comp_filter_(_user_comp_filter), + user_comp_filter_from_factory_( + std::move(_user_comp_filter_from_factory)) { + if (!user_comp_filter_) { + user_comp_filter_ = user_comp_filter_from_factory_.get(); + } + } + + // Return a pointer to user compaction filter + const CompactionFilter* user_comp_filter() const { return user_comp_filter_; } + + const Customizable* Inner() const override { return user_comp_filter_; } + + protected: + const CompactionFilter* user_comp_filter_; + + private: + std::unique_ptr user_comp_filter_from_factory_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc new file mode 100644 index 0000000000..c61ce02204 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" + +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/, + const Slice& /*key*/, + const Slice& existing_value, + std::string* /*new_value*/, + bool* /*value_changed*/) const { + // remove kv pairs that have empty values + return existing_value.empty(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h new file mode 100644 index 0000000000..2a56bd6c2e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#pragma once + +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class RemoveEmptyValueCompactionFilter : public CompactionFilter { + public: + static const char* kClassName() { return "RemoveEmptyValueCompactionFilter"; } + + const char* Name() const override { return kClassName(); } + + bool Filter(int level, const Slice& key, const Slice& existing_value, + std::string* new_value, bool* value_changed) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/convenience/info_log_finder.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/convenience/info_log_finder.cc new file mode 100644 index 0000000000..fe62fd5616 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/convenience/info_log_finder.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "rocksdb/utilities/info_log_finder.h" + +#include "file/filename.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +Status GetInfoLogList(DB* db, std::vector* info_log_list) { + if (!db) { + return Status::InvalidArgument("DB pointer is not valid"); + } + std::string parent_path; + const Options& options = db->GetOptions(); + return GetInfoLogFiles(options.env->GetFileSystem(), options.db_log_dir, + db->GetName(), &parent_path, info_log_list); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.cc new file mode 100644 index 0000000000..e43f3a1912 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.cc @@ -0,0 +1,379 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/counted_fs.h" + +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +class CountedSequentialFile : public FSSequentialFileOwnerWrapper { + private: + CountedFileSystem* fs_; + + public: + CountedSequentialFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {} + + ~CountedSequentialFile() override { fs_->counters()->closes++; } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + IOStatus rv = target()->Read(n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override { + IOStatus rv = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } +}; + +class CountedRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + private: + CountedFileSystem* fs_; + + public: + CountedRandomAccessFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSRandomAccessFileOwnerWrapper(std::move(f)), fs_(fs) {} + + ~CountedRandomAccessFile() override { fs_->counters()->closes++; } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->MultiRead(reqs, num_reqs, options, dbg); + for (size_t r = 0; r < num_reqs; r++) { + fs_->counters()->reads.RecordOp(reqs[r].status, reqs[r].result.size()); + } + return rv; + } +}; + +class CountedWritableFile : public FSWritableFileOwnerWrapper { + private: + CountedFileSystem* fs_; + + public: + CountedWritableFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSWritableFileOwnerWrapper(std::move(f)), fs_(fs) {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->Append(data, options, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& info, + IODebugContext* dbg) override { + IOStatus rv = target()->Append(data, options, info, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->PositionedAppend(data, offset, options, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& info, + IODebugContext* dbg) override { + IOStatus rv = target()->PositionedAppend(data, offset, options, info, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Close(options, dbg); + if (rv.ok()) { + fs_->counters()->closes++; + } + return rv; + } + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Flush(options, dbg); + if (rv.ok()) { + fs_->counters()->flushes++; + } + return rv; + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Sync(options, dbg); + if (rv.ok()) { + fs_->counters()->syncs++; + } + return rv; + } + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Fsync(options, dbg); + if (rv.ok()) { + fs_->counters()->fsyncs++; + } + return rv; + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->RangeSync(offset, nbytes, options, dbg); + if (rv.ok()) { + fs_->counters()->syncs++; + } + return rv; + } +}; + +class CountedRandomRWFile : public FSRandomRWFileOwnerWrapper { + private: + mutable CountedFileSystem* fs_; + + public: + CountedRandomRWFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSRandomRWFileOwnerWrapper(std::move(f)), fs_(fs) {} + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->Write(offset, data, options, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Flush(options, dbg); + if (rv.ok()) { + fs_->counters()->flushes++; + } + return rv; + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Sync(options, dbg); + if (rv.ok()) { + fs_->counters()->syncs++; + } + return rv; + } + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Fsync(options, dbg); + if (rv.ok()) { + fs_->counters()->fsyncs++; + } + return rv; + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Close(options, dbg); + if (rv.ok()) { + fs_->counters()->closes++; + } + return rv; + } +}; + +class CountedDirectory : public FSDirectoryWrapper { + private: + mutable CountedFileSystem* fs_; + bool closed_ = false; + + public: + CountedDirectory(std::unique_ptr&& f, CountedFileSystem* fs) + : FSDirectoryWrapper(std::move(f)), fs_(fs) {} + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = FSDirectoryWrapper::Fsync(options, dbg); + if (rv.ok()) { + fs_->counters()->dsyncs++; + } + return rv; + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = FSDirectoryWrapper::Close(options, dbg); + if (rv.ok()) { + fs_->counters()->closes++; + fs_->counters()->dir_closes++; + closed_ = true; + } + return rv; + } + + IOStatus FsyncWithDirOptions(const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_options) override { + IOStatus rv = + FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, dir_options); + if (rv.ok()) { + fs_->counters()->dsyncs++; + } + return rv; + } + + ~CountedDirectory() { + if (!closed_) { + // TODO: fix DB+CF code to use explicit Close, not rely on destructor + fs_->counters()->closes++; + fs_->counters()->dir_closes++; + } + } +}; +} // anonymous namespace + +std::string FileOpCounters::PrintCounters() const { + std::stringstream ss; + ss << "Num files opened: " << opens.load(std::memory_order_relaxed) + << std::endl; + ss << "Num files deleted: " << deletes.load(std::memory_order_relaxed) + << std::endl; + ss << "Num files renamed: " << renames.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Flush(): " << flushes.load(std::memory_order_relaxed) << std::endl; + ss << "Num Sync(): " << syncs.load(std::memory_order_relaxed) << std::endl; + ss << "Num Fsync(): " << fsyncs.load(std::memory_order_relaxed) << std::endl; + ss << "Num Dir Fsync(): " << dsyncs.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Close(): " << closes.load(std::memory_order_relaxed) << std::endl; + ss << "Num Dir Open(): " << dir_opens.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Dir Close(): " << dir_closes.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Read(): " << reads.ops.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Append(): " << writes.ops.load(std::memory_order_relaxed) + << std::endl; + ss << "Num bytes read: " << reads.bytes.load(std::memory_order_relaxed) + << std::endl; + ss << "Num bytes written: " << writes.bytes.load(std::memory_order_relaxed) + << std::endl; + return ss.str(); +} + +CountedFileSystem::CountedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + +IOStatus CountedFileSystem::NewSequentialFile( + const std::string& f, const FileOptions& options, + std::unique_ptr* r, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewSequentialFile(f, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + r->reset(new CountedSequentialFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewRandomAccessFile( + const std::string& f, const FileOptions& options, + std::unique_ptr* r, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewRandomAccessFile(f, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + r->reset(new CountedRandomAccessFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewWritableFile(const std::string& f, + const FileOptions& options, + std::unique_ptr* r, + IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewWritableFile(f, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + r->reset(new CountedWritableFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->ReopenWritableFile(fname, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + result->reset(new CountedWritableFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = + target()->ReuseWritableFile(fname, old_fname, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + result->reset(new CountedWritableFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewRandomRWFile( + const std::string& name, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewRandomRWFile(name, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + result->reset(new CountedRandomRWFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewDirectory(const std::string& name, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewDirectory(name, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + counters_.dir_opens++; + result->reset(new CountedDirectory(std::move(base), this)); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.h new file mode 100644 index 0000000000..cb8a8968fb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/counted_fs.h @@ -0,0 +1,158 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; + +struct OpCounter { + std::atomic ops; + std::atomic bytes; + + OpCounter() : ops(0), bytes(0) {} + + void Reset() { + ops = 0; + bytes = 0; + } + void RecordOp(const IOStatus& io_s, size_t added_bytes) { + if (!io_s.IsNotSupported()) { + ops.fetch_add(1, std::memory_order_relaxed); + } + if (io_s.ok()) { + bytes.fetch_add(added_bytes, std::memory_order_relaxed); + } + } +}; + +struct FileOpCounters { + static const char* kName() { return "FileOpCounters"; } + + std::atomic opens; + std::atomic closes; + std::atomic deletes; + std::atomic renames; + std::atomic flushes; + std::atomic syncs; + std::atomic dsyncs; + std::atomic fsyncs; + std::atomic dir_opens; + std::atomic dir_closes; + OpCounter reads; + OpCounter writes; + + FileOpCounters() + : opens(0), + closes(0), + deletes(0), + renames(0), + flushes(0), + syncs(0), + dsyncs(0), + fsyncs(0), + dir_opens(0), + dir_closes(0) {} + + void Reset() { + opens = 0; + closes = 0; + deletes = 0; + renames = 0; + flushes = 0; + syncs = 0; + dsyncs = 0; + fsyncs = 0; + dir_opens = 0; + dir_closes = 0; + reads.Reset(); + writes.Reset(); + } + std::string PrintCounters() const; +}; + +// A FileSystem class that counts operations (reads, writes, opens, closes, etc) +class CountedFileSystem : public FileSystemWrapper { + public: + private: + FileOpCounters counters_; + + public: + explicit CountedFileSystem(const std::shared_ptr& base); + static const char* kClassName() { return "CountedFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& f, const FileOptions& options, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& f, const FileOptions& options, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewRandomRWFile(const std::string& name, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus s = target()->DeleteFile(fname, options, dbg); + if (s.ok()) { + counters_.deletes++; + } + return s; + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + IOStatus st = target()->RenameFile(s, t, options, dbg); + if (st.ok()) { + counters_.renames++; + } + return st; + } + + const FileOpCounters* counters() const { return &counters_; } + + FileOpCounters* counters() { return &counters_; } + + const void* GetOptionsPtr(const std::string& name) const override { + if (name == FileOpCounters::kName()) { + return counters(); + } else { + return FileSystemWrapper::GetOptionsPtr(name); + } + } + + // Prints the counters to a string + std::string PrintCounters() const { return counters_.PrintCounters(); } + void ResetCounters() { counters_.Reset(); } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/debug.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/debug.cc new file mode 100644 index 0000000000..911bc510a6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/debug.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/utilities/debug.h" + +#include "db/db_impl/db_impl.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { + +static std::unordered_map value_type_string_map = { + {"TypeDeletion", ValueType::kTypeDeletion}, + {"TypeValue", ValueType::kTypeValue}, + {"TypeMerge", ValueType::kTypeMerge}, + {"TypeLogData", ValueType::kTypeLogData}, + {"TypeColumnFamilyDeletion", ValueType::kTypeColumnFamilyDeletion}, + {"TypeColumnFamilyValue", ValueType::kTypeColumnFamilyValue}, + {"TypeColumnFamilyMerge", ValueType::kTypeColumnFamilyMerge}, + {"TypeSingleDeletion", ValueType::kTypeSingleDeletion}, + {"TypeColumnFamilySingleDeletion", + ValueType::kTypeColumnFamilySingleDeletion}, + {"TypeBeginPrepareXID", ValueType::kTypeBeginPrepareXID}, + {"TypeEndPrepareXID", ValueType::kTypeEndPrepareXID}, + {"TypeCommitXID", ValueType::kTypeCommitXID}, + {"TypeRollbackXID", ValueType::kTypeRollbackXID}, + {"TypeNoop", ValueType::kTypeNoop}, + {"TypeColumnFamilyRangeDeletion", + ValueType::kTypeColumnFamilyRangeDeletion}, + {"TypeRangeDeletion", ValueType::kTypeRangeDeletion}, + {"TypeColumnFamilyBlobIndex", ValueType::kTypeColumnFamilyBlobIndex}, + {"TypeBlobIndex", ValueType::kTypeBlobIndex}, + {"TypeBeginPersistedPrepareXID", ValueType::kTypeBeginPersistedPrepareXID}, + {"TypeBeginUnprepareXID", ValueType::kTypeBeginUnprepareXID}, + {"TypeDeletionWithTimestamp", ValueType::kTypeDeletionWithTimestamp}, + {"TypeCommitXIDAndTimestamp", ValueType::kTypeCommitXIDAndTimestamp}, + {"TypeWideColumnEntity", ValueType::kTypeWideColumnEntity}, + {"TypeColumnFamilyWideColumnEntity", + ValueType::kTypeColumnFamilyWideColumnEntity}}; + +std::string KeyVersion::GetTypeName() const { + std::string type_name; + if (SerializeEnum(value_type_string_map, + static_cast(type), &type_name)) { + return type_name; + } else { + return "Invalid"; + } +} + +Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, + size_t max_num_ikeys, + std::vector* key_versions) { + if (nullptr == db) { + return Status::InvalidArgument("db cannot be null."); + } + return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key, + max_num_ikeys, key_versions); +} + +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions) { + if (nullptr == db) { + return Status::InvalidArgument("db cannot be null."); + } + if (nullptr == cfh) { + return Status::InvalidArgument("Column family handle cannot be null."); + } + if (nullptr == key_versions) { + return Status::InvalidArgument("key_versions cannot be null."); + } + key_versions->clear(); + + DBImpl* idb = static_cast(db->GetRootDB()); + auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator); + ReadOptions read_options; + Arena arena; + ScopedArenaIterator iter( + idb->NewInternalIterator(read_options, &arena, kMaxSequenceNumber, cfh)); + + if (!begin_key.empty()) { + InternalKey ikey; + ikey.SetMinPossibleForUserKey(begin_key); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + size_t num_keys = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + Status pik_status = + ParseInternalKey(iter->key(), &ikey, true /* log_err_key */); // TODO + if (!pik_status.ok()) { + return pik_status; + } + + if (!end_key.empty() && + icmp.user_comparator()->Compare(ikey.user_key, end_key) > 0) { + break; + } + + key_versions->emplace_back(ikey.user_key.ToString() /* _user_key */, + iter->value().ToString() /* _value */, + ikey.sequence /* _sequence */, + static_cast(ikey.type) /* _type */); + if (++num_keys >= max_num_ikeys) { + break; + } + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_mirror.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_mirror.cc new file mode 100644 index 0000000000..0802d7c708 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_mirror.cc @@ -0,0 +1,273 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include "rocksdb/utilities/env_mirror.h" + +namespace ROCKSDB_NAMESPACE { + +// An implementation of Env that mirrors all work over two backend +// Env's. This is useful for debugging purposes. +class SequentialFileMirror : public SequentialFile { + public: + std::unique_ptr a_, b_; + std::string fname; + explicit SequentialFileMirror(std::string f) : fname(f) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + Slice aslice; + Status as = a_->Read(n, &aslice, scratch); + if (as == Status::OK()) { + char* bscratch = new char[n]; + Slice bslice; +#ifndef NDEBUG + size_t off = 0; +#endif + size_t left = aslice.size(); + while (left) { + Status bs = b_->Read(left, &bslice, bscratch); +#ifndef NDEBUG + assert(as == bs); + assert(memcmp(bscratch, scratch + off, bslice.size()) == 0); + off += bslice.size(); +#endif + left -= bslice.size(); + } + delete[] bscratch; + *result = aslice; + } else { + Status bs = b_->Read(n, result, scratch); + assert(as == bs); + } + return as; + } + + Status Skip(uint64_t n) override { + Status as = a_->Skip(n); + Status bs = b_->Skip(n); + assert(as == bs); + return as; + } + Status InvalidateCache(size_t offset, size_t length) override { + Status as = a_->InvalidateCache(offset, length); + Status bs = b_->InvalidateCache(offset, length); + assert(as == bs); + return as; + }; +}; + +class RandomAccessFileMirror : public RandomAccessFile { + public: + std::unique_ptr a_, b_; + std::string fname; + explicit RandomAccessFileMirror(std::string f) : fname(f) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + Status as = a_->Read(offset, n, result, scratch); + if (as == Status::OK()) { + char* bscratch = new char[n]; + Slice bslice; + size_t off = 0; + size_t left = result->size(); + while (left) { + Status bs = b_->Read(offset + off, left, &bslice, bscratch); + assert(as == bs); + assert(memcmp(bscratch, scratch + off, bslice.size()) == 0); + off += bslice.size(); + left -= bslice.size(); + } + delete[] bscratch; + } else { + Status bs = b_->Read(offset, n, result, scratch); + assert(as == bs); + } + return as; + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + // NOTE: not verified + return a_->GetUniqueId(id, max_size); + } +}; + +class WritableFileMirror : public WritableFile { + public: + std::unique_ptr a_, b_; + std::string fname; + explicit WritableFileMirror(std::string f, const EnvOptions& options) + : WritableFile(options), fname(f) {} + + Status Append(const Slice& data) override { + Status as = a_->Append(data); + Status bs = b_->Append(data); + assert(as == bs); + return as; + } + Status Append(const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + Status as = a_->PositionedAppend(data, offset); + Status bs = b_->PositionedAppend(data, offset); + assert(as == bs); + return as; + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /* verification_info */) override { + return PositionedAppend(data, offset); + } + Status Truncate(uint64_t size) override { + Status as = a_->Truncate(size); + Status bs = b_->Truncate(size); + assert(as == bs); + return as; + } + Status Close() override { + Status as = a_->Close(); + Status bs = b_->Close(); + assert(as == bs); + return as; + } + Status Flush() override { + Status as = a_->Flush(); + Status bs = b_->Flush(); + assert(as == bs); + return as; + } + Status Sync() override { + Status as = a_->Sync(); + Status bs = b_->Sync(); + assert(as == bs); + return as; + } + Status Fsync() override { + Status as = a_->Fsync(); + Status bs = b_->Fsync(); + assert(as == bs); + return as; + } + bool IsSyncThreadSafe() const override { + bool as = a_->IsSyncThreadSafe(); + assert(as == b_->IsSyncThreadSafe()); + return as; + } + void SetIOPriority(Env::IOPriority pri) override { + a_->SetIOPriority(pri); + b_->SetIOPriority(pri); + } + Env::IOPriority GetIOPriority() override { + // NOTE: we don't verify this one + return a_->GetIOPriority(); + } + uint64_t GetFileSize() override { + uint64_t as = a_->GetFileSize(); + assert(as == b_->GetFileSize()); + return as; + } + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + // NOTE: we don't verify this one + return a_->GetPreallocationStatus(block_size, last_allocated_block); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + // NOTE: we don't verify this one + return a_->GetUniqueId(id, max_size); + } + Status InvalidateCache(size_t offset, size_t length) override { + Status as = a_->InvalidateCache(offset, length); + Status bs = b_->InvalidateCache(offset, length); + assert(as == bs); + return as; + } + + protected: + Status Allocate(uint64_t offset, uint64_t length) override { + Status as = a_->Allocate(offset, length); + Status bs = b_->Allocate(offset, length); + assert(as == bs); + return as; + } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + Status as = a_->RangeSync(offset, nbytes); + Status bs = b_->RangeSync(offset, nbytes); + assert(as == bs); + return as; + } +}; + +Status EnvMirror::NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + if (f.find("/proc/") == 0) { + return a_->NewSequentialFile(f, r, options); + } + SequentialFileMirror* mf = new SequentialFileMirror(f); + Status as = a_->NewSequentialFile(f, &mf->a_, options); + Status bs = b_->NewSequentialFile(f, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +Status EnvMirror::NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + if (f.find("/proc/") == 0) { + return a_->NewRandomAccessFile(f, r, options); + } + RandomAccessFileMirror* mf = new RandomAccessFileMirror(f); + Status as = a_->NewRandomAccessFile(f, &mf->a_, options); + Status bs = b_->NewRandomAccessFile(f, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +Status EnvMirror::NewWritableFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options); + WritableFileMirror* mf = new WritableFileMirror(f, options); + Status as = a_->NewWritableFile(f, &mf->a_, options); + Status bs = b_->NewWritableFile(f, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +Status EnvMirror::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) { + if (fname.find("/proc/") == 0) + return a_->ReuseWritableFile(fname, old_fname, r, options); + WritableFileMirror* mf = new WritableFileMirror(fname, options); + Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options); + Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.cc new file mode 100644 index 0000000000..01fdfccaf3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "utilities/env_timed.h" + +#include "env/composite_env_wrapper.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +TimedFileSystem::TimedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} +IOStatus TimedFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_sequential_file_nanos); + return FileSystemWrapper::NewSequentialFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_random_access_file_nanos); + return FileSystemWrapper::NewRandomAccessFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_writable_file_nanos); + return FileSystemWrapper::NewWritableFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_reuse_writable_file_nanos); + return FileSystemWrapper::ReuseWritableFile(fname, old_fname, options, result, + dbg); +} + +IOStatus TimedFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_random_rw_file_nanos); + return FileSystemWrapper::NewRandomRWFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::NewDirectory(const std::string& name, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_directory_nanos); + return FileSystemWrapper::NewDirectory(name, options, result, dbg); +} + +IOStatus TimedFileSystem::FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_file_exists_nanos); + return FileSystemWrapper::FileExists(fname, options, dbg); +} + +IOStatus TimedFileSystem::GetChildren(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_children_nanos); + return FileSystemWrapper::GetChildren(dir, options, result, dbg); +} + +IOStatus TimedFileSystem::GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_children_file_attributes_nanos); + return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, + dbg); +} + +IOStatus TimedFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_delete_file_nanos); + return FileSystemWrapper::DeleteFile(fname, options, dbg); +} + +IOStatus TimedFileSystem::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_create_dir_nanos); + return FileSystemWrapper::CreateDir(dirname, options, dbg); +} + +IOStatus TimedFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_create_dir_if_missing_nanos); + return FileSystemWrapper::CreateDirIfMissing(dirname, options, dbg); +} + +IOStatus TimedFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_delete_dir_nanos); + return FileSystemWrapper::DeleteDir(dirname, options, dbg); +} + +IOStatus TimedFileSystem::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_file_size_nanos); + return FileSystemWrapper::GetFileSize(fname, options, file_size, dbg); +} + +IOStatus TimedFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_file_modification_time_nanos); + return FileSystemWrapper::GetFileModificationTime(fname, options, file_mtime, + dbg); +} + +IOStatus TimedFileSystem::RenameFile(const std::string& src, + const std::string& dst, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_rename_file_nanos); + return FileSystemWrapper::RenameFile(src, dst, options, dbg); +} + +IOStatus TimedFileSystem::LinkFile(const std::string& src, + const std::string& dst, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_link_file_nanos); + return FileSystemWrapper::LinkFile(src, dst, options, dbg); +} + +IOStatus TimedFileSystem::LockFile(const std::string& fname, + const IOOptions& options, FileLock** lock, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_lock_file_nanos); + return FileSystemWrapper::LockFile(fname, options, lock, dbg); +} + +IOStatus TimedFileSystem::UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_unlock_file_nanos); + return FileSystemWrapper::UnlockFile(lock, options, dbg); +} + +IOStatus TimedFileSystem::NewLogger(const std::string& fname, + const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_logger_nanos); + return FileSystemWrapper::NewLogger(fname, options, result, dbg); +} + +std::shared_ptr NewTimedFileSystem( + const std::shared_ptr& base) { + return std::make_shared(base); +} + +// An environment that measures function call times for filesystem +// operations, reporting results to variables in PerfContext. +Env* NewTimedEnv(Env* base_env) { + std::shared_ptr timed_fs = + NewTimedFileSystem(base_env->GetFileSystem()); + return new CompositeEnvWrapper(base_env, timed_fs); +} + + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.h new file mode 100644 index 0000000000..6c6ac89fb4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/env_timed.h @@ -0,0 +1,95 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once +#include "rocksdb/file_system.h" +namespace ROCKSDB_NAMESPACE { +class TimedFileSystem : public FileSystemWrapper { + public: + explicit TimedFileSystem(const std::shared_ptr& base); + + static const char* kClassName() { return "TimedFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus FileExists(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + + IOStatus RenameFile(const std::string& src, const std::string& dst, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LinkFile(const std::string& src, const std::string& dst, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.cc new file mode 100644 index 0000000000..b0495a8c18 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.cc @@ -0,0 +1,555 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include "utilities/fault_injection_env.h" + +#include +#include + +#include "util/random.h" +namespace ROCKSDB_NAMESPACE { + +// Assume a filename, and not a directory name like "/foo/bar/" +std::string GetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +// A basic file truncation function suitable for this test. +Status Truncate(Env* env, const std::string& filename, uint64_t length) { + std::unique_ptr orig_file; + const EnvOptions options; + Status s = env->NewSequentialFile(filename, &orig_file, options); + if (!s.ok()) { + fprintf(stderr, "Cannot open file %s for truncation: %s\n", + filename.c_str(), s.ToString().c_str()); + return s; + } + + std::unique_ptr scratch(new char[length]); + ROCKSDB_NAMESPACE::Slice result; + s = orig_file->Read(length, &result, scratch.get()); +#ifdef OS_WIN + orig_file.reset(); +#endif + if (s.ok()) { + std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; + std::unique_ptr tmp_file; + s = env->NewWritableFile(tmp_name, &tmp_file, options); + if (s.ok()) { + s = tmp_file->Append(result); + if (s.ok()) { + s = env->RenameFile(tmp_name, filename); + } else { + fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(), + filename.c_str(), s.ToString().c_str()); + env->DeleteFile(tmp_name); + } + } + } + if (!s.ok()) { + fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), + s.ToString().c_str()); + } + + return s; +} + +// Trim the tailing "/" in the end of `str` +std::string TrimDirname(const std::string& str) { + size_t found = str.find_last_not_of("/"); + if (found == std::string::npos) { + return str; + } + return str.substr(0, found + 1); +} + +// Return pair of a full path. +std::pair GetDirAndName(const std::string& name) { + std::string dirname = GetDirName(name); + std::string fname = name.substr(dirname.size() + 1); + return std::make_pair(dirname, fname); +} + +Status FileState::DropUnsyncedData(Env* env) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + return Truncate(env, filename_, sync_pos); +} + +Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + assert(pos_ >= sync_pos); + int range = static_cast(pos_ - sync_pos); + uint64_t truncated_size = + static_cast(sync_pos) + rand->Uniform(range); + return Truncate(env, filename_, truncated_size); +} + +Status TestDirectory::Fsync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + env_->SyncDir(dirname_); + return dir_->Fsync(); +} + +Status TestDirectory::Close() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return dir_->Close(); +} + +TestRandomAccessFile::TestRandomAccessFile( + std::unique_ptr&& target, FaultInjectionTestEnv* env) + : target_(std::move(target)), env_(env) { + assert(target_); + assert(env_); +} + +Status TestRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + assert(env_); + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + + assert(target_); + return target_->Read(offset, n, result, scratch); +} + +Status TestRandomAccessFile::Prefetch(uint64_t offset, size_t n) { + assert(env_); + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + + assert(target_); + return target_->Prefetch(offset, n); +} + +Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(env_); + if (!env_->IsFilesystemActive()) { + const Status s = env_->GetError(); + + assert(reqs); + for (size_t i = 0; i < num_reqs; ++i) { + reqs[i].status = s; + } + + return s; + } + + assert(target_); + return target_->MultiRead(reqs, num_reqs); +} + +TestWritableFile::TestWritableFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env) + : state_(fname), + target_(std::move(f)), + writable_file_opened_(true), + env_(env) { + assert(target_ != nullptr); + state_.pos_ = 0; +} + +TestWritableFile::~TestWritableFile() { + if (writable_file_opened_) { + Close().PermitUncheckedError(); + } +} + +Status TestWritableFile::Append(const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + Status s = target_->Append(data); + if (s.ok()) { + state_.pos_ += data.size(); + env_->WritableFileAppended(state_); + } + return s; +} + +Status TestWritableFile::Close() { + writable_file_opened_ = false; + Status s = target_->Close(); + if (s.ok()) { + env_->WritableFileClosed(state_); + } + return s; +} + +Status TestWritableFile::Flush() { + Status s = target_->Flush(); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return s; +} + +Status TestWritableFile::Sync() { + if (!env_->IsFilesystemActive()) { + return Status::IOError("FaultInjectionTestEnv: not active"); + } + // No need to actual sync. + state_.pos_at_last_sync_ = state_.pos_; + env_->WritableFileSynced(state_); + return Status::OK(); +} + +TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/, + std::unique_ptr&& f, + FaultInjectionTestEnv* env) + : target_(std::move(f)), file_opened_(true), env_(env) { + assert(target_ != nullptr); +} + +TestRandomRWFile::~TestRandomRWFile() { + if (file_opened_) { + Close().PermitUncheckedError(); + } +} + +Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Write(offset, data); +} + +Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Read(offset, n, result, scratch); +} + +Status TestRandomRWFile::Close() { + file_opened_ = false; + return target_->Close(); +} + +Status TestRandomRWFile::Flush() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Flush(); +} + +Status TestRandomRWFile::Sync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Sync(); +} + +Status FaultInjectionTestEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + std::unique_ptr r; + Status s = target()->NewDirectory(name, &r); + assert(s.ok()); + if (!s.ok()) { + return s; + } + result->reset(new TestDirectory(this, TrimDirname(name), r.release())); + return Status::OK(); +} + +Status FaultInjectionTestEnv::NewWritableFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + // Not allow overwriting files + Status s = target()->FileExists(fname); + if (s.ok()) { + return Status::Corruption("File already exists."); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + s = target()->NewWritableFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + +Status FaultInjectionTestEnv::ReopenWritableFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + + bool exists; + Status s, exists_s = target()->FileExists(fname); + if (exists_s.IsNotFound()) { + exists = false; + } else if (exists_s.ok()) { + exists = true; + } else { + s = exists_s; + exists = false; + } + + if (s.ok()) { + s = target()->ReopenWritableFile(fname, result, soptions); + } + + // Only track files we created. Files created outside of this + // `FaultInjectionTestEnv` are not eligible for tracking/data dropping + // (for example, they may contain data a previous db_stress run expects to + // be recovered). This could be extended to track/drop data appended once + // the file is under `FaultInjectionTestEnv`'s control. + if (s.ok()) { + bool should_track; + { + MutexLock l(&mutex_); + if (db_file_state_.find(fname) != db_file_state_.end()) { + // It was written by this `Env` earlier. + assert(exists); + should_track = true; + } else if (!exists) { + // It was created by this `Env` just now. + should_track = true; + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } else { + should_track = false; + } + } + if (should_track) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + } + } + return s; +} + +Status FaultInjectionTestEnv::NewRandomRWFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = target()->NewRandomRWFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestRandomRWFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + +Status FaultInjectionTestEnv::NewRandomAccessFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + + assert(target()); + const Status s = target()->NewRandomAccessFile(fname, result, soptions); + if (!s.ok()) { + return s; + } + + assert(result); + result->reset(new TestRandomAccessFile(std::move(*result), this)); + + return Status::OK(); +} + +Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = EnvWrapper::DeleteFile(f); + if (s.ok()) { + UntrackFile(f); + } + return s; +} + +Status FaultInjectionTestEnv::RenameFile(const std::string& s, + const std::string& t) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status ret = EnvWrapper::RenameFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; +} + +Status FaultInjectionTestEnv::LinkFile(const std::string& s, + const std::string& t) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status ret = EnvWrapper::LinkFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) != + dir_to_new_files_since_last_sync_[sdn.first].end()) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; +} + +void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + db_file_state_[state.filename_] = state; + open_managed_files_.erase(state.filename_); + } +} + +void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +// For every file that is not fully synced, make a call to `func` with +// FileState of the file as the parameter. +Status FaultInjectionTestEnv::DropFileData( + std::function func) { + Status s; + MutexLock l(&mutex_); + for (std::map::const_iterator it = + db_file_state_.begin(); + s.ok() && it != db_file_state_.end(); ++it) { + const FileState& state = it->second; + if (!state.IsFullySynced()) { + s = func(target(), state); + } + } + return s; +} + +Status FaultInjectionTestEnv::DropUnsyncedFileData() { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropUnsyncedData(env); + }); +} + +Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropRandomUnsyncedData(env, rnd); + }); +} + +Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { + // Because DeleteFile access this container make a copy to avoid deadlock + std::map> map_copy; + { + MutexLock l(&mutex_); + map_copy.insert(dir_to_new_files_since_last_sync_.begin(), + dir_to_new_files_since_last_sync_.end()); + } + + for (auto& pair : map_copy) { + for (std::string name : pair.second) { + Status s = DeleteFile(pair.first + "/" + name); + if (!s.ok()) { + return s; + } + } + } + return Status::OK(); +} +void FaultInjectionTestEnv::ResetState() { + MutexLock l(&mutex_); + db_file_state_.clear(); + dir_to_new_files_since_last_sync_.clear(); + SetFilesystemActiveNoLock(true); +} + +void FaultInjectionTestEnv::UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + auto dir_and_name = GetDirAndName(f); + dir_to_new_files_since_last_sync_[dir_and_name.first].erase( + dir_and_name.second); + db_file_state_.erase(f); + open_managed_files_.erase(f); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.h new file mode 100644 index 0000000000..549bfe7168 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_env.h @@ -0,0 +1,258 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#pragma once + +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/env.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { +class Random; +class TestWritableFile; +class FaultInjectionTestEnv; + +struct FileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + + explicit FileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) {} + + FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + Status DropUnsyncedData(Env* env) const; + + Status DropRandomUnsyncedData(Env* env, Random* rand) const; +}; + +class TestRandomAccessFile : public RandomAccessFile { + public: + TestRandomAccessFile(std::unique_ptr&& target, + FaultInjectionTestEnv* env); + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + Status Prefetch(uint64_t offset, size_t n) override; + + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override; + + private: + std::unique_ptr target_; + FaultInjectionTestEnv* env_; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestWritableFile : public WritableFile { + public: + explicit TestWritableFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestWritableFile(); + virtual Status Append(const Slice& data) override; + virtual Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + virtual Status Truncate(uint64_t size) override { + return target_->Truncate(size); + } + virtual Status Close() override; + virtual Status Flush() override; + virtual Status Sync() override; + virtual bool IsSyncThreadSafe() const override { return true; } + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + virtual Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /*verification_info*/) override { + return PositionedAppend(data, offset); + } + virtual bool use_direct_io() const override { + return target_->use_direct_io(); + }; + + private: + FileState state_; + std::unique_ptr target_; + bool writable_file_opened_; + FaultInjectionTestEnv* env_; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestRandomRWFile : public RandomRWFile { + public: + explicit TestRandomRWFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestRandomRWFile(); + Status Write(uint64_t offset, const Slice& data) override; + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + Status Close() override; + Status Flush() override; + Status Sync() override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); }; + + private: + std::unique_ptr target_; + bool file_opened_; + FaultInjectionTestEnv* env_; +}; + +class TestDirectory : public Directory { + public: + explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, + Directory* dir) + : env_(env), dirname_(dirname), dir_(dir) {} + ~TestDirectory() {} + + virtual Status Fsync() override; + virtual Status Close() override; + + private: + FaultInjectionTestEnv* env_; + std::string dirname_; + std::unique_ptr dir_; +}; + +class FaultInjectionTestEnv : public EnvWrapper { + public: + explicit FaultInjectionTestEnv(Env* base) + : EnvWrapper(base), filesystem_active_(true) {} + virtual ~FaultInjectionTestEnv() { error_.PermitUncheckedError(); } + + static const char* kClassName() { return "FaultInjectionTestEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override; + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + virtual Status DeleteFile(const std::string& f) override; + + virtual Status RenameFile(const std::string& s, + const std::string& t) override; + + virtual Status LinkFile(const std::string& s, const std::string& t) override; + +// Undef to eliminate clash on Windows +#undef GetFreeSpace + virtual Status GetFreeSpace(const std::string& path, + uint64_t* disk_free) override { + if (!IsFilesystemActive() && + error_.subcode() == IOStatus::SubCode::kNoSpace) { + *disk_free = 0; + return Status::OK(); + } else { + return target()->GetFreeSpace(path, disk_free); + } + } + + void WritableFileClosed(const FileState& state); + + void WritableFileSynced(const FileState& state); + + void WritableFileAppended(const FileState& state); + + // For every file that is not fully synced, make a call to `func` with + // FileState of the file as the parameter. + Status DropFileData(std::function func); + + Status DropUnsyncedFileData(); + + Status DropRandomUnsyncedFileData(Random* rnd); + + Status DeleteFilesCreatedAfterLastDirSync(); + + void ResetState(); + + void UntrackFile(const std::string& f); + + void SyncDir(const std::string& dirname) { + MutexLock l(&mutex_); + dir_to_new_files_since_last_sync_.erase(dirname); + } + + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() { + MutexLock l(&mutex_); + return filesystem_active_; + } + void SetFilesystemActiveNoLock( + bool active, Status error = Status::Corruption("Not active")) { + error.PermitUncheckedError(); + filesystem_active_ = active; + if (!active) { + error_ = error; + } + error.PermitUncheckedError(); + } + void SetFilesystemActive(bool active, + Status error = Status::Corruption("Not active")) { + error.PermitUncheckedError(); + MutexLock l(&mutex_); + SetFilesystemActiveNoLock(active, error); + error.PermitUncheckedError(); + } + void AssertNoOpenFile() { assert(open_managed_files_.empty()); } + Status GetError() { return error_; } + + private: + port::Mutex mutex_; + std::map db_file_state_; + std::set open_managed_files_; + std::unordered_map> + dir_to_new_files_since_last_sync_; + bool filesystem_active_; // Record flushes, syncs, writes + Status error_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.cc new file mode 100644 index 0000000000..5261d79ea1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.cc @@ -0,0 +1,1071 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom FileSystem to keep track of the state of a file +// system the last "Sync". The data being written is cached in a "buffer". +// Only when "Sync" is called, the data will be persistent. It can simulate +// file data loss (or entire files) not protected by a "Sync". For any of the +// FileSystem related operations, by specify the "IOStatus Error", a specific +// error can be returned when file system is not activated. + +#include "utilities/fault_injection_fs.h" + +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "port/lang.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kNewFileNoOverwrite = ""; + +// Assume a filename, and not a directory name like "/foo/bar/" +std::string TestFSGetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +// Trim the tailing "/" in the end of `str` +std::string TestFSTrimDirname(const std::string& str) { + size_t found = str.find_last_not_of("/"); + if (found == std::string::npos) { + return str; + } + return str.substr(0, found + 1); +} + +// Return pair of a full path. +std::pair TestFSGetDirAndName( + const std::string& name) { + std::string dirname = TestFSGetDirName(name); + std::string fname = name.substr(dirname.size() + 1); + return std::make_pair(dirname, fname); +} + +// Calculate the checksum of the data with corresponding checksum +// type. If name does not match, no checksum is returned. +void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data, + size_t size, std::string* checksum) { + if (checksum_type == ChecksumType::kCRC32c) { + uint32_t v_crc32c = crc32c::Extend(0, data, size); + PutFixed32(checksum, v_crc32c); + return; + } else if (checksum_type == ChecksumType::kxxHash) { + uint32_t v = XXH32(data, size, 0); + PutFixed32(checksum, v); + } + return; +} + +IOStatus FSFileState::DropUnsyncedData() { + buffer_.resize(0); + return IOStatus::OK(); +} + +IOStatus FSFileState::DropRandomUnsyncedData(Random* rand) { + int range = static_cast(buffer_.size()); + size_t truncated_size = static_cast(rand->Uniform(range)); + buffer_.resize(truncated_size); + return IOStatus::OK(); +} + +IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + fs_->SyncDir(dirname_); + IOStatus s = dir_->Fsync(options, dbg); + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return s; +} + +IOStatus TestFSDirectory::Close(const IOOptions& options, IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + IOStatus s = dir_->Close(options, dbg); + return s; +} + +IOStatus TestFSDirectory::FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + fs_->SyncDir(dirname_); + IOStatus s = dir_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return s; +} + +TestFSWritableFile::TestFSWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : state_(fname), + file_opts_(file_opts), + target_(std::move(f)), + writable_file_opened_(true), + fs_(fs) { + assert(target_ != nullptr); + state_.pos_ = 0; +} + +TestFSWritableFile::~TestFSWritableFile() { + if (writable_file_opened_) { + Close(IOOptions(), nullptr).PermitUncheckedError(); + } +} + +IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (target_->use_direct_io()) { + target_->Append(data, options, dbg).PermitUncheckedError(); + } else { + state_.buffer_.append(data.data(), data.size()); + state_.pos_ += data.size(); + fs_->WritableFileAppended(state_); + } + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; +} + +// By setting the IngestDataCorruptionBeforeWrite(), the data corruption is +// simulated. +IOStatus TestFSWritableFile::Append( + const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (fs_->ShouldDataCorruptionBeforeWrite()) { + return IOStatus::Corruption("Data is corrupted!"); + } + + // Calculate the checksum + std::string checksum; + CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(), + data.size(), &checksum); + if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum && + checksum != verification_info.checksum.ToString()) { + std::string msg = "Data is corrupted! Origin data checksum: " + + verification_info.checksum.ToString() + + "current data checksum: " + checksum; + return IOStatus::Corruption(msg); + } + if (target_->use_direct_io()) { + target_->Append(data, options, dbg).PermitUncheckedError(); + } else { + state_.buffer_.append(data.data(), data.size()); + state_.pos_ += data.size(); + fs_->WritableFileAppended(state_); + } + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; +} + +IOStatus TestFSWritableFile::PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& options, + const DataVerificationInfo& verification_info, IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (fs_->ShouldDataCorruptionBeforeWrite()) { + return IOStatus::Corruption("Data is corrupted!"); + } + + // Calculate the checksum + std::string checksum; + CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(), + data.size(), &checksum); + if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum && + checksum != verification_info.checksum.ToString()) { + std::string msg = "Data is corrupted! Origin data checksum: " + + verification_info.checksum.ToString() + + "current data checksum: " + checksum; + return IOStatus::Corruption(msg); + } + target_->PositionedAppend(data, offset, options, dbg); + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; +} + +IOStatus TestFSWritableFile::Close(const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + writable_file_opened_ = false; + IOStatus io_s; + if (!target_->use_direct_io()) { + io_s = target_->Append(state_.buffer_, options, dbg); + } + if (io_s.ok()) { + state_.buffer_.resize(0); + // Ignore sync errors + target_->Sync(options, dbg).PermitUncheckedError(); + io_s = target_->Close(options, dbg); + } + if (io_s.ok()) { + fs_->WritableFileClosed(state_); + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return io_s; +} + +IOStatus TestFSWritableFile::Flush(const IOOptions&, IODebugContext*) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (fs_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return IOStatus::OK(); +} + +IOStatus TestFSWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (target_->use_direct_io()) { + // For Direct IO mode, we don't buffer anything in TestFSWritableFile. + // So just return + return IOStatus::OK(); + } + IOStatus io_s = target_->Append(state_.buffer_, options, dbg); + state_.buffer_.resize(0); + // Ignore sync errors + target_->Sync(options, dbg).PermitUncheckedError(); + state_.pos_at_last_sync_ = state_.pos_; + fs_->WritableFileSynced(state_); + return io_s; +} + +IOStatus TestFSWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + // Assumes caller passes consecutive byte ranges. + uint64_t sync_limit = offset + nbytes; + uint64_t buf_begin = + state_.pos_at_last_sync_ < 0 ? 0 : state_.pos_at_last_sync_; + + IOStatus io_s; + if (sync_limit < buf_begin) { + return io_s; + } + uint64_t num_to_sync = std::min(static_cast(state_.buffer_.size()), + sync_limit - buf_begin); + Slice buf_to_sync(state_.buffer_.data(), num_to_sync); + io_s = target_->Append(buf_to_sync, options, dbg); + state_.buffer_ = state_.buffer_.substr(num_to_sync); + // Ignore sync errors + target_->RangeSync(offset, nbytes, options, dbg).PermitUncheckedError(); + state_.pos_at_last_sync_ = offset + num_to_sync; + fs_->WritableFileSynced(state_); + return io_s; +} + +TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/, + std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : target_(std::move(f)), file_opened_(true), fs_(fs) { + assert(target_ != nullptr); +} + +TestFSRandomRWFile::~TestFSRandomRWFile() { + if (file_opened_) { + Close(IOOptions(), nullptr).PermitUncheckedError(); + } +} + +IOStatus TestFSRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Write(offset, data, options, dbg); +} + +IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Read(offset, n, options, result, scratch, dbg); +} + +IOStatus TestFSRandomRWFile::Close(const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + file_opened_ = false; + return target_->Close(options, dbg); +} + +IOStatus TestFSRandomRWFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Flush(options, dbg); +} + +IOStatus TestFSRandomRWFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Sync(options, dbg); +} + +TestFSRandomAccessFile::TestFSRandomAccessFile( + const std::string& /*fname*/, std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : target_(std::move(f)), fs_(fs) { + assert(target_ != nullptr); +} + +IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, + IODebugContext* dbg) const { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + IOStatus s = target_->Read(offset, n, options, result, scratch, dbg); + if (s.ok()) { + s = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(), + scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr); + } + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected read error"); + } + return s; +} + +IOStatus TestFSRandomAccessFile::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { + IOStatus ret; + IOStatus s; + FSReadRequest res; + if (!fs_->IsFilesystemActive()) { + ret = fs_->GetError(); + } else { + ret = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kRead, &res.result, + use_direct_io(), req.scratch, /*need_count_increase=*/true, + /*fault_injected=*/nullptr); + } + if (ret.ok()) { + if (fs_->ShouldInjectRandomReadError()) { + ret = IOStatus::IOError("Injected read error"); + } else { + s = target_->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, nullptr); + } + } + if (!ret.ok()) { + res.status = ret; + cb(res, cb_arg); + } + return s; +} + +IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg); + bool injected_error = false; + for (size_t i = 0; i < num_reqs; i++) { + if (!reqs[i].status.ok()) { + // Already seeing an error. + break; + } + bool this_injected_error; + reqs[i].status = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq, + &(reqs[i].result), use_direct_io(), reqs[i].scratch, + /*need_count_increase=*/true, + /*fault_injected=*/&this_injected_error); + injected_error |= this_injected_error; + } + if (s.ok()) { + s = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr, + use_direct_io(), nullptr, /*need_count_increase=*/!injected_error, + /*fault_injected=*/nullptr); + } + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected read error"); + } + return s; +} + +size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + if (fs_->ShouldFailGetUniqueId()) { + return 0; + } else { + return target_->GetUniqueId(id, max_size); + } +} +IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + IOStatus s = target()->Read(n, options, result, scratch, dbg); + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected seq read error"); + } + return s; +} + +IOStatus TestFSSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + IOStatus s = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected seq positioned read error"); + } + return s; +} + +IOStatus FaultInjectionTestFS::NewDirectory( + const std::string& name, const IOOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + std::unique_ptr r; + IOStatus io_s = target()->NewDirectory(name, options, &r, dbg); + if (!io_s.ok()) { + return io_s; + } + result->reset( + new TestFSDirectory(this, TestFSTrimDirname(name), r.release())); + return IOStatus::OK(); +} + +IOStatus FaultInjectionTestFS::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + if (ShouldUseDiretWritable(fname)) { + return target()->NewWritableFile(fname, file_opts, result, dbg); + } + + IOStatus io_s = target()->NewWritableFile(fname, file_opts, result, dbg); + if (io_s.ok()) { + result->reset( + new TestFSWritableFile(fname, file_opts, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + { + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + // The new file could overwrite an old one. Here we simplify + // the implementation by assuming no file of this name after + // dropping unsynced files. + list[dir_and_name.second] = kNewFileNoOverwrite; + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + if (ShouldUseDiretWritable(fname)) { + return target()->ReopenWritableFile(fname, file_opts, result, dbg); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + bool exists; + IOStatus io_s, + exists_s = target()->FileExists(fname, IOOptions(), nullptr /* dbg */); + if (exists_s.IsNotFound()) { + exists = false; + } else if (exists_s.ok()) { + exists = true; + } else { + io_s = exists_s; + exists = false; + } + + if (io_s.ok()) { + io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + } + + // Only track files we created. Files created outside of this + // `FaultInjectionTestFS` are not eligible for tracking/data dropping + // (for example, they may contain data a previous db_stress run expects to + // be recovered). This could be extended to track/drop data appended once + // the file is under `FaultInjectionTestFS`'s control. + if (io_s.ok()) { + bool should_track; + { + MutexLock l(&mutex_); + if (db_file_state_.find(fname) != db_file_state_.end()) { + // It was written by this `FileSystem` earlier. + assert(exists); + should_track = true; + } else if (!exists) { + // It was created by this `FileSystem` just now. + should_track = true; + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list[dir_and_name.second] = kNewFileNoOverwrite; + } else { + should_track = false; + } + } + if (should_track) { + result->reset( + new TestFSWritableFile(fname, file_opts, std::move(*result), this)); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::NewRandomRWFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + if (ShouldUseDiretWritable(fname)) { + return target()->NewRandomRWFile(fname, file_opts, result, dbg); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + IOStatus io_s = target()->NewRandomRWFile(fname, file_opts, result, dbg); + if (io_s.ok()) { + result->reset(new TestFSRandomRWFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + { + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + // It could be overwriting an old file, but we simplify the + // implementation by ignoring it. + list[dir_and_name.second] = kNewFileNoOverwrite; + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + if (ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected error when open random access file"); + } + IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr, + false, nullptr, + /*need_count_increase=*/true, + /*fault_injected=*/nullptr); + if (io_s.ok()) { + io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + } + if (io_s.ok()) { + result->reset(new TestFSRandomAccessFile(fname, std::move(*result), this)); + } + return io_s; +} + +IOStatus FaultInjectionTestFS::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + + if (ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected read error when creating seq file"); + } + IOStatus io_s = target()->NewSequentialFile(fname, file_opts, result, dbg); + if (io_s.ok()) { + result->reset(new TestFSSequentialFile(std::move(*result), this)); + } + return io_s; +} + +IOStatus FaultInjectionTestFS::DeleteFile(const std::string& f, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + IOStatus io_s = FileSystemWrapper::DeleteFile(f, options, dbg); + if (io_s.ok()) { + UntrackFile(f); + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::RenameFile(const std::string& s, + const std::string& t, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + // We preserve contents of overwritten files up to a size threshold. + // We could keep previous file in another name, but we need to worry about + // garbage collect the those files. We do it if it is needed later. + // We ignore I/O errors here for simplicity. + std::string previous_contents = kNewFileNoOverwrite; + if (target()->FileExists(t, IOOptions(), nullptr).ok()) { + uint64_t file_size; + if (target()->GetFileSize(t, IOOptions(), &file_size, nullptr).ok() && + file_size < 1024) { + ReadFileToString(target(), t, &previous_contents).PermitUncheckedError(); + } + } + IOStatus io_s = FileSystemWrapper::RenameFile(s, t, options, dbg); + + if (io_s.ok()) { + { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + auto sdn = TestFSGetDirAndName(s); + auto tdn = TestFSGetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist[tdn.second] = previous_contents; + } + } + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + return io_s; +} + +IOStatus FaultInjectionTestFS::LinkFile(const std::string& s, + const std::string& t, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + // Using the value in `dir_to_new_files_since_last_sync_` for the source file + // may be a more reasonable choice. + std::string previous_contents = kNewFileNoOverwrite; + + IOStatus io_s = FileSystemWrapper::LinkFile(s, t, options, dbg); + + if (io_s.ok()) { + { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + } + + auto sdn = TestFSGetDirAndName(s); + auto tdn = TestFSGetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) != + dir_to_new_files_since_last_sync_[sdn.first].end()) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist[tdn.second] = previous_contents; + } + } + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + return io_s; +} + +IOStatus FaultInjectionTestFS::Poll(std::vector& io_handles, + size_t min_completions) { + return target()->Poll(io_handles, min_completions); +} + +IOStatus FaultInjectionTestFS::AbortIO(std::vector& io_handles) { + return target()->AbortIO(io_handles); +} + +void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + db_file_state_[state.filename_] = state; + open_managed_files_.erase(state.filename_); + } +} + +void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +void FaultInjectionTestFS::WritableFileAppended(const FSFileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +IOStatus FaultInjectionTestFS::DropUnsyncedFileData() { + IOStatus io_s; + MutexLock l(&mutex_); + for (std::map::iterator it = db_file_state_.begin(); + io_s.ok() && it != db_file_state_.end(); ++it) { + FSFileState& fs_state = it->second; + if (!fs_state.IsFullySynced()) { + io_s = fs_state.DropUnsyncedData(); + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::DropRandomUnsyncedFileData(Random* rnd) { + IOStatus io_s; + MutexLock l(&mutex_); + for (std::map::iterator it = db_file_state_.begin(); + io_s.ok() && it != db_file_state_.end(); ++it) { + FSFileState& fs_state = it->second; + if (!fs_state.IsFullySynced()) { + io_s = fs_state.DropRandomUnsyncedData(rnd); + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync( + const IOOptions& options, IODebugContext* dbg) { + // Because DeleteFile access this container make a copy to avoid deadlock + std::map> map_copy; + { + MutexLock l(&mutex_); + map_copy.insert(dir_to_new_files_since_last_sync_.begin(), + dir_to_new_files_since_last_sync_.end()); + } + + for (auto& pair : map_copy) { + for (auto& file_pair : pair.second) { + if (file_pair.second == kNewFileNoOverwrite) { + IOStatus io_s = + DeleteFile(pair.first + "/" + file_pair.first, options, dbg); + if (!io_s.ok()) { + return io_s; + } + } else { + IOStatus io_s = + WriteStringToFile(target(), file_pair.second, + pair.first + "/" + file_pair.first, true); + if (!io_s.ok()) { + return io_s; + } + } + } + } + return IOStatus::OK(); +} + +void FaultInjectionTestFS::ResetState() { + MutexLock l(&mutex_); + db_file_state_.clear(); + dir_to_new_files_since_last_sync_.clear(); + SetFilesystemActiveNoLock(true); +} + +void FaultInjectionTestFS::UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + auto dir_and_name = TestFSGetDirAndName(f); + dir_to_new_files_since_last_sync_[dir_and_name.first].erase( + dir_and_name.second); + db_file_state_.erase(f); + open_managed_files_.erase(f); +} + +IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( + ErrorOperation op, Slice* result, bool direct_io, char* scratch, + bool need_count_increase, bool* fault_injected) { + bool dummy_bool; + bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool; + ret_fault_injected = false; + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) { + return IOStatus::OK(); + } + + if (ctx->rand.OneIn(ctx->one_in)) { + if (ctx->count == 0) { + ctx->message = ""; + } + if (need_count_increase) { + ctx->count++; + } + if (ctx->callstack) { + free(ctx->callstack); + } + ctx->callstack = port::SaveStack(&ctx->frames); + + if (op != ErrorOperation::kMultiReadSingleReq) { + // Likely non-per read status code for MultiRead + ctx->message += "error; "; + ret_fault_injected = true; + return IOStatus::IOError(); + } else if (Random::GetTLSInstance()->OneIn(8)) { + assert(result); + // For a small chance, set the failure to status but turn the + // result to be empty, which is supposed to be caught for a check. + *result = Slice(); + ctx->message += "inject empty result; "; + ret_fault_injected = true; + } else if (!direct_io && Random::GetTLSInstance()->OneIn(7) && + scratch != nullptr && result->data() == scratch) { + assert(result); + // With direct I/O, many extra bytes might be read so corrupting + // one byte might not cause checksum mismatch. Skip checksum + // corruption injection. + // We only corrupt data if the result is filled to `scratch`. For other + // cases, the data might not be able to be modified (e.g mmaped files) + // or has unintended side effects. + // For a small chance, set the failure to status but corrupt the + // result in a way that checksum checking is supposed to fail. + // Corrupt the last byte, which is supposed to be a checksum byte + // It would work for CRC. Not 100% sure for xxhash and will adjust + // if it is not the case. + const_cast(result->data())[result->size() - 1]++; + ctx->message += "corrupt last byte; "; + ret_fault_injected = true; + } else { + ctx->message += "error result multiget single; "; + ret_fault_injected = true; + return IOStatus::IOError(); + } + } + return IOStatus::OK(); +} + +bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name, + uint64_t* number, FileType* type) { + std::size_t found = file_name.find_last_of("/"); + std::string file = file_name.substr(found); + return ParseFileName(file, number, type); +} + +IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) { + MutexLock l(&mutex_); + if (!enable_write_error_injection_ || !write_error_one_in_) { + return IOStatus::OK(); + } + bool allowed_type = false; + + if (inject_for_all_file_types_) { + allowed_type = true; + } else { + uint64_t number; + FileType cur_type = kTempFile; + if (TryParseFileName(file_name, &number, &cur_type)) { + for (const auto& type : write_error_allowed_types_) { + if (cur_type == type) { + allowed_type = true; + } + } + } + } + + if (allowed_type) { + if (write_error_rand_.OneIn(write_error_one_in_)) { + return GetError(); + } + } + return IOStatus::OK(); +} + +IOStatus FaultInjectionTestFS::InjectMetadataWriteError() { + { + MutexLock l(&mutex_); + if (!enable_metadata_write_error_injection_ || + !metadata_write_error_one_in_ || + !write_error_rand_.OneIn(metadata_write_error_one_in_)) { + return IOStatus::OK(); + } + } + TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected"); + return IOStatus::IOError(); +} + +void FaultInjectionTestFS::PrintFaultBacktrace() { +#if defined(OS_LINUX) + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx == nullptr) { + return; + } + fprintf(stderr, "Injected error type = %d\n", ctx->type); + fprintf(stderr, "Message: %s\n", ctx->message.c_str()); + port::PrintAndFreeStack(ctx->callstack, ctx->frames); + ctx->callstack = nullptr; +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.h new file mode 100644 index 0000000000..cab0051bd1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_fs.h @@ -0,0 +1,593 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom FileSystem to keep track of the state of a file +// system the last "Sync". The data being written is cached in a "buffer". +// Only when "Sync" is called, the data will be persistent. It can similate +// file data loss (or entire files) not protected by a "Sync". For any of the +// FileSystem related operations, by specify the "IOStatus Error", a specific +// error can be returned when file system is not activated. + +#pragma once + +#include +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/file_system.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class TestFSWritableFile; +class FaultInjectionTestFS; + +struct FSFileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + std::string buffer_; + + explicit FSFileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) {} + + FSFileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + IOStatus DropUnsyncedData(); + + IOStatus DropRandomUnsyncedData(Random* rand); +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestFSWritableFile : public FSWritableFile { + public: + explicit TestFSWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr&& f, + FaultInjectionTestFS* fs); + virtual ~TestFSWritableFile(); + virtual IOStatus Append(const Slice& data, const IOOptions&, + IODebugContext*) override; + virtual IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + virtual IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Truncate(size, options, dbg); + } + virtual IOStatus Close(const IOOptions& options, + IODebugContext* dbg) override; + virtual IOStatus Flush(const IOOptions&, IODebugContext*) override; + virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, + IODebugContext* dbg) override; + virtual bool IsSyncThreadSafe() const override { return true; } + virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + virtual size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + virtual bool use_direct_io() const override { + return target_->use_direct_io(); + }; + + private: + FSFileState state_; // Need protection by mutex_ + FileOptions file_opts_; + std::unique_ptr target_; + bool writable_file_opened_; + FaultInjectionTestFS* fs_; + port::Mutex mutex_; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestFSRandomRWFile : public FSRandomRWFile { + public: + explicit TestFSRandomRWFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestFS* fs); + virtual ~TestFSRandomRWFile(); + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); }; + + private: + std::unique_ptr target_; + bool file_opened_; + FaultInjectionTestFS* fs_; +}; + +class TestFSRandomAccessFile : public FSRandomAccessFile { + public: + explicit TestFSRandomAccessFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestFS* fs); + ~TestFSRandomAccessFile() override {} + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override; + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetUniqueId(char* id, size_t max_size) const override; + + private: + std::unique_ptr target_; + FaultInjectionTestFS* fs_; +}; + +class TestFSSequentialFile : public FSSequentialFileOwnerWrapper { + public: + explicit TestFSSequentialFile(std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {} + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + private: + FaultInjectionTestFS* fs_; +}; + +class TestFSDirectory : public FSDirectory { + public: + explicit TestFSDirectory(FaultInjectionTestFS* fs, std::string dirname, + FSDirectory* dir) + : fs_(fs), dirname_(dirname), dir_(dir) {} + ~TestFSDirectory() {} + + virtual IOStatus Fsync(const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus Close(const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override; + + private: + FaultInjectionTestFS* fs_; + std::string dirname_; + std::unique_ptr dir_; +}; + +class FaultInjectionTestFS : public FileSystemWrapper { + public: + explicit FaultInjectionTestFS(const std::shared_ptr& base) + : FileSystemWrapper(base), + filesystem_active_(true), + filesystem_writable_(false), + thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)), + enable_write_error_injection_(false), + enable_metadata_write_error_injection_(false), + write_error_rand_(0), + write_error_one_in_(0), + metadata_write_error_one_in_(0), + read_error_one_in_(0), + ingest_data_corruption_before_write_(false), + fail_get_file_unique_id_(false) {} + virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); } + + static const char* kClassName() { return "FaultInjectionTestFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewDirectory(const std::string& name, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + virtual IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus LinkFile(const std::string& src, const std::string& target, + const IOOptions& options, + IODebugContext* dbg) override; + +// Undef to eliminate clash on Windows +#undef GetFreeSpace + virtual IOStatus GetFreeSpace(const std::string& path, + const IOOptions& options, uint64_t* disk_free, + IODebugContext* dbg) override { + IOStatus io_s; + if (!IsFilesystemActive() && + error_.subcode() == IOStatus::SubCode::kNoSpace) { + *disk_free = 0; + } else { + io_s = target()->GetFreeSpace(path, options, disk_free, dbg); + } + return io_s; + } + + virtual IOStatus Poll(std::vector& io_handles, + size_t min_completions) override; + + virtual IOStatus AbortIO(std::vector& io_handles) override; + + void WritableFileClosed(const FSFileState& state); + + void WritableFileSynced(const FSFileState& state); + + void WritableFileAppended(const FSFileState& state); + + IOStatus DropUnsyncedFileData(); + + IOStatus DropRandomUnsyncedFileData(Random* rnd); + + IOStatus DeleteFilesCreatedAfterLastDirSync(const IOOptions& options, + IODebugContext* dbg); + + void ResetState(); + + void UntrackFile(const std::string& f); + + void SyncDir(const std::string& dirname) { + MutexLock l(&mutex_); + dir_to_new_files_since_last_sync_.erase(dirname); + } + + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() { + MutexLock l(&mutex_); + return filesystem_active_; + } + + // Setting filesystem_writable_ makes NewWritableFile. ReopenWritableFile, + // and NewRandomRWFile bypass FaultInjectionTestFS and go directly to the + // target FS + bool IsFilesystemDirectWritable() { + MutexLock l(&mutex_); + return filesystem_writable_; + } + bool ShouldUseDiretWritable(const std::string& file_name) { + MutexLock l(&mutex_); + if (filesystem_writable_) { + return true; + } + FileType file_type = kTempFile; + uint64_t file_number = 0; + if (!TryParseFileName(file_name, &file_number, &file_type)) { + return false; + } + return skip_direct_writable_types_.find(file_type) != + skip_direct_writable_types_.end(); + } + void SetFilesystemActiveNoLock( + bool active, IOStatus error = IOStatus::Corruption("Not active")) { + error.PermitUncheckedError(); + filesystem_active_ = active; + if (!active) { + error_ = error; + } + } + void SetFilesystemActive( + bool active, IOStatus error = IOStatus::Corruption("Not active")) { + MutexLock l(&mutex_); + error.PermitUncheckedError(); + SetFilesystemActiveNoLock(active, error); + } + void SetFilesystemDirectWritable(bool writable) { + MutexLock l(&mutex_); + filesystem_writable_ = writable; + } + void AssertNoOpenFile() { assert(open_managed_files_.empty()); } + + IOStatus GetError() { return error_; } + + void SetFileSystemIOError(IOStatus io_error) { + MutexLock l(&mutex_); + io_error.PermitUncheckedError(); + error_ = io_error; + } + + // To simulate the data corruption before data is written in FS + void IngestDataCorruptionBeforeWrite() { + MutexLock l(&mutex_); + ingest_data_corruption_before_write_ = true; + } + + void NoDataCorruptionBeforeWrite() { + MutexLock l(&mutex_); + ingest_data_corruption_before_write_ = false; + } + + bool ShouldDataCorruptionBeforeWrite() { + MutexLock l(&mutex_); + return ingest_data_corruption_before_write_; + } + + void SetChecksumHandoffFuncType(const ChecksumType& func_type) { + MutexLock l(&mutex_); + checksum_handoff_func_tpye_ = func_type; + } + + const ChecksumType& GetChecksumHandoffFuncType() { + MutexLock l(&mutex_); + return checksum_handoff_func_tpye_; + } + + void SetFailGetUniqueId(bool flag) { + MutexLock l(&mutex_); + fail_get_file_unique_id_ = flag; + } + + bool ShouldFailGetUniqueId() { + MutexLock l(&mutex_); + return fail_get_file_unique_id_; + } + + // Specify what the operation, so we can inject the right type of error + enum ErrorOperation : char { + kRead = 0, + kMultiReadSingleReq = 1, + kMultiRead = 2, + kOpen, + }; + + // Set thread-local parameters for error injection. The first argument, + // seed is the seed for the random number generator, and one_in determines + // the probability of injecting error (i.e an error is injected with + // 1/one_in probability) + void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) { + struct ErrorContext* ctx = + static_cast(thread_local_error_->Get()); + if (ctx == nullptr) { + ctx = new ErrorContext(seed); + thread_local_error_->Reset(ctx); + } + ctx->one_in = one_in; + ctx->count = 0; + } + + static void DeleteThreadLocalErrorContext(void* p) { + ErrorContext* ctx = static_cast(p); + delete ctx; + } + + // This is to set the parameters for the write error injection. + // seed is the seed for the random number generator, and one_in determines + // the probability of injecting error (i.e an error is injected with + // 1/one_in probability). For write error, we can specify the error we + // want to inject. Types decides the file types we want to inject the + // error (e.g., Wal files, SST files), which is empty by default. + void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error, + bool inject_for_all_file_types, + const std::vector& types) { + MutexLock l(&mutex_); + Random tmp_rand(seed); + error.PermitUncheckedError(); + error_ = error; + write_error_rand_ = tmp_rand; + write_error_one_in_ = one_in; + inject_for_all_file_types_ = inject_for_all_file_types; + write_error_allowed_types_ = types; + } + + void SetSkipDirectWritableTypes(const std::set& types) { + MutexLock l(&mutex_); + skip_direct_writable_types_ = types; + } + + void SetRandomMetadataWriteError(int one_in) { + MutexLock l(&mutex_); + metadata_write_error_one_in_ = one_in; + } + // If the value is not 0, it is enabled. Otherwise, it is disabled. + void SetRandomReadError(int one_in) { read_error_one_in_ = one_in; } + + bool ShouldInjectRandomReadError() { + return read_error_one_in() && + Random::GetTLSInstance()->OneIn(read_error_one_in()); + } + + // Inject an write error with randomlized parameter and the predefined + // error type. Only the allowed file types will inject the write error + IOStatus InjectWriteError(const std::string& file_name); + + // Ingest error to metadata operations. + IOStatus InjectMetadataWriteError(); + + // Inject an error. For a READ operation, a status of IOError(), a + // corruption in the contents of scratch, or truncation of slice + // are the types of error with equal probability. For OPEN, + // its always an IOError. + // fault_injected returns whether a fault is injected. It is needed + // because some fault is inected with IOStatus to be OK. + IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice, + bool direct_io, char* scratch, + bool need_count_increase, + bool* fault_injected); + + // Get the count of how many times we injected since the previous call + int GetAndResetErrorCount() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + int count = 0; + if (ctx != nullptr) { + count = ctx->count; + ctx->count = 0; + } + return count; + } + + void EnableErrorInjection() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx) { + ctx->enable_error_injection = true; + } + } + + void EnableWriteErrorInjection() { + MutexLock l(&mutex_); + enable_write_error_injection_ = true; + } + void EnableMetadataWriteErrorInjection() { + MutexLock l(&mutex_); + enable_metadata_write_error_injection_ = true; + } + + void DisableWriteErrorInjection() { + MutexLock l(&mutex_); + enable_write_error_injection_ = false; + } + + void DisableErrorInjection() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx) { + ctx->enable_error_injection = false; + } + } + + void DisableMetadataWriteErrorInjection() { + MutexLock l(&mutex_); + enable_metadata_write_error_injection_ = false; + } + + int read_error_one_in() const { return read_error_one_in_.load(); } + + int write_error_one_in() const { return write_error_one_in_; } + + // We capture a backtrace every time a fault is injected, for debugging + // purposes. This call prints the backtrace to stderr and frees the + // saved callstack + void PrintFaultBacktrace(); + + private: + port::Mutex mutex_; + std::map db_file_state_; + std::set open_managed_files_; + // directory -> (file name -> file contents to recover) + // When data is recovered from unsyned parent directory, the files with + // empty file contents to recover is deleted. Those with non-empty ones + // will be recovered to content accordingly. + std::unordered_map> + dir_to_new_files_since_last_sync_; + bool filesystem_active_; // Record flushes, syncs, writes + bool filesystem_writable_; // Bypass FaultInjectionTestFS and go directly + // to underlying FS for writable files + IOStatus error_; + + enum ErrorType : int { + kErrorTypeStatus = 0, + kErrorTypeCorruption, + kErrorTypeTruncated, + kErrorTypeMax + }; + + struct ErrorContext { + Random rand; + int one_in; + int count; + bool enable_error_injection; + void* callstack; + std::string message; + int frames; + ErrorType type; + + explicit ErrorContext(uint32_t seed) + : rand(seed), + enable_error_injection(false), + callstack(nullptr), + frames(0) {} + ~ErrorContext() { + if (callstack) { + free(callstack); + } + } + }; + + std::unique_ptr thread_local_error_; + bool enable_write_error_injection_; + bool enable_metadata_write_error_injection_; + Random write_error_rand_; + int write_error_one_in_; + int metadata_write_error_one_in_; + std::atomic read_error_one_in_; + bool inject_for_all_file_types_; + std::vector write_error_allowed_types_; + // File types where direct writable is skipped. + std::set skip_direct_writable_types_; + bool ingest_data_corruption_before_write_; + ChecksumType checksum_handoff_func_tpye_; + bool fail_get_file_unique_id_; + + // Extract number of type from file name. Return false if failing to fine + // them. + bool TryParseFileName(const std::string& file_name, uint64_t* number, + FileType* type); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.cc new file mode 100644 index 0000000000..d7a2a1bd75 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This class implements a custom SecondaryCache that randomly injects an +// error status into Inserts/Lookups based on a specified probability. + +#include "utilities/fault_injection_secondary_cache.h" + +namespace ROCKSDB_NAMESPACE { + +void FaultInjectionSecondaryCache::ResultHandle::UpdateHandleValue( + FaultInjectionSecondaryCache::ResultHandle* handle) { + ErrorContext* ctx = handle->cache_->GetErrorContext(); + if (!ctx->rand.OneIn(handle->cache_->prob_)) { + handle->value_ = handle->base_->Value(); + handle->size_ = handle->base_->Size(); + } + handle->base_.reset(); +} + +bool FaultInjectionSecondaryCache::ResultHandle::IsReady() { + bool ready = true; + if (base_) { + ready = base_->IsReady(); + if (ready) { + UpdateHandleValue(this); + } + } + return ready; +} + +void FaultInjectionSecondaryCache::ResultHandle::Wait() { + base_->Wait(); + UpdateHandleValue(this); +} + +Cache::ObjectPtr FaultInjectionSecondaryCache::ResultHandle::Value() { + return value_; +} + +size_t FaultInjectionSecondaryCache::ResultHandle::Size() { return size_; } + +void FaultInjectionSecondaryCache::ResultHandle::WaitAll( + FaultInjectionSecondaryCache* cache, + std::vector handles) { + std::vector base_handles; + for (SecondaryCacheResultHandle* hdl : handles) { + FaultInjectionSecondaryCache::ResultHandle* handle = + static_cast(hdl); + if (!handle->base_) { + continue; + } + base_handles.emplace_back(handle->base_.get()); + } + + cache->base_->WaitAll(base_handles); + for (SecondaryCacheResultHandle* hdl : handles) { + FaultInjectionSecondaryCache::ResultHandle* handle = + static_cast(hdl); + if (handle->base_) { + UpdateHandleValue(handle); + } + } +} + +FaultInjectionSecondaryCache::ErrorContext* +FaultInjectionSecondaryCache::GetErrorContext() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (!ctx) { + ctx = new ErrorContext(seed_); + thread_local_error_->Reset(ctx); + } + + return ctx; +} + +Status FaultInjectionSecondaryCache::Insert( + const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper) { + ErrorContext* ctx = GetErrorContext(); + if (ctx->rand.OneIn(prob_)) { + return Status::IOError(); + } + + return base_->Insert(key, value, helper); +} + +std::unique_ptr +FaultInjectionSecondaryCache::Lookup(const Slice& key, + const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, + bool wait, bool advise_erase, + bool& kept_in_sec_cache) { + ErrorContext* ctx = GetErrorContext(); + if (base_is_compressed_sec_cache_) { + if (ctx->rand.OneIn(prob_)) { + return nullptr; + } else { + return base_->Lookup(key, helper, create_context, wait, advise_erase, + kept_in_sec_cache); + } + } else { + std::unique_ptr hdl = base_->Lookup( + key, helper, create_context, wait, advise_erase, kept_in_sec_cache); + if (wait && ctx->rand.OneIn(prob_)) { + hdl.reset(); + } + return std::unique_ptr( + new FaultInjectionSecondaryCache::ResultHandle(this, std::move(hdl))); + } +} + +void FaultInjectionSecondaryCache::Erase(const Slice& key) { + base_->Erase(key); +} + +void FaultInjectionSecondaryCache::WaitAll( + std::vector handles) { + if (base_is_compressed_sec_cache_) { + ErrorContext* ctx = GetErrorContext(); + std::vector base_handles; + for (SecondaryCacheResultHandle* hdl : handles) { + if (ctx->rand.OneIn(prob_)) { + continue; + } + base_handles.push_back(hdl); + } + base_->WaitAll(base_handles); + } else { + FaultInjectionSecondaryCache::ResultHandle::WaitAll(this, handles); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.h new file mode 100644 index 0000000000..ed89f655aa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/fault_injection_secondary_cache.h @@ -0,0 +1,109 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/secondary_cache.h" +#include "util/random.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +// This class implements a custom SecondaryCache that randomly injects an +// error status into Inserts/Lookups based on a specified probability. +// Its used by db_stress to verify correctness in the presence of +// secondary cache errors. +// +class FaultInjectionSecondaryCache : public SecondaryCache { + public: + explicit FaultInjectionSecondaryCache( + const std::shared_ptr& base, uint32_t seed, int prob) + : base_(base), + seed_(seed), + prob_(prob), + thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) { + if (std::strcmp(base_->Name(), "CompressedSecondaryCache") == 0) { + base_is_compressed_sec_cache_ = true; + } + } + + virtual ~FaultInjectionSecondaryCache() override {} + + const char* Name() const override { return "FaultInjectionSecondaryCache"; } + + Status Insert(const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper) override; + + std::unique_ptr Lookup( + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& kept_in_sec_cache) override; + + bool SupportForceErase() const override { return base_->SupportForceErase(); } + + void Erase(const Slice& key) override; + + void WaitAll(std::vector handles) override; + + Status SetCapacity(size_t capacity) override { + return base_->SetCapacity(capacity); + } + + Status GetCapacity(size_t& capacity) override { + return base_->GetCapacity(capacity); + } + + std::string GetPrintableOptions() const override { + return base_->GetPrintableOptions(); + } + + private: + class ResultHandle : public SecondaryCacheResultHandle { + public: + ResultHandle(FaultInjectionSecondaryCache* cache, + std::unique_ptr&& base) + : cache_(cache), base_(std::move(base)), value_(nullptr), size_(0) {} + + ~ResultHandle() override {} + + bool IsReady() override; + + void Wait() override; + + Cache::ObjectPtr Value() override; + + size_t Size() override; + + static void WaitAll(FaultInjectionSecondaryCache* cache, + std::vector handles); + + private: + static void UpdateHandleValue(ResultHandle* handle); + + FaultInjectionSecondaryCache* cache_; + std::unique_ptr base_; + Cache::ObjectPtr value_; + size_t size_; + }; + + static void DeleteThreadLocalErrorContext(void* p) { + ErrorContext* ctx = static_cast(p); + delete ctx; + } + + const std::shared_ptr base_; + uint32_t seed_; + int prob_; + bool base_is_compressed_sec_cache_{false}; + + struct ErrorContext { + Random rand; + + explicit ErrorContext(uint32_t seed) : rand(seed) {} + }; + std::unique_ptr thread_local_error_; + + ErrorContext* GetErrorContext(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/leveldb_options/leveldb_options.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/leveldb_options/leveldb_options.cc new file mode 100644 index 0000000000..f81e59b83a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/leveldb_options/leveldb_options.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/utilities/leveldb_options.h" + +#include "rocksdb/advanced_cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +LevelDBOptions::LevelDBOptions() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(nullptr), + write_buffer_size(4 << 20), + max_open_files(1000), + block_cache(nullptr), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(nullptr) {} + +Options ConvertOptions(const LevelDBOptions& leveldb_options) { + Options options = Options(); + options.create_if_missing = leveldb_options.create_if_missing; + options.error_if_exists = leveldb_options.error_if_exists; + options.paranoid_checks = leveldb_options.paranoid_checks; + options.env = leveldb_options.env; + options.info_log.reset(leveldb_options.info_log); + options.write_buffer_size = leveldb_options.write_buffer_size; + options.max_open_files = leveldb_options.max_open_files; + options.compression = leveldb_options.compression; + + BlockBasedTableOptions table_options; + table_options.block_cache.reset(leveldb_options.block_cache); + table_options.block_size = leveldb_options.block_size; + table_options.block_restart_interval = leveldb_options.block_restart_interval; + table_options.filter_policy.reset(leveldb_options.filter_policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + return options; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory/memory_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory/memory_util.cc new file mode 100644 index 0000000000..62042192f9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory/memory_util.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/utilities/memory_util.h" + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +Status MemoryUtil::GetApproximateMemoryUsageByType( + const std::vector& dbs, + const std::unordered_set cache_set, + std::map* usage_by_type) { + usage_by_type->clear(); + + // MemTable + for (auto* db : dbs) { + uint64_t usage = 0; + if (db->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, + &usage)) { + (*usage_by_type)[MemoryUtil::kMemTableTotal] += usage; + } + if (db->GetAggregatedIntProperty(DB::Properties::kCurSizeAllMemTables, + &usage)) { + (*usage_by_type)[MemoryUtil::kMemTableUnFlushed] += usage; + } + } + + // Table Readers + for (auto* db : dbs) { + uint64_t usage = 0; + if (db->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, + &usage)) { + (*usage_by_type)[MemoryUtil::kTableReadersTotal] += usage; + } + } + + // Cache + for (const auto* cache : cache_set) { + if (cache != nullptr) { + (*usage_by_type)[MemoryUtil::kCacheTotal] += cache->GetUsage(); + } + } + + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory_allocators.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory_allocators.h new file mode 100644 index 0000000000..bdc2e13a94 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/memory_allocators.h @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include "rocksdb/memory_allocator.h" + +namespace ROCKSDB_NAMESPACE { +// A memory allocator using new/delete +class DefaultMemoryAllocator : public MemoryAllocator { + public: + static const char* kClassName() { return "DefaultMemoryAllocator"; } + const char* Name() const override { return kClassName(); } + void* Allocate(size_t size) override { + return static_cast(new char[size]); + } + + void Deallocate(void* p) override { delete[] static_cast(p); } +}; + +// Base class for a MemoryAllocator. This implementation does nothing +// and implements the methods in failuse mode (assert if the methods are +// invoked). Implementations can extend this class and override these methods +// when they are enabled via compiler switches (e.g., the +// JeMallocMemoryAllocator can define these methods if ROCKSDB_JEMALLOC is +// defined at compile time. If compiled in "disabled" mode, this class provides +// default/failure implementations. If compiled in "enabled" mode, the derived +// class needs to provide the appopriate "enabled" methods for the "real" +// implementation. Failure of the "real" implementation to implement ovreride +// any of these methods will result in an assert failure. +class BaseMemoryAllocator : public MemoryAllocator { + public: + void* Allocate(size_t /*size*/) override { + assert(false); + return nullptr; + } + + void Deallocate(void* /*p*/) override { assert(false); } +}; + +// A Wrapped MemoryAllocator. Delegates the memory allcator functions to the +// wrapped one. +class MemoryAllocatorWrapper : public MemoryAllocator { + public: + // Initialize an MemoryAllocatorWrapper that delegates all calls to *t + explicit MemoryAllocatorWrapper(const std::shared_ptr& t); + ~MemoryAllocatorWrapper() override {} + + // Return the target to which to forward all calls + MemoryAllocator* target() const { return target_.get(); } + // Allocate a block of at least size. Has to be thread-safe. + void* Allocate(size_t size) override { return target_->Allocate(size); } + + // Deallocate previously allocated block. Has to be thread-safe. + void Deallocate(void* p) override { return target_->Deallocate(p); } + + // Returns the memory size of the block allocated at p. The default + // implementation that just returns the original allocation_size is fine. + size_t UsableSize(void* p, size_t allocation_size) const override { + return target_->UsableSize(p, allocation_size); + } + + const Customizable* Inner() const override { return target_.get(); } + + protected: + std::shared_ptr target_; +}; + +// A memory allocator that counts the number of allocations and deallocations +// This class is useful if the number of memory allocations/dellocations is +// important. +class CountedMemoryAllocator : public MemoryAllocatorWrapper { + public: + CountedMemoryAllocator() + : MemoryAllocatorWrapper(std::make_shared()), + allocations_(0), + deallocations_(0) {} + + explicit CountedMemoryAllocator(const std::shared_ptr& t) + : MemoryAllocatorWrapper(t), allocations_(0), deallocations_(0) {} + static const char* kClassName() { return "CountedMemoryAllocator"; } + const char* Name() const override { return kClassName(); } + std::string GetId() const override { return std::string(Name()); } + void* Allocate(size_t size) override { + allocations_++; + return MemoryAllocatorWrapper::Allocate(size); + } + + void Deallocate(void* p) override { + deallocations_++; + MemoryAllocatorWrapper::Deallocate(p); + } + uint64_t GetNumAllocations() const { return allocations_; } + uint64_t GetNumDeallocations() const { return deallocations_; } + + private: + std::atomic allocations_; + std::atomic deallocations_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.cc new file mode 100644 index 0000000000..020064d092 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.cc @@ -0,0 +1,115 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/merge_operators.h" + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/max_operator.h" +#include "utilities/merge_operators/put_operator.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" +#include "utilities/merge_operators/uint64add.h" + +namespace ROCKSDB_NAMESPACE { +static int RegisterBuiltinMergeOperators(ObjectLibrary& library, + const std::string& /*arg*/) { + size_t num_types; + library.AddFactory( + ObjectLibrary::PatternEntry(StringAppendOperator::kClassName()) + .AnotherName(StringAppendOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new StringAppendOperator(",")); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(StringAppendTESTOperator::kClassName()) + .AnotherName(StringAppendTESTOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new StringAppendTESTOperator(",")); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(SortList::kClassName()) + .AnotherName(SortList::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new SortList()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(BytesXOROperator::kClassName()) + .AnotherName(BytesXOROperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new BytesXOROperator()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(UInt64AddOperator::kClassName()) + .AnotherName(UInt64AddOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new UInt64AddOperator()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(MaxOperator::kClassName()) + .AnotherName(MaxOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new MaxOperator()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(PutOperatorV2::kClassName()) + .AnotherName(PutOperatorV2::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new PutOperatorV2()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(PutOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new PutOperator()); + return guard->get(); + }); + + return static_cast(library.GetFactoryCount(&num_types)); +} + +Status MergeOperator::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinMergeOperators(*(ObjectLibrary::Default().get()), ""); + }); + return LoadSharedObject(config_options, value, result); +} + +std::shared_ptr MergeOperators::CreateFromStringId( + const std::string& id) { + std::shared_ptr result; + Status s = MergeOperator::CreateFromString(ConfigOptions(), id, &result); + if (s.ok()) { + return result; + } else { + // Empty or unknown, just return nullptr + return nullptr; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.h new file mode 100644 index 0000000000..9b90107e39 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include + +#include +#include + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { + +class MergeOperators { + public: + static std::shared_ptr CreatePutOperator(); + static std::shared_ptr CreateDeprecatedPutOperator(); + static std::shared_ptr CreateUInt64AddOperator(); + static std::shared_ptr CreateStringAppendOperator(); + static std::shared_ptr CreateStringAppendOperator( + char delim_char); + static std::shared_ptr CreateStringAppendOperator( + const std::string& delim); + static std::shared_ptr CreateStringAppendTESTOperator(); + static std::shared_ptr CreateMaxOperator(); + static std::shared_ptr CreateBytesXOROperator(); + static std::shared_ptr CreateSortOperator(); + + // Will return a different merge operator depending on the string. + static std::shared_ptr CreateFromStringId( + const std::string& name); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.cc new file mode 100644 index 0000000000..fa09c18ea9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/merge_operators/bytesxor.h" + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr MergeOperators::CreateBytesXOROperator() { + return std::make_shared(); +} + +bool BytesXOROperator::Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* /*logger*/) const { + XOR(existing_value, value, new_value); + return true; +} + +void BytesXOROperator::XOR(const Slice* existing_value, const Slice& value, + std::string* new_value) const { + if (!existing_value) { + new_value->clear(); + new_value->assign(value.data(), value.size()); + return; + } + + size_t min_size = std::min(existing_value->size(), value.size()); + size_t max_size = std::max(existing_value->size(), value.size()); + + new_value->clear(); + new_value->reserve(max_size); + + const char* existing_value_data = existing_value->data(); + const char* value_data = value.data(); + + for (size_t i = 0; i < min_size; i++) { + new_value->push_back(existing_value_data[i] ^ value_data[i]); + } + + if (existing_value->size() == max_size) { + for (size_t i = min_size; i < max_size; i++) { + new_value->push_back(existing_value_data[i]); + } + } else { + assert(value.size() == max_size); + for (size_t i = min_size; i < max_size; i++) { + new_value->push_back(value_data[i]); + } + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.h new file mode 100644 index 0000000000..3c7baaccec --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/bytesxor.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// A 'model' merge operator that XORs two (same sized) array of bytes. +// Implemented as an AssociativeMergeOperator for simplicity and example. +class BytesXOROperator : public AssociativeMergeOperator { + public: + // XORs the two array of bytes one byte at a time and stores the result + // in new_value. len is the number of xored bytes, and the length of new_value + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override; + + static const char* kClassName() { return "BytesXOR"; } + static const char* kNickName() { return "bytesxor"; } + + const char* NickName() const override { return kNickName(); } + const char* Name() const override { return kClassName(); } + + void XOR(const Slice* existing_value, const Slice& value, + std::string* new_value) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max.cc new file mode 100644 index 0000000000..ff854cca34 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/max_operator.h" + +namespace ROCKSDB_NAMESPACE { + +bool MaxOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + Slice& max = merge_out->existing_operand; + if (merge_in.existing_value) { + max = + Slice(merge_in.existing_value->data(), merge_in.existing_value->size()); + } else if (max.data() == nullptr) { + max = Slice(); + } + + for (const auto& op : merge_in.operand_list) { + if (max.compare(op) < 0) { + max = op; + } + } + + return true; +} + +bool MaxOperator::PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* /*logger*/) const { + if (left_operand.compare(right_operand) >= 0) { + new_value->assign(left_operand.data(), left_operand.size()); + } else { + new_value->assign(right_operand.data(), right_operand.size()); + } + return true; +} + +bool MaxOperator::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + Slice max; + for (const auto& operand : operand_list) { + if (max.compare(operand) < 0) { + max = operand; + } + } + + new_value->assign(max.data(), max.size()); + return true; +} + +std::shared_ptr MergeOperators::CreateMaxOperator() { + return std::make_shared(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max_operator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max_operator.h new file mode 100644 index 0000000000..4c8e98db4b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/max_operator.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Merge operator that picks the maximum operand, Comparison is based on +// Slice::compare + +#pragma once + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class Slice; + +class MaxOperator : public MergeOperator { + public: + static const char* kClassName() { return "MaxOperator"; } + static const char* kNickName() { return "max"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put.cc new file mode 100644 index 0000000000..79468c6b78 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/put_operator.h" + +namespace ROCKSDB_NAMESPACE { + +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. +bool PutOperator::FullMerge(const Slice& /*key*/, + const Slice* /*existing_value*/, + const std::deque& operand_sequence, + std::string* new_value, Logger* /*logger*/) const { + // Put basically only looks at the current/latest value + assert(!operand_sequence.empty()); + assert(new_value != nullptr); + new_value->assign(operand_sequence.back()); + return true; +} + +bool PutOperator::PartialMerge(const Slice& /*key*/, + const Slice& /*left_operand*/, + const Slice& right_operand, + std::string* new_value, + Logger* /*logger*/) const { + new_value->assign(right_operand.data(), right_operand.size()); + return true; +} + +bool PutOperator::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + new_value->assign(operand_list.back().data(), operand_list.back().size()); + return true; +} + +bool PutOperatorV2::FullMerge( + const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_sequence*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + assert(false); + return false; +} + +bool PutOperatorV2::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Put basically only looks at the current/latest value + assert(!merge_in.operand_list.empty()); + merge_out->existing_operand = merge_in.operand_list.back(); + return true; +} + +std::shared_ptr MergeOperators::CreateDeprecatedPutOperator() { + return std::make_shared(); +} + +std::shared_ptr MergeOperators::CreatePutOperator() { + return std::make_shared(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put_operator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put_operator.h new file mode 100644 index 0000000000..7529da78e3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/put_operator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. + +#pragma once + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class Slice; + +class PutOperator : public MergeOperator { + public: + static const char* kClassName() { return "PutOperator"; } + static const char* kNickName() { return "put_v1"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& operand_sequence, + std::string* new_value, Logger* /*logger*/) const override; + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + using MergeOperator::PartialMergeMulti; + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override; +}; + +class PutOperatorV2 : public PutOperator { + public: + static const char* kNickName() { return "put"; } + const char* NickName() const override { return kNickName(); } + + bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_sequence*/, + std::string* /*new_value*/, Logger* /*logger*/) const override; + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.cc new file mode 100644 index 0000000000..67bfc7e5ea --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "utilities/merge_operators/sortlist.h" + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +bool SortList::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + std::vector left; + for (Slice slice : merge_in.operand_list) { + std::vector right; + MakeVector(right, slice); + left = Merge(left, right); + } + for (int i = 0; i < static_cast(left.size()) - 1; i++) { + merge_out->new_value.append(std::to_string(left[i])).append(","); + } + merge_out->new_value.append(std::to_string(left.back())); + return true; +} + +bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const { + std::vector left; + std::vector right; + MakeVector(left, left_operand); + MakeVector(right, right_operand); + left = Merge(left, right); + for (int i = 0; i < static_cast(left.size()) - 1; i++) { + new_value->append(std::to_string(left[i])).append(","); + } + new_value->append(std::to_string(left.back())); + return true; +} + +bool SortList::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + (void)operand_list; + (void)new_value; + return true; +} + +void SortList::MakeVector(std::vector& operand, Slice slice) const { + do { + const char* begin = slice.data_; + while (*slice.data_ != ',' && *slice.data_) slice.data_++; + operand.push_back(std::stoi(std::string(begin, slice.data_))); + } while (0 != *slice.data_++); +} + +std::vector SortList::Merge(std::vector& left, + std::vector& right) const { + // Fill the resultant vector with sorted results from both vectors + std::vector result; + unsigned left_it = 0, right_it = 0; + + while (left_it < left.size() && right_it < right.size()) { + // If the left value is smaller than the right it goes next + // into the resultant vector + if (left[left_it] < right[right_it]) { + result.push_back(left[left_it]); + left_it++; + } else { + result.push_back(right[right_it]); + right_it++; + } + } + + // Push the remaining data from both vectors onto the resultant + while (left_it < left.size()) { + result.push_back(left[left_it]); + left_it++; + } + + while (right_it < right.size()) { + result.push_back(right[right_it]); + right_it++; + } + + return result; +} + +std::shared_ptr MergeOperators::CreateSortOperator() { + return std::make_shared(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.h new file mode 100644 index 0000000000..eaa4e76fba --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/sortlist.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// A MergeOperator for RocksDB that implements Merge Sort. +// It is built using the MergeOperator interface. The operator works by taking +// an input which contains one or more merge operands where each operand is a +// list of sorted ints and merges them to form a large sorted list. +#pragma once + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class SortList : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + static const char* kClassName() { return "MergeSortOperator"; } + static const char* kNickName() { return "sortlist"; } + + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + void MakeVector(std::vector& operand, Slice slice) const; + + private: + std::vector Merge(std::vector& left, std::vector& right) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.cc new file mode 100644 index 0000000000..720dc7200a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.cc @@ -0,0 +1,76 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend.h" + +#include + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map + stringappend_merge_type_info = { + {"delimiter", + {0, OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; +} // namespace +// Constructor: also specify the delimiter character. +StringAppendOperator::StringAppendOperator(char delim_char) + : delim_(1, delim_char) { + RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info); +} + +StringAppendOperator::StringAppendOperator(const std::string& delim) + : delim_(delim) { + RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info); +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendOperator::Merge(const Slice& /*key*/, + const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* /*logger*/) const { + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + if (!existing_value) { + // No existing_value. Set *new_value = value + new_value->assign(value.data(), value.size()); + } else { + // Generic append (existing_value != null). + // Reserve *new_value to correct size, and apply concatenation. + new_value->reserve(existing_value->size() + delim_.size() + value.size()); + new_value->assign(existing_value->data(), existing_value->size()); + new_value->append(delim_); + new_value->append(value.data(), value.size()); + } + + return true; +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator() { + return std::make_shared(','); +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator( + char delim_char) { + return std::make_shared(delim_char); +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator( + const std::string& delim) { + return std::make_shared(delim); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.h new file mode 100644 index 0000000000..153532382c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend.h @@ -0,0 +1,32 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class StringAppendOperator : public AssociativeMergeOperator { + public: + // Constructor: specify delimiter + explicit StringAppendOperator(char delim_char); + explicit StringAppendOperator(const std::string& delim); + + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override; + + static const char* kClassName() { return "StringAppendOperator"; } + static const char* kNickName() { return "stringappend"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + + private: + std::string delim_; // The delimiter is inserted between elements +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.cc new file mode 100644 index 0000000000..8b6fe9020a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.cc @@ -0,0 +1,130 @@ +/** + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend2.h" + +#include + +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map + stringappend2_merge_type_info = { + {"delimiter", + {0, OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; +} // namespace + +// Constructor: also specify the delimiter character. +StringAppendTESTOperator::StringAppendTESTOperator(char delim_char) + : delim_(1, delim_char) { + RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info); +} + +StringAppendTESTOperator::StringAppendTESTOperator(const std::string& delim) + : delim_(delim) { + RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info); +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendTESTOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Clear the *new_value for writing. + merge_out->new_value.clear(); + + if (merge_in.existing_value == nullptr && merge_in.operand_list.size() == 1) { + // Only one operand + merge_out->existing_operand = merge_in.operand_list.back(); + return true; + } + + // Compute the space needed for the final result. + size_t numBytes = 0; + + for (auto it = merge_in.operand_list.begin(); + it != merge_in.operand_list.end(); ++it) { + numBytes += it->size() + delim_.size(); + } + + // Only print the delimiter after the first entry has been printed + bool printDelim = false; + + // Prepend the *existing_value if one exists. + if (merge_in.existing_value) { + merge_out->new_value.reserve(numBytes + merge_in.existing_value->size()); + merge_out->new_value.append(merge_in.existing_value->data(), + merge_in.existing_value->size()); + printDelim = true; + } else if (numBytes) { + // Without the existing (initial) value, the delimiter before the first of + // subsequent operands becomes redundant. + merge_out->new_value.reserve(numBytes - delim_.size()); + } + + // Concatenate the sequence of strings (and add a delimiter between each) + for (auto it = merge_in.operand_list.begin(); + it != merge_in.operand_list.end(); ++it) { + if (printDelim) { + merge_out->new_value.append(delim_); + } + merge_out->new_value.append(it->data(), it->size()); + printDelim = true; + } + + return true; +} + +bool StringAppendTESTOperator::PartialMergeMulti( + const Slice& /*key*/, const std::deque& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + return false; +} + +// A version of PartialMerge that actually performs "partial merging". +// Use this to simulate the exact behaviour of the StringAppendOperator. +bool StringAppendTESTOperator::_AssocPartialMergeMulti( + const Slice& /*key*/, const std::deque& operand_list, + std::string* new_value, Logger* /*logger*/) const { + // Clear the *new_value for writing + assert(new_value); + new_value->clear(); + assert(operand_list.size() >= 2); + + // Generic append + // Determine and reserve correct size for *new_value. + size_t size = 0; + for (const auto& operand : operand_list) { + size += operand.size(); + } + size += (operand_list.size() - 1) * delim_.length(); // Delimiters + new_value->reserve(size); + + // Apply concatenation + new_value->assign(operand_list.front().data(), operand_list.front().size()); + + for (std::deque::const_iterator it = operand_list.begin() + 1; + it != operand_list.end(); ++it) { + new_value->append(delim_); + new_value->append(it->data(), it->size()); + } + + return true; +} + +std::shared_ptr +MergeOperators::CreateStringAppendTESTOperator() { + return std::make_shared(','); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.h new file mode 100644 index 0000000000..75389e4ae8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/string_append/stringappend2.h @@ -0,0 +1,52 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +/** + * A TEST MergeOperator for rocksdb that implements string append. + * It is built using the MergeOperator interface rather than the simpler + * AssociativeMergeOperator interface. This is useful for testing/benchmarking. + * While the two operators are semantically the same, all production code + * should use the StringAppendOperator defined in stringappend.{h,cc}. The + * operator defined in the present file is primarily for testing. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class StringAppendTESTOperator : public MergeOperator { + public: + // Constructor with delimiter + explicit StringAppendTESTOperator(char delim_char); + explicit StringAppendTESTOperator(const std::string& delim); + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + static const char* kClassName() { return "StringAppendTESTOperator"; } + static const char* kNickName() { return "stringappendtest"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + private: + // A version of PartialMerge that actually performs "partial merging". + // Use this to simulate the exact behaviour of the StringAppendOperator. + bool _AssocPartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + std::string delim_; // The delimiter is inserted between elements +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.cc new file mode 100644 index 0000000000..72957761b6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/merge_operators/uint64add.h" + +#include + +#include "logging/logging.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { // anonymous namespace + +bool UInt64AddOperator::Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const { + uint64_t orig_value = 0; + if (existing_value) { + orig_value = DecodeInteger(*existing_value, logger); + } + uint64_t operand = DecodeInteger(value, logger); + + assert(new_value); + new_value->clear(); + PutFixed64(new_value, orig_value + operand); + + return true; // Return true always since corruption will be treated as 0 +} + +uint64_t UInt64AddOperator::DecodeInteger(const Slice& value, + Logger* logger) const { + uint64_t result = 0; + + if (value.size() == sizeof(uint64_t)) { + result = DecodeFixed64(value.data()); + } else if (logger != nullptr) { + // If value is corrupted, treat it as 0 + ROCKS_LOG_ERROR(logger, + "uint64 value corruption, size: %" ROCKSDB_PRIszt + " > %" ROCKSDB_PRIszt, + value.size(), sizeof(uint64_t)); + } + + return result; +} + +std::shared_ptr MergeOperators::CreateUInt64AddOperator() { + return std::make_shared(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.h new file mode 100644 index 0000000000..7e232677a3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/merge_operators/uint64add.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A 'model' merge operator with uint64 addition semantics +// Implemented as an AssociativeMergeOperator for simplicity and example. + +#pragma once + +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class Slice; + +class UInt64AddOperator : public AssociativeMergeOperator { + public: + static const char* kClassName() { return "UInt64AddOperator"; } + static const char* kNickName() { return "uint64add"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override; + + private: + // Takes the string and decodes it into a uint64_t + // On error, prints a message and returns 0 + uint64_t DecodeInteger(const Slice& value, Logger* logger) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/object_registry.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/object_registry.cc new file mode 100644 index 0000000000..786f2ee2e4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/object_registry.cc @@ -0,0 +1,381 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/object_registry.h" + +#include + +#include "logging/logging.h" +#include "port/lang.h" +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +bool MatchesInteger(const std::string &target, size_t start, size_t pos) { + // If it is numeric, everything up to the match must be a number + int digits = 0; + if (target[start] == '-') { + start++; // Allow negative numbers + } + while (start < pos) { + if (!isdigit(target[start++])) { + return false; + } else { + digits++; + } + } + return (digits > 0); +} + +bool MatchesDecimal(const std::string &target, size_t start, size_t pos) { + int digits = 0; + if (target[start] == '-') { + start++; // Allow negative numbers + } + for (bool point = false; start < pos; start++) { + if (target[start] == '.') { + if (point) { + return false; + } else { + point = true; + } + } else if (!isdigit(target[start])) { + return false; + } else { + digits++; + } + } + return (digits > 0); +} +} // namespace + +size_t ObjectLibrary::PatternEntry::MatchSeparatorAt( + size_t start, Quantifier mode, const std::string &target, size_t tlen, + const std::string &separator) const { + size_t slen = separator.size(); + // See if there is enough space. If so, find the separator + if (tlen < start + slen) { + return std::string::npos; // not enough space left + } else if (mode == kMatchExact) { + // Exact mode means the next thing we are looking for is the separator + if (target.compare(start, slen, separator) != 0) { + return std::string::npos; + } else { + return start + slen; // Found the separator, return where we found it + } + } else { + auto pos = start + 1; + if (!separator.empty()) { + pos = target.find(separator, pos); + } + if (pos == std::string::npos) { + return pos; + } else if (mode == kMatchInteger) { + if (!MatchesInteger(target, start, pos)) { + return std::string::npos; + } + } else if (mode == kMatchDecimal) { + if (!MatchesDecimal(target, start, pos)) { + return std::string::npos; + } + } + return pos + slen; + } +} + +bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name, + size_t nlen, + const std::string &target, + size_t tlen) const { + if (separators_.empty()) { + assert(optional_); // If there are no separators, it must be only a name + return nlen == tlen && name == target; + } else if (nlen == tlen) { // The lengths are the same + return optional_ && name == target; + } else if (tlen < nlen + slength_) { + // The target is not long enough + return false; + } else if (target.compare(0, nlen, name) != 0) { + return false; // Target does not start with name + } else { + // Loop through all of the separators one at a time matching them. + // Note that we first match the separator and then its quantifiers. + // Since we expect the separator first, we start with an exact match + // Subsequent matches will use the quantifier of the previous separator + size_t start = nlen; + auto mode = kMatchExact; + for (size_t idx = 0; idx < separators_.size(); ++idx) { + const auto &separator = separators_[idx]; + start = MatchSeparatorAt(start, mode, target, tlen, separator.first); + if (start == std::string::npos) { + return false; + } else { + mode = separator.second; + } + } + // We have matched all of the separators. Now check that what is left + // unmatched in the target is acceptable. + if (mode == kMatchExact) { + return (start == tlen); + } else if (start > tlen || (start == tlen && mode != kMatchZeroOrMore)) { + return false; + } else if (mode == kMatchInteger) { + return MatchesInteger(target, start, tlen); + } else if (mode == kMatchDecimal) { + return MatchesDecimal(target, start, tlen); + } + } + return true; +} + +bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const { + auto tlen = target.size(); + if (MatchesTarget(name_, nlength_, target, tlen)) { + return true; + } else if (!names_.empty()) { + for (const auto &alt : names_) { + if (MatchesTarget(alt, alt.size(), target, tlen)) { + return true; + } + } + } + return false; +} + +size_t ObjectLibrary::GetFactoryCount(size_t *types) const { + std::unique_lock lock(mu_); + *types = factories_.size(); + size_t factories = 0; + for (const auto &e : factories_) { + factories += e.second.size(); + } + return factories; +} + +size_t ObjectLibrary::GetFactoryCount(const std::string &type) const { + std::unique_lock lock(mu_); + auto iter = factories_.find(type); + if (iter != factories_.end()) { + return iter->second.size(); + } else { + return 0; + } +} + +void ObjectLibrary::GetFactoryNames(const std::string &type, + std::vector *names) const { + assert(names); + std::unique_lock lock(mu_); + auto iter = factories_.find(type); + if (iter != factories_.end()) { + for (const auto &f : iter->second) { + names->push_back(f->Name()); + } + } +} + +void ObjectLibrary::GetFactoryTypes( + std::unordered_set *types) const { + assert(types); + std::unique_lock lock(mu_); + for (const auto &iter : factories_) { + types->insert(iter.first); + } +} + +void ObjectLibrary::Dump(Logger *logger) const { + std::unique_lock lock(mu_); + if (logger != nullptr && !factories_.empty()) { + ROCKS_LOG_HEADER(logger, " Registered Library: %s\n", id_.c_str()); + for (const auto &iter : factories_) { + ROCKS_LOG_HEADER(logger, " Registered factories for type[%s] ", + iter.first.c_str()); + bool printed_one = false; + for (const auto &e : iter.second) { + ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name()); + printed_one = true; + } + } + } +} + +// Returns the Default singleton instance of the ObjectLibrary +// This instance will contain most of the "standard" registered objects +std::shared_ptr &ObjectLibrary::Default() { + // Use avoid destruction here so the default ObjectLibrary will not be + // statically destroyed and long-lived. + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared("default")); + return instance; +} + +ObjectRegistry::ObjectRegistry(const std::shared_ptr &library) { + libraries_.push_back(library); + for (const auto &b : builtins_) { + RegisterPlugin(b.first, b.second); + } +} + +std::shared_ptr ObjectRegistry::Default() { + // Use avoid destruction here so the default ObjectRegistry will not be + // statically destroyed and long-lived. + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared(ObjectLibrary::Default())); + return instance; +} + +std::shared_ptr ObjectRegistry::NewInstance() { + return std::make_shared(Default()); +} + +std::shared_ptr ObjectRegistry::NewInstance( + const std::shared_ptr &parent) { + return std::make_shared(parent); +} + +Status ObjectRegistry::SetManagedObject( + const std::string &type, const std::string &id, + const std::shared_ptr &object) { + std::string object_key = ToManagedObjectKey(type, id); + std::shared_ptr curr; + if (parent_ != nullptr) { + curr = parent_->GetManagedObject(type, id); + } + if (curr == nullptr) { + // We did not find the object in any parent. Update in the current + std::unique_lock lock(objects_mutex_); + auto iter = managed_objects_.find(object_key); + if (iter != managed_objects_.end()) { // The object exists + curr = iter->second.lock(); + if (curr != nullptr && curr != object) { + return Status::InvalidArgument("Object already exists: ", object_key); + } else { + iter->second = object; + } + } else { + // The object does not exist. Add it + managed_objects_[object_key] = object; + } + } else if (curr != object) { + return Status::InvalidArgument("Object already exists: ", object_key); + } + return Status::OK(); +} + +std::shared_ptr ObjectRegistry::GetManagedObject( + const std::string &type, const std::string &id) const { + { + std::unique_lock lock(objects_mutex_); + auto iter = managed_objects_.find(ToManagedObjectKey(type, id)); + if (iter != managed_objects_.end()) { + return iter->second.lock(); + } + } + if (parent_ != nullptr) { + return parent_->GetManagedObject(type, id); + } else { + return nullptr; + } +} + +Status ObjectRegistry::ListManagedObjects( + const std::string &type, const std::string &name, + std::vector> *results) const { + { + std::string key = ToManagedObjectKey(type, name); + std::unique_lock lock(objects_mutex_); + for (auto iter = managed_objects_.lower_bound(key); + iter != managed_objects_.end() && StartsWith(iter->first, key); + ++iter) { + auto shared = iter->second.lock(); + if (shared != nullptr) { + if (name.empty() || shared->IsInstanceOf(name)) { + results->emplace_back(shared); + } + } + } + } + if (parent_ != nullptr) { + return parent_->ListManagedObjects(type, name, results); + } else { + return Status::OK(); + } +} + +// Returns the number of registered types for this registry. +// If specified (not-null), types is updated to include the names of the +// registered types. +size_t ObjectRegistry::GetFactoryCount(const std::string &type) const { + size_t count = 0; + if (parent_ != nullptr) { + count = parent_->GetFactoryCount(type); + } + std::unique_lock lock(library_mutex_); + for (const auto &library : libraries_) { + count += library->GetFactoryCount(type); + } + return count; +} + +void ObjectRegistry::GetFactoryNames(const std::string &type, + std::vector *names) const { + assert(names); + names->clear(); + if (parent_ != nullptr) { + parent_->GetFactoryNames(type, names); + } + std::unique_lock lock(library_mutex_); + for (const auto &library : libraries_) { + library->GetFactoryNames(type, names); + } +} + +void ObjectRegistry::GetFactoryTypes( + std::unordered_set *types) const { + assert(types); + if (parent_ != nullptr) { + parent_->GetFactoryTypes(types); + } + std::unique_lock lock(library_mutex_); + for (const auto &library : libraries_) { + library->GetFactoryTypes(types); + } +} + +void ObjectRegistry::Dump(Logger *logger) const { + if (logger != nullptr) { + std::unique_lock lock(library_mutex_); + if (!plugins_.empty()) { + ROCKS_LOG_HEADER(logger, " Registered Plugins:"); + bool printed_one = false; + for (const auto &plugin : plugins_) { + ROCKS_LOG_HEADER(logger, "%s%s", (printed_one) ? ", " : " ", + plugin.c_str()); + printed_one = true; + } + ROCKS_LOG_HEADER(logger, "\n"); + } + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) { + iter->get()->Dump(logger); + } + } + if (parent_ != nullptr) { + parent_->Dump(logger); + } +} + +int ObjectRegistry::RegisterPlugin(const std::string &name, + const RegistrarFunc &func) { + if (!name.empty() && func != nullptr) { + plugins_.push_back(name); + return AddLibrary(name)->Register(func, name); + } else { + return -1; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/option_change_migration/option_change_migration.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/option_change_migration/option_change_migration.cc new file mode 100644 index 0000000000..ea3cf68571 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/option_change_migration/option_change_migration.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/option_change_migration.h" + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// Return a version of Options `opts` that allow us to open/write into a DB +// without triggering an automatic compaction or stalling. This is guaranteed +// by disabling automatic compactions and using huge values for stalling +// triggers. +Options GetNoCompactionOptions(const Options& opts) { + Options ret_opts = opts; + ret_opts.disable_auto_compactions = true; + ret_opts.level0_slowdown_writes_trigger = 999999; + ret_opts.level0_stop_writes_trigger = 999999; + ret_opts.soft_pending_compaction_bytes_limit = 0; + ret_opts.hard_pending_compaction_bytes_limit = 0; + return ret_opts; +} + +Status OpenDb(const Options& options, const std::string& dbname, + std::unique_ptr* db) { + db->reset(); + DB* tmpdb; + Status s = DB::Open(options, dbname, &tmpdb); + if (s.ok()) { + db->reset(tmpdb); + } + return s; +} + +// l0_file_size specifies size of file on L0. Files will be range partitioned +// after a full compaction so they are likely qualified to put on L0. If +// left as 0, the files are compacted in a single file and put to L0. Otherwise, +// will try to compact the files as size l0_file_size. +Status CompactToLevel(const Options& options, const std::string& dbname, + int dest_level, uint64_t l0_file_size, bool need_reopen) { + std::unique_ptr db; + Options no_compact_opts = GetNoCompactionOptions(options); + if (dest_level == 0) { + if (l0_file_size == 0) { + // Single file. + l0_file_size = 999999999999999; + } + // L0 has strict sequenceID requirements to files to it. It's safer + // to only put one compacted file to there. + // This is only used for converting to universal compaction with + // only one level. In this case, compacting to one file is also + // optimal. + no_compact_opts.target_file_size_base = l0_file_size; + no_compact_opts.max_compaction_bytes = l0_file_size; + } + Status s = OpenDb(no_compact_opts, dbname, &db); + if (!s.ok()) { + return s; + } + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = dest_level; + if (dest_level == 0) { + // cannot use kForceOptimized because the compaction is expected to + // generate one output file + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + } + s = db->CompactRange(cro, nullptr, nullptr); + + if (s.ok() && need_reopen) { + // Need to restart DB to rewrite the manifest file. + // In order to open a DB with specific num_levels, the manifest file should + // contain no record that mentiones any level beyond num_levels. Issuing a + // full compaction will move all the data to a level not exceeding + // num_levels, but the manifest may still contain previous record mentioning + // a higher level. Reopening the DB will force the manifest to be rewritten + // so that those records will be cleared. + db.reset(); + s = OpenDb(no_compact_opts, dbname, &db); + } + return s; +} + +Status MigrateToUniversal(std::string dbname, const Options& old_opts, + const Options& new_opts) { + if (old_opts.num_levels <= new_opts.num_levels || + old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + return Status::OK(); + } else { + bool need_compact = false; + { + std::unique_ptr db; + Options opts = GetNoCompactionOptions(old_opts); + Status s = OpenDb(opts, dbname, &db); + if (!s.ok()) { + return s; + } + ColumnFamilyMetaData metadata; + db->GetColumnFamilyMetaData(&metadata); + if (!metadata.levels.empty() && + metadata.levels.back().level >= new_opts.num_levels) { + need_compact = true; + } + } + if (need_compact) { + return CompactToLevel(old_opts, dbname, new_opts.num_levels - 1, + /*l0_file_size=*/0, true); + } + return Status::OK(); + } +} + +Status MigrateToLevelBase(std::string dbname, const Options& old_opts, + const Options& new_opts) { + if (!new_opts.level_compaction_dynamic_level_bytes) { + if (old_opts.num_levels == 1) { + return Status::OK(); + } + // Compact everything to level 1 to guarantee it can be safely opened. + Options opts = old_opts; + opts.target_file_size_base = new_opts.target_file_size_base; + // Although sometimes we can open the DB with the new option without error, + // We still want to compact the files to avoid the LSM tree to stuck + // in bad shape. For example, if the user changed the level size + // multiplier from 4 to 8, with the same data, we will have fewer + // levels. Unless we issue a full comaction, the LSM tree may stuck + // with more levels than needed and it won't recover automatically. + return CompactToLevel(opts, dbname, 1, /*l0_file_size=*/0, true); + } else { + // Compact everything to the last level to guarantee it can be safely + // opened. + if (old_opts.num_levels == 1) { + return Status::OK(); + } else if (new_opts.num_levels > old_opts.num_levels) { + // Dynamic level mode requires data to be put in the last level first. + return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1, + /*l0_file_size=*/0, false); + } else { + Options opts = old_opts; + opts.target_file_size_base = new_opts.target_file_size_base; + return CompactToLevel(opts, dbname, new_opts.num_levels - 1, + /*l0_file_size=*/0, true); + } + } +} +} // namespace + +Status OptionChangeMigration(std::string dbname, const Options& old_opts, + const Options& new_opts) { + if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + // LSM generated by FIFO compaction can be opened by any compaction. + return Status::OK(); + } else if (new_opts.compaction_style == + CompactionStyle::kCompactionStyleUniversal) { + return MigrateToUniversal(dbname, old_opts, new_opts); + } else if (new_opts.compaction_style == + CompactionStyle::kCompactionStyleLevel) { + return MigrateToLevelBase(dbname, old_opts, new_opts); + } else if (new_opts.compaction_style == + CompactionStyle::kCompactionStyleFIFO) { + uint64_t l0_file_size = 0; + if (new_opts.compaction_options_fifo.max_table_files_size > 0) { + // Create at least 8 files when max_table_files_size hits, so that the DB + // doesn't just disappear. This in fact violates the FIFO condition, but + // otherwise, the migrated DB is unlikley to be usable. + l0_file_size = new_opts.compaction_options_fifo.max_table_files_size / 8; + } + return CompactToLevel(old_opts, dbname, 0, l0_file_size, true); + } else { + return Status::NotSupported( + "Do not how to migrate to this compaction style"); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/options/options_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/options/options_util.cc new file mode 100644 index 0000000000..394bf431f4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/options/options_util.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/utilities/options_util.h" + +#include "file/filename.h" +#include "options/options_parser.h" +#include "rocksdb/convenience.h" +#include "rocksdb/options.h" +#include "table/block_based/block_based_table_factory.h" + +namespace ROCKSDB_NAMESPACE { +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& file_name, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache) { + RocksDBOptionsParser parser; + const auto& fs = config_options.env->GetFileSystem(); + Status s = parser.Parse(config_options, file_name, fs.get()); + if (!s.ok()) { + return s; + } + *db_options = *parser.db_opt(); + const std::vector& cf_names = *parser.cf_names(); + const std::vector& cf_opts = *parser.cf_opts(); + cf_descs->clear(); + for (size_t i = 0; i < cf_opts.size(); ++i) { + cf_descs->push_back({cf_names[i], cf_opts[i]}); + if (cache != nullptr) { + TableFactory* tf = cf_opts[i].table_factory.get(); + if (tf != nullptr) { + auto* opts = tf->GetOptions(); + if (opts != nullptr) { + opts->block_cache = *cache; + } + } + } + } + return Status::OK(); +} + +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name) { + Status s; + std::string latest_file_name; + uint64_t latest_time_stamp = 0; + std::vector file_names; + s = env->GetChildren(dbpath, &file_names); + if (s.IsNotFound()) { + return Status::NotFound(Status::kPathNotFound, + "No options files found in the DB directory.", + dbpath); + } else if (!s.ok()) { + return s; + } + for (auto& file_name : file_names) { + uint64_t time_stamp; + FileType type; + if (ParseFileName(file_name, &time_stamp, &type) && type == kOptionsFile) { + if (time_stamp > latest_time_stamp) { + latest_time_stamp = time_stamp; + latest_file_name = file_name; + } + } + } + if (latest_file_name.size() == 0) { + return Status::NotFound(Status::kPathNotFound, + "No options files found in the DB directory.", + dbpath); + } + *options_file_name = latest_file_name; + return Status::OK(); +} + +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache) { + std::string options_file_name; + Status s = + GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name); + if (!s.ok()) { + return s; + } + return LoadOptionsFromFile(config_options, dbpath + "/" + options_file_name, + db_options, cf_descs, cache); +} + +Status CheckOptionsCompatibility( + const ConfigOptions& config_options, const std::string& dbpath, + const DBOptions& db_options, + const std::vector& cf_descs) { + std::string options_file_name; + Status s = + GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name); + if (!s.ok()) { + return s; + } + + std::vector cf_names; + std::vector cf_opts; + for (const auto& cf_desc : cf_descs) { + cf_names.push_back(cf_desc.name); + cf_opts.push_back(cf_desc.options); + } + + const auto& fs = config_options.env->GetFileSystem(); + + return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( + config_options, db_options, cf_names, cf_opts, + dbpath + "/" + options_file_name, fs.get()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.cc new file mode 100644 index 0000000000..3118fc2df6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.cc @@ -0,0 +1,420 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/persistent_cache/block_cache_tier.h" + +#include +#include + +#include "logging/logging.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" +#include "utilities/persistent_cache/block_cache_tier_file.h" + +namespace ROCKSDB_NAMESPACE { + +// +// BlockCacheImpl +// +Status BlockCacheTier::Open() { + Status status; + + WriteLock _(&lock_); + + assert(!size_); + + // Check the validity of the options + status = opt_.ValidateSettings(); + assert(status.ok()); + if (!status.ok()) { + Error(opt_.log, "Invalid block cache options"); + return status; + } + + // Create base directory or cleanup existing directory + status = opt_.env->CreateDirIfMissing(opt_.path); + if (!status.ok()) { + Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(), + status.ToString().c_str()); + return status; + } + + // Create base/ directory + status = opt_.env->CreateDir(GetCachePath()); + if (!status.ok()) { + // directory already exists, clean it up + status = CleanupCacheFolder(GetCachePath()); + assert(status.ok()); + if (!status.ok()) { + Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(), + status.ToString().c_str()); + return status; + } + } + + // create a new file + assert(!cache_file_); + status = NewCacheFile(); + if (!status.ok()) { + Error(opt_.log, "Error creating new file %s. %s", opt_.path.c_str(), + status.ToString().c_str()); + return status; + } + + assert(cache_file_); + + if (opt_.pipeline_writes) { + assert(!insert_th_.joinable()); + insert_th_ = port::Thread(&BlockCacheTier::InsertMain, this); + } + + return Status::OK(); +} + +bool IsCacheFile(const std::string& file) { + // check if the file has .rc suffix + // Unfortunately regex support across compilers is not even, so we use simple + // string parsing + size_t pos = file.find("."); + if (pos == std::string::npos) { + return false; + } + + std::string suffix = file.substr(pos); + return suffix == ".rc"; +} + +Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) { + std::vector files; + Status status = opt_.env->GetChildren(folder, &files); + if (!status.ok()) { + Error(opt_.log, "Error getting files for %s. %s", folder.c_str(), + status.ToString().c_str()); + return status; + } + + // cleanup files with the patter :digi:.rc + for (auto file : files) { + if (IsCacheFile(file)) { + // cache file + Info(opt_.log, "Removing file %s.", file.c_str()); + status = opt_.env->DeleteFile(folder + "/" + file); + if (!status.ok()) { + Error(opt_.log, "Error deleting file %s. %s", file.c_str(), + status.ToString().c_str()); + return status; + } + } else { + ROCKS_LOG_DEBUG(opt_.log, "Skipping file %s", file.c_str()); + } + } + return Status::OK(); +} + +Status BlockCacheTier::Close() { + // stop the insert thread + if (opt_.pipeline_writes && insert_th_.joinable()) { + InsertOp op(/*quit=*/true); + insert_ops_.Push(std::move(op)); + insert_th_.join(); + } + + // stop the writer before + writer_.Stop(); + + // clear all metadata + WriteLock _(&lock_); + metadata_.Clear(); + return Status::OK(); +} + +template +void Add(std::map* stats, const std::string& key, + const T& t) { + stats->insert({key, static_cast(t)}); +} + +PersistentCache::StatsType BlockCacheTier::Stats() { + std::map stats; + Add(&stats, "persistentcache.blockcachetier.bytes_piplined", + stats_.bytes_pipelined_.Average()); + Add(&stats, "persistentcache.blockcachetier.bytes_written", + stats_.bytes_written_.Average()); + Add(&stats, "persistentcache.blockcachetier.bytes_read", + stats_.bytes_read_.Average()); + Add(&stats, "persistentcache.blockcachetier.insert_dropped", + stats_.insert_dropped_); + Add(&stats, "persistentcache.blockcachetier.cache_hits", stats_.cache_hits_); + Add(&stats, "persistentcache.blockcachetier.cache_misses", + stats_.cache_misses_); + Add(&stats, "persistentcache.blockcachetier.cache_errors", + stats_.cache_errors_); + Add(&stats, "persistentcache.blockcachetier.cache_hits_pct", + stats_.CacheHitPct()); + Add(&stats, "persistentcache.blockcachetier.cache_misses_pct", + stats_.CacheMissPct()); + Add(&stats, "persistentcache.blockcachetier.read_hit_latency", + stats_.read_hit_latency_.Average()); + Add(&stats, "persistentcache.blockcachetier.read_miss_latency", + stats_.read_miss_latency_.Average()); + Add(&stats, "persistentcache.blockcachetier.write_latency", + stats_.write_latency_.Average()); + + auto out = PersistentCacheTier::Stats(); + out.push_back(stats); + return out; +} + +Status BlockCacheTier::Insert(const Slice& key, const char* data, + const size_t size) { + // update stats + stats_.bytes_pipelined_.Add(size); + + if (opt_.pipeline_writes) { + // off load the write to the write thread + insert_ops_.Push( + InsertOp(key.ToString(), std::move(std::string(data, size)))); + return Status::OK(); + } + + assert(!opt_.pipeline_writes); + return InsertImpl(key, Slice(data, size)); +} + +void BlockCacheTier::InsertMain() { + while (true) { + InsertOp op(insert_ops_.Pop()); + + if (op.signal_) { + // that is a secret signal to exit + break; + } + + size_t retry = 0; + Status s; + while ((s = InsertImpl(Slice(op.key_), Slice(op.data_))).IsTryAgain()) { + if (retry > kMaxRetry) { + break; + } + + // this can happen when the buffers are full, we wait till some buffers + // are free. Why don't we wait inside the code. This is because we want + // to support both pipelined and non-pipelined mode + buffer_allocator_.WaitUntilUsable(); + retry++; + } + + if (!s.ok()) { + stats_.insert_dropped_++; + } + } +} + +Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) { + // pre-condition + assert(key.size()); + assert(data.size()); + assert(cache_file_); + + StopWatchNano timer(opt_.clock, /*auto_start=*/true); + + WriteLock _(&lock_); + + LBA lba; + if (metadata_.Lookup(key, &lba)) { + // the key already exists, this is duplicate insert + return Status::OK(); + } + + while (!cache_file_->Append(key, data, &lba)) { + if (!cache_file_->Eof()) { + ROCKS_LOG_DEBUG(opt_.log, "Error inserting to cache file %d", + cache_file_->cacheid()); + stats_.write_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::TryAgain(); + } + + assert(cache_file_->Eof()); + Status status = NewCacheFile(); + if (!status.ok()) { + return status; + } + } + + // Insert into lookup index + BlockInfo* info = metadata_.Insert(key, lba); + assert(info); + if (!info) { + return Status::IOError("Unexpected error inserting to index"); + } + + // insert to cache file reverse mapping + cache_file_->Add(info); + + // update stats + stats_.bytes_written_.Add(data.size()); + stats_.write_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::OK(); +} + +Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr* val, + size_t* size) { + StopWatchNano timer(opt_.clock, /*auto_start=*/true); + + LBA lba; + bool status; + status = metadata_.Lookup(key, &lba); + if (!status) { + stats_.cache_misses_++; + stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::NotFound("blockcache: key not found"); + } + + BlockCacheFile* const file = metadata_.Lookup(lba.cache_id_); + if (!file) { + // this can happen because the block index and cache file index are + // different, and the cache file might be removed between the two lookups + stats_.cache_misses_++; + stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::NotFound("blockcache: cache file not found"); + } + + assert(file->refs_); + + std::unique_ptr scratch(new char[lba.size_]); + Slice blk_key; + Slice blk_val; + + status = file->Read(lba, &blk_key, &blk_val, scratch.get()); + --file->refs_; + if (!status) { + stats_.cache_misses_++; + stats_.cache_errors_++; + stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::NotFound("blockcache: error reading data"); + } + + assert(blk_key == key); + + val->reset(new char[blk_val.size()]); + memcpy(val->get(), blk_val.data(), blk_val.size()); + *size = blk_val.size(); + + stats_.bytes_read_.Add(*size); + stats_.cache_hits_++; + stats_.read_hit_latency_.Add(timer.ElapsedNanos() / 1000); + + return Status::OK(); +} + +bool BlockCacheTier::Erase(const Slice& key) { + WriteLock _(&lock_); + BlockInfo* info = metadata_.Remove(key); + assert(info); + delete info; + return true; +} + +Status BlockCacheTier::NewCacheFile() { + lock_.AssertHeld(); + + TEST_SYNC_POINT_CALLBACK("BlockCacheTier::NewCacheFile:DeleteDir", + (void*)(GetCachePath().c_str())); + + std::unique_ptr f(new WriteableCacheFile( + opt_.env, &buffer_allocator_, &writer_, GetCachePath(), writer_cache_id_, + opt_.cache_file_size, opt_.log)); + + bool status = f->Create(opt_.enable_direct_writes, opt_.enable_direct_reads); + if (!status) { + return Status::IOError("Error creating file"); + } + + Info(opt_.log, "Created cache file %d", writer_cache_id_); + + writer_cache_id_++; + cache_file_ = f.release(); + + // insert to cache files tree + status = metadata_.Insert(cache_file_); + assert(status); + if (!status) { + Error(opt_.log, "Error inserting to metadata"); + return Status::IOError("Error inserting to metadata"); + } + + return Status::OK(); +} + +bool BlockCacheTier::Reserve(const size_t size) { + WriteLock _(&lock_); + assert(size_ <= opt_.cache_size); + + if (size + size_ <= opt_.cache_size) { + // there is enough space to write + size_ += size; + return true; + } + + assert(size + size_ >= opt_.cache_size); + // there is not enough space to fit the requested data + // we can clear some space by evicting cold data + + const double retain_fac = (100 - kEvictPct) / static_cast(100); + while (size + size_ > opt_.cache_size * retain_fac) { + std::unique_ptr f(metadata_.Evict()); + if (!f) { + // nothing is evictable + return false; + } + assert(!f->refs_); + uint64_t file_size; + if (!f->Delete(&file_size).ok()) { + // unable to delete file + return false; + } + + assert(file_size <= size_); + size_ -= file_size; + } + + size_ += size; + assert(size_ <= opt_.cache_size * 0.9); + return true; +} + +Status NewPersistentCache(Env* const env, const std::string& path, + const uint64_t size, + const std::shared_ptr& log, + const bool optimized_for_nvm, + std::shared_ptr* cache) { + if (!cache) { + return Status::IOError("invalid argument cache"); + } + + auto opt = PersistentCacheConfig(env, path, size, log); + if (optimized_for_nvm) { + // the default settings are optimized for SSD + // NVM devices are better accessed with 4K direct IO and written with + // parallelism + opt.enable_direct_writes = true; + opt.writer_qdepth = 4; + opt.writer_dispatch_size = 4 * 1024; + } + + auto pcache = std::make_shared(opt); + Status s = pcache->Open(); + + if (!s.ok()) { + return s; + } + + *cache = pcache; + return s; +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.h new file mode 100644 index 0000000000..caabbef94e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier.h @@ -0,0 +1,154 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + + +#ifndef OS_WIN +#include +#endif // ! OS_WIN + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/system_clock.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "utilities/persistent_cache/block_cache_tier_file.h" +#include "utilities/persistent_cache/block_cache_tier_metadata.h" +#include "utilities/persistent_cache/persistent_cache_util.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Block cache tier implementation +// +class BlockCacheTier : public PersistentCacheTier { + public: + explicit BlockCacheTier(const PersistentCacheConfig& opt) + : opt_(opt), + insert_ops_(static_cast(opt_.max_write_pipeline_backlog_size)), + buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()), + writer_(this, opt_.writer_qdepth, + static_cast(opt_.writer_dispatch_size)) { + Info(opt_.log, "Initializing allocator. size=%d B count=%" ROCKSDB_PRIszt, + opt_.write_buffer_size, opt_.write_buffer_count()); + } + + virtual ~BlockCacheTier() { + // Close is re-entrant so we can call close even if it is already closed + Close().PermitUncheckedError(); + assert(!insert_th_.joinable()); + } + + Status Insert(const Slice& key, const char* data, const size_t size) override; + Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) override; + Status Open() override; + Status Close() override; + bool Erase(const Slice& key) override; + bool Reserve(const size_t size) override; + + bool IsCompressed() override { return opt_.is_compressed; } + + std::string GetPrintableOptions() const override { return opt_.ToString(); } + + PersistentCache::StatsType Stats() override; + + void TEST_Flush() override { + while (insert_ops_.Size()) { + /* sleep override */ + SystemClock::Default()->SleepForMicroseconds(1000000); + } + } + + private: + // Percentage of cache to be evicted when the cache is full + static const size_t kEvictPct = 10; + // Max attempts to insert key, value to cache in pipelined mode + static const size_t kMaxRetry = 3; + + // Pipelined operation + struct InsertOp { + explicit InsertOp(const bool signal) : signal_(signal) {} + explicit InsertOp(std::string&& key, const std::string& data) + : key_(std::move(key)), data_(data) {} + ~InsertOp() {} + + InsertOp() = delete; + InsertOp(InsertOp&& /*rhs*/) = default; + InsertOp& operator=(InsertOp&& rhs) = default; + + // used for estimating size by bounded queue + size_t Size() { return data_.size() + key_.size(); } + + std::string key_; + std::string data_; + bool signal_ = false; // signal to request processing thread to exit + }; + + // entry point for insert thread + void InsertMain(); + // insert implementation + Status InsertImpl(const Slice& key, const Slice& data); + // Create a new cache file + Status NewCacheFile(); + // Get cache directory path + std::string GetCachePath() const { return opt_.path + "/cache"; } + // Cleanup folder + Status CleanupCacheFolder(const std::string& folder); + + // Statistics + struct Statistics { + HistogramImpl bytes_pipelined_; + HistogramImpl bytes_written_; + HistogramImpl bytes_read_; + HistogramImpl read_hit_latency_; + HistogramImpl read_miss_latency_; + HistogramImpl write_latency_; + std::atomic cache_hits_{0}; + std::atomic cache_misses_{0}; + std::atomic cache_errors_{0}; + std::atomic insert_dropped_{0}; + + double CacheHitPct() const { + const auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_hits_ / static_cast(lookups) : 0.0; + } + + double CacheMissPct() const { + const auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_misses_ / static_cast(lookups) : 0.0; + } + }; + + port::RWMutex lock_; // Synchronization + const PersistentCacheConfig opt_; // BlockCache options + BoundedQueue insert_ops_; // Ops waiting for insert + ROCKSDB_NAMESPACE::port::Thread insert_th_; // Insert thread + uint32_t writer_cache_id_ = 0; // Current cache file identifier + WriteableCacheFile* cache_file_ = nullptr; // Current cache file reference + CacheWriteBufferAllocator buffer_allocator_; // Buffer provider + ThreadedWriter writer_; // Writer threads + BlockCacheTierMetadata metadata_; // Cache meta data manager + std::atomic size_{0}; // Size of the cache + Statistics stats_; // Statistics +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.cc new file mode 100644 index 0000000000..9a667c1fdc --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.cc @@ -0,0 +1,608 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/persistent_cache/block_cache_tier_file.h" + +#ifndef OS_WIN +#include +#endif +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { + +// +// File creation factories +// +Status NewWritableCacheFile(Env* const env, const std::string& filepath, + std::unique_ptr* file, + const bool use_direct_writes = false) { + EnvOptions opt; + opt.use_direct_writes = use_direct_writes; + Status s = env->NewWritableFile(filepath, file, opt); + return s; +} + +Status NewRandomAccessCacheFile(const std::shared_ptr& fs, + const std::string& filepath, + std::unique_ptr* file, + const bool use_direct_reads = true) { + assert(fs.get()); + + FileOptions opt; + opt.use_direct_reads = use_direct_reads; + return fs->NewRandomAccessFile(filepath, opt, file, nullptr); +} + +// +// BlockCacheFile +// +Status BlockCacheFile::Delete(uint64_t* size) { + assert(env_); + + Status status = env_->GetFileSize(Path(), size); + if (!status.ok()) { + return status; + } + return env_->DeleteFile(Path()); +} + +// +// CacheRecord +// +// Cache record represents the record on disk +// +// +--------+---------+----------+------------+---------------+-------------+ +// | magic | crc | key size | value size | key data | value data | +// +--------+---------+----------+------------+---------------+-------------+ +// <-- 4 --><-- 4 --><-- 4 --><-- 4 --><-- key size --><-- v-size --> +// +struct CacheRecordHeader { + CacheRecordHeader() : magic_(0), crc_(0), key_size_(0), val_size_(0) {} + CacheRecordHeader(const uint32_t magic, const uint32_t key_size, + const uint32_t val_size) + : magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {} + + uint32_t magic_; + uint32_t crc_; + uint32_t key_size_; + uint32_t val_size_; +}; + +struct CacheRecord { + CacheRecord() {} + CacheRecord(const Slice& key, const Slice& val) + : hdr_(MAGIC, static_cast(key.size()), + static_cast(val.size())), + key_(key), + val_(val) { + hdr_.crc_ = ComputeCRC(); + } + + uint32_t ComputeCRC() const; + bool Serialize(std::vector* bufs, size_t* woff); + bool Deserialize(const Slice& buf); + + static uint32_t CalcSize(const Slice& key, const Slice& val) { + return static_cast(sizeof(CacheRecordHeader) + key.size() + + val.size()); + } + + static const uint32_t MAGIC = 0xfefa; + + bool Append(std::vector* bufs, size_t* woff, + const char* data, const size_t size); + + CacheRecordHeader hdr_; + Slice key_; + Slice val_; +}; + +static_assert(sizeof(CacheRecordHeader) == 16, "DataHeader is not aligned"); + +uint32_t CacheRecord::ComputeCRC() const { + uint32_t crc = 0; + CacheRecordHeader tmp = hdr_; + tmp.crc_ = 0; + crc = crc32c::Extend(crc, reinterpret_cast(&tmp), sizeof(tmp)); + crc = crc32c::Extend(crc, reinterpret_cast(key_.data()), + key_.size()); + crc = crc32c::Extend(crc, reinterpret_cast(val_.data()), + val_.size()); + return crc; +} + +bool CacheRecord::Serialize(std::vector* bufs, + size_t* woff) { + assert(bufs->size()); + return Append(bufs, woff, reinterpret_cast(&hdr_), + sizeof(hdr_)) && + Append(bufs, woff, reinterpret_cast(key_.data()), + key_.size()) && + Append(bufs, woff, reinterpret_cast(val_.data()), + val_.size()); +} + +bool CacheRecord::Append(std::vector* bufs, size_t* woff, + const char* data, const size_t data_size) { + assert(*woff < bufs->size()); + + const char* p = data; + size_t size = data_size; + + while (size && *woff < bufs->size()) { + CacheWriteBuffer* buf = (*bufs)[*woff]; + const size_t free = buf->Free(); + if (size <= free) { + buf->Append(p, size); + size = 0; + } else { + buf->Append(p, free); + p += free; + size -= free; + assert(!buf->Free()); + assert(buf->Used() == buf->Capacity()); + } + + if (!buf->Free()) { + *woff += 1; + } + } + + assert(!size); + + return !size; +} + +bool CacheRecord::Deserialize(const Slice& data) { + assert(data.size() >= sizeof(CacheRecordHeader)); + if (data.size() < sizeof(CacheRecordHeader)) { + return false; + } + + memcpy(&hdr_, data.data(), sizeof(hdr_)); + + assert(hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) == data.size()); + if (hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) != data.size()) { + return false; + } + + key_ = Slice(data.data_ + sizeof(hdr_), hdr_.key_size_); + val_ = Slice(key_.data_ + hdr_.key_size_, hdr_.val_size_); + + if (!(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_)) { + fprintf(stderr, "** magic %d ** \n", hdr_.magic_); + fprintf(stderr, "** key_size %d ** \n", hdr_.key_size_); + fprintf(stderr, "** val_size %d ** \n", hdr_.val_size_); + fprintf(stderr, "** key %s ** \n", key_.ToString().c_str()); + fprintf(stderr, "** val %s ** \n", val_.ToString().c_str()); + for (size_t i = 0; i < hdr_.val_size_; ++i) { + fprintf(stderr, "%d.", (uint8_t)val_.data()[i]); + } + fprintf(stderr, "\n** cksum %d != %d **", hdr_.crc_, ComputeCRC()); + } + + assert(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_); + return hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_; +} + +// +// RandomAccessFile +// + +bool RandomAccessCacheFile::Open(const bool enable_direct_reads) { + WriteLock _(&rwlock_); + return OpenImpl(enable_direct_reads); +} + +bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) { + rwlock_.AssertHeld(); + + ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str()); + assert(env_); + + std::unique_ptr file; + Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file, + enable_direct_reads); + if (!status.ok()) { + Error(log_, "Error opening random access file %s. %s", Path().c_str(), + status.ToString().c_str()); + return false; + } + freader_.reset(new RandomAccessFileReader(std::move(file), Path(), + env_->GetSystemClock().get())); + + return true; +} + +bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val, + char* scratch) { + ReadLock _(&rwlock_); + + assert(lba.cache_id_ == cache_id_); + + if (!freader_) { + return false; + } + + Slice result; + Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch, + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + Error(log_, "Error reading from file %s. %s", Path().c_str(), + s.ToString().c_str()); + return false; + } + + assert(result.data() == scratch); + + return ParseRec(lba, key, val, scratch); +} + +bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val, + char* scratch) { + Slice data(scratch, lba.size_); + + CacheRecord rec; + if (!rec.Deserialize(data)) { + assert(!"Error deserializing data"); + Error(log_, "Error de-serializing record from file %s off %d", + Path().c_str(), lba.off_); + return false; + } + + *key = Slice(rec.key_); + *val = Slice(rec.val_); + + return true; +} + +// +// WriteableCacheFile +// + +WriteableCacheFile::~WriteableCacheFile() { + WriteLock _(&rwlock_); + if (!eof_) { + // This file never flushed. We give priority to shutdown since this is a + // cache + // TODO(krad): Figure a way to flush the pending data + if (file_) { + assert(refs_ == 1); + --refs_; + } + } + assert(!refs_); + ClearBuffers(); +} + +bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/, + const bool enable_direct_reads) { + WriteLock _(&rwlock_); + + enable_direct_reads_ = enable_direct_reads; + + ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)", + Path().c_str(), max_size_); + + assert(env_); + + Status s = env_->FileExists(Path()); + if (s.ok()) { + ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(), + s.ToString().c_str()); + } + + s = NewWritableCacheFile(env_, Path(), &file_); + if (!s.ok()) { + ROCKS_LOG_WARN(log_, "Unable to create file %s. %s", Path().c_str(), + s.ToString().c_str()); + return false; + } + + assert(!refs_); + ++refs_; + + return true; +} + +bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) { + WriteLock _(&rwlock_); + + if (eof_) { + // We can't append since the file is full + return false; + } + + // estimate the space required to store the (key, val) + uint32_t rec_size = CacheRecord::CalcSize(key, val); + + if (!ExpandBuffer(rec_size)) { + // unable to expand the buffer + ROCKS_LOG_DEBUG(log_, "Error expanding buffers. size=%d", rec_size); + return false; + } + + lba->cache_id_ = cache_id_; + lba->off_ = disk_woff_; + lba->size_ = rec_size; + + CacheRecord rec(key, val); + if (!rec.Serialize(&bufs_, &buf_woff_)) { + // unexpected error: unable to serialize the data + assert(!"Error serializing record"); + return false; + } + + disk_woff_ += rec_size; + eof_ = disk_woff_ >= max_size_; + + // dispatch buffer for flush + DispatchBuffer(); + + return true; +} + +bool WriteableCacheFile::ExpandBuffer(const size_t size) { + rwlock_.AssertHeld(); + assert(!eof_); + + // determine if there is enough space + size_t free = 0; // compute the free space left in buffer + for (size_t i = buf_woff_; i < bufs_.size(); ++i) { + free += bufs_[i]->Free(); + if (size <= free) { + // we have enough space in the buffer + return true; + } + } + + // expand the buffer until there is enough space to write `size` bytes + assert(free < size); + assert(alloc_); + + while (free < size) { + CacheWriteBuffer* const buf = alloc_->Allocate(); + if (!buf) { + ROCKS_LOG_DEBUG(log_, "Unable to allocate buffers"); + return false; + } + + size_ += static_cast(buf->Free()); + free += buf->Free(); + bufs_.push_back(buf); + } + + assert(free >= size); + return true; +} + +void WriteableCacheFile::DispatchBuffer() { + rwlock_.AssertHeld(); + + assert(bufs_.size()); + assert(buf_doff_ <= buf_woff_); + assert(buf_woff_ <= bufs_.size()); + + if (pending_ios_) { + return; + } + + if (!eof_ && buf_doff_ == buf_woff_) { + // dispatch buffer is pointing to write buffer and we haven't hit eof + return; + } + + assert(eof_ || buf_doff_ < buf_woff_); + assert(buf_doff_ < bufs_.size()); + assert(file_); + assert(alloc_); + + auto* buf = bufs_[buf_doff_]; + const uint64_t file_off = buf_doff_ * alloc_->BufferSize(); + + assert(!buf->Free() || + (eof_ && buf_doff_ == buf_woff_ && buf_woff_ < bufs_.size())); + // we have reached end of file, and there is space in the last buffer + // pad it with zero for direct IO + buf->FillTrailingZeros(); + + assert(buf->Used() % kFileAlignmentSize == 0); + + writer_->Write(file_.get(), buf, file_off, + std::bind(&WriteableCacheFile::BufferWriteDone, this)); + pending_ios_++; + buf_doff_++; +} + +void WriteableCacheFile::BufferWriteDone() { + WriteLock _(&rwlock_); + + assert(bufs_.size()); + + pending_ios_--; + + if (buf_doff_ < bufs_.size()) { + DispatchBuffer(); + } + + if (eof_ && buf_doff_ >= bufs_.size() && !pending_ios_) { + // end-of-file reached, move to read mode + CloseAndOpenForReading(); + } +} + +void WriteableCacheFile::CloseAndOpenForReading() { + // Our env abstraction do not allow reading from a file opened for appending + // We need close the file and re-open it for reading + Close(); + RandomAccessCacheFile::OpenImpl(enable_direct_reads_); +} + +bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block, + char* scratch) { + rwlock_.AssertHeld(); + + if (!ReadBuffer(lba, scratch)) { + Error(log_, "Error reading from buffer. cache=%d off=%d", cache_id_, + lba.off_); + return false; + } + + return ParseRec(lba, key, block, scratch); +} + +bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) { + rwlock_.AssertHeld(); + + assert(lba.off_ < disk_woff_); + assert(alloc_); + + // we read from the buffers like reading from a flat file. The list of buffers + // are treated as contiguous stream of data + + char* tmp = data; + size_t pending_nbytes = lba.size_; + // start buffer + size_t start_idx = lba.off_ / alloc_->BufferSize(); + // offset into the start buffer + size_t start_off = lba.off_ % alloc_->BufferSize(); + + assert(start_idx <= buf_woff_); + + for (size_t i = start_idx; pending_nbytes && i < bufs_.size(); ++i) { + assert(i <= buf_woff_); + auto* buf = bufs_[i]; + assert(i == buf_woff_ || !buf->Free()); + // bytes to write to the buffer + size_t nbytes = pending_nbytes > (buf->Used() - start_off) + ? (buf->Used() - start_off) + : pending_nbytes; + memcpy(tmp, buf->Data() + start_off, nbytes); + + // left over to be written + pending_nbytes -= nbytes; + start_off = 0; + tmp += nbytes; + } + + assert(!pending_nbytes); + if (pending_nbytes) { + return false; + } + + assert(tmp == data + lba.size_); + return true; +} + +void WriteableCacheFile::Close() { + rwlock_.AssertHeld(); + + assert(size_ >= max_size_); + assert(disk_woff_ >= max_size_); + assert(buf_doff_ == bufs_.size()); + assert(bufs_.size() - buf_woff_ <= 1); + assert(!pending_ios_); + + Info(log_, "Closing file %s. size=%d written=%d", Path().c_str(), size_, + disk_woff_); + + ClearBuffers(); + file_.reset(); + + assert(refs_); + --refs_; +} + +void WriteableCacheFile::ClearBuffers() { + assert(alloc_); + + for (size_t i = 0; i < bufs_.size(); ++i) { + alloc_->Deallocate(bufs_[i]); + } + + bufs_.clear(); +} + +// +// ThreadedFileWriter implementation +// +ThreadedWriter::ThreadedWriter(PersistentCacheTier* const cache, + const size_t qdepth, const size_t io_size) + : Writer(cache), io_size_(io_size) { + for (size_t i = 0; i < qdepth; ++i) { + port::Thread th(&ThreadedWriter::ThreadMain, this); + threads_.push_back(std::move(th)); + } +} + +void ThreadedWriter::Stop() { + // notify all threads to exit + for (size_t i = 0; i < threads_.size(); ++i) { + q_.Push(IO(/*signal=*/true)); + } + + // wait for all threads to exit + for (auto& th : threads_) { + th.join(); + assert(!th.joinable()); + } + threads_.clear(); +} + +void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf, + const uint64_t file_off, + const std::function callback) { + q_.Push(IO(file, buf, file_off, callback)); +} + +void ThreadedWriter::ThreadMain() { + while (true) { + // Fetch the IO to process + IO io(q_.Pop()); + if (io.signal_) { + // that's secret signal to exit + break; + } + + // Reserve space for writing the buffer + while (!cache_->Reserve(io.buf_->Used())) { + // We can fail to reserve space if every file in the system + // is being currently accessed + /* sleep override */ + SystemClock::Default()->SleepForMicroseconds(1000000); + } + + DispatchIO(io); + + io.callback_(); + } +} + +void ThreadedWriter::DispatchIO(const IO& io) { + size_t written = 0; + while (written < io.buf_->Used()) { + Slice data(io.buf_->Data() + written, io_size_); + Status s = io.file_->Append(data); + assert(s.ok()); + if (!s.ok()) { + // That is definite IO error to device. There is not much we can + // do but ignore the failure. This can lead to corruption of data on + // disk, but the cache will skip while reading + fprintf(stderr, "Error writing data to file. %s\n", s.ToString().c_str()); + } + written += io_size_; + } +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.h new file mode 100644 index 0000000000..127f5a8676 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file.h @@ -0,0 +1,291 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + + +#include +#include +#include +#include + +#include "file/random_access_file_reader.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "utilities/persistent_cache/block_cache_tier_file_buffer.h" +#include "utilities/persistent_cache/lrulist.h" +#include "utilities/persistent_cache/persistent_cache_tier.h" +#include "utilities/persistent_cache/persistent_cache_util.h" + +// The io code path of persistent cache uses pipelined architecture +// +// client -> In Queue <-- BlockCacheTier --> Out Queue <-- Writer <--> Kernel +// +// This would enable the system to scale for GB/s of throughput which is +// expected with modern devies like NVM. +// +// The file level operations are encapsulated in the following abstractions +// +// BlockCacheFile +// ^ +// | +// | +// RandomAccessCacheFile (For reading) +// ^ +// | +// | +// WriteableCacheFile (For writing) +// +// Write IO code path : +// +namespace ROCKSDB_NAMESPACE { + +class WriteableCacheFile; +struct BlockInfo; + +// Represents a logical record on device +// +// (L)ogical (B)lock (Address = { cache-file-id, offset, size } +struct LogicalBlockAddress { + LogicalBlockAddress() {} + explicit LogicalBlockAddress(const uint32_t cache_id, const uint32_t off, + const uint16_t size) + : cache_id_(cache_id), off_(off), size_(size) {} + + uint32_t cache_id_ = 0; + uint32_t off_ = 0; + uint32_t size_ = 0; +}; + +using LBA = LogicalBlockAddress; + +// class Writer +// +// Writer is the abstraction used for writing data to file. The component can be +// multithreaded. It is the last step of write pipeline +class Writer { + public: + explicit Writer(PersistentCacheTier* const cache) : cache_(cache) {} + virtual ~Writer() {} + + // write buffer to file at the given offset + virtual void Write(WritableFile* const file, CacheWriteBuffer* buf, + const uint64_t file_off, + const std::function callback) = 0; + // stop the writer + virtual void Stop() = 0; + + PersistentCacheTier* const cache_; +}; + +// class BlockCacheFile +// +// Generic interface to support building file specialized for read/writing +class BlockCacheFile : public LRUElement { + public: + explicit BlockCacheFile(const uint32_t cache_id) + : LRUElement(), cache_id_(cache_id) {} + + explicit BlockCacheFile(Env* const env, const std::string& dir, + const uint32_t cache_id) + : LRUElement(), + env_(env), + dir_(dir), + cache_id_(cache_id) {} + + virtual ~BlockCacheFile() {} + + // append key/value to file and return LBA locator to user + virtual bool Append(const Slice& /*key*/, const Slice& /*val*/, + LBA* const /*lba*/) { + assert(!"not implemented"); + return false; + } + + // read from the record locator (LBA) and return key, value and status + virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/, + char* /*scratch*/) { + assert(!"not implemented"); + return false; + } + + // get file path + std::string Path() const { + return dir_ + "/" + std::to_string(cache_id_) + ".rc"; + } + // get cache ID + uint32_t cacheid() const { return cache_id_; } + // Add block information to file data + // Block information is the list of index reference for this file + virtual void Add(BlockInfo* binfo) { + WriteLock _(&rwlock_); + block_infos_.push_back(binfo); + } + // get block information + std::list& block_infos() { return block_infos_; } + // delete file and return the size of the file + virtual Status Delete(uint64_t* size); + + protected: + port::RWMutex rwlock_; // synchronization mutex + Env* const env_ = nullptr; // Env for OS + const std::string dir_; // Directory name + const uint32_t cache_id_; // Cache id for the file + std::list block_infos_; // List of index entries mapping to the + // file content +}; + +// class RandomAccessFile +// +// Thread safe implementation for reading random data from file +class RandomAccessCacheFile : public BlockCacheFile { + public: + explicit RandomAccessCacheFile(Env* const env, const std::string& dir, + const uint32_t cache_id, + const std::shared_ptr& log) + : BlockCacheFile(env, dir, cache_id), log_(log) {} + + virtual ~RandomAccessCacheFile() {} + + // open file for reading + bool Open(const bool enable_direct_reads); + // read data from the disk + bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override; + + private: + std::unique_ptr freader_; + + protected: + bool OpenImpl(const bool enable_direct_reads); + bool ParseRec(const LBA& lba, Slice* key, Slice* val, char* scratch); + + std::shared_ptr log_; // log file +}; + +// class WriteableCacheFile +// +// All writes to the files are cached in buffers. The buffers are flushed to +// disk as they get filled up. When file size reaches a certain size, a new file +// will be created provided there is free space +class WriteableCacheFile : public RandomAccessCacheFile { + public: + explicit WriteableCacheFile(Env* const env, CacheWriteBufferAllocator* alloc, + Writer* writer, const std::string& dir, + const uint32_t cache_id, const uint32_t max_size, + const std::shared_ptr& log) + : RandomAccessCacheFile(env, dir, cache_id, log), + alloc_(alloc), + writer_(writer), + max_size_(max_size) {} + + virtual ~WriteableCacheFile(); + + // create file on disk + bool Create(const bool enable_direct_writes, const bool enable_direct_reads); + + // read data from logical file + bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override { + ReadLock _(&rwlock_); + const bool closed = eof_ && bufs_.empty(); + if (closed) { + // the file is closed, read from disk + return RandomAccessCacheFile::Read(lba, key, block, scratch); + } + // file is still being written, read from buffers + return ReadBuffer(lba, key, block, scratch); + } + + // append data to end of file + bool Append(const Slice&, const Slice&, LBA* const) override; + // End-of-file + bool Eof() const { return eof_; } + + private: + friend class ThreadedWriter; + + static const size_t kFileAlignmentSize = 4 * 1024; // align file size + + bool ReadBuffer(const LBA& lba, Slice* key, Slice* block, char* scratch); + bool ReadBuffer(const LBA& lba, char* data); + bool ExpandBuffer(const size_t size); + void DispatchBuffer(); + void BufferWriteDone(); + void CloseAndOpenForReading(); + void ClearBuffers(); + void Close(); + + // File layout in memory + // + // +------+------+------+------+------+------+ + // | b0 | b1 | b2 | b3 | b4 | b5 | + // +------+------+------+------+------+------+ + // ^ ^ + // | | + // buf_doff_ buf_woff_ + // (next buffer to (next buffer to fill) + // flush to disk) + // + // The buffers are flushed to disk serially for a given file + + CacheWriteBufferAllocator* const alloc_ = nullptr; // Buffer provider + Writer* const writer_ = nullptr; // File writer thread + std::unique_ptr file_; // RocksDB Env file abstraction + std::vector bufs_; // Written buffers + uint32_t size_ = 0; // Size of the file + const uint32_t max_size_; // Max size of the file + bool eof_ = false; // End of file + uint32_t disk_woff_ = 0; // Offset to write on disk + size_t buf_woff_ = 0; // off into bufs_ to write + size_t buf_doff_ = 0; // off into bufs_ to dispatch + size_t pending_ios_ = 0; // Number of ios to disk in-progress + bool enable_direct_reads_ = false; // Should we enable direct reads + // when reading from disk +}; + +// +// Abstraction to do writing to device. It is part of pipelined architecture. +// +class ThreadedWriter : public Writer { + public: + // Representation of IO to device + struct IO { + explicit IO(const bool signal) : signal_(signal) {} + explicit IO(WritableFile* const file, CacheWriteBuffer* const buf, + const uint64_t file_off, const std::function callback) + : file_(file), buf_(buf), file_off_(file_off), callback_(callback) {} + + IO(const IO&) = default; + IO& operator=(const IO&) = default; + size_t Size() const { return sizeof(IO); } + + WritableFile* file_ = nullptr; // File to write to + CacheWriteBuffer* buf_ = nullptr; // buffer to write + uint64_t file_off_ = 0; // file offset + bool signal_ = false; // signal to exit thread loop + std::function callback_; // Callback on completion + }; + + explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth, + const size_t io_size); + virtual ~ThreadedWriter() { assert(threads_.empty()); } + + void Stop() override; + void Write(WritableFile* const file, CacheWriteBuffer* buf, + const uint64_t file_off, + const std::function callback) override; + + private: + void ThreadMain(); + void DispatchIO(const IO& io); + + const size_t io_size_ = 0; + BoundedQueue q_; + std::vector threads_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file_buffer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file_buffer.h new file mode 100644 index 0000000000..d4f02455a3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_file_buffer.h @@ -0,0 +1,127 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "memory/arena.h" +#include "rocksdb/comparator.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// +// CacheWriteBuffer +// +// Buffer abstraction that can be manipulated via append +// (not thread safe) +class CacheWriteBuffer { + public: + explicit CacheWriteBuffer(const size_t size) : size_(size), pos_(0) { + buf_.reset(new char[size_]); + assert(!pos_); + assert(size_); + } + + virtual ~CacheWriteBuffer() {} + + void Append(const char* buf, const size_t size) { + assert(pos_ + size <= size_); + memcpy(buf_.get() + pos_, buf, size); + pos_ += size; + assert(pos_ <= size_); + } + + void FillTrailingZeros() { + assert(pos_ <= size_); + memset(buf_.get() + pos_, '0', size_ - pos_); + pos_ = size_; + } + + void Reset() { pos_ = 0; } + size_t Free() const { return size_ - pos_; } + size_t Capacity() const { return size_; } + size_t Used() const { return pos_; } + char* Data() const { return buf_.get(); } + + private: + std::unique_ptr buf_; + const size_t size_; + size_t pos_; +}; + +// +// CacheWriteBufferAllocator +// +// Buffer pool abstraction(not thread safe) +// +class CacheWriteBufferAllocator { + public: + explicit CacheWriteBufferAllocator(const size_t buffer_size, + const size_t buffer_count) + : cond_empty_(&lock_), buffer_size_(buffer_size) { + MutexLock _(&lock_); + buffer_size_ = buffer_size; + for (uint32_t i = 0; i < buffer_count; i++) { + auto* buf = new CacheWriteBuffer(buffer_size_); + assert(buf); + if (buf) { + bufs_.push_back(buf); + cond_empty_.Signal(); + } + } + } + + virtual ~CacheWriteBufferAllocator() { + MutexLock _(&lock_); + assert(bufs_.size() * buffer_size_ == Capacity()); + for (auto* buf : bufs_) { + delete buf; + } + bufs_.clear(); + } + + CacheWriteBuffer* Allocate() { + MutexLock _(&lock_); + if (bufs_.empty()) { + return nullptr; + } + + assert(!bufs_.empty()); + CacheWriteBuffer* const buf = bufs_.front(); + bufs_.pop_front(); + return buf; + } + + void Deallocate(CacheWriteBuffer* const buf) { + assert(buf); + MutexLock _(&lock_); + buf->Reset(); + bufs_.push_back(buf); + cond_empty_.Signal(); + } + + void WaitUntilUsable() { + // We are asked to wait till we have buffers available + MutexLock _(&lock_); + while (bufs_.empty()) { + cond_empty_.Wait(); + } + } + + size_t Capacity() const { return bufs_.size() * buffer_size_; } + size_t Free() const { return bufs_.size() * buffer_size_; } + size_t BufferSize() const { return buffer_size_; } + + private: + port::Mutex lock_; // Sync lock + port::CondVar cond_empty_; // Condition var for empty buffers + size_t buffer_size_; // Size of each buffer + std::list bufs_; // Buffer stash +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.cc new file mode 100644 index 0000000000..182df6ca65 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/persistent_cache/block_cache_tier_metadata.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +bool BlockCacheTierMetadata::Insert(BlockCacheFile* file) { + return cache_file_index_.Insert(file); +} + +BlockCacheFile* BlockCacheTierMetadata::Lookup(const uint32_t cache_id) { + BlockCacheFile* ret = nullptr; + BlockCacheFile lookup_key(cache_id); + bool ok = cache_file_index_.Find(&lookup_key, &ret); + if (ok) { + assert(ret->refs_); + return ret; + } + return nullptr; +} + +BlockCacheFile* BlockCacheTierMetadata::Evict() { + using std::placeholders::_1; + auto fn = std::bind(&BlockCacheTierMetadata::RemoveAllKeys, this, _1); + return cache_file_index_.Evict(fn); +} + +void BlockCacheTierMetadata::Clear() { + cache_file_index_.Clear([](BlockCacheFile* arg) { delete arg; }); + block_index_.Clear([](BlockInfo* arg) { delete arg; }); +} + +BlockInfo* BlockCacheTierMetadata::Insert(const Slice& key, const LBA& lba) { + std::unique_ptr binfo(new BlockInfo(key, lba)); + if (!block_index_.Insert(binfo.get())) { + return nullptr; + } + return binfo.release(); +} + +bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) { + BlockInfo lookup_key(key); + BlockInfo* block; + port::RWMutex* rlock = nullptr; + if (!block_index_.Find(&lookup_key, &block, &rlock)) { + return false; + } + + ReadUnlock _(rlock); + assert(block->key_ == key.ToString()); + if (lba) { + *lba = block->lba_; + } + return true; +} + +BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) { + BlockInfo lookup_key(key); + BlockInfo* binfo = nullptr; + bool ok __attribute__((__unused__)); + ok = block_index_.Erase(&lookup_key, &binfo); + assert(ok); + return binfo; +} + +void BlockCacheTierMetadata::RemoveAllKeys(BlockCacheFile* f) { + for (BlockInfo* binfo : f->block_infos()) { + BlockInfo* tmp = nullptr; + bool status = block_index_.Erase(binfo, &tmp); + (void)status; + assert(status); + assert(tmp == binfo); + delete binfo; + } + f->block_infos().clear(); +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.h new file mode 100644 index 0000000000..15eb0be3d9 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/block_cache_tier_metadata.h @@ -0,0 +1,122 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "utilities/persistent_cache/block_cache_tier_file.h" +#include "utilities/persistent_cache/hash_table.h" +#include "utilities/persistent_cache/hash_table_evictable.h" +#include "utilities/persistent_cache/lrulist.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Block Cache Tier Metadata +// +// The BlockCacheTierMetadata holds all the metadata associated with block +// cache. It +// fundamentally contains 2 indexes and an LRU. +// +// Block Cache Index +// +// This is a forward index that maps a given key to a LBA (Logical Block +// Address). LBA is a disk pointer that points to a record on the cache. +// +// LBA = { cache-id, offset, size } +// +// Cache File Index +// +// This is a forward index that maps a given cache-id to a cache file object. +// Typically you would lookup using LBA and use the object to read or write +struct BlockInfo { + explicit BlockInfo(const Slice& key, const LBA& lba = LBA()) + : key_(key.ToString()), lba_(lba) {} + + std::string key_; + LBA lba_; +}; + +class BlockCacheTierMetadata { + public: + explicit BlockCacheTierMetadata(const uint32_t blocks_capacity = 1024 * 1024, + const uint32_t cachefile_capacity = 10 * 1024) + : cache_file_index_(cachefile_capacity), block_index_(blocks_capacity) {} + + virtual ~BlockCacheTierMetadata() {} + + // Insert a given cache file + bool Insert(BlockCacheFile* file); + + // Lookup cache file based on cache_id + BlockCacheFile* Lookup(const uint32_t cache_id); + + // Insert block information to block index + BlockInfo* Insert(const Slice& key, const LBA& lba); + // bool Insert(BlockInfo* binfo); + + // Lookup block information from block index + bool Lookup(const Slice& key, LBA* lba); + + // Remove a given from the block index + BlockInfo* Remove(const Slice& key); + + // Find and evict a cache file using LRU policy + BlockCacheFile* Evict(); + + // Clear the metadata contents + virtual void Clear(); + + protected: + // Remove all block information from a given file + virtual void RemoveAllKeys(BlockCacheFile* file); + + private: + // Cache file index definition + // + // cache-id => BlockCacheFile + struct BlockCacheFileHash { + uint64_t operator()(const BlockCacheFile* rec) { + return std::hash()(rec->cacheid()); + } + }; + + struct BlockCacheFileEqual { + uint64_t operator()(const BlockCacheFile* lhs, const BlockCacheFile* rhs) { + return lhs->cacheid() == rhs->cacheid(); + } + }; + + using CacheFileIndexType = + EvictableHashTable; + + // Block Lookup Index + // + // key => LBA + struct Hash { + size_t operator()(BlockInfo* node) const { + return std::hash()(node->key_); + } + }; + + struct Equal { + size_t operator()(BlockInfo* lhs, BlockInfo* rhs) const { + return lhs->key_ == rhs->key_; + } + }; + + using BlockIndexType = HashTable; + + CacheFileIndexType cache_file_index_; + BlockIndexType block_index_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table.h new file mode 100644 index 0000000000..13b2ec7d66 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table.h @@ -0,0 +1,237 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + + +#include + +#include +#include + +#ifdef OS_LINUX +#include +#endif + +#include "rocksdb/env.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// HashTable +// +// Traditional implementation of hash table with synchronization built on top +// don't perform very well in multi-core scenarios. This is an implementation +// designed for multi-core scenarios with high lock contention. +// +// |<-------- alpha ------------->| +// Buckets Collision list +// ---- +----+ +---+---+--- ...... ---+---+---+ +// / | |--->| | | | | | +// / +----+ +---+---+--- ...... ---+---+---+ +// / | | +// Locks/ +----+ +// +--+/ . . +// | | . . +// +--+ . . +// | | . . +// +--+ . . +// | | . . +// +--+ . . +// \ +----+ +// \ | | +// \ +----+ +// \ | | +// \---- +----+ +// +// The lock contention is spread over an array of locks. This helps improve +// concurrent access. The spine is designed for a certain capacity and load +// factor. When the capacity planning is done correctly we can expect +// O(load_factor = 1) insert, access and remove time. +// +// Micro benchmark on debug build gives about .5 Million/sec rate of insert, +// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the +// blocks were of 4K, the hash table can support a virtual throughput of +// 6 GB/s. +// +// T Object type (contains both key and value) +// Hash Function that returns an hash from type T +// Equal Returns if two objects are equal +// (We need explicit equal for pointer type) +// +template +class HashTable { + public: + explicit HashTable(const size_t capacity = 1024 * 1024, + const float load_factor = 2.0, const uint32_t nlocks = 256) + : nbuckets_( + static_cast(load_factor ? capacity / load_factor : 0)), + nlocks_(nlocks) { + // pre-conditions + assert(capacity); + assert(load_factor); + assert(nbuckets_); + assert(nlocks_); + + buckets_.reset(new Bucket[nbuckets_]); +#ifdef OS_LINUX + mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); +#endif + + // initialize locks + locks_.reset(new port::RWMutex[nlocks_]); +#ifdef OS_LINUX + mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); +#endif + + // post-conditions + assert(buckets_); + assert(locks_); + } + + virtual ~HashTable() { AssertEmptyBuckets(); } + + // + // Insert given record to hash table + // + bool Insert(const T& t) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + WriteLock _(&locks_[lock_idx]); + auto& bucket = buckets_[bucket_idx]; + return Insert(&bucket, t); + } + + // Lookup hash table + // + // Please note that read lock should be held by the caller. This is because + // the caller owns the data, and should hold the read lock as long as he + // operates on the data. + bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + port::RWMutex& lock = locks_[lock_idx]; + lock.ReadLock(); + + auto& bucket = buckets_[bucket_idx]; + if (Find(&bucket, t, ret)) { + *ret_lock = &lock; + return true; + } + + lock.ReadUnlock(); + return false; + } + + // + // Erase a given key from the hash table + // + bool Erase(const T& t, T* ret) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + WriteLock _(&locks_[lock_idx]); + + auto& bucket = buckets_[bucket_idx]; + return Erase(&bucket, t, ret); + } + + // Fetch the mutex associated with a key + // This call is used to hold the lock for a given data for extended period of + // time. + port::RWMutex* GetMutex(const T& t) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + return &locks_[lock_idx]; + } + + void Clear(void (*fn)(T)) { + for (uint32_t i = 0; i < nbuckets_; ++i) { + const uint32_t lock_idx = i % nlocks_; + WriteLock _(&locks_[lock_idx]); + for (auto& t : buckets_[i].list_) { + (*fn)(t); + } + buckets_[i].list_.clear(); + } + } + + protected: + // Models bucket of keys that hash to the same bucket number + struct Bucket { + std::list list_; + }; + + // Substitute for std::find with custom comparator operator + typename std::list::iterator Find(std::list* list, const T& t) { + for (auto it = list->begin(); it != list->end(); ++it) { + if (Equal()(*it, t)) { + return it; + } + } + return list->end(); + } + + bool Insert(Bucket* bucket, const T& t) { + // Check if the key already exists + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + return false; + } + + // insert to bucket + bucket->list_.push_back(t); + return true; + } + + bool Find(Bucket* bucket, const T& t, T* ret) { + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + if (ret) { + *ret = *it; + } + return true; + } + return false; + } + + bool Erase(Bucket* bucket, const T& t, T* ret) { + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + if (ret) { + *ret = *it; + } + + bucket->list_.erase(it); + return true; + } + return false; + } + + // assert that all buckets are empty + void AssertEmptyBuckets() { +#ifndef NDEBUG + for (size_t i = 0; i < nbuckets_; ++i) { + WriteLock _(&locks_[i % nlocks_]); + assert(buckets_[i].list_.empty()); + } +#endif + } + + const uint32_t nbuckets_; // No. of buckets in the spine + std::unique_ptr buckets_; // Spine of the hash buckets + const uint32_t nlocks_; // No. of locks + std::unique_ptr locks_; // Granular locks +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table_evictable.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table_evictable.h new file mode 100644 index 0000000000..daf4bec1ae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/hash_table_evictable.h @@ -0,0 +1,166 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + + +#include + +#include "util/random.h" +#include "utilities/persistent_cache/hash_table.h" +#include "utilities/persistent_cache/lrulist.h" + +namespace ROCKSDB_NAMESPACE { + +// Evictable Hash Table +// +// Hash table index where least accessed (or one of the least accessed) elements +// can be evicted. +// +// Please note EvictableHashTable can only be created for pointer type objects +template +class EvictableHashTable : private HashTable { + public: + using hash_table = HashTable; + + explicit EvictableHashTable(const size_t capacity = 1024 * 1024, + const float load_factor = 2.0, + const uint32_t nlocks = 256) + : HashTable(capacity, load_factor, nlocks), + lru_lists_(new LRUList[hash_table::nlocks_]) { + assert(lru_lists_); + } + + virtual ~EvictableHashTable() { AssertEmptyLRU(); } + + // + // Insert given record to hash table (and LRU list) + // + bool Insert(T* t) { + const uint64_t h = Hash()(t); + typename hash_table::Bucket& bucket = GetBucket(h); + LRUListType& lru = GetLRUList(h); + port::RWMutex& lock = GetMutex(h); + + WriteLock _(&lock); + if (hash_table::Insert(&bucket, t)) { + lru.Push(t); + return true; + } + return false; + } + + // + // Lookup hash table + // + // Please note that read lock should be held by the caller. This is because + // the caller owns the data, and should hold the read lock as long as he + // operates on the data. + bool Find(T* t, T** ret) { + const uint64_t h = Hash()(t); + typename hash_table::Bucket& bucket = GetBucket(h); + LRUListType& lru = GetLRUList(h); + port::RWMutex& lock = GetMutex(h); + + ReadLock _(&lock); + if (hash_table::Find(&bucket, t, ret)) { + ++(*ret)->refs_; + lru.Touch(*ret); + return true; + } + return false; + } + + // + // Evict one of the least recently used object + // + T* Evict(const std::function& fn = nullptr) { + uint32_t random = Random::GetTLSInstance()->Next(); + const size_t start_idx = random % hash_table::nlocks_; + T* t = nullptr; + + // iterate from start_idx .. 0 .. start_idx + for (size_t i = 0; !t && i < hash_table::nlocks_; ++i) { + const size_t idx = (start_idx + i) % hash_table::nlocks_; + + WriteLock _(&hash_table::locks_[idx]); + LRUListType& lru = lru_lists_[idx]; + if (!lru.IsEmpty() && (t = lru.Pop()) != nullptr) { + assert(!t->refs_); + // We got an item to evict, erase from the bucket + const uint64_t h = Hash()(t); + typename hash_table::Bucket& bucket = GetBucket(h); + T* tmp = nullptr; + bool status = hash_table::Erase(&bucket, t, &tmp); + assert(t == tmp); + (void)status; + assert(status); + if (fn) { + fn(t); + } + break; + } + assert(!t); + } + return t; + } + + void Clear(void (*fn)(T*)) { + for (uint32_t i = 0; i < hash_table::nbuckets_; ++i) { + const uint32_t lock_idx = i % hash_table::nlocks_; + WriteLock _(&hash_table::locks_[lock_idx]); + auto& lru_list = lru_lists_[lock_idx]; + auto& bucket = hash_table::buckets_[i]; + for (auto* t : bucket.list_) { + lru_list.Unlink(t); + (*fn)(t); + } + bucket.list_.clear(); + } + // make sure that all LRU lists are emptied + AssertEmptyLRU(); + } + + void AssertEmptyLRU() { +#ifndef NDEBUG + for (uint32_t i = 0; i < hash_table::nlocks_; ++i) { + WriteLock _(&hash_table::locks_[i]); + auto& lru_list = lru_lists_[i]; + assert(lru_list.IsEmpty()); + } +#endif + } + + // + // Fetch the mutex associated with a key + // This call is used to hold the lock for a given data for extended period of + // time. + port::RWMutex* GetMutex(T* t) { return hash_table::GetMutex(t); } + + private: + using LRUListType = LRUList; + + typename hash_table::Bucket& GetBucket(const uint64_t h) { + const uint32_t bucket_idx = h % hash_table::nbuckets_; + return hash_table::buckets_[bucket_idx]; + } + + LRUListType& GetLRUList(const uint64_t h) { + const uint32_t bucket_idx = h % hash_table::nbuckets_; + const uint32_t lock_idx = bucket_idx % hash_table::nlocks_; + return lru_lists_[lock_idx]; + } + + port::RWMutex& GetMutex(const uint64_t h) { + const uint32_t bucket_idx = h % hash_table::nbuckets_; + const uint32_t lock_idx = bucket_idx % hash_table::nlocks_; + return hash_table::locks_[lock_idx]; + } + + std::unique_ptr lru_lists_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/lrulist.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/lrulist.h new file mode 100644 index 0000000000..9a896ed599 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/lrulist.h @@ -0,0 +1,172 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + + +#include + +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// LRU element definition +// +// Any object that needs to be part of the LRU algorithm should extend this +// class +template +struct LRUElement { + explicit LRUElement() : next_(nullptr), prev_(nullptr), refs_(0) {} + + virtual ~LRUElement() { assert(!refs_); } + + T* next_; + T* prev_; + std::atomic refs_; +}; + +// LRU implementation +// +// In place LRU implementation. There is no copy or allocation involved when +// inserting or removing an element. This makes the data structure slim +template +class LRUList { + public: + virtual ~LRUList() { + MutexLock _(&lock_); + assert(!head_); + assert(!tail_); + } + + // Push element into the LRU at the cold end + inline void Push(T* const t) { + assert(t); + assert(!t->next_); + assert(!t->prev_); + + MutexLock _(&lock_); + + assert((!head_ && !tail_) || (head_ && tail_)); + assert(!head_ || !head_->prev_); + assert(!tail_ || !tail_->next_); + + t->next_ = head_; + if (head_) { + head_->prev_ = t; + } + + head_ = t; + if (!tail_) { + tail_ = t; + } + } + + // Unlink the element from the LRU + inline void Unlink(T* const t) { + MutexLock _(&lock_); + UnlinkImpl(t); + } + + // Evict an element from the LRU + inline T* Pop() { + MutexLock _(&lock_); + + assert(tail_ && head_); + assert(!tail_->next_); + assert(!head_->prev_); + + T* t = head_; + while (t && t->refs_) { + t = t->next_; + } + + if (!t) { + // nothing can be evicted + return nullptr; + } + + assert(!t->refs_); + + // unlike the element + UnlinkImpl(t); + return t; + } + + // Move the element from the front of the list to the back of the list + inline void Touch(T* const t) { + MutexLock _(&lock_); + UnlinkImpl(t); + PushBackImpl(t); + } + + // Check if the LRU is empty + inline bool IsEmpty() const { + MutexLock _(&lock_); + return !head_ && !tail_; + } + + private: + // Unlink an element from the LRU + void UnlinkImpl(T* const t) { + assert(t); + + lock_.AssertHeld(); + + assert(head_ && tail_); + assert(t->prev_ || head_ == t); + assert(t->next_ || tail_ == t); + + if (t->prev_) { + t->prev_->next_ = t->next_; + } + if (t->next_) { + t->next_->prev_ = t->prev_; + } + + if (tail_ == t) { + tail_ = tail_->prev_; + } + if (head_ == t) { + head_ = head_->next_; + } + + t->next_ = t->prev_ = nullptr; + } + + // Insert an element at the hot end + inline void PushBack(T* const t) { + MutexLock _(&lock_); + PushBackImpl(t); + } + + inline void PushBackImpl(T* const t) { + assert(t); + assert(!t->next_); + assert(!t->prev_); + + lock_.AssertHeld(); + + assert((!head_ && !tail_) || (head_ && tail_)); + assert(!head_ || !head_->prev_); + assert(!tail_ || !tail_->next_); + + t->prev_ = tail_; + if (tail_) { + tail_->next_ = t; + } + + tail_ = t; + if (!head_) { + head_ = tail_; + } + } + + mutable port::Mutex lock_; // synchronization primitive + T* head_ = nullptr; // front (cold) + T* tail_ = nullptr; // back (hot) +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_test.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_test.h new file mode 100644 index 0000000000..aab61bb773 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_test.h @@ -0,0 +1,284 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + + +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "memory/arena.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "table/block_based/block_builder.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "utilities/persistent_cache/volatile_tier_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Unit tests for testing PersistentCacheTier +// +class PersistentCacheTierTest : public testing::Test { + public: + PersistentCacheTierTest(); + virtual ~PersistentCacheTierTest() { + if (cache_) { + Status s = cache_->Close(); + assert(s.ok()); + } + } + + protected: + // Flush cache + void Flush() { + if (cache_) { + cache_->TEST_Flush(); + } + } + + // create threaded workload + template + std::list SpawnThreads(const size_t n, const T& fn) { + std::list threads; + for (size_t i = 0; i < n; i++) { + port::Thread th(fn); + threads.push_back(std::move(th)); + } + return threads; + } + + // Wait for threads to join + void Join(std::list&& threads) { + for (auto& th : threads) { + th.join(); + } + threads.clear(); + } + + // Run insert workload in threads + void Insert(const size_t nthreads, const size_t max_keys) { + key_ = 0; + max_keys_ = max_keys; + // spawn threads + auto fn = std::bind(&PersistentCacheTierTest::InsertImpl, this); + auto threads = SpawnThreads(nthreads, fn); + // join with threads + Join(std::move(threads)); + // Flush cache + Flush(); + } + + // Run verification on the cache + void Verify(const size_t nthreads = 1, const bool eviction_enabled = false) { + stats_verify_hits_ = 0; + stats_verify_missed_ = 0; + key_ = 0; + // spawn threads + auto fn = + std::bind(&PersistentCacheTierTest::VerifyImpl, this, eviction_enabled); + auto threads = SpawnThreads(nthreads, fn); + // join with threads + Join(std::move(threads)); + } + + // pad 0 to numbers + std::string PaddedNumber(const size_t data, const size_t pad_size) { + assert(pad_size); + char* ret = new char[pad_size]; + int pos = static_cast(pad_size) - 1; + size_t count = 0; + size_t t = data; + // copy numbers + while (t) { + count++; + ret[pos--] = '0' + t % 10; + t = t / 10; + } + // copy 0s + while (pos >= 0) { + ret[pos--] = '0'; + } + // post condition + assert(count <= pad_size); + assert(pos == -1); + std::string result(ret, pad_size); + delete[] ret; + return result; + } + + // Insert workload implementation + void InsertImpl() { + const std::string prefix = "key_prefix_"; + + while (true) { + size_t i = key_++; + if (i >= max_keys_) { + break; + } + + char data[4 * 1024]; + memset(data, '0' + (i % 10), sizeof(data)); + auto k = prefix + PaddedNumber(i, /*count=*/8); + Slice key(k); + while (true) { + Status status = cache_->Insert(key, data, sizeof(data)); + if (status.ok()) { + break; + } + ASSERT_TRUE(status.IsTryAgain()); + Env::Default()->SleepForMicroseconds(1 * 1000 * 1000); + } + } + } + + // Verification implementation + void VerifyImpl(const bool eviction_enabled = false) { + const std::string prefix = "key_prefix_"; + while (true) { + size_t i = key_++; + if (i >= max_keys_) { + break; + } + + char edata[4 * 1024]; + memset(edata, '0' + (i % 10), sizeof(edata)); + auto k = prefix + PaddedNumber(i, /*count=*/8); + Slice key(k); + std::unique_ptr block; + size_t block_size; + + if (eviction_enabled) { + if (!cache_->Lookup(key, &block, &block_size).ok()) { + // assume that the key is evicted + stats_verify_missed_++; + continue; + } + } + + ASSERT_OK(cache_->Lookup(key, &block, &block_size)); + ASSERT_EQ(block_size, sizeof(edata)); + ASSERT_EQ(memcmp(edata, block.get(), sizeof(edata)), 0); + stats_verify_hits_++; + } + } + + // template for insert test + void RunInsertTest(const size_t nthreads, const size_t max_keys) { + Insert(nthreads, max_keys); + Verify(nthreads); + ASSERT_EQ(stats_verify_hits_, max_keys); + ASSERT_EQ(stats_verify_missed_, 0); + + ASSERT_OK(cache_->Close()); + cache_.reset(); + } + + // template for negative insert test + void RunNegativeInsertTest(const size_t nthreads, const size_t max_keys) { + Insert(nthreads, max_keys); + Verify(nthreads, /*eviction_enabled=*/true); + ASSERT_LT(stats_verify_hits_, max_keys); + ASSERT_GT(stats_verify_missed_, 0); + + ASSERT_OK(cache_->Close()); + cache_.reset(); + } + + // template for insert with eviction test + void RunInsertTestWithEviction(const size_t nthreads, const size_t max_keys) { + Insert(nthreads, max_keys); + Verify(nthreads, /*eviction_enabled=*/true); + ASSERT_EQ(stats_verify_hits_ + stats_verify_missed_, max_keys); + ASSERT_GT(stats_verify_hits_, 0); + ASSERT_GT(stats_verify_missed_, 0); + + ASSERT_OK(cache_->Close()); + cache_.reset(); + } + + const std::string path_; + std::shared_ptr log_; + std::shared_ptr cache_; + std::atomic key_{0}; + size_t max_keys_ = 0; + std::atomic stats_verify_hits_{0}; + std::atomic stats_verify_missed_{0}; +}; + +// +// RocksDB tests +// +class PersistentCacheDBTest : public DBTestBase { + public: + PersistentCacheDBTest(); + + static uint64_t TestGetTickerCount(const Options& options, + Tickers ticker_type) { + return static_cast( + options.statistics->getTickerCount(ticker_type)); + } + + // insert data to table + void Insert(const Options& options, + const BlockBasedTableOptions& /*table_options*/, + const int num_iter, std::vector* values) { + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = rnd.RandomString(1000); + } + values->push_back(str); + ASSERT_OK(Put(1, Key(i), (*values)[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + } + + // verify data + void Verify(const int num_iter, const std::vector& values) { + for (int j = 0; j < 2; ++j) { + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + } + } + + // test template + void RunTest(const std::function(bool)>& + new_pcache, + const size_t max_keys, const size_t max_usecase); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.cc new file mode 100644 index 0000000000..773aafbf26 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "utilities/persistent_cache/persistent_cache_tier.h" + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +std::string PersistentCacheConfig::ToString() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " path: %s\n", path.c_str()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_direct_reads: %d\n", + enable_direct_reads); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_direct_writes: %d\n", + enable_direct_writes); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_size: %" PRIu64 "\n", cache_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_file_size: %" PRIu32 "\n", + cache_file_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " writer_qdepth: %" PRIu32 "\n", + writer_qdepth); + ret.append(buffer); + snprintf(buffer, kBufferSize, " pipeline_writes: %d\n", pipeline_writes); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " max_write_pipeline_backlog_size: %" PRIu64 "\n", + max_write_pipeline_backlog_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " write_buffer_size: %" PRIu32 "\n", + write_buffer_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " writer_dispatch_size: %" PRIu64 "\n", + writer_dispatch_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " is_compressed: %d\n", is_compressed); + ret.append(buffer); + + return ret; +} + +// +// PersistentCacheTier implementation +// +Status PersistentCacheTier::Open() { + if (next_tier_) { + return next_tier_->Open(); + } + return Status::OK(); +} + +Status PersistentCacheTier::Close() { + if (next_tier_) { + return next_tier_->Close(); + } + return Status::OK(); +} + +bool PersistentCacheTier::Reserve(const size_t /*size*/) { + // default implementation is a pass through + return true; +} + +bool PersistentCacheTier::Erase(const Slice& /*key*/) { + // default implementation is a pass through since not all cache tiers might + // support erase + return true; +} + +std::string PersistentCacheTier::PrintStats() { + std::ostringstream os; + for (auto tier_stats : Stats()) { + os << "---- next tier -----" << std::endl; + for (auto stat : tier_stats) { + os << stat.first << ": " << stat.second << std::endl; + } + } + return os.str(); +} + +PersistentCache::StatsType PersistentCacheTier::Stats() { + if (next_tier_) { + return next_tier_->Stats(); + } + return PersistentCache::StatsType{}; +} + +uint64_t PersistentCacheTier::NewId() { + return last_id_.fetch_add(1, std::memory_order_relaxed); +} + +// +// PersistentTieredCache implementation +// +PersistentTieredCache::~PersistentTieredCache() { assert(tiers_.empty()); } + +Status PersistentTieredCache::Open() { + assert(!tiers_.empty()); + return tiers_.front()->Open(); +} + +Status PersistentTieredCache::Close() { + assert(!tiers_.empty()); + Status status = tiers_.front()->Close(); + if (status.ok()) { + tiers_.clear(); + } + return status; +} + +bool PersistentTieredCache::Erase(const Slice& key) { + assert(!tiers_.empty()); + return tiers_.front()->Erase(key); +} + +PersistentCache::StatsType PersistentTieredCache::Stats() { + assert(!tiers_.empty()); + return tiers_.front()->Stats(); +} + +std::string PersistentTieredCache::PrintStats() { + assert(!tiers_.empty()); + return tiers_.front()->PrintStats(); +} + +Status PersistentTieredCache::Insert(const Slice& page_key, const char* data, + const size_t size) { + assert(!tiers_.empty()); + return tiers_.front()->Insert(page_key, data, size); +} + +Status PersistentTieredCache::Lookup(const Slice& page_key, + std::unique_ptr* data, + size_t* size) { + assert(!tiers_.empty()); + return tiers_.front()->Lookup(page_key, data, size); +} + +void PersistentTieredCache::AddTier(const Tier& tier) { + if (!tiers_.empty()) { + tiers_.back()->set_next_tier(tier); + } + tiers_.push_back(tier); +} + +bool PersistentTieredCache::IsCompressed() { + assert(tiers_.size()); + return tiers_.front()->IsCompressed(); +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.h new file mode 100644 index 0000000000..44d2fbba31 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_tier.h @@ -0,0 +1,340 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + + +#include +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "rocksdb/env.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +// Persistent Cache +// +// Persistent cache is tiered key-value cache that can use persistent medium. It +// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM. +// The code has been kept generic but significant benchmark/design/development +// time has been spent to make sure the cache performs appropriately for +// respective storage medium. +// The file defines +// PersistentCacheTier : Implementation that handles individual cache tier +// PersistentTieresCache : Implementation that handles all tiers as a logical +// unit +// +// PersistentTieredCache architecture: +// +--------------------------+ PersistentCacheTier that handles multiple tiers +// | +----------------+ | +// | | RAM | PersistentCacheTier that handles RAM (VolatileCacheImpl) +// | +----------------+ | +// | | next | +// | v | +// | +----------------+ | +// | | NVM | PersistentCacheTier implementation that handles NVM +// | +----------------+ (BlockCacheImpl) +// | | next | +// | V | +// | +----------------+ | +// | | LE-SSD | PersistentCacheTier implementation that handles LE-SSD +// | +----------------+ (BlockCacheImpl) +// | | | +// | V | +// | null | +// +--------------------------+ +// | +// V +// null +namespace ROCKSDB_NAMESPACE { + +// Persistent Cache Config +// +// This struct captures all the options that are used to configure persistent +// cache. Some of the terminologies used in naming the options are +// +// dispatch size : +// This is the size in which IO is dispatched to the device +// +// write buffer size : +// This is the size of an individual write buffer size. Write buffers are +// grouped to form buffered file. +// +// cache size : +// This is the logical maximum for the cache size +// +// qdepth : +// This is the max number of IOs that can issues to the device in parallel +// +// pepeling : +// The writer code path follows pipelined architecture, which means the +// operations are handed off from one stage to another +// +// pipelining backlog size : +// With the pipelined architecture, there can always be backlogging of ops in +// pipeline queues. This is the maximum backlog size after which ops are dropped +// from queue +struct PersistentCacheConfig { + explicit PersistentCacheConfig( + Env* const _env, const std::string& _path, const uint64_t _cache_size, + const std::shared_ptr& _log, + const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) { + env = _env; + clock = (env != nullptr) ? env->GetSystemClock().get() + : SystemClock::Default().get(); + path = _path; + log = _log; + cache_size = _cache_size; + writer_dispatch_size = write_buffer_size = _write_buffer_size; + } + + // + // Validate the settings. Our intentions are to catch erroneous settings ahead + // of time instead going violating invariants or causing dead locks. + // + Status ValidateSettings() const { + // (1) check pre-conditions for variables + if (!env || path.empty()) { + return Status::InvalidArgument("empty or null args"); + } + + // (2) assert size related invariants + // - cache size cannot be less than cache file size + // - individual write buffer size cannot be greater than cache file size + // - total write buffer size cannot be less than 2X cache file size + if (cache_size < cache_file_size || write_buffer_size >= cache_file_size || + write_buffer_size * write_buffer_count() < 2 * cache_file_size) { + return Status::InvalidArgument("invalid cache size"); + } + + // (2) check writer settings + // - Queue depth cannot be 0 + // - writer_dispatch_size cannot be greater than writer_buffer_size + // - dispatch size and buffer size need to be aligned + if (!writer_qdepth || writer_dispatch_size > write_buffer_size || + write_buffer_size % writer_dispatch_size) { + return Status::InvalidArgument("invalid writer settings"); + } + + return Status::OK(); + } + + // + // Env abstraction to use for system level operations + // + Env* env; + SystemClock* clock; + // + // Path for the block cache where blocks are persisted + // + std::string path; + + // + // Log handle for logging messages + // + std::shared_ptr log; + + // + // Enable direct IO for reading + // + bool enable_direct_reads = true; + + // + // Enable direct IO for writing + // + bool enable_direct_writes = false; + + // + // Logical cache size + // + uint64_t cache_size = std::numeric_limits::max(); + + // cache-file-size + // + // Cache consists of multiples of small files. This parameter defines the + // size of an individual cache file + // + // default: 1M + uint32_t cache_file_size = 100ULL * 1024 * 1024; + + // writer-qdepth + // + // The writers can issues IO to the devices in parallel. This parameter + // controls the max number if IOs that can issues in parallel to the block + // device + // + // default :1 + uint32_t writer_qdepth = 1; + + // pipeline-writes + // + // The write optionally follow pipelined architecture. This helps + // avoid regression in the eviction code path of the primary tier. This + // parameter defines if pipelining is enabled or disabled + // + // default: true + bool pipeline_writes = true; + + // max-write-pipeline-backlog-size + // + // Max pipeline buffer size. This is the maximum backlog we can accumulate + // while waiting for writes. After the limit, new ops will be dropped. + // + // Default: 1GiB + uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024; + + // write-buffer-size + // + // This is the size in which buffer slabs are allocated. + // + // Default: 1M + uint32_t write_buffer_size = 1ULL * 1024 * 1024; + + // write-buffer-count + // + // This is the total number of buffer slabs. This is calculated as a factor of + // file size in order to avoid dead lock. + size_t write_buffer_count() const { + assert(write_buffer_size); + return static_cast((writer_qdepth + 1.2) * cache_file_size / + write_buffer_size); + } + + // writer-dispatch-size + // + // The writer thread will dispatch the IO at the specified IO size + // + // default: 1M + uint64_t writer_dispatch_size = 1ULL * 1024 * 1024; + + // is_compressed + // + // This option determines if the cache will run in compressed mode or + // uncompressed mode + bool is_compressed = true; + + PersistentCacheConfig MakePersistentCacheConfig( + const std::string& path, const uint64_t size, + const std::shared_ptr& log); + + std::string ToString() const; +}; + +// Persistent Cache Tier +// +// This a logical abstraction that defines a tier of the persistent cache. Tiers +// can be stacked over one another. PersistentCahe provides the basic definition +// for accessing/storing in the cache. PersistentCacheTier extends the interface +// to enable management and stacking of tiers. +class PersistentCacheTier : public PersistentCache { + public: + using Tier = std::shared_ptr; + + virtual ~PersistentCacheTier() {} + + // Open the persistent cache tier + virtual Status Open(); + + // Close the persistent cache tier + virtual Status Close(); + + // Reserve space up to 'size' bytes + virtual bool Reserve(const size_t size); + + // Erase a key from the cache + virtual bool Erase(const Slice& key); + + // Print stats to string recursively + virtual std::string PrintStats(); + + virtual PersistentCache::StatsType Stats() override; + + // Insert to page cache + virtual Status Insert(const Slice& page_key, const char* data, + const size_t size) override = 0; + + // Lookup page cache by page identifier + virtual Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override = 0; + + // Does it store compressed data ? + virtual bool IsCompressed() override = 0; + + virtual std::string GetPrintableOptions() const override = 0; + + virtual uint64_t NewId() override; + + // Return a reference to next tier + virtual Tier& next_tier() { return next_tier_; } + + // Set the value for next tier + virtual void set_next_tier(const Tier& tier) { + assert(!next_tier_); + next_tier_ = tier; + } + + virtual void TEST_Flush() { + if (next_tier_) { + next_tier_->TEST_Flush(); + } + } + + private: + Tier next_tier_; // next tier + std::atomic last_id_{1}; +}; + +// PersistentTieredCache +// +// Abstraction that helps you construct a tiers of persistent caches as a +// unified cache. The tier(s) of cache will act a single tier for management +// ease and support PersistentCache methods for accessing data. +class PersistentTieredCache : public PersistentCacheTier { + public: + virtual ~PersistentTieredCache(); + + Status Open() override; + Status Close() override; + bool Erase(const Slice& key) override; + std::string PrintStats() override; + PersistentCache::StatsType Stats() override; + Status Insert(const Slice& page_key, const char* data, + const size_t size) override; + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override; + bool IsCompressed() override; + + std::string GetPrintableOptions() const override { + return "PersistentTieredCache"; + } + + void AddTier(const Tier& tier); + + Tier& next_tier() override { + auto it = tiers_.end(); + return (*it)->next_tier(); + } + + void set_next_tier(const Tier& tier) override { + auto it = tiers_.end(); + (*it)->set_next_tier(tier); + } + + void TEST_Flush() override { + assert(!tiers_.empty()); + tiers_.front()->TEST_Flush(); + PersistentCacheTier::TEST_Flush(); + } + + protected: + std::list tiers_; // list of tiers top-down +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_util.h new file mode 100644 index 0000000000..2a769652db --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/persistent_cache_util.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Simple synchronized queue implementation with the option of +// bounding the queue +// +// On overflow, the elements will be discarded +// +template +class BoundedQueue { + public: + explicit BoundedQueue( + const size_t max_size = std::numeric_limits::max()) + : cond_empty_(&lock_), max_size_(max_size) {} + + virtual ~BoundedQueue() {} + + void Push(T&& t) { + MutexLock _(&lock_); + if (max_size_ != std::numeric_limits::max() && + size_ + t.Size() >= max_size_) { + // overflow + return; + } + + size_ += t.Size(); + q_.push_back(std::move(t)); + cond_empty_.SignalAll(); + } + + T Pop() { + MutexLock _(&lock_); + while (q_.empty()) { + cond_empty_.Wait(); + } + + T t = std::move(q_.front()); + size_ -= t.Size(); + q_.pop_front(); + return t; + } + + size_t Size() const { + MutexLock _(&lock_); + return size_; + } + + private: + mutable port::Mutex lock_; + port::CondVar cond_empty_; + std::list q_; + size_t size_ = 0; + const size_t max_size_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.cc new file mode 100644 index 0000000000..6264f5ef3a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "utilities/persistent_cache/volatile_tier_impl.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +void VolatileCacheTier::DeleteCacheData(VolatileCacheTier::CacheData* data) { + assert(data); + delete data; +} + +VolatileCacheTier::~VolatileCacheTier() { index_.Clear(&DeleteCacheData); } + +PersistentCache::StatsType VolatileCacheTier::Stats() { + std::map stat; + stat.insert({"persistent_cache.volatile_cache.hits", + static_cast(stats_.cache_hits_)}); + stat.insert({"persistent_cache.volatile_cache.misses", + static_cast(stats_.cache_misses_)}); + stat.insert({"persistent_cache.volatile_cache.inserts", + static_cast(stats_.cache_inserts_)}); + stat.insert({"persistent_cache.volatile_cache.evicts", + static_cast(stats_.cache_evicts_)}); + stat.insert({"persistent_cache.volatile_cache.hit_pct", + static_cast(stats_.CacheHitPct())}); + stat.insert({"persistent_cache.volatile_cache.miss_pct", + static_cast(stats_.CacheMissPct())}); + + auto out = PersistentCacheTier::Stats(); + out.push_back(stat); + return out; +} + +Status VolatileCacheTier::Insert(const Slice& page_key, const char* data, + const size_t size) { + // precondition + assert(data); + assert(size); + + // increment the size + size_ += size; + + // check if we have overshot the limit, if so evict some space + while (size_ > max_size_) { + if (!Evict()) { + // unable to evict data, we give up so we don't spike read + // latency + assert(size_ >= size); + size_ -= size; + return Status::TryAgain("Unable to evict any data"); + } + } + + assert(size_ >= size); + + // insert order: LRU, followed by index + std::string key(page_key.data(), page_key.size()); + std::string value(data, size); + std::unique_ptr cache_data( + new CacheData(std::move(key), std::move(value))); + bool ok = index_.Insert(cache_data.get()); + if (!ok) { + // decrement the size that we incremented ahead of time + assert(size_ >= size); + size_ -= size; + // failed to insert to cache, block already in cache + return Status::TryAgain("key already exists in volatile cache"); + } + + cache_data.release(); + stats_.cache_inserts_++; + return Status::OK(); +} + +Status VolatileCacheTier::Lookup(const Slice& page_key, + std::unique_ptr* result, + size_t* size) { + CacheData key(std::move(page_key.ToString())); + CacheData* kv; + bool ok = index_.Find(&key, &kv); + if (ok) { + // set return data + result->reset(new char[kv->value.size()]); + memcpy(result->get(), kv->value.c_str(), kv->value.size()); + *size = kv->value.size(); + // drop the reference on cache data + kv->refs_--; + // update stats + stats_.cache_hits_++; + return Status::OK(); + } + + stats_.cache_misses_++; + + if (next_tier()) { + return next_tier()->Lookup(page_key, result, size); + } + + return Status::NotFound("key not found in volatile cache"); +} + +bool VolatileCacheTier::Erase(const Slice& /*key*/) { + assert(!"not supported"); + return true; +} + +bool VolatileCacheTier::Evict() { + CacheData* edata = index_.Evict(); + if (!edata) { + // not able to evict any object + return false; + } + + stats_.cache_evicts_++; + + // push the evicted object to the next level + if (next_tier()) { + // TODO: Should the insert error be ignored? + Status s = next_tier()->Insert(Slice(edata->key), edata->value.c_str(), + edata->value.size()); + s.PermitUncheckedError(); + } + + // adjust size and destroy data + size_ -= edata->value.size(); + delete edata; + + return true; +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.h new file mode 100644 index 0000000000..f5d3064438 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/persistent_cache/volatile_tier_impl.h @@ -0,0 +1,139 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + + +#include +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "utilities/persistent_cache/hash_table.h" +#include "utilities/persistent_cache/hash_table_evictable.h" +#include "utilities/persistent_cache/persistent_cache_tier.h" + +// VolatileCacheTier +// +// This file provides persistent cache tier implementation for caching +// key/values in RAM. +// +// key/values +// | +// V +// +-------------------+ +// | VolatileCacheTier | Store in an evictable hash table +// +-------------------+ +// | +// V +// on eviction +// pushed to next tier +// +// The implementation is designed to be concurrent. The evictable hash table +// implementation is not concurrent at this point though. +// +// The eviction algorithm is LRU +namespace ROCKSDB_NAMESPACE { + +class VolatileCacheTier : public PersistentCacheTier { + public: + explicit VolatileCacheTier( + const bool is_compressed = true, + const size_t max_size = std::numeric_limits::max()) + : is_compressed_(is_compressed), max_size_(max_size) {} + + virtual ~VolatileCacheTier(); + + // insert to cache + Status Insert(const Slice& page_key, const char* data, + const size_t size) override; + // lookup key in cache + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override; + + // is compressed cache ? + bool IsCompressed() override { return is_compressed_; } + + // erase key from cache + bool Erase(const Slice& key) override; + + std::string GetPrintableOptions() const override { + return "VolatileCacheTier"; + } + + // Expose stats as map + PersistentCache::StatsType Stats() override; + + private: + // + // Cache data abstraction + // + struct CacheData : LRUElement { + explicit CacheData(CacheData&& rhs) noexcept + : key(std::move(rhs.key)), value(std::move(rhs.value)) {} + + explicit CacheData(const std::string& _key, const std::string& _value = "") + : key(_key), value(_value) {} + + virtual ~CacheData() {} + + const std::string key; + const std::string value; + }; + + static void DeleteCacheData(CacheData* data); + + // + // Index and LRU definition + // + struct CacheDataHash { + uint64_t operator()(const CacheData* obj) const { + assert(obj); + return std::hash()(obj->key); + } + }; + + struct CacheDataEqual { + bool operator()(const CacheData* lhs, const CacheData* rhs) const { + assert(lhs); + assert(rhs); + return lhs->key == rhs->key; + } + }; + + struct Statistics { + std::atomic cache_misses_{0}; + std::atomic cache_hits_{0}; + std::atomic cache_inserts_{0}; + std::atomic cache_evicts_{0}; + + double CacheHitPct() const { + auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_hits_ / static_cast(lookups) : 0.0; + } + + double CacheMissPct() const { + auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_misses_ / static_cast(lookups) : 0.0; + } + }; + + using IndexType = + EvictableHashTable; + + // Evict LRU tail + bool Evict(); + + const bool is_compressed_ = true; // does it store compressed data + IndexType index_; // in-memory cache + std::atomic max_size_{0}; // Maximum size of the cache + std::atomic size_{0}; // Size of the cache + Statistics stats_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.cc new file mode 100644 index 0000000000..edb75d545c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.cc @@ -0,0 +1,287 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/simulator_cache/cache_simulator.h" + +#include + +#include "db/dbformat.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +const std::string kGhostCachePrefix = "ghost_"; +} // namespace + +GhostCache::GhostCache(std::shared_ptr sim_cache) + : sim_cache_(sim_cache) {} + +bool GhostCache::Admit(const Slice& lookup_key) { + auto handle = sim_cache_->Lookup(lookup_key); + if (handle != nullptr) { + sim_cache_->Release(handle); + return true; + } + // TODO: Should we check for errors here? + auto s = sim_cache_->Insert(lookup_key, /*obj=*/nullptr, + &kNoopCacheItemHelper, lookup_key.size()); + s.PermitUncheckedError(); + return false; +} + +CacheSimulator::CacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache) + : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {} + +void CacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool admit = true; + const bool is_user_access = + BlockCacheTraceHelper::IsUserAccess(access.caller); + bool is_cache_miss = true; + if (ghost_cache_ && !access.no_insert) { + admit = ghost_cache_->Admit(access.block_key); + } + auto handle = sim_cache_->Lookup(access.block_key); + if (handle != nullptr) { + sim_cache_->Release(handle); + is_cache_miss = false; + } else { + if (!access.no_insert && admit && access.block_size > 0) { + // Ignore errors on insert + auto s = sim_cache_->Insert(access.block_key, /*obj=*/nullptr, + &kNoopCacheItemHelper, access.block_size); + s.PermitUncheckedError(); + } + } + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access, + is_cache_miss); +} + +void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms, + bool is_user_access, bool is_cache_miss) { + uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond; + num_accesses_timeline_[timestamp_in_seconds] += 1; + num_accesses_ += 1; + if (num_misses_timeline_.find(timestamp_in_seconds) == + num_misses_timeline_.end()) { + num_misses_timeline_[timestamp_in_seconds] = 0; + } + if (is_cache_miss) { + num_misses_ += 1; + num_misses_timeline_[timestamp_in_seconds] += 1; + } + if (is_user_access) { + user_accesses_ += 1; + if (is_cache_miss) { + user_misses_ += 1; + } + } +} + +Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority( + const BlockCacheTraceRecord& access) const { + if (access.block_type == TraceType::kBlockTraceFilterBlock || + access.block_type == TraceType::kBlockTraceIndexBlock || + access.block_type == TraceType::kBlockTraceUncompressionDictBlock) { + return Cache::Priority::HIGH; + } + return Cache::Priority::LOW; +} + +void PrioritizedCacheSimulator::AccessKVPair( + const Slice& key, uint64_t value_size, Cache::Priority priority, + const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access, + bool* is_cache_miss, bool* admitted, bool update_metrics) { + assert(is_cache_miss); + assert(admitted); + *is_cache_miss = true; + *admitted = true; + if (ghost_cache_ && !no_insert) { + *admitted = ghost_cache_->Admit(key); + } + auto handle = sim_cache_->Lookup(key); + if (handle != nullptr) { + sim_cache_->Release(handle); + *is_cache_miss = false; + } else if (!no_insert && *admitted && value_size > 0) { + // TODO: Should we check for an error here? + auto s = sim_cache_->Insert(key, /*obj=*/nullptr, &kNoopCacheItemHelper, + value_size, + /*handle=*/nullptr, priority); + s.PermitUncheckedError(); + } + if (update_metrics) { + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access, + *is_cache_miss); + } +} + +void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool is_cache_miss = true; + bool admitted = true; + AccessKVPair(access.block_key, access.block_size, + ComputeBlockPriority(access), access, access.no_insert, + BlockCacheTraceHelper::IsUserAccess(access.caller), + &is_cache_miss, &admitted, /*update_metrics=*/true); +} + +void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { + // TODO (haoyu): We only support Get for now. We need to extend the tracing + // for MultiGet, i.e., non-data block accesses must log all keys in a + // MultiGet. + bool is_cache_miss = true; + bool admitted = false; + if (access.caller == TableReaderCaller::kUserGet && + access.get_id != BlockCacheTraceHelper::kReservedGetId) { + // This is a Get request. + const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access); + GetRequestStatus& status = getid_status_map_[access.get_id]; + if (status.is_complete) { + // This Get request completes. + // Skip future accesses to its index/filter/data + // blocks. These block lookups are unnecessary if we observe a hit for the + // referenced key-value pair already. Thus, we treat these lookups as + // hits. This is also to ensure the total number of accesses are the same + // when comparing to other policies. + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + /*is_user_access=*/true, + /*is_cache_miss=*/false); + return; + } + if (status.row_key_status.find(row_key) == status.row_key_status.end()) { + // This is the first time that this key is accessed. Look up the key-value + // pair first. Do not update the miss/accesses metrics here since it will + // be updated later. + AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH, + access, + /*no_insert=*/false, + /*is_user_access=*/true, &is_cache_miss, &admitted, + /*update_metrics=*/false); + InsertResult result = InsertResult::NO_INSERT; + if (admitted && access.referenced_data_size > 0) { + result = InsertResult::INSERTED; + } else if (admitted) { + result = InsertResult::ADMITTED; + } + status.row_key_status[row_key] = result; + } + if (!is_cache_miss) { + // A cache hit. + status.is_complete = true; + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + /*is_user_access=*/true, + /*is_cache_miss=*/false); + return; + } + // The row key-value pair observes a cache miss. We need to access its + // index/filter/data blocks. + InsertResult inserted = status.row_key_status[row_key]; + AccessKVPair( + access.block_key, access.block_size, ComputeBlockPriority(access), + access, + /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert, + /*is_user_access=*/true, &is_cache_miss, &admitted, + /*update_metrics=*/true); + if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) { + // TODO: Should we check for an error here? + auto s = + sim_cache_->Insert(row_key, /*obj=*/nullptr, &kNoopCacheItemHelper, + access.referenced_data_size, + /*handle=*/nullptr, Cache::Priority::HIGH); + s.PermitUncheckedError(); + status.row_key_status[row_key] = InsertResult::INSERTED; + } + return; + } + AccessKVPair(access.block_key, access.block_size, + ComputeBlockPriority(access), access, access.no_insert, + BlockCacheTraceHelper::IsUserAccess(access.caller), + &is_cache_miss, &admitted, /*update_metrics=*/true); +} + +BlockCacheTraceSimulator::BlockCacheTraceSimulator( + uint64_t warmup_seconds, uint32_t downsample_ratio, + const std::vector& cache_configurations) + : warmup_seconds_(warmup_seconds), + downsample_ratio_(downsample_ratio), + cache_configurations_(cache_configurations) {} + +Status BlockCacheTraceSimulator::InitializeCaches() { + for (auto const& config : cache_configurations_) { + for (auto cache_capacity : config.cache_capacities) { + // Scale down the cache capacity since the trace contains accesses on + // 1/'downsample_ratio' blocks. + uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_; + std::shared_ptr sim_cache; + std::unique_ptr ghost_cache; + std::string cache_name = config.cache_name; + if (cache_name.find(kGhostCachePrefix) != std::string::npos) { + ghost_cache.reset(new GhostCache( + NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_name = cache_name.substr(kGhostCachePrefix.size()); + } + if (cache_name == "lru") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0)); + } else if (cache_name == "lru_priority") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5)); + } else if (cache_name == "lru_hybrid") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5), + /*insert_blocks_upon_row_kvpair_miss=*/true); + } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5), + /*insert_blocks_upon_row_kvpair_miss=*/false); + } else { + // Not supported. + return Status::InvalidArgument("Unknown cache name " + + config.cache_name); + } + sim_caches_[config].push_back(sim_cache); + } + } + return Status::OK(); +} + +void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { + if (trace_start_time_ == 0) { + trace_start_time_ = access.access_timestamp; + } + // access.access_timestamp is in microseconds. + if (!warmup_complete_ && + trace_start_time_ + warmup_seconds_ * kMicrosInSecond <= + access.access_timestamp) { + for (auto& config_caches : sim_caches_) { + for (auto& sim_cache : config_caches.second) { + sim_cache->reset_counter(); + } + } + warmup_complete_ = true; + } + for (auto& config_caches : sim_caches_) { + for (auto& sim_cache : config_caches.second) { + sim_cache->Access(access); + } + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.h new file mode 100644 index 0000000000..6d49790131 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/cache_simulator.h @@ -0,0 +1,231 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "cache/lru_cache.h" +#include "trace_replay/block_cache_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +// A cache configuration provided by user. +struct CacheConfiguration { + std::string cache_name; // LRU. + uint32_t num_shard_bits; + uint64_t ghost_cache_capacity; // ghost cache capacity in bytes. + std::vector + cache_capacities; // simulate cache capacities in bytes. + + bool operator==(const CacheConfiguration& o) const { + return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits && + ghost_cache_capacity == o.ghost_cache_capacity; + } + bool operator<(const CacheConfiguration& o) const { + return cache_name < o.cache_name || + (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) || + (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits && + ghost_cache_capacity < o.ghost_cache_capacity); + } +}; + +class MissRatioStats { + public: + void reset_counter() { + num_misses_ = 0; + num_accesses_ = 0; + user_accesses_ = 0; + user_misses_ = 0; + } + double miss_ratio() const { + if (num_accesses_ == 0) { + return -1; + } + return static_cast(num_misses_ * 100.0 / num_accesses_); + } + uint64_t total_accesses() const { return num_accesses_; } + uint64_t total_misses() const { return num_misses_; } + + const std::map& num_accesses_timeline() const { + return num_accesses_timeline_; + } + + const std::map& num_misses_timeline() const { + return num_misses_timeline_; + } + + double user_miss_ratio() const { + if (user_accesses_ == 0) { + return -1; + } + return static_cast(user_misses_ * 100.0 / user_accesses_); + } + uint64_t user_accesses() const { return user_accesses_; } + uint64_t user_misses() const { return user_misses_; } + + void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access, + bool is_cache_miss); + + private: + uint64_t num_accesses_ = 0; + uint64_t num_misses_ = 0; + uint64_t user_accesses_ = 0; + uint64_t user_misses_ = 0; + + std::map num_accesses_timeline_; + std::map num_misses_timeline_; +}; + +// A ghost cache admits an entry on its second access. +class GhostCache { + public: + explicit GhostCache(std::shared_ptr sim_cache); + ~GhostCache() = default; + // No copy and move. + GhostCache(const GhostCache&) = delete; + GhostCache& operator=(const GhostCache&) = delete; + GhostCache(GhostCache&&) = delete; + GhostCache& operator=(GhostCache&&) = delete; + + // Returns true if the lookup_key is in the ghost cache. + // Returns false otherwise. + bool Admit(const Slice& lookup_key); + + private: + std::shared_ptr sim_cache_; +}; + +// A cache simulator that runs against a block cache trace. +class CacheSimulator { + public: + CacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache); + virtual ~CacheSimulator() = default; + // No copy and move. + CacheSimulator(const CacheSimulator&) = delete; + CacheSimulator& operator=(const CacheSimulator&) = delete; + CacheSimulator(CacheSimulator&&) = delete; + CacheSimulator& operator=(CacheSimulator&&) = delete; + + virtual void Access(const BlockCacheTraceRecord& access); + + void reset_counter() { miss_ratio_stats_.reset_counter(); } + + const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; } + + protected: + MissRatioStats miss_ratio_stats_; + std::unique_ptr ghost_cache_; + std::shared_ptr sim_cache_; +}; + +// A prioritized cache simulator that runs against a block cache trace. +// It inserts missing index/filter/uncompression-dictionary blocks with high +// priority in the cache. +class PrioritizedCacheSimulator : public CacheSimulator { + public: + PrioritizedCacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache) + : CacheSimulator(std::move(ghost_cache), sim_cache) {} + void Access(const BlockCacheTraceRecord& access) override; + + protected: + // Access the key-value pair and returns true upon a cache miss. + void AccessKVPair(const Slice& key, uint64_t value_size, + Cache::Priority priority, + const BlockCacheTraceRecord& access, bool no_insert, + bool is_user_access, bool* is_cache_miss, bool* admitted, + bool update_metrics); + + Cache::Priority ComputeBlockPriority( + const BlockCacheTraceRecord& access) const; +}; + +// A hybrid row and block cache simulator. It looks up/inserts key-value pairs +// referenced by Get/MultiGet requests, and not their accessed index/filter/data +// blocks. +// +// Upon a Get/MultiGet request, it looks up the referenced key first. +// If it observes a cache hit, future block accesses on this key-value pair is +// skipped since the request is served already. Otherwise, it continues to look +// up/insert its index/filter/data blocks. It also inserts the referenced +// key-value pair in the cache for future lookups. +class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator { + public: + HybridRowBlockCacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache, + bool insert_blocks_upon_row_kvpair_miss) + : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache), + insert_blocks_upon_row_kvpair_miss_( + insert_blocks_upon_row_kvpair_miss) {} + void Access(const BlockCacheTraceRecord& access) override; + + private: + enum InsertResult : char { + INSERTED, + ADMITTED, + NO_INSERT, + }; + + // We set is_complete to true when the referenced row-key of a get request + // hits the cache. If is_complete is true, we treat future accesses of this + // get request as hits. + // + // For each row key, it stores an enum. It is INSERTED when the + // kv-pair has been inserted into the cache, ADMITTED if it should be inserted + // but haven't been, NO_INSERT if it should not be inserted. + // + // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not + // know its size. This may happen if the first access on the referenced key is + // an index/filter block. + struct GetRequestStatus { + bool is_complete = false; + std::map row_key_status; + }; + + // A map stores get_id to a map of row keys. + std::map getid_status_map_; + bool insert_blocks_upon_row_kvpair_miss_; +}; + +// A block cache simulator that reports miss ratio curves given a set of cache +// configurations. +class BlockCacheTraceSimulator { + public: + // warmup_seconds: The number of seconds to warmup simulated caches. The + // hit/miss counters are reset after the warmup completes. + BlockCacheTraceSimulator( + uint64_t warmup_seconds, uint32_t downsample_ratio, + const std::vector& cache_configurations); + ~BlockCacheTraceSimulator() = default; + // No copy and move. + BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete; + BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete; + + Status InitializeCaches(); + + void Access(const BlockCacheTraceRecord& access); + + const std::map>>& + sim_caches() const { + return sim_caches_; + } + + private: + const uint64_t warmup_seconds_; + const uint32_t downsample_ratio_; + const std::vector cache_configurations_; + + bool warmup_complete_ = false; + std::map>> + sim_caches_; + uint64_t trace_start_time_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/sim_cache.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/sim_cache.cc new file mode 100644 index 0000000000..d58c3b34f1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/simulator_cache/sim_cache.cc @@ -0,0 +1,372 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/sim_cache.h" + +#include +#include + +#include "file/writable_file_writer.h" +#include "monitoring/statistics_impl.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class CacheActivityLogger { + public: + CacheActivityLogger() + : activity_logging_enabled_(false), max_logging_size_(0) {} + + ~CacheActivityLogger() { + MutexLock l(&mutex_); + + StopLoggingInternal(); + bg_status_.PermitUncheckedError(); + } + + Status StartLogging(const std::string& activity_log_file, Env* env, + uint64_t max_logging_size = 0) { + assert(activity_log_file != ""); + assert(env != nullptr); + + Status status; + FileOptions file_opts; + + MutexLock l(&mutex_); + + // Stop existing logging if any + StopLoggingInternal(); + + // Open log file + status = WritableFileWriter::Create(env->GetFileSystem(), activity_log_file, + file_opts, &file_writer_, nullptr); + if (!status.ok()) { + return status; + } + + max_logging_size_ = max_logging_size; + activity_logging_enabled_.store(true); + + return status; + } + + void StopLogging() { + MutexLock l(&mutex_); + + StopLoggingInternal(); + } + + void ReportLookup(const Slice& key) { + if (activity_logging_enabled_.load() == false) { + return; + } + + std::ostringstream oss; + // line format: "LOOKUP - " + oss << "LOOKUP - " << key.ToString(true) << std::endl; + + MutexLock l(&mutex_); + Status s = file_writer_->Append(oss.str()); + if (!s.ok() && bg_status_.ok()) { + bg_status_ = s; + } + if (MaxLoggingSizeReached() || !bg_status_.ok()) { + // Stop logging if we have reached the max file size or + // encountered an error + StopLoggingInternal(); + } + } + + void ReportAdd(const Slice& key, size_t size) { + if (activity_logging_enabled_.load() == false) { + return; + } + + std::ostringstream oss; + // line format: "ADD - - " + oss << "ADD - " << key.ToString(true) << " - " << size << std::endl; + MutexLock l(&mutex_); + Status s = file_writer_->Append(oss.str()); + if (!s.ok() && bg_status_.ok()) { + bg_status_ = s; + } + + if (MaxLoggingSizeReached() || !bg_status_.ok()) { + // Stop logging if we have reached the max file size or + // encountered an error + StopLoggingInternal(); + } + } + + Status& bg_status() { + MutexLock l(&mutex_); + return bg_status_; + } + + private: + bool MaxLoggingSizeReached() { + mutex_.AssertHeld(); + + return (max_logging_size_ > 0 && + file_writer_->GetFileSize() >= max_logging_size_); + } + + void StopLoggingInternal() { + mutex_.AssertHeld(); + + if (!activity_logging_enabled_) { + return; + } + + activity_logging_enabled_.store(false); + Status s = file_writer_->Close(); + if (!s.ok() && bg_status_.ok()) { + bg_status_ = s; + } + } + + // Mutex to sync writes to file_writer, and all following + // class data members + port::Mutex mutex_; + // Indicates if logging is currently enabled + // atomic to allow reads without mutex + std::atomic activity_logging_enabled_; + // When reached, we will stop logging and close the file + // Value of 0 means unlimited + uint64_t max_logging_size_; + std::unique_ptr file_writer_; + Status bg_status_; +}; + +// SimCacheImpl definition +class SimCacheImpl : public SimCache { + public: + // capacity for real cache (ShardedLRUCache) + // test_capacity for key only cache + SimCacheImpl(std::shared_ptr sim_cache, std::shared_ptr cache) + : SimCache(cache), + key_only_cache_(sim_cache), + miss_times_(0), + hit_times_(0), + stats_(nullptr) {} + + ~SimCacheImpl() override {} + + const char* Name() const override { return "SimCache"; } + + void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + Status Insert(const Slice& key, Cache::ObjectPtr value, + const CacheItemHelper* helper, size_t charge, Handle** handle, + Priority priority) override { + // The handle and value passed in are for real cache, so we pass nullptr + // to key_only_cache_ for both instead. Also, the deleter function pointer + // will be called by user to perform some external operation which should + // be applied only once. Thus key_only_cache accepts an empty function. + // *Lambda function without capture can be assgined to a function pointer + Handle* h = key_only_cache_->Lookup(key); + if (h == nullptr) { + // TODO: Check for error here? + auto s = key_only_cache_->Insert(key, nullptr, &kNoopCacheItemHelper, + charge, nullptr, priority); + s.PermitUncheckedError(); + } else { + key_only_cache_->Release(h); + } + + cache_activity_logger_.ReportAdd(key, charge); + if (!target_) { + return Status::OK(); + } + return target_->Insert(key, value, helper, charge, handle, priority); + } + + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) override { + HandleLookup(key, stats); + if (!target_) { + return nullptr; + } + return target_->Lookup(key, helper, create_context, priority, stats); + } + + void StartAsyncLookup(AsyncLookupHandle& async_handle) override { + HandleLookup(async_handle.key, async_handle.stats); + if (target_) { + target_->StartAsyncLookup(async_handle); + } + } + + bool Ref(Handle* handle) override { return target_->Ref(handle); } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + return target_->Release(handle, erase_if_last_ref); + } + + void Erase(const Slice& key) override { + target_->Erase(key); + key_only_cache_->Erase(key); + } + + Cache::ObjectPtr Value(Handle* handle) override { + return target_->Value(handle); + } + + uint64_t NewId() override { return target_->NewId(); } + + size_t GetCapacity() const override { return target_->GetCapacity(); } + + bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + size_t GetUsage() const override { return target_->GetUsage(); } + + size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + size_t GetCharge(Handle* handle) const override { + return target_->GetCharge(handle); + } + + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { + return target_->GetCacheItemHelper(handle); + } + + size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } + + void DisownData() override { + target_->DisownData(); + key_only_cache_->DisownData(); + } + + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + target_->ApplyToAllEntries(callback, opts); + } + + void EraseUnRefEntries() override { + target_->EraseUnRefEntries(); + key_only_cache_->EraseUnRefEntries(); + } + + size_t GetSimCapacity() const override { + return key_only_cache_->GetCapacity(); + } + size_t GetSimUsage() const override { return key_only_cache_->GetUsage(); } + void SetSimCapacity(size_t capacity) override { + key_only_cache_->SetCapacity(capacity); + } + + uint64_t get_miss_counter() const override { + return miss_times_.load(std::memory_order_relaxed); + } + + uint64_t get_hit_counter() const override { + return hit_times_.load(std::memory_order_relaxed); + } + + void reset_counter() override { + miss_times_.store(0, std::memory_order_relaxed); + hit_times_.store(0, std::memory_order_relaxed); + SetTickerCount(stats_, SIM_BLOCK_CACHE_HIT, 0); + SetTickerCount(stats_, SIM_BLOCK_CACHE_MISS, 0); + } + + std::string ToString() const override { + std::ostringstream oss; + oss << "SimCache MISSes: " << get_miss_counter() << std::endl; + oss << "SimCache HITs: " << get_hit_counter() << std::endl; + auto lookups = get_miss_counter() + get_hit_counter(); + oss << "SimCache HITRATE: " << std::fixed << std::setprecision(2) + << (lookups == 0 ? 0 : get_hit_counter() * 100.0f / lookups) + << std::endl; + return oss.str(); + } + + std::string GetPrintableOptions() const override { + std::ostringstream oss; + oss << " cache_options:" << std::endl; + oss << target_->GetPrintableOptions(); + oss << " sim_cache_options:" << std::endl; + oss << key_only_cache_->GetPrintableOptions(); + return oss.str(); + } + + Status StartActivityLogging(const std::string& activity_log_file, Env* env, + uint64_t max_logging_size = 0) override { + return cache_activity_logger_.StartLogging(activity_log_file, env, + max_logging_size); + } + + void StopActivityLogging() override { cache_activity_logger_.StopLogging(); } + + Status GetActivityLoggingStatus() override { + return cache_activity_logger_.bg_status(); + } + + private: + std::shared_ptr key_only_cache_; + std::atomic miss_times_; + std::atomic hit_times_; + Statistics* stats_; + CacheActivityLogger cache_activity_logger_; + + void inc_miss_counter() { + miss_times_.fetch_add(1, std::memory_order_relaxed); + } + void inc_hit_counter() { hit_times_.fetch_add(1, std::memory_order_relaxed); } + + void HandleLookup(const Slice& key, Statistics* stats) { + Handle* h = key_only_cache_->Lookup(key); + if (h != nullptr) { + key_only_cache_->Release(h); + inc_hit_counter(); + RecordTick(stats, SIM_BLOCK_CACHE_HIT); + } else { + inc_miss_counter(); + RecordTick(stats, SIM_BLOCK_CACHE_MISS); + } + cache_activity_logger_.ReportLookup(key); + } +}; + +} // end anonymous namespace + +// For instrumentation purpose, use NewSimCache instead +std::shared_ptr NewSimCache(std::shared_ptr cache, + size_t sim_capacity, int num_shard_bits) { + LRUCacheOptions co; + co.capacity = sim_capacity; + co.num_shard_bits = num_shard_bits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + return NewSimCache(NewLRUCache(co), cache, num_shard_bits); +} + +std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits) { + if (num_shard_bits >= 20) { + return nullptr; // the cache cannot be sharded into too many fine pieces + } + return std::make_shared(sim_cache, cache); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc new file mode 100644 index 0000000000..a3b5189da0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/table_properties_collectors/compact_on_deletion_collector.h" + +#include +#include + +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +CompactOnDeletionCollector::CompactOnDeletionCollector( + size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio) + : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets), + current_bucket_(0), + num_keys_in_current_bucket_(0), + num_deletions_in_observation_window_(0), + deletion_trigger_(deletion_trigger), + deletion_ratio_(deletion_ratio), + deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1), + need_compaction_(false), + finished_(false) { + memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets); +} + +// AddUserKey() will be called when a new key/value pair is inserted into the +// table. +// @params key the user key that is inserted into the table. +// @params value the value that is inserted into the table. +// @params file_size file size up to now +Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/, + const Slice& /*value*/, + EntryType type, + SequenceNumber /*seq*/, + uint64_t /*file_size*/) { + assert(!finished_); + if (!bucket_size_ && !deletion_ratio_enabled_) { + // This collector is effectively disabled + return Status::OK(); + } + + if (need_compaction_) { + // If the output file already needs to be compacted, skip the check. + return Status::OK(); + } + + if (deletion_ratio_enabled_) { + total_entries_++; + if (type == kEntryDelete) { + deletion_entries_++; + } + } + + if (bucket_size_) { + if (num_keys_in_current_bucket_ == bucket_size_) { + // When the current bucket is full, advance the cursor of the + // ring buffer to the next bucket. + current_bucket_ = (current_bucket_ + 1) % kNumBuckets; + + // Update the current count of observed deletion keys by excluding + // the number of deletion keys in the oldest bucket in the + // observation window. + assert(num_deletions_in_observation_window_ >= + num_deletions_in_buckets_[current_bucket_]); + num_deletions_in_observation_window_ -= + num_deletions_in_buckets_[current_bucket_]; + num_deletions_in_buckets_[current_bucket_] = 0; + num_keys_in_current_bucket_ = 0; + } + + num_keys_in_current_bucket_++; + if (type == kEntryDelete) { + num_deletions_in_observation_window_++; + num_deletions_in_buckets_[current_bucket_]++; + if (num_deletions_in_observation_window_ >= deletion_trigger_) { + need_compaction_ = true; + } + } + } + + return Status::OK(); +} + +Status CompactOnDeletionCollector::Finish( + UserCollectedProperties* /*properties*/) { + if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) { + double ratio = static_cast(deletion_entries_) / total_entries_; + need_compaction_ = ratio >= deletion_ratio_; + } + finished_ = true; + return Status::OK(); +} +static std::unordered_map + on_deletion_collector_type_info = { + {"window_size", + {0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, + [](const ConfigOptions&, const std::string&, const std::string& value, + void* addr) { + auto* factory = + static_cast(addr); + factory->SetWindowSize(ParseSizeT(value)); + return Status::OK(); + }, + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* factory = + static_cast(addr); + *value = std::to_string(factory->GetWindowSize()); + return Status::OK(); + }, + nullptr}}, + {"deletion_trigger", + {0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, + [](const ConfigOptions&, const std::string&, const std::string& value, + void* addr) { + auto* factory = + static_cast(addr); + factory->SetDeletionTrigger(ParseSizeT(value)); + return Status::OK(); + }, + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* factory = + static_cast(addr); + *value = std::to_string(factory->GetDeletionTrigger()); + return Status::OK(); + }, + nullptr}}, + {"deletion_ratio", + {0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, + [](const ConfigOptions&, const std::string&, const std::string& value, + void* addr) { + auto* factory = + static_cast(addr); + factory->SetDeletionRatio(ParseDouble(value)); + return Status::OK(); + }, + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* factory = + static_cast(addr); + *value = std::to_string(factory->GetDeletionRatio()); + return Status::OK(); + }, + nullptr}}, + +}; + +CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory( + size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio) + : sliding_window_size_(sliding_window_size), + deletion_trigger_(deletion_trigger), + deletion_ratio_(deletion_ratio) { + RegisterOptions("", this, &on_deletion_collector_type_info); +} + +TablePropertiesCollector* +CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) { + return new CompactOnDeletionCollector(sliding_window_size_.load(), + deletion_trigger_.load(), + deletion_ratio_.load()); +} + +std::string CompactOnDeletionCollectorFactory::ToString() const { + std::ostringstream cfg; + cfg << Name() << " (Sliding window size = " << sliding_window_size_.load() + << " Deletion trigger = " << deletion_trigger_.load() + << " Deletion ratio = " << deletion_ratio_.load() << ')'; + return cfg.str(); +} + +std::shared_ptr +NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio) { + return std::shared_ptr( + new CompactOnDeletionCollectorFactory(sliding_window_size, + deletion_trigger, deletion_ratio)); +} +namespace { +static int RegisterTablePropertiesCollectorFactories( + ObjectLibrary& library, const std::string& /*arg*/) { + library.AddFactory( + CompactOnDeletionCollectorFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + // By default, create a CompactionOnDeletionCollector that is disabled. + // Users will need to provide configuration parameters or call the + // corresponding Setter to enable the factory. + guard->reset(new CompactOnDeletionCollectorFactory(0, 0, 0)); + return guard->get(); + }); + return 1; +} +} // namespace + +Status TablePropertiesCollectorFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { + static std::once_flag once; + std::call_once(once, [&]() { + RegisterTablePropertiesCollectorFactories(*(ObjectLibrary::Default().get()), + ""); + }); + return LoadSharedObject(options, value, + result); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.h new file mode 100644 index 0000000000..c267463a02 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/utilities/table_properties_collectors.h" +namespace ROCKSDB_NAMESPACE { + +class CompactOnDeletionCollector : public TablePropertiesCollector { + public: + CompactOnDeletionCollector(size_t sliding_window_size, + size_t deletion_trigger, double deletion_raatio); + + // AddUserKey() will be called when a new key/value pair is inserted into the + // table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + // @params file_size file size up to now + virtual Status AddUserKey(const Slice& key, const Slice& value, + EntryType type, SequenceNumber seq, + uint64_t file_size) override; + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* /*properties*/) override; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties(); + } + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const override { + return "CompactOnDeletionCollector"; + } + + // EXPERIMENTAL Return whether the output file should be further compacted + virtual bool NeedCompact() const override { return need_compaction_; } + + static const int kNumBuckets = 128; + + private: + void Reset(); + + // A ring buffer that used to count the number of deletion entries for every + // "bucket_size_" keys. + size_t num_deletions_in_buckets_[kNumBuckets]; + // the number of keys in a bucket + size_t bucket_size_; + + size_t current_bucket_; + size_t num_keys_in_current_bucket_; + size_t num_deletions_in_observation_window_; + size_t deletion_trigger_; + const double deletion_ratio_; + const bool deletion_ratio_enabled_; + size_t total_entries_ = 0; + size_t deletion_entries_ = 0; + // true if the current SST file needs to be compacted. + bool need_compaction_; + bool finished_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.cc new file mode 100644 index 0000000000..5886d3539f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/trace/file_trace_reader_writer.h" + +#include "env/composite_env_wrapper.h" +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "trace_replay/trace_replay.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +const unsigned int FileTraceReader::kBufferSize = 1024; // 1KB + +FileTraceReader::FileTraceReader( + std::unique_ptr&& reader) + : file_reader_(std::move(reader)), + offset_(0), + buffer_(new char[kBufferSize]) {} + +FileTraceReader::~FileTraceReader() { + Close().PermitUncheckedError(); + delete[] buffer_; +} + +Status FileTraceReader::Close() { + file_reader_.reset(); + return Status::OK(); +} + +Status FileTraceReader::Reset() { + if (file_reader_ == nullptr) { + return Status::IOError("TraceReader is closed."); + } + offset_ = 0; + return Status::OK(); +} + +Status FileTraceReader::Read(std::string* data) { + assert(file_reader_ != nullptr); + Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize, + &result_, buffer_, nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + if (result_.size() == 0) { + // No more data to read + // Todo: Come up with a better way to indicate end of data. May be this + // could be avoided once footer is introduced. + return Status::Incomplete(); + } + if (result_.size() < kTraceMetadataSize) { + return Status::Corruption("Corrupted trace file."); + } + *data = result_.ToString(); + offset_ += kTraceMetadataSize; + + uint32_t payload_len = + DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]); + + // Read Payload + unsigned int bytes_to_read = payload_len; + unsigned int to_read = + bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; + while (to_read > 0) { + s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_, + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + if (result_.size() < to_read) { + return Status::Corruption("Corrupted trace file."); + } + data->append(result_.data(), result_.size()); + + offset_ += to_read; + bytes_to_read -= to_read; + to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; + } + + return s; +} + +FileTraceWriter::FileTraceWriter( + std::unique_ptr&& file_writer) + : file_writer_(std::move(file_writer)) {} + +FileTraceWriter::~FileTraceWriter() { Close().PermitUncheckedError(); } + +Status FileTraceWriter::Close() { + file_writer_.reset(); + return Status::OK(); +} + +Status FileTraceWriter::Write(const Slice& data) { + return file_writer_->Append(data); +} + +uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); } + +Status NewFileTraceReader(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_reader) { + std::unique_ptr file_reader; + Status s = RandomAccessFileReader::Create( + env->GetFileSystem(), trace_filename, FileOptions(env_options), + &file_reader, nullptr); + if (!s.ok()) { + return s; + } + trace_reader->reset(new FileTraceReader(std::move(file_reader))); + return s; +} + +Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_writer) { + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create(env->GetFileSystem(), trace_filename, + FileOptions(env_options), &file_writer, + nullptr); + if (!s.ok()) { + return s; + } + trace_writer->reset(new FileTraceWriter(std::move(file_writer))); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.h new file mode 100644 index 0000000000..65d4831083 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/file_trace_reader_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/trace_reader_writer.h" + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReader; +class WritableFileWriter; + +// FileTraceReader allows reading RocksDB traces from a file. +class FileTraceReader : public TraceReader { + public: + explicit FileTraceReader(std::unique_ptr&& reader); + ~FileTraceReader(); + + virtual Status Read(std::string* data) override; + virtual Status Close() override; + virtual Status Reset() override; + + private: + std::unique_ptr file_reader_; + Slice result_; + size_t offset_; + char* const buffer_; + + static const unsigned int kBufferSize; +}; + +// FileTraceWriter allows writing RocksDB traces to a file. +class FileTraceWriter : public TraceWriter { + public: + explicit FileTraceWriter(std::unique_ptr&& file_writer); + ~FileTraceWriter(); + + virtual Status Write(const Slice& data) override; + virtual Status Close() override; + virtual uint64_t GetFileSize() override; + + private: + std::unique_ptr file_writer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.cc new file mode 100644 index 0000000000..32a5ad7f05 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.cc @@ -0,0 +1,314 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/trace/replayer_impl.h" + +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "util/threadpool_imp.h" + +namespace ROCKSDB_NAMESPACE { + +ReplayerImpl::ReplayerImpl(DB* db, + const std::vector& handles, + std::unique_ptr&& reader) + : Replayer(), + trace_reader_(std::move(reader)), + prepared_(false), + trace_end_(false), + header_ts_(0), + exec_handler_(TraceRecord::NewExecutionHandler(db, handles)), + env_(db->GetEnv()), + trace_file_version_(-1) {} + +ReplayerImpl::~ReplayerImpl() { + exec_handler_.reset(); + trace_reader_.reset(); +} + +Status ReplayerImpl::Prepare() { + Trace header; + int db_version; + Status s = ReadHeader(&header); + if (!s.ok()) { + return s; + } + s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version); + if (!s.ok()) { + return s; + } + header_ts_ = header.ts; + prepared_ = true; + trace_end_ = false; + return Status::OK(); +} + +Status ReplayerImpl::Next(std::unique_ptr* record) { + if (!prepared_) { + return Status::Incomplete("Not prepared!"); + } + if (trace_end_) { + return Status::Incomplete("Trace end."); + } + + Trace trace; + Status s = ReadTrace(&trace); // ReadTrace is atomic + // Reached the trace end. + if (s.ok() && trace.type == kTraceEnd) { + trace_end_ = true; + return Status::Incomplete("Trace end."); + } + if (!s.ok() || record == nullptr) { + return s; + } + + return TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, record); +} + +Status ReplayerImpl::Execute(const std::unique_ptr& record, + std::unique_ptr* result) { + return record->Accept(exec_handler_.get(), result); +} + +Status ReplayerImpl::Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) { + if (options.fast_forward <= 0.0) { + return Status::InvalidArgument("Wrong fast forward speed!"); + } + + if (!prepared_) { + return Status::Incomplete("Not prepared!"); + } + if (trace_end_) { + return Status::Incomplete("Trace end."); + } + + Status s = Status::OK(); + + if (options.num_threads <= 1) { + // num_threads == 0 or num_threads == 1 uses single thread. + std::chrono::system_clock::time_point replay_epoch = + std::chrono::system_clock::now(); + + while (s.ok()) { + Trace trace; + s = ReadTrace(&trace); + // If already at trace end, ReadTrace should return Status::Incomplete(). + if (!s.ok()) { + break; + } + + // No need to sleep before breaking the loop if at the trace end. + if (trace.type == kTraceEnd) { + trace_end_ = true; + s = Status::Incomplete("Trace end."); + break; + } + + // In single-threaded replay, decode first then sleep. + std::unique_ptr record; + s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, &record); + if (!s.ok() && !s.IsNotSupported()) { + break; + } + + std::chrono::system_clock::time_point sleep_to = + replay_epoch + + std::chrono::microseconds(static_cast(std::llround( + 1.0 * (trace.ts - header_ts_) / options.fast_forward))); + if (sleep_to > std::chrono::system_clock::now()) { + std::this_thread::sleep_until(sleep_to); + } + + // Skip unsupported traces, stop for other errors. + if (s.IsNotSupported()) { + if (result_callback != nullptr) { + result_callback(s, nullptr); + } + s = Status::OK(); + continue; + } + + if (result_callback == nullptr) { + s = Execute(record, nullptr); + } else { + std::unique_ptr res; + s = Execute(record, &res); + result_callback(s, std::move(res)); + } + } + } else { + // Multi-threaded replay. + ThreadPoolImpl thread_pool; + thread_pool.SetHostEnv(env_); + thread_pool.SetBackgroundThreads(static_cast(options.num_threads)); + + std::mutex mtx; + // Background decoding and execution status. + Status bg_s = Status::OK(); + uint64_t last_err_ts = static_cast(-1); + // Callback function used in background work to update bg_s for the ealiest + // TraceRecord which has execution error. This is different from the + // timestamp of the first execution error (either start or end timestamp). + // + // Suppose TraceRecord R1, R2, with timestamps T1 < T2. Their execution + // timestamps are T1_start, T1_end, T2_start, T2_end. + // Single-thread: there must be T1_start < T1_end < T2_start < T2_end. + // Multi-thread: T1_start < T2_start may not be enforced. Orders of them are + // totally unknown. + // In order to report the same `first` error in both single-thread and + // multi-thread replay, we can only rely on the TraceRecords' timestamps, + // rather than their executin timestamps. Although in single-thread replay, + // the first error is also the last error, while in multi-thread replay, the + // first error may not be the first error in execution, and it may not be + // the last error in exeution as well. + auto error_cb = [&mtx, &bg_s, &last_err_ts](Status err, uint64_t err_ts) { + std::lock_guard gd(mtx); + // Only record the first error. + if (!err.ok() && !err.IsNotSupported() && err_ts < last_err_ts) { + bg_s = err; + last_err_ts = err_ts; + } + }; + + std::chrono::system_clock::time_point replay_epoch = + std::chrono::system_clock::now(); + + while (bg_s.ok() && s.ok()) { + Trace trace; + s = ReadTrace(&trace); + // If already at trace end, ReadTrace should return Status::Incomplete(). + if (!s.ok()) { + break; + } + + TraceType trace_type = trace.type; + + // No need to sleep before breaking the loop if at the trace end. + if (trace_type == kTraceEnd) { + trace_end_ = true; + s = Status::Incomplete("Trace end."); + break; + } + + // In multi-threaded replay, sleep first then start decoding and + // execution in a thread. + std::chrono::system_clock::time_point sleep_to = + replay_epoch + + std::chrono::microseconds(static_cast(std::llround( + 1.0 * (trace.ts - header_ts_) / options.fast_forward))); + if (sleep_to > std::chrono::system_clock::now()) { + std::this_thread::sleep_until(sleep_to); + } + + if (trace_type == kTraceWrite || trace_type == kTraceGet || + trace_type == kTraceIteratorSeek || + trace_type == kTraceIteratorSeekForPrev || + trace_type == kTraceMultiGet) { + std::unique_ptr ra(new ReplayerWorkerArg); + ra->trace_entry = std::move(trace); + ra->handler = exec_handler_.get(); + ra->trace_file_version = trace_file_version_; + ra->error_cb = error_cb; + ra->result_cb = result_callback; + thread_pool.Schedule(&ReplayerImpl::BackgroundWork, ra.release(), + nullptr, nullptr); + } else { + // Skip unsupported traces. + if (result_callback != nullptr) { + result_callback(Status::NotSupported("Unsupported trace type."), + nullptr); + } + } + } + + thread_pool.WaitForJobsAndJoinAllThreads(); + if (!bg_s.ok()) { + s = bg_s; + } + } + + if (s.IsIncomplete()) { + // Reaching eof returns Incomplete status at the moment. + // Could happen when killing a process without calling EndTrace() API. + // TODO: Add better error handling. + trace_end_ = true; + return Status::OK(); + } + return s; +} + +uint64_t ReplayerImpl::GetHeaderTimestamp() const { return header_ts_; } + +Status ReplayerImpl::ReadHeader(Trace* header) { + assert(header != nullptr); + Status s = trace_reader_->Reset(); + if (!s.ok()) { + return s; + } + std::string encoded_trace; + // Read the trace head + s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + + return TracerHelper::DecodeHeader(encoded_trace, header); +} + +Status ReplayerImpl::ReadTrace(Trace* trace) { + assert(trace != nullptr); + std::string encoded_trace; + // We don't know if TraceReader is implemented thread-safe, so we protect the + // reading trace part with a mutex. The decoding part does not need to be + // protected since it's local. + { + std::lock_guard guard(mutex_); + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + } + return TracerHelper::DecodeTrace(encoded_trace, trace); +} + +void ReplayerImpl::BackgroundWork(void* arg) { + std::unique_ptr ra( + reinterpret_cast(arg)); + assert(ra != nullptr); + + std::unique_ptr record; + Status s = TracerHelper::DecodeTraceRecord(&(ra->trace_entry), + ra->trace_file_version, &record); + if (!s.ok()) { + // Stop the replay + if (ra->error_cb != nullptr) { + ra->error_cb(s, ra->trace_entry.ts); + } + // Report the result + if (ra->result_cb != nullptr) { + ra->result_cb(s, nullptr); + } + return; + } + + if (ra->result_cb == nullptr) { + s = record->Accept(ra->handler, nullptr); + } else { + std::unique_ptr res; + s = record->Accept(ra->handler, &res); + ra->result_cb(s, std::move(res)); + } + record.reset(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.h new file mode 100644 index 0000000000..f484ab237f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/trace/replayer_impl.h @@ -0,0 +1,84 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/trace_record_result.h" +#include "rocksdb/utilities/replayer.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { + +class ReplayerImpl : public Replayer { + public: + ReplayerImpl(DB* db, const std::vector& handles, + std::unique_ptr&& reader); + ~ReplayerImpl() override; + + using Replayer::Prepare; + Status Prepare() override; + + using Replayer::Next; + Status Next(std::unique_ptr* record) override; + + using Replayer::Execute; + Status Execute(const std::unique_ptr& record, + std::unique_ptr* result) override; + + using Replayer::Replay; + Status Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) override; + + using Replayer::GetHeaderTimestamp; + uint64_t GetHeaderTimestamp() const override; + + private: + Status ReadHeader(Trace* header); + Status ReadTrace(Trace* trace); + + // Generic function to execute a Trace in a thread pool. + static void BackgroundWork(void* arg); + + std::unique_ptr trace_reader_; + std::mutex mutex_; + std::atomic prepared_; + std::atomic trace_end_; + uint64_t header_ts_; + std::unique_ptr exec_handler_; + Env* env_; + // When reading the trace header, the trace file version can be parsed. + // Replayer will use different decode method to get the trace content based + // on different trace file version. + int trace_file_version_; +}; + +// Arguments passed to BackgroundWork() for replaying in a thread pool. +struct ReplayerWorkerArg { + Trace trace_entry; + int trace_file_version; + // Handler to execute TraceRecord. + TraceRecord::Handler* handler; + // Callback function to report the error status and the timestamp of the + // TraceRecord (not the start/end timestamp of executing the TraceRecord). + std::function error_cb; + // Callback function to report the trace execution status and operation + // execution status/result(s). + std::function&&)> result_cb; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.cc new file mode 100644 index 0000000000..f0bef953b6 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/lock/lock_manager.h" + +#include "utilities/transactions/lock/point/point_lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr NewLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt) { + assert(db); + if (opt.lock_mgr_handle) { + // A custom lock manager was provided in options + auto mgr = opt.lock_mgr_handle->getLockManager(); + return std::shared_ptr(opt.lock_mgr_handle, mgr); + } else { + // Use a point lock manager by default + return std::shared_ptr(new PointLockManager(db, opt)); + } +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.h new file mode 100644 index 0000000000..d90ed9b007 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_manager.h @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB; + +class LockManager { + public: + virtual ~LockManager() {} + + // Whether supports locking a specific key. + virtual bool IsPointLockSupported() const = 0; + + // Whether supports locking a range of keys. + virtual bool IsRangeLockSupported() const = 0; + + // Locks acquired through this LockManager should be tracked by + // the LockTrackers created through the returned factory. + virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0; + + // Enable locking for the specified column family. + // Caller should guarantee that this column family is not already enabled. + virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0; + + // Disable locking for the specified column family. + // Caller should guarantee that this column family is no longer used. + virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0; + + // Attempt to lock a key or a key range. If OK status is returned, the caller + // is responsible for calling UnLock() on this key. + virtual Status TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) = 0; + // The range [start, end] are inclusive at both sides. + virtual Status TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const Endpoint& start, + const Endpoint& end, Env* env, bool exclusive) = 0; + + // Unlock a key or a range locked by TryLock(). txn must be the same + // Transaction that locked this key. + virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) = 0; + virtual void UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const std::string& key, + Env* env) = 0; + virtual void UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const Endpoint& start, + const Endpoint& end, Env* env) = 0; + + using PointLockStatus = std::unordered_multimap; + virtual PointLockStatus GetPointLockStatus() = 0; + + using RangeLockStatus = + std::unordered_multimap; + virtual RangeLockStatus GetRangeLockStatus() = 0; + + virtual std::vector GetDeadlockInfoBuffer() = 0; + + virtual void Resize(uint32_t new_size) = 0; +}; + +// LockManager should always be constructed through this factory method, +// instead of constructing through concrete implementations' constructor. +// Caller owns the returned pointer. +std::shared_ptr NewLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt); + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_tracker.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_tracker.h new file mode 100644 index 0000000000..fe09ab22c5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/lock_tracker.h @@ -0,0 +1,207 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Request for locking a single key. +struct PointLockRequest { + // The id of the key's column family. + ColumnFamilyId column_family_id = 0; + // The key to lock. + std::string key; + // The sequence number from which there is no concurrent update to key. + SequenceNumber seq = 0; + // Whether the lock is acquired only for read. + bool read_only = false; + // Whether the lock is in exclusive mode. + bool exclusive = true; +}; + +// Request for locking a range of keys. +struct RangeLockRequest { + // The id of the key's column family. + ColumnFamilyId column_family_id; + + // The range to be locked + Endpoint start_endp; + Endpoint end_endp; +}; + +struct PointLockStatus { + // Whether the key is locked. + bool locked = false; + // Whether the key is locked in exclusive mode. + bool exclusive = true; + // The sequence number in the tracked PointLockRequest. + SequenceNumber seq = 0; +}; + +// Return status when calling LockTracker::Untrack. +enum class UntrackStatus { + // The lock is not tracked at all, so no lock to untrack. + NOT_TRACKED, + // The lock is untracked but not removed from the tracker. + UNTRACKED, + // The lock is removed from the tracker. + REMOVED, +}; + +// Tracks the lock requests. +// In PessimisticTransaction, it tracks the locks acquired through LockMgr; +// In OptimisticTransaction, since there is no LockMgr, it tracks the lock +// intention. Not thread-safe. +class LockTracker { + public: + virtual ~LockTracker() {} + + // Whether supports locking a specific key. + virtual bool IsPointLockSupported() const = 0; + + // Whether supports locking a range of keys. + virtual bool IsRangeLockSupported() const = 0; + + // Tracks the acquirement of a lock on key. + // + // If this method is not supported, leave it as a no-op. + virtual void Track(const PointLockRequest& /*lock_request*/) = 0; + + // Untracks the lock on a key. + // seq and exclusive in lock_request are not used. + // + // If this method is not supported, leave it as a no-op and + // returns NOT_TRACKED. + virtual UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) = 0; + + // Counterpart of Track(const PointLockRequest&) for RangeLockRequest. + virtual void Track(const RangeLockRequest& /*lock_request*/) = 0; + + // Counterpart of Untrack(const PointLockRequest&) for RangeLockRequest. + virtual UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) = 0; + + // Merges lock requests tracked in the specified tracker into the current + // tracker. + // + // E.g. for point lock, if a key in tracker is not yet tracked, + // track this new key; otherwise, merge the tracked information of the key + // such as lock's exclusiveness, read/write statistics. + // + // If this method is not supported, leave it as a no-op. + // + // REQUIRED: the specified tracker must be of the same concrete class type as + // the current tracker. + virtual void Merge(const LockTracker& /*tracker*/) = 0; + + // This is a reverse operation of Merge. + // + // E.g. for point lock, if a key exists in both current and the sepcified + // tracker, then subtract the information (such as read/write statistics) of + // the key in the specified tracker from the current tracker. + // + // If this method is not supported, leave it as a no-op. + // + // REQUIRED: + // The specified tracker must be of the same concrete class type as + // the current tracker. + // The tracked locks in the specified tracker must be a subset of those + // tracked by the current tracker. + virtual void Subtract(const LockTracker& /*tracker*/) = 0; + + // Clears all tracked locks. + virtual void Clear() = 0; + + // Gets the new locks (excluding the locks that have been tracked before the + // save point) tracked since the specified save point, the result is stored + // in an internally constructed LockTracker and returned. + // + // save_point_tracker is the tracker used by a SavePoint to track locks + // tracked after creating the SavePoint. + // + // The implementation should document whether point lock, or range lock, or + // both are considered in this method. + // If this method is not supported, returns nullptr. + // + // REQUIRED: + // The save_point_tracker must be of the same concrete class type as the + // current tracker. + // The tracked locks in the specified tracker must be a subset of those + // tracked by the current tracker. + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker& /*save_point_tracker*/) const = 0; + + // Gets lock related information of the key. + // + // If point lock is not supported, always returns LockStatus with + // locked=false. + virtual PointLockStatus GetPointLockStatus( + ColumnFamilyId /*column_family_id*/, + const std::string& /*key*/) const = 0; + + // Gets number of tracked point locks. + // + // If point lock is not supported, always returns 0. + virtual uint64_t GetNumPointLocks() const = 0; + + class ColumnFamilyIterator { + public: + virtual ~ColumnFamilyIterator() {} + + // Whether there are remaining column families. + virtual bool HasNext() const = 0; + + // Gets next column family id. + // + // If HasNext is false, calling this method has undefined behavior. + virtual ColumnFamilyId Next() = 0; + }; + + // Gets an iterator for column families. + // + // Returned iterator must not be nullptr. + // If there is no column family to iterate, + // returns an empty non-null iterator. + // Caller owns the returned pointer. + virtual ColumnFamilyIterator* GetColumnFamilyIterator() const = 0; + + class KeyIterator { + public: + virtual ~KeyIterator() {} + + // Whether there are remaining keys. + virtual bool HasNext() const = 0; + + // Gets the next key. + // + // If HasNext is false, calling this method has undefined behavior. + virtual const std::string& Next() = 0; + }; + + // Gets an iterator for keys with tracked point locks in the column family. + // + // The column family must exist. + // Returned iterator must not be nullptr. + // Caller owns the returned pointer. + virtual KeyIterator* GetKeyIterator( + ColumnFamilyId /*column_family_id*/) const = 0; +}; + +// LockTracker should always be constructed through this factory. +// Each LockManager owns a LockTrackerFactory. +class LockTrackerFactory { + public: + // Caller owns the returned pointer. + virtual LockTracker* Create() const = 0; + virtual ~LockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.cc new file mode 100644 index 0000000000..b73b3fe761 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.cc @@ -0,0 +1,719 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/lock/point/point_lock_manager.h" + +#include +#include +#include + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/thread_local.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +struct LockInfo { + bool exclusive; + autovector txn_ids; + + // Transaction locks are not valid after this time in us + uint64_t expiration_time; + + LockInfo(TransactionID id, uint64_t time, bool ex) + : exclusive(ex), expiration_time(time) { + txn_ids.push_back(id); + } + LockInfo(const LockInfo& lock_info) + : exclusive(lock_info.exclusive), + txn_ids(lock_info.txn_ids), + expiration_time(lock_info.expiration_time) {} + void operator=(const LockInfo& lock_info) { + exclusive = lock_info.exclusive; + txn_ids = lock_info.txn_ids; + expiration_time = lock_info.expiration_time; + } + DECLARE_DEFAULT_MOVES(LockInfo); +}; + +struct LockMapStripe { + explicit LockMapStripe(std::shared_ptr factory) { + stripe_mutex = factory->AllocateMutex(); + stripe_cv = factory->AllocateCondVar(); + assert(stripe_mutex); + assert(stripe_cv); + } + + // Mutex must be held before modifying keys map + std::shared_ptr stripe_mutex; + + // Condition Variable per stripe for waiting on a lock + std::shared_ptr stripe_cv; + + // Locked keys mapped to the info about the transactions that locked them. + // TODO(agiardullo): Explore performance of other data structures. + UnorderedMap keys; +}; + +// Map of #num_stripes LockMapStripes +struct LockMap { + explicit LockMap(size_t num_stripes, + std::shared_ptr factory) + : num_stripes_(num_stripes) { + lock_map_stripes_.reserve(num_stripes); + for (size_t i = 0; i < num_stripes; i++) { + LockMapStripe* stripe = new LockMapStripe(factory); + lock_map_stripes_.push_back(stripe); + } + } + + ~LockMap() { + for (auto stripe : lock_map_stripes_) { + delete stripe; + } + } + + // Number of sepearate LockMapStripes to create, each with their own Mutex + const size_t num_stripes_; + + // Count of keys that are currently locked in this column family. + // (Only maintained if PointLockManager::max_num_locks_ is positive.) + std::atomic lock_cnt{0}; + + std::vector lock_map_stripes_; + + size_t GetStripe(const std::string& key) const; +}; + +namespace { +void UnrefLockMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_maps_cache = + static_cast>*>(ptr); + delete lock_maps_cache; +} +} // anonymous namespace + +PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db, + const TransactionDBOptions& opt) + : txn_db_impl_(txn_db), + default_num_stripes_(opt.num_stripes), + max_num_locks_(opt.max_num_locks), + lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)), + dlock_buffer_(opt.max_num_deadlocks), + mutex_factory_(opt.custom_mutex_factory + ? opt.custom_mutex_factory + : std::make_shared()) {} + +size_t LockMap::GetStripe(const std::string& key) const { + assert(num_stripes_ > 0); + return FastRange64(GetSliceNPHash64(key), num_stripes_); +} + +void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) { + InstrumentedMutexLock l(&lock_map_mutex_); + + if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) { + lock_maps_.emplace(cf->GetID(), std::make_shared( + default_num_stripes_, mutex_factory_)); + } else { + // column_family already exists in lock map + assert(false); + } +} + +void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) { + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep using it + // until they release their references to it. + { + InstrumentedMutexLock l(&lock_map_mutex_); + + auto lock_maps_iter = lock_maps_.find(cf->GetID()); + if (lock_maps_iter == lock_maps_.end()) { + return; + } + + lock_maps_.erase(lock_maps_iter); + } // lock_map_mutex_ + + // Clear all thread-local caches + autovector local_caches; + lock_maps_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast(cache); + } +} + +// Look up the LockMap std::shared_ptr for a given column_family_id. +// Note: The LockMap is only valid as long as the caller is still holding on +// to the returned std::shared_ptr. +std::shared_ptr PointLockManager::GetLockMap( + ColumnFamilyId column_family_id) { + // First check thread-local cache + if (lock_maps_cache_->Get() == nullptr) { + lock_maps_cache_->Reset(new LockMaps()); + } + + auto lock_maps_cache = static_cast(lock_maps_cache_->Get()); + + auto lock_map_iter = lock_maps_cache->find(column_family_id); + if (lock_map_iter != lock_maps_cache->end()) { + // Found lock map for this column family. + return lock_map_iter->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(&lock_map_mutex_); + + lock_map_iter = lock_maps_.find(column_family_id); + if (lock_map_iter == lock_maps_.end()) { + return std::shared_ptr(nullptr); + } else { + // Found lock map. Store in thread-local cache and return. + std::shared_ptr& lock_map = lock_map_iter->second; + lock_maps_cache->insert({column_family_id, lock_map}); + + return lock_map; + } +} + +// Returns true if this lock has expired and can be acquired by another +// transaction. +// If false, sets *expire_time to the expiration time of the lock according +// to Env->GetMicros() or 0 if no expiration. +bool PointLockManager::IsLockExpired(TransactionID txn_id, + const LockInfo& lock_info, Env* env, + uint64_t* expire_time) { + if (lock_info.expiration_time == 0) { + *expire_time = 0; + return false; + } + + auto now = env->NowMicros(); + bool expired = lock_info.expiration_time <= now; + if (!expired) { + // return how many microseconds until lock will be expired + *expire_time = lock_info.expiration_time; + } else { + for (auto id : lock_info.txn_ids) { + if (txn_id == id) { + continue; + } + + bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id); + if (!success) { + expired = false; + *expire_time = 0; + break; + } + } + } + + return expired; +} + +Status PointLockManager::TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env, + bool exclusive) { + // Lookup lock map for this column family id + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + char msg[255]; + snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32, + column_family_id); + + return Status::InvalidArgument(msg); + } + + // Need to lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive); + int64_t timeout = txn->GetLockTimeout(); + + return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env, + timeout, lock_info); +} + +// Helper function for TryLock(). +Status PointLockManager::AcquireWithTimeout( + PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe, + ColumnFamilyId column_family_id, const std::string& key, Env* env, + int64_t timeout, const LockInfo& lock_info) { + Status result; + uint64_t end_time = 0; + + if (timeout > 0) { + uint64_t start_time = env->NowMicros(); + end_time = start_time + timeout; + } + + if (timeout < 0) { + // If timeout is negative, we wait indefinitely to acquire the lock + result = stripe->stripe_mutex->Lock(); + } else { + result = stripe->stripe_mutex->TryLockFor(timeout); + } + + if (!result.ok()) { + // failed to acquire mutex + return result; + } + + // Acquire lock if we are able to + uint64_t expire_time_hint = 0; + autovector wait_ids; + result = AcquireLocked(lock_map, stripe, key, env, lock_info, + &expire_time_hint, &wait_ids); + + if (!result.ok() && timeout != 0) { + PERF_TIMER_GUARD(key_lock_wait_time); + PERF_COUNTER_ADD(key_lock_wait_count, 1); + // If we weren't able to acquire the lock, we will keep retrying as long + // as the timeout allows. + bool timed_out = false; + do { + // Decide how long to wait + int64_t cv_end_time = -1; + if (expire_time_hint > 0 && end_time > 0) { + cv_end_time = std::min(expire_time_hint, end_time); + } else if (expire_time_hint > 0) { + cv_end_time = expire_time_hint; + } else if (end_time > 0) { + cv_end_time = end_time; + } + + assert(result.IsBusy() || wait_ids.size() != 0); + + // We are dependent on a transaction to finish, so perform deadlock + // detection. + if (wait_ids.size() != 0) { + if (txn->IsDeadlockDetect()) { + if (IncrementWaiters(txn, wait_ids, key, column_family_id, + lock_info.exclusive, env)) { + result = Status::Busy(Status::SubCode::kDeadlock); + stripe->stripe_mutex->UnLock(); + return result; + } + } + txn->SetWaitingTxn(wait_ids, column_family_id, &key); + } + + TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn"); + if (cv_end_time < 0) { + // Wait indefinitely + result = stripe->stripe_cv->Wait(stripe->stripe_mutex); + } else { + uint64_t now = env->NowMicros(); + if (static_cast(cv_end_time) > now) { + result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex, + cv_end_time - now); + } + } + + if (wait_ids.size() != 0) { + txn->ClearWaitingTxn(); + if (txn->IsDeadlockDetect()) { + DecrementWaiters(txn, wait_ids); + } + } + + if (result.IsTimedOut()) { + timed_out = true; + // Even though we timed out, we will still make one more attempt to + // acquire lock below (it is possible the lock expired and we + // were never signaled). + } + + if (result.ok() || result.IsTimedOut()) { + result = AcquireLocked(lock_map, stripe, key, env, lock_info, + &expire_time_hint, &wait_ids); + } + } while (!result.ok() && !timed_out); + } + + stripe->stripe_mutex->UnLock(); + + return result; +} + +void PointLockManager::DecrementWaiters( + const PessimisticTransaction* txn, + const autovector& wait_ids) { + std::lock_guard lock(wait_txn_map_mutex_); + DecrementWaitersImpl(txn, wait_ids); +} + +void PointLockManager::DecrementWaitersImpl( + const PessimisticTransaction* txn, + const autovector& wait_ids) { + auto id = txn->GetID(); + assert(wait_txn_map_.Contains(id)); + wait_txn_map_.Delete(id); + + for (auto wait_id : wait_ids) { + rev_wait_txn_map_.Get(wait_id)--; + if (rev_wait_txn_map_.Get(wait_id) == 0) { + rev_wait_txn_map_.Delete(wait_id); + } + } +} + +bool PointLockManager::IncrementWaiters( + const PessimisticTransaction* txn, + const autovector& wait_ids, const std::string& key, + const uint32_t& cf_id, const bool& exclusive, Env* const env) { + auto id = txn->GetID(); + std::vector queue_parents( + static_cast(txn->GetDeadlockDetectDepth())); + std::vector queue_values( + static_cast(txn->GetDeadlockDetectDepth())); + std::lock_guard lock(wait_txn_map_mutex_); + assert(!wait_txn_map_.Contains(id)); + + wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key}); + + for (auto wait_id : wait_ids) { + if (rev_wait_txn_map_.Contains(wait_id)) { + rev_wait_txn_map_.Get(wait_id)++; + } else { + rev_wait_txn_map_.Insert(wait_id, 1); + } + } + + // No deadlock if nobody is waiting on self. + if (!rev_wait_txn_map_.Contains(id)) { + return false; + } + + const auto* next_ids = &wait_ids; + int parent = -1; + int64_t deadlock_time = 0; + for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) { + int i = 0; + if (next_ids) { + for (; i < static_cast(next_ids->size()) && + tail + i < txn->GetDeadlockDetectDepth(); + i++) { + queue_values[tail + i] = (*next_ids)[i]; + queue_parents[tail + i] = parent; + } + tail += i; + } + + // No more items in the list, meaning no deadlock. + if (tail == head) { + return false; + } + + auto next = queue_values[head]; + if (next == id) { + std::vector path; + while (head != -1) { + assert(wait_txn_map_.Contains(queue_values[head])); + + auto extracted_info = wait_txn_map_.Get(queue_values[head]); + path.push_back({queue_values[head], extracted_info.m_cf_id, + extracted_info.m_exclusive, + extracted_info.m_waiting_key}); + head = queue_parents[head]; + } + if (!env->GetCurrentTime(&deadlock_time).ok()) { + /* + TODO(AR) this preserves the current behaviour whilst checking the + status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED + passes. Should we instead raise an error if !ok() ? + */ + deadlock_time = 0; + } + std::reverse(path.begin(), path.end()); + dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time)); + deadlock_time = 0; + DecrementWaitersImpl(txn, wait_ids); + return true; + } else if (!wait_txn_map_.Contains(next)) { + next_ids = nullptr; + continue; + } else { + parent = head; + next_ids = &(wait_txn_map_.Get(next).m_neighbors); + } + } + + // Wait cycle too big, just assume deadlock. + if (!env->GetCurrentTime(&deadlock_time).ok()) { + /* + TODO(AR) this preserves the current behaviour whilst checking the status + of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes. + Should we instead raise an error if !ok() ? + */ + deadlock_time = 0; + } + dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true)); + DecrementWaitersImpl(txn, wait_ids); + return true; +} + +// Try to lock this key after we have acquired the mutex. +// Sets *expire_time to the expiration time in microseconds +// or 0 if no expiration. +// REQUIRED: Stripe mutex must be held. +Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& txn_lock_info, + uint64_t* expire_time, + autovector* txn_ids) { + assert(txn_lock_info.txn_ids.size() == 1); + + Status result; + // Check if this key is already locked + auto stripe_iter = stripe->keys.find(key); + if (stripe_iter != stripe->keys.end()) { + // Lock already held + LockInfo& lock_info = stripe_iter->second; + assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive); + + if (lock_info.exclusive || txn_lock_info.exclusive) { + if (lock_info.txn_ids.size() == 1 && + lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) { + // The list contains one txn and we're it, so just take it. + lock_info.exclusive = txn_lock_info.exclusive; + lock_info.expiration_time = txn_lock_info.expiration_time; + } else { + // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case + // it's there for a shared lock with multiple holders which was not + // caught in the first case. + if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env, + expire_time)) { + // lock is expired, can steal it + lock_info.txn_ids = txn_lock_info.txn_ids; + lock_info.exclusive = txn_lock_info.exclusive; + lock_info.expiration_time = txn_lock_info.expiration_time; + // lock_cnt does not change + } else { + result = Status::TimedOut(Status::SubCode::kLockTimeout); + *txn_ids = lock_info.txn_ids; + } + } + } else { + // We are requesting shared access to a shared lock, so just grant it. + lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]); + // Using std::max means that expiration time never goes down even when + // a transaction is removed from the list. The correct solution would be + // to track expiry for every transaction, but this would also work for + // now. + lock_info.expiration_time = + std::max(lock_info.expiration_time, txn_lock_info.expiration_time); + } + } else { // Lock not held. + // Check lock limit + if (max_num_locks_ > 0 && + lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) { + result = Status::Busy(Status::SubCode::kLockLimit); + } else { + // acquire lock + stripe->keys.emplace(key, txn_lock_info); + + // Maintain lock count if there is a limit on the number of locks + if (max_num_locks_) { + lock_map->lock_cnt++; + } + } + } + + return result; +} + +void PointLockManager::UnLockKey(PessimisticTransaction* txn, + const std::string& key, LockMapStripe* stripe, + LockMap* lock_map, Env* env) { +#ifdef NDEBUG + (void)env; +#endif + TransactionID txn_id = txn->GetID(); + + auto stripe_iter = stripe->keys.find(key); + if (stripe_iter != stripe->keys.end()) { + auto& txns = stripe_iter->second.txn_ids; + auto txn_it = std::find(txns.begin(), txns.end(), txn_id); + // Found the key we locked. unlock it. + if (txn_it != txns.end()) { + if (txns.size() == 1) { + stripe->keys.erase(stripe_iter); + } else { + auto last_it = txns.end() - 1; + if (txn_it != last_it) { + *txn_it = *last_it; + } + txns.pop_back(); + } + + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } + } else { + // This key is either not locked or locked by someone else. This should + // only happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() < env->NowMicros()); + } +} + +void PointLockManager::UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env) { + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + stripe->stripe_mutex->Lock().PermitUncheckedError(); + UnLockKey(txn, key, stripe, lock_map, env); + stripe->stripe_mutex->UnLock(); + + // Signal waiting threads to retry locking + stripe->stripe_cv->NotifyAll(); +} + +void PointLockManager::UnLock(PessimisticTransaction* txn, + const LockTracker& tracker, Env* env) { + std::unique_ptr cf_it( + tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::shared_ptr lock_map_ptr = GetLockMap(cf); + LockMap* lock_map = lock_map_ptr.get(); + if (!lock_map) { + // Column Family must have been dropped. + return; + } + + // Bucket keys by lock_map_ stripe + UnorderedMap> keys_by_stripe( + lock_map->num_stripes_); + std::unique_ptr key_it( + tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + size_t stripe_num = lock_map->GetStripe(key); + keys_by_stripe[stripe_num].push_back(&key); + } + + // For each stripe, grab the stripe mutex and unlock all keys in this stripe + for (auto& stripe_iter : keys_by_stripe) { + size_t stripe_num = stripe_iter.first; + auto& stripe_keys = stripe_iter.second; + + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + stripe->stripe_mutex->Lock().PermitUncheckedError(); + + for (const std::string* key : stripe_keys) { + UnLockKey(txn, *key, stripe, lock_map, env); + } + + stripe->stripe_mutex->UnLock(); + + // Signal waiting threads to retry locking + stripe->stripe_cv->NotifyAll(); + } + } +} + +PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() { + PointLockStatus data; + // Lock order here is important. The correct order is lock_map_mutex_, then + // for every column family ID in ascending order lock every stripe in + // ascending order. + InstrumentedMutexLock l(&lock_map_mutex_); + + std::vector cf_ids; + for (const auto& map : lock_maps_) { + cf_ids.push_back(map.first); + } + std::sort(cf_ids.begin(), cf_ids.end()); + + for (auto i : cf_ids) { + const auto& stripes = lock_maps_[i]->lock_map_stripes_; + // Iterate and lock all stripes in ascending order. + for (const auto& j : stripes) { + j->stripe_mutex->Lock().PermitUncheckedError(); + for (const auto& it : j->keys) { + struct KeyLockInfo info; + info.exclusive = it.second.exclusive; + info.key = it.first; + for (const auto& id : it.second.txn_ids) { + info.ids.push_back(id); + } + data.insert({i, info}); + } + } + } + + // Unlock everything. Unlocking order is not important. + for (auto i : cf_ids) { + const auto& stripes = lock_maps_[i]->lock_map_stripes_; + for (const auto& j : stripes) { + j->stripe_mutex->UnLock(); + } + } + + return data; +} + +std::vector PointLockManager::GetDeadlockInfoBuffer() { + return dlock_buffer_.PrepareBuffer(); +} + +void PointLockManager::Resize(uint32_t target_size) { + dlock_buffer_.Resize(target_size); +} + +PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() { + return {}; +} + +Status PointLockManager::TryLock(PessimisticTransaction* /* txn */, + ColumnFamilyId /* cf_id */, + const Endpoint& /* start */, + const Endpoint& /* end */, Env* /* env */, + bool /* exclusive */) { + return Status::NotSupported( + "PointLockManager does not support range locking"); +} + +void PointLockManager::UnLock(PessimisticTransaction* /* txn */, + ColumnFamilyId /* cf_id */, + const Endpoint& /* start */, + const Endpoint& /* end */, Env* /* env */) { + // no-op +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.h new file mode 100644 index 0000000000..99183ca1cd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager.h @@ -0,0 +1,222 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/utilities/transaction.h" +#include "util/autovector.h" +#include "util/hash_containers.h" +#include "util/hash_map.h" +#include "util/thread_local.h" +#include "utilities/transactions/lock/lock_manager.h" +#include "utilities/transactions/lock/point/point_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +struct LockInfo; +struct LockMap; +struct LockMapStripe; + +template +class DeadlockInfoBufferTempl { + private: + std::vector paths_buffer_; + uint32_t buffer_idx_; + std::mutex paths_buffer_mutex_; + + std::vector Normalize() { + auto working = paths_buffer_; + + if (working.empty()) { + return working; + } + + // Next write occurs at a nonexistent path's slot + if (paths_buffer_[buffer_idx_].empty()) { + working.resize(buffer_idx_); + } else { + std::rotate(working.begin(), working.begin() + buffer_idx_, + working.end()); + } + + return working; + } + + public: + explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks) + : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {} + + void AddNewPath(Path path) { + std::lock_guard lock(paths_buffer_mutex_); + + if (paths_buffer_.empty()) { + return; + } + + paths_buffer_[buffer_idx_] = std::move(path); + buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size(); + } + + void Resize(uint32_t target_size) { + std::lock_guard lock(paths_buffer_mutex_); + + paths_buffer_ = Normalize(); + + // Drop the deadlocks that will no longer be needed ater the normalize + if (target_size < paths_buffer_.size()) { + paths_buffer_.erase( + paths_buffer_.begin(), + paths_buffer_.begin() + (paths_buffer_.size() - target_size)); + buffer_idx_ = 0; + } + // Resize the buffer to the target size and restore the buffer's idx + else { + auto prev_size = paths_buffer_.size(); + paths_buffer_.resize(target_size); + buffer_idx_ = (uint32_t)prev_size; + } + } + + std::vector PrepareBuffer() { + std::lock_guard lock(paths_buffer_mutex_); + + // Reversing the normalized vector returns the latest deadlocks first + auto working = Normalize(); + std::reverse(working.begin(), working.end()); + + return working; + } +}; + +using DeadlockInfoBuffer = DeadlockInfoBufferTempl; + +struct TrackedTrxInfo { + autovector m_neighbors; + uint32_t m_cf_id; + bool m_exclusive; + std::string m_waiting_key; +}; + +class PointLockManager : public LockManager { + public: + PointLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt); + // No copying allowed + PointLockManager(const PointLockManager&) = delete; + PointLockManager& operator=(const PointLockManager&) = delete; + + ~PointLockManager() override {} + + bool IsPointLockSupported() const override { return true; } + + bool IsRangeLockSupported() const override { return false; } + + const LockTrackerFactory& GetLockTrackerFactory() const override { + return PointLockTrackerFactory::Get(); + } + + // Creates a new LockMap for this column family. Caller should guarantee + // that this column family does not already exist. + void AddColumnFamily(const ColumnFamilyHandle* cf) override; + // Deletes the LockMap for this column family. Caller should guarantee that + // this column family is no longer in use. + void RemoveColumnFamily(const ColumnFamilyHandle* cf) override; + + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) override; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start, const Endpoint& end, Env* env, + bool exclusive) override; + + void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start, const Endpoint& end, Env* env) override; + + PointLockStatus GetPointLockStatus() override; + + RangeLockStatus GetRangeLockStatus() override; + + std::vector GetDeadlockInfoBuffer() override; + + void Resize(uint32_t new_size) override; + + private: + PessimisticTransactionDB* txn_db_impl_; + + // Default number of lock map stripes per column family + const size_t default_num_stripes_; + + // Limit on number of keys locked per column family + const int64_t max_num_locks_; + + // The following lock order must be satisfied in order to avoid deadlocking + // ourselves. + // - lock_map_mutex_ + // - stripe mutexes in ascending cf id, ascending stripe order + // - wait_txn_map_mutex_ + // + // Must be held when accessing/modifying lock_maps_. + InstrumentedMutex lock_map_mutex_; + + // Map of ColumnFamilyId to locked key info + using LockMaps = UnorderedMap>; + LockMaps lock_maps_; + + // Thread-local cache of entries in lock_maps_. This is an optimization + // to avoid acquiring a mutex in order to look up a LockMap + std::unique_ptr lock_maps_cache_; + + // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_. + std::mutex wait_txn_map_mutex_; + + // Maps from waitee -> number of waiters. + HashMap rev_wait_txn_map_; + // Maps from waiter -> waitee. + HashMap wait_txn_map_; + DeadlockInfoBuffer dlock_buffer_; + + // Used to allocate mutexes/condvars to use when locking keys + std::shared_ptr mutex_factory_; + + bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env, + uint64_t* wait_time); + + std::shared_ptr GetLockMap(uint32_t column_family_id); + + Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map, + LockMapStripe* stripe, uint32_t column_family_id, + const std::string& key, Env* env, int64_t timeout, + const LockInfo& lock_info); + + Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& lock_info, uint64_t* wait_time, + autovector* txn_ids); + + void UnLockKey(PessimisticTransaction* txn, const std::string& key, + LockMapStripe* stripe, LockMap* lock_map, Env* env); + + bool IncrementWaiters(const PessimisticTransaction* txn, + const autovector& wait_ids, + const std::string& key, const uint32_t& cf_id, + const bool& exclusive, Env* const env); + void DecrementWaiters(const PessimisticTransaction* txn, + const autovector& wait_ids); + void DecrementWaitersImpl(const PessimisticTransaction* txn, + const autovector& wait_ids); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager_test.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager_test.h new file mode 100644 index 0000000000..ca9f46bf9d --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_manager_test.h @@ -0,0 +1,324 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "utilities/transactions/lock/point/point_lock_manager.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class MockColumnFamilyHandle : public ColumnFamilyHandle { + public: + explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {} + + ~MockColumnFamilyHandle() override {} + + const std::string& GetName() const override { return name_; } + + ColumnFamilyId GetID() const override { return cf_id_; } + + Status GetDescriptor(ColumnFamilyDescriptor*) override { + return Status::OK(); + } + + const Comparator* GetComparator() const override { + return BytewiseComparator(); + } + + private: + ColumnFamilyId cf_id_; + std::string name_ = "MockCF"; +}; + +class PointLockManagerTest : public testing::Test { + public: + void SetUp() override { + env_ = Env::Default(); + db_dir_ = test::PerThreadDBPath("point_lock_manager_test"); + ASSERT_OK(env_->CreateDir(db_dir_)); + + Options opt; + opt.create_if_missing = true; + TransactionDBOptions txn_opt; + txn_opt.transaction_lock_timeout = 0; + + ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_)); + + // CAUTION: This test creates a separate lock manager object (right, NOT + // the one that the TransactionDB is using!), and runs tests on it. + locker_.reset(new PointLockManager( + static_cast(db_), txn_opt)); + + wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn"; + } + + void TearDown() override { + delete db_; + EXPECT_OK(DestroyDir(env_, db_dir_)); + } + + PessimisticTransaction* NewTxn( + TransactionOptions txn_opt = TransactionOptions()) { + Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt); + return reinterpret_cast(txn); + } + + protected: + Env* env_; + std::shared_ptr locker_; + const char* wait_sync_point_name_; + friend void PointLockManagerTestExternalSetup(PointLockManagerTest*); + + private: + std::string db_dir_; + TransactionDB* db_; +}; + +using init_func_t = void (*)(PointLockManagerTest*); + +class AnyLockManagerTest : public PointLockManagerTest, + public testing::WithParamInterface { + public: + void SetUp() override { + // If a custom setup function was provided, use it. Otherwise, use what we + // have inherited. + auto init_func = GetParam(); + if (init_func) + (*init_func)(this); + else + PointLockManagerTest::SetUp(); + } +}; + +TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) { + // Tests that a txn can acquire exclusive lock on the same key repeatedly. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + + delete txn; +} + +TEST_P(AnyLockManagerTest, ReentrantSharedLock) { + // Tests that a txn can acquire shared lock on the same key repeatedly. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + + delete txn; +} + +TEST_P(AnyLockManagerTest, LockUpgrade) { + // Tests that a txn can upgrade from a shared lock to an exclusive lock. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + delete txn; +} + +TEST_P(AnyLockManagerTest, LockDowngrade) { + // Tests that a txn can acquire a shared lock after acquiring an exclusive + // lock on the same key. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + delete txn; +} + +TEST_P(AnyLockManagerTest, LockConflict) { + // Tests that lock conflicts lead to lock timeout. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn1 = NewTxn(); + auto txn2 = NewTxn(); + + { + // exclusive-exclusive conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + auto s = locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_TRUE(s.IsTimedOut()); + } + + { + // exclusive-shared conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true)); + auto s = locker_->TryLock(txn2, 1, "k2", env_, false); + ASSERT_TRUE(s.IsTimedOut()); + } + + { + // shared-exclusive conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false)); + auto s = locker_->TryLock(txn2, 1, "k2", env_, true); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Cleanup + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn1, 1, "k2", env_); + + delete txn1; + delete txn2; +} + +port::Thread BlockUntilWaitingTxn(const char* sync_point_name, + std::function f) { + std::atomic reached(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + sync_point_name, [&](void* /*arg*/) { reached.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t(f); + + while (!reached.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + return t; +} + +TEST_P(AnyLockManagerTest, SharedLocks) { + // Tests that shared locks can be concurrently held by multiple transactions. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn1 = NewTxn(); + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn1, 1, "k", env_); + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +TEST_P(AnyLockManagerTest, Deadlock) { + // Tests that deadlock can be detected. + // Deadlock scenario: + // txn1 exclusively locks k1, and wants to lock k2; + // txn2 exclusively locks k2, and wants to lock k1. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + TransactionOptions txn_opt; + txn_opt.deadlock_detect = true; + txn_opt.lock_timeout = 1000000; + auto txn1 = NewTxn(txn_opt); + auto txn2 = NewTxn(txn_opt); + + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true)); + + // txn1 tries to lock k2, will block forever. + port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + // block because txn2 is holding a lock on k2. + locker_->TryLock(txn1, 1, "k2", env_, true); + }); + + auto s = locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_TRUE(s.IsBusy()); + ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock); + + std::vector deadlock_paths = locker_->GetDeadlockInfoBuffer(); + ASSERT_EQ(deadlock_paths.size(), 1u); + ASSERT_FALSE(deadlock_paths[0].limit_exceeded); + + std::vector deadlocks = deadlock_paths[0].path; + ASSERT_EQ(deadlocks.size(), 2u); + + ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID()); + ASSERT_EQ(deadlocks[0].m_cf_id, 1u); + ASSERT_TRUE(deadlocks[0].m_exclusive); + ASSERT_EQ(deadlocks[0].m_waiting_key, "k2"); + + ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID()); + ASSERT_EQ(deadlocks[1].m_cf_id, 1u); + ASSERT_TRUE(deadlocks[1].m_exclusive); + ASSERT_EQ(deadlocks[1].m_waiting_key, "k1"); + + locker_->UnLock(txn2, 1, "k2", env_); + t.join(); + + // Cleanup + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn1, 1, "k2", env_); + delete txn2; + delete txn1; +} + +TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false)); + + auto txn3 = NewTxn(); + txn3->SetLockTimeout(10000); + port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true)); + locker_->UnLock(txn3, 1, "k", env_); + }); + + // Ok, now txn3 is waiting for lock on "k", which is owned by two + // transactions. Check that GetWaitingTxns reports this correctly + uint32_t wait_cf_id; + std::string wait_key; + auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key); + + ASSERT_EQ(wait_cf_id, 1u); + ASSERT_EQ(wait_key, "k"); + ASSERT_EQ(waiters.size(), 2); + bool waits_correct = + (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) || + (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID()); + ASSERT_EQ(waits_correct, true); + + // Release locks so txn3 can proceed with execution + locker_->UnLock(txn1, 1, "k", env_); + locker_->UnLock(txn2, 1, "k", env_); + + // Wait until txn3 finishes + t1.join(); + + delete txn1; + delete txn2; + delete txn3; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.cc new file mode 100644 index 0000000000..ce86c64868 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.cc @@ -0,0 +1,255 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/lock/point/point_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class TrackedKeysColumnFamilyIterator + : public LockTracker::ColumnFamilyIterator { + public: + explicit TrackedKeysColumnFamilyIterator(const TrackedKeys& keys) + : tracked_keys_(keys), it_(keys.begin()) {} + + bool HasNext() const override { return it_ != tracked_keys_.end(); } + + ColumnFamilyId Next() override { return (it_++)->first; } + + private: + const TrackedKeys& tracked_keys_; + TrackedKeys::const_iterator it_; +}; + +class TrackedKeysIterator : public LockTracker::KeyIterator { + public: + TrackedKeysIterator(const TrackedKeys& keys, ColumnFamilyId id) + : key_infos_(keys.at(id)), it_(key_infos_.begin()) {} + + bool HasNext() const override { return it_ != key_infos_.end(); } + + const std::string& Next() override { return (it_++)->first; } + + private: + const TrackedKeyInfos& key_infos_; + TrackedKeyInfos::const_iterator it_; +}; + +} // namespace + +void PointLockTracker::Track(const PointLockRequest& r) { + auto& keys = tracked_keys_[r.column_family_id]; + auto result = keys.try_emplace(r.key, r.seq); + auto it = result.first; + if (!result.second && r.seq < it->second.seq) { + // Now tracking this key with an earlier sequence number + it->second.seq = r.seq; + } + // else we do not update the seq. The smaller the tracked seq, the stronger it + // the guarantee since it implies from the seq onward there has not been a + // concurrent update to the key. So we update the seq if it implies stronger + // guarantees, i.e., if it is smaller than the existing tracked seq. + + if (r.read_only) { + it->second.num_reads++; + } else { + it->second.num_writes++; + } + + it->second.exclusive = it->second.exclusive || r.exclusive; +} + +UntrackStatus PointLockTracker::Untrack(const PointLockRequest& r) { + auto cf_keys = tracked_keys_.find(r.column_family_id); + if (cf_keys == tracked_keys_.end()) { + return UntrackStatus::NOT_TRACKED; + } + + auto& keys = cf_keys->second; + auto it = keys.find(r.key); + if (it == keys.end()) { + return UntrackStatus::NOT_TRACKED; + } + + bool untracked = false; + auto& info = it->second; + if (r.read_only) { + if (info.num_reads > 0) { + info.num_reads--; + untracked = true; + } + } else { + if (info.num_writes > 0) { + info.num_writes--; + untracked = true; + } + } + + bool removed = false; + if (info.num_reads == 0 && info.num_writes == 0) { + keys.erase(it); + if (keys.empty()) { + tracked_keys_.erase(cf_keys); + } + removed = true; + } + + if (removed) { + return UntrackStatus::REMOVED; + } + if (untracked) { + return UntrackStatus::UNTRACKED; + } + return UntrackStatus::NOT_TRACKED; +} + +void PointLockTracker::Merge(const LockTracker& tracker) { + const PointLockTracker& t = static_cast(tracker); + for (const auto& cf_keys : t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto current_cf_keys = tracked_keys_.find(cf); + if (current_cf_keys == tracked_keys_.end()) { + tracked_keys_.emplace(cf_keys); + } else { + auto& current_keys = current_cf_keys->second; + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + // If key was not previously tracked, just copy the whole struct over. + // Otherwise, some merging needs to occur. + auto current_info = current_keys.find(key); + if (current_info == current_keys.end()) { + current_keys.emplace(key_info); + } else { + current_info->second.Merge(info); + } + } + } + } +} + +void PointLockTracker::Subtract(const LockTracker& tracker) { + const PointLockTracker& t = static_cast(tracker); + for (const auto& cf_keys : t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto& current_keys = tracked_keys_.at(cf); + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + uint32_t num_reads = info.num_reads; + uint32_t num_writes = info.num_writes; + + auto current_key_info = current_keys.find(key); + assert(current_key_info != current_keys.end()); + + // Decrement the total reads/writes of this key by the number of + // reads/writes done since the last SavePoint. + if (num_reads > 0) { + assert(current_key_info->second.num_reads >= num_reads); + current_key_info->second.num_reads -= num_reads; + } + if (num_writes > 0) { + assert(current_key_info->second.num_writes >= num_writes); + current_key_info->second.num_writes -= num_writes; + } + if (current_key_info->second.num_reads == 0 && + current_key_info->second.num_writes == 0) { + current_keys.erase(current_key_info); + } + } + } +} + +LockTracker* PointLockTracker::GetTrackedLocksSinceSavePoint( + const LockTracker& save_point_tracker) const { + // Examine the number of reads/writes performed on all keys written + // since the last SavePoint and compare to the total number of reads/writes + // for each key. + LockTracker* t = new PointLockTracker(); + const PointLockTracker& save_point_t = + static_cast(save_point_tracker); + for (const auto& cf_keys : save_point_t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto& current_keys = tracked_keys_.at(cf); + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + uint32_t num_reads = info.num_reads; + uint32_t num_writes = info.num_writes; + + auto current_key_info = current_keys.find(key); + assert(current_key_info != current_keys.end()); + assert(current_key_info->second.num_reads >= num_reads); + assert(current_key_info->second.num_writes >= num_writes); + + if (current_key_info->second.num_reads == num_reads && + current_key_info->second.num_writes == num_writes) { + // All the reads/writes to this key were done in the last savepoint. + PointLockRequest r; + r.column_family_id = cf; + r.key = key; + r.seq = info.seq; + r.read_only = (num_writes == 0); + r.exclusive = info.exclusive; + t->Track(r); + } + } + } + return t; +} + +PointLockStatus PointLockTracker::GetPointLockStatus( + ColumnFamilyId column_family_id, const std::string& key) const { + assert(IsPointLockSupported()); + PointLockStatus status; + auto it = tracked_keys_.find(column_family_id); + if (it == tracked_keys_.end()) { + return status; + } + + const auto& keys = it->second; + auto key_it = keys.find(key); + if (key_it == keys.end()) { + return status; + } + + const TrackedKeyInfo& key_info = key_it->second; + status.locked = true; + status.exclusive = key_info.exclusive; + status.seq = key_info.seq; + return status; +} + +uint64_t PointLockTracker::GetNumPointLocks() const { + uint64_t num_keys = 0; + for (const auto& cf_keys : tracked_keys_) { + num_keys += cf_keys.second.size(); + } + return num_keys; +} + +LockTracker::ColumnFamilyIterator* PointLockTracker::GetColumnFamilyIterator() + const { + return new TrackedKeysColumnFamilyIterator(tracked_keys_); +} + +LockTracker::KeyIterator* PointLockTracker::GetKeyIterator( + ColumnFamilyId column_family_id) const { + assert(tracked_keys_.find(column_family_id) != tracked_keys_.end()); + return new TrackedKeysIterator(tracked_keys_, column_family_id); +} + +void PointLockTracker::Clear() { tracked_keys_.clear(); } + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.h new file mode 100644 index 0000000000..57e1b8437a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/point/point_lock_tracker.h @@ -0,0 +1,97 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +struct TrackedKeyInfo { + // Earliest sequence number that is relevant to this transaction for this key + SequenceNumber seq; + + uint32_t num_writes; + uint32_t num_reads; + + bool exclusive; + + explicit TrackedKeyInfo(SequenceNumber seq_no) + : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {} + + void Merge(const TrackedKeyInfo& info) { + assert(seq <= info.seq); + num_reads += info.num_reads; + num_writes += info.num_writes; + exclusive = exclusive || info.exclusive; + } +}; + +using TrackedKeyInfos = std::unordered_map; + +using TrackedKeys = std::unordered_map; + +// Tracks point locks on single keys. +class PointLockTracker : public LockTracker { + public: + PointLockTracker() = default; + + PointLockTracker(const PointLockTracker&) = delete; + PointLockTracker& operator=(const PointLockTracker&) = delete; + + bool IsPointLockSupported() const override { return true; } + + bool IsRangeLockSupported() const override { return false; } + + void Track(const PointLockRequest& lock_request) override; + + UntrackStatus Untrack(const PointLockRequest& lock_request) override; + + void Track(const RangeLockRequest& /*lock_request*/) override {} + + UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + void Merge(const LockTracker& tracker) override; + + void Subtract(const LockTracker& tracker) override; + + void Clear() override; + + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker& save_point_tracker) const override; + + PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, + const std::string& key) const override; + + uint64_t GetNumPointLocks() const override; + + ColumnFamilyIterator* GetColumnFamilyIterator() const override; + + KeyIterator* GetKeyIterator(ColumnFamilyId column_family_id) const override; + + private: + TrackedKeys tracked_keys_; +}; + +class PointLockTrackerFactory : public LockTrackerFactory { + public: + static const PointLockTrackerFactory& Get() { + static const PointLockTrackerFactory instance; + return instance; + } + + LockTracker* Create() const override { return new PointLockTracker(); } + + private: + PointLockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_lock_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_lock_manager.h new file mode 100644 index 0000000000..22d653ff67 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_lock_manager.h @@ -0,0 +1,34 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// Generic definitions for a Range-based Lock Manager +// +#pragma once + +#include "utilities/transactions/lock/lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +/* + A base class for all Range-based lock managers + + See also class RangeLockManagerHandle in + include/rocksdb/utilities/transaction_db.h +*/ +class RangeLockManagerBase : public LockManager { + public: + // Geting a point lock is reduced to getting a range lock on a single-point + // range + using LockManager::TryLock; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) override { + Endpoint endp(key.data(), key.size(), false); + return TryLock(txn, column_family_id, endp, endp, env, exclusive); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/db.h new file mode 100644 index 0000000000..5aa826c8e0 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/db.h @@ -0,0 +1,76 @@ +#ifndef _DB_H +#define _DB_H + +#include +#include + +typedef struct __toku_dbt DBT; + +// port: this is currently not used +struct simple_dbt { + uint32_t len; + void *data; +}; + +// engine status info +// engine status is passed to handlerton as an array of +// TOKU_ENGINE_STATUS_ROW_S[] +typedef enum { + STATUS_FS_STATE = 0, // interpret as file system state (redzone) enum + STATUS_UINT64, // interpret as uint64_t + STATUS_CHARSTR, // interpret as char * + STATUS_UNIXTIME, // interpret as time_t + STATUS_TOKUTIME, // interpret as tokutime_t + STATUS_PARCOUNT, // interpret as PARTITIONED_COUNTER + STATUS_DOUBLE // interpret as double +} toku_engine_status_display_type; + +typedef enum { + TOKU_ENGINE_STATUS = (1ULL << 0), // Include when asking for engine status + TOKU_GLOBAL_STATUS = + (1ULL << 1), // Include when asking for information_schema.global_status +} toku_engine_status_include_type; + +typedef struct __toku_engine_status_row { + const char *keyname; // info schema key, should not change across revisions + // without good reason + const char + *columnname; // column for mysql, e.g. information_schema.global_status. + // TOKUDB_ will automatically be prefixed. + const char *legend; // the text that will appear at user interface + toku_engine_status_display_type type; // how to interpret the value + toku_engine_status_include_type + include; // which kinds of callers should get read this row? + union { + double dnum; + uint64_t num; + const char *str; + char datebuf[26]; + struct partitioned_counter *parcount; + } value; +} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S; + +#define DB_BUFFER_SMALL -30999 +#define DB_LOCK_DEADLOCK -30995 +#define DB_LOCK_NOTGRANTED -30994 +#define DB_NOTFOUND -30989 +#define DB_KEYEXIST -30996 +#define DB_DBT_MALLOC 8 +#define DB_DBT_REALLOC 64 +#define DB_DBT_USERMEM 256 + +/* PerconaFT specific error codes */ +#define TOKUDB_OUT_OF_LOCKS -100000 + +typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid, + uint64_t blocking_txnid); + +struct __toku_dbt { + void *data; + size_t size; + size_t ulen; + // One of DB_DBT_XXX flags + uint32_t flags; +}; + +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h new file mode 100644 index 0000000000..718efc6231 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h @@ -0,0 +1,138 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../db.h" +#include "../portability/memory.h" +#include "../util/dbt.h" + +typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b); + +int toku_keycompare(const void *key1, size_t key1len, const void *key2, + size_t key2len); + +int toku_builtin_compare_fun(const DBT *, const DBT *) + __attribute__((__visibility__("default"))); + +namespace toku { + +// a comparator object encapsulates the data necessary for +// comparing two keys in a fractal tree. it further understands +// that points may be positive or negative infinity. + +class comparator { + void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) { + _cmp = cmp; + _cmp_arg = cmp_arg; + _memcmp_magic = memcmp_magic; + } + + public: + // This magic value is reserved to mean that the magic has not been set. + static const uint8_t MEMCMP_MAGIC_NONE = 0; + + void create(ft_compare_func cmp, void *cmp_arg, + uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) { + init(cmp, cmp_arg, memcmp_magic); + } + + // inherit the attributes of another comparator, but keep our own + // copy of fake_db that is owned separately from the one given. + void inherit(const comparator &cmp) { + invariant_notnull(cmp._cmp); + init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic); + } + + // like inherit, but doesn't require that the this comparator + // was already created + void create_from(const comparator &cmp) { inherit(cmp); } + + void destroy() {} + + ft_compare_func get_compare_func() const { return _cmp; } + + uint8_t get_memcmp_magic() const { return _memcmp_magic; } + + bool valid() const { return _cmp != nullptr; } + + inline bool dbt_has_memcmp_magic(const DBT *dbt) const { + return *reinterpret_cast(dbt->data) == _memcmp_magic; + } + + int operator()(const DBT *a, const DBT *b) const { + if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b), + 0)) { + return toku_dbt_infinite_compare(a, b); + } else if (_memcmp_magic != MEMCMP_MAGIC_NONE + // If `a' has the memcmp magic.. + && dbt_has_memcmp_magic(a) + // ..then we expect `b' to also have the memcmp magic + && __builtin_expect(dbt_has_memcmp_magic(b), 1)) { + assert(0); // psergey: this branch should not be taken. + return toku_builtin_compare_fun(a, b); + } else { + // yikes, const sadness here + return _cmp(_cmp_arg, a, b); + } + } + + private: + ft_compare_func _cmp; + void *_cmp_arg; + + uint8_t _memcmp_magic; +}; + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h new file mode 100644 index 0000000000..1b45111721 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h @@ -0,0 +1,102 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" +#include "../portability/toku_race_tools.h" +#include "../util/status.h" + +// +// Lock Tree Manager statistics +// +class LTM_STATUS_S { + public: + enum { + LTM_SIZE_CURRENT = 0, + LTM_SIZE_LIMIT, + LTM_ESCALATION_COUNT, + LTM_ESCALATION_TIME, + LTM_ESCALATION_LATEST_RESULT, + LTM_NUM_LOCKTREES, + LTM_LOCK_REQUESTS_PENDING, + LTM_STO_NUM_ELIGIBLE, + LTM_STO_END_EARLY_COUNT, + LTM_STO_END_EARLY_TIME, + LTM_WAIT_COUNT, + LTM_WAIT_TIME, + LTM_LONG_WAIT_COUNT, + LTM_LONG_WAIT_TIME, + LTM_TIMEOUT_COUNT, + LTM_WAIT_ESCALATION_COUNT, + LTM_WAIT_ESCALATION_TIME, + LTM_LONG_WAIT_ESCALATION_COUNT, + LTM_LONG_WAIT_ESCALATION_TIME, + LTM_STATUS_NUM_ROWS // must be last + }; + + void init(void); + void destroy(void); + + TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS]; + + private: + bool m_initialized = false; +}; +typedef LTM_STATUS_S* LTM_STATUS; +extern LTM_STATUS_S ltm_status; + +#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num + +void toku_status_init(void); // just call ltm_status.init(); +void toku_status_destroy(void); // just call ltm_status.destroy(); diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h new file mode 100644 index 0000000000..e1bfb86c50 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h @@ -0,0 +1,174 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../ft/comparator.h" +#include "keyrange.h" +#include "treenode.h" + +namespace toku { + +// A concurrent_tree stores non-overlapping ranges. +// Access to disjoint parts of the tree usually occurs concurrently. + +class concurrent_tree { + public: + // A locked_keyrange gives you exclusive access to read and write + // operations that occur on any keys in that range. You only have + // the right to operate on keys in that range or keys that were read + // from the keyrange using iterate() + // + // Access model: + // - user prepares a locked keyrange. all threads serialize behind prepare(). + // - user breaks the serialzation point by acquiring a range, or releasing. + // - one thread operates on a certain locked_keyrange object at a time. + // - when the thread is finished, it releases + + class locked_keyrange { + public: + // effect: prepare to acquire a locked keyrange over the given + // concurrent_tree, preventing other threads from preparing + // until this thread either does acquire() or release(). + // note: operations performed on a prepared keyrange are equivalent + // to ones performed on an acquired keyrange over -inf, +inf. + // rationale: this provides the user with a serialization point for + // descending + // or modifying the the tree. it also proives a convenient way of + // doing serializable operations on the tree. + // There are two valid sequences of calls: + // - prepare, acquire, [operations], release + // - prepare, [operations],release + void prepare(concurrent_tree *tree); + + // requires: the locked keyrange was prepare()'d + // effect: acquire a locked keyrange over the given concurrent_tree. + // the locked keyrange represents the range of keys overlapped + // by the given range + void acquire(const keyrange &range); + + // effect: releases a locked keyrange and the mutex it holds + void release(void); + + // effect: iterate over each range this locked_keyrange represents, + // calling function->fn() on each node's keyrange and txnid + // until there are no more or the function returns false + template + void iterate(F *function) const { + // if the subtree is non-empty, traverse it by calling the given + // function on each range, txnid pair found that overlaps. + if (!m_subtree->is_empty()) { + m_subtree->traverse_overlaps(m_range, function); + } + } + + // Adds another owner to the lock on the specified keyrange. + // requires: the keyrange contains one treenode whose bounds are + // exactly equal to the specifed range (no sub/supersets) + bool add_shared_owner(const keyrange &range, TXNID new_owner); + + // inserts the given range into the tree, with an associated txnid. + // requires: range does not overlap with anything in this locked_keyrange + // rationale: caller is responsible for only inserting unique ranges + void insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the tree. + // - txnid=TXNID_ANY means remove the range no matter what its + // owners are + // - Other value means remove the specified txnid from + // ownership (if the range has other owners, it will remain + // in the tree) + // requires: range exists exactly in this locked_keyrange + // rationale: caller is responsible for only removing existing ranges + void remove(const keyrange &range, TXNID txnid); + + // effect: removes all of the keys represented by this locked keyrange + // rationale: we'd like a fast way to empty out a tree + void remove_all(void); + + private: + // the concurrent tree this locked keyrange is for + concurrent_tree *m_tree; + + // the range of keys this locked keyrange represents + keyrange m_range; + + // the subtree under which all overlapping ranges exist + treenode *m_subtree; + + friend class concurrent_tree_unit_test; + }; + + // effect: initialize the tree to an empty state + void create(const comparator *cmp); + + // effect: destroy the tree. + // requires: tree is empty + void destroy(void); + + // returns: true iff the tree is empty + bool is_empty(void); + + // returns: the memory overhead of a single insertion into the tree + static uint64_t get_insertion_memory_overhead(void); + + private: + // the root needs to always exist so there's a lock to grab + // even if the tree is empty. that's why we store a treenode + // here and not a pointer to one. + treenode m_root; + + friend class concurrent_tree_unit_test; +}; + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h new file mode 100644 index 0000000000..f9aeea0c4e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h @@ -0,0 +1,141 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../ft/comparator.h" + +namespace toku { + +// A keyrange has a left and right key as endpoints. +// +// When a keyrange is created it owns no memory, but when it copies +// or extends another keyrange, it copies memory as necessary. This +// means it is cheap in the common case. + +class keyrange { + public: + // effect: constructor that borrows left and right key pointers. + // no memory is allocated or copied. + void create(const DBT *left_key, const DBT *right_key); + + // effect: constructor that allocates and copies another keyrange's points. + void create_copy(const keyrange &range); + + // effect: destroys the keyrange, freeing any allocated memory + void destroy(void); + + // effect: extends the keyrange by choosing the leftmost and rightmost + // endpoints from this range and the given range. + // replaced keys in this range are freed, new keys are copied. + void extend(const comparator &cmp, const keyrange &range); + + // returns: the amount of memory this keyrange takes. does not account + // for point optimizations or malloc overhead. + uint64_t get_memory_size(void) const; + + // returns: pointer to the left key of this range + const DBT *get_left_key(void) const; + + // returns: pointer to the right key of this range + const DBT *get_right_key(void) const; + + // two ranges are either equal, lt, gt, or overlapping + enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS }; + + // effect: compares this range to the given range + // returns: LESS_THAN if given range is strictly to the left + // GREATER_THAN if given range is strictly to the right + // EQUALS if given range has the same left and right endpoints + // OVERLAPS if at least one of the given range's endpoints falls + // between this range's endpoints + comparison compare(const comparator &cmp, const keyrange &range) const; + + // returns: true if the range and the given range are equal or overlapping + bool overlaps(const comparator &cmp, const keyrange &range) const; + + // returns: a keyrange representing -inf, +inf + static keyrange get_infinite_range(void); + + private: + // some keys should be copied, some keys should not be. + // + // to support both, we use two DBTs for copies and two pointers + // for temporaries. the access rule is: + // - if a pointer is non-null, then it reprsents the key. + // - otherwise the pointer is null, and the key is in the copy. + DBT m_left_key_copy; + DBT m_right_key_copy; + const DBT *m_left_key; + const DBT *m_right_key; + + // if this range is a point range, then m_left_key == m_right_key + // and the actual data is stored exactly once in m_left_key_copy. + bool m_point_range; + + // effect: initializes a keyrange to be empty + void init_empty(void); + + // effect: copies the given key once into the left key copy + // and sets the right key copy to share the left. + // rationale: optimization for point ranges to only do one malloc + void set_both_keys(const DBT *key); + + // effect: destroys the current left key. sets and copies the new one. + void replace_left_key(const DBT *key); + + // effect: destroys the current right key. sets and copies the new one. + void replace_right_key(const DBT *key); +}; + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h new file mode 100644 index 0000000000..d30e1e2cad --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h @@ -0,0 +1,255 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" +#include "../ft/comparator.h" +#include "../portability/toku_pthread.h" +#include "locktree.h" +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +// Information about a lock wait +struct lock_wait_info { + locktree *ltree; // the tree where wait happens + TXNID waiter; // the waiting transaction + void *m_extra; // lock_request's m_extra + + // The transactions that are waited for. + std::vector waitees; +}; + +typedef std::vector lock_wait_infos; + +// A lock request contains the db, the key range, the lock type, and +// the transaction id that describes a potential row range lock. +// +// the typical use case is: +// - initialize a lock request +// - start to try to acquire the lock +// - do something else +// - wait for the lock request to be resolved on a timed condition +// - destroy the lock request +// a lock request is resolved when its state is no longer pending, or +// when it becomes granted, or timedout, or deadlocked. when resolved, the +// state of the lock request is changed and any waiting threads are awakened. + +class lock_request { + public: + enum type { UNKNOWN, READ, WRITE }; + + // effect: Initializes a lock request. + void create(toku_external_mutex_factory_t mutex_factory); + + // effect: Destroys a lock request. + void destroy(void); + + // effect: Resets the lock request parameters, allowing it to be reused. + // requires: Lock request was already created at some point + void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, + type lock_type, bool big_txn, void *extra = nullptr); + + // effect: Tries to acquire a lock described by this lock request. + // returns: The return code of locktree::acquire_[write,read]_lock() + // or DB_LOCK_DEADLOCK if this request would end up deadlocked. + int start(void); + + // effect: Sleeps until either the request is granted or the wait time + // expires. returns: The return code of locktree::acquire_[write,read]_lock() + // or simply DB_LOCK_NOTGRANTED if the wait time expired. + int wait(uint64_t wait_time_ms); + int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, + int (*killed_callback)(void), + void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr, + void *callback_arg = nullptr); + + // return: left end-point of the lock range + const DBT *get_left_key(void) const; + + // return: right end-point of the lock range + const DBT *get_right_key(void) const; + + // return: the txnid waiting for a lock + TXNID get_txnid(void) const; + + // return: when this lock request started, as milliseconds from epoch + uint64_t get_start_time(void) const; + + // return: which txnid is blocking this request (there may be more, though) + TXNID get_conflicting_txnid(void) const; + + // effect: Retries all of the lock requests for the given locktree. + // Any lock requests successfully restarted is completed and woken + // up. + // The rest remain pending. + static void retry_all_lock_requests( + locktree *lt, + void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr, + void *callback_arg = nullptr, + void (*after_retry_test_callback)(void) = nullptr); + static void retry_all_lock_requests_info( + lt_lock_request_info *info, + void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg); + + void set_start_test_callback(void (*f)(void)); + void set_start_before_pending_test_callback(void (*f)(void)); + void set_retry_test_callback(void (*f)(void)); + + void *get_extra(void) const; + + void kill_waiter(void); + static void kill_waiter(locktree *lt, void *extra); + + private: + enum state { + UNINITIALIZED, + INITIALIZED, + PENDING, + COMPLETE, + DESTROYED, + }; + + // The keys for a lock request are stored "unowned" in m_left_key + // and m_right_key. When the request is about to go to sleep, it + // copies these keys and stores them in m_left_key_copy etc and + // sets the temporary pointers to null. + TXNID m_txnid; + TXNID m_conflicting_txnid; + uint64_t m_start_time; + const DBT *m_left_key; + const DBT *m_right_key; + DBT m_left_key_copy; + DBT m_right_key_copy; + + // The lock request type and associated locktree + type m_type; + locktree *m_lt; + + // If the lock request is in the completed state, then its + // final return value is stored in m_complete_r + int m_complete_r; + state m_state; + + toku_external_cond_t m_wait_cond; + + bool m_big_txn; + + // the lock request info state stored in the + // locktree that this lock request is for. + struct lt_lock_request_info *m_info; + + void *m_extra; + + // effect: tries again to acquire the lock described by this lock request + // returns: 0 if retrying the request succeeded and is now complete + int retry(lock_wait_infos *collector); + + void complete(int complete_r); + + // effect: Finds another lock request by txnid. + // requires: The lock request info mutex is held + lock_request *find_lock_request(const TXNID &txnid); + + // effect: Insert this lock request into the locktree's set. + // requires: the locktree's mutex is held + void insert_into_lock_requests(void); + + // effect: Removes this lock request from the locktree's set. + // requires: The lock request info mutex is held + void remove_from_lock_requests(void); + + // effect: Asks this request's locktree which txnids are preventing + // us from getting the lock described by this request. + // returns: conflicts is populated with the txnid's that this request + // is blocked on + void get_conflicts(txnid_set *conflicts); + + // effect: Builds a wait-for-graph for this lock request and the given + // conflict set + void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts); + + // returns: True if this lock request is in deadlock with the given conflicts + // set + bool deadlock_exists(const txnid_set &conflicts); + + void copy_keys(void); + + static int find_by_txnid(lock_request *const &request, const TXNID &txnid); + + // Report list of conflicts to lock wait callback. + static void report_waits(lock_wait_infos *wait_conflicts, + void (*lock_wait_callback)(void *, + lock_wait_infos *), + void *callback_arg); + void add_conflicts_to_waits(txnid_set *conflicts, + lock_wait_infos *wait_conflicts); + + void (*m_start_test_callback)(void); + void (*m_start_before_pending_test_callback)(void); + void (*m_retry_test_callback)(void); + + public: + std::function m_deadlock_cb; + + friend class lock_request_unit_test; +}; +// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t +// This is ok as the PODness is not really required: lock_request objects are +// not moved in memory or anything. +// ENSURE_POD(lock_request); + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h new file mode 100644 index 0000000000..f0f4b042d3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h @@ -0,0 +1,580 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../db.h" +#include "../ft/comparator.h" +#include "../portability/toku_external_pthread.h" +#include "../portability/toku_pthread.h" +#include "../portability/toku_time.h" +// PORT #include // just for DICTIONARY_ID.. +// PORT: ft-status for LTM_STATUS: +#include "../ft/ft-status.h" + +struct DICTIONARY_ID { + uint64_t dictid; +}; + +#include "../util/omt.h" +#include "range_buffer.h" +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +class locktree; +class locktree_manager; +class lock_request; +class concurrent_tree; + +typedef int (*lt_create_cb)(locktree *lt, void *extra); +typedef void (*lt_destroy_cb)(locktree *lt); +typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, + const range_buffer &buffer, void *extra); + +typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b, + void *extra); + +struct lt_counters { + uint64_t wait_count, wait_time; + uint64_t long_wait_count, long_wait_time; + uint64_t timeout_count; + + void add(const lt_counters &rhs) { + wait_count += rhs.wait_count; + wait_time += rhs.wait_time; + long_wait_count += rhs.long_wait_count; + long_wait_time += rhs.long_wait_time; + timeout_count += rhs.timeout_count; + } +}; + +// Lock request state for some locktree +struct lt_lock_request_info { + omt pending_lock_requests; + std::atomic_bool pending_is_empty; + toku_external_mutex_t mutex; + bool should_retry_lock_requests; + lt_counters counters; + std::atomic_ullong retry_want; + unsigned long long retry_done; + toku_mutex_t retry_mutex; + toku_cond_t retry_cv; + bool running_retry; + + void init(toku_external_mutex_factory_t mutex_factory); + void destroy(void); +}; + +// The locktree manager manages a set of locktrees, one for each open +// dictionary. Locktrees are retrieved from the manager. When they are no +// longer needed, they are be released by the user. +class locktree_manager { + public: + // param: create_cb, called just after a locktree is first created. + // destroy_cb, called just before a locktree is destroyed. + // escalate_cb, called after a locktree is escalated (with extra + // param) + void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb, + lt_escalate_cb escalate_cb, void *extra, + toku_external_mutex_factory_t mutex_factory_arg); + + void destroy(void); + + size_t get_max_lock_memory(void); + + int set_max_lock_memory(size_t max_lock_memory); + + // effect: Get a locktree from the manager. If a locktree exists with the + // given + // dict_id, it is referenced and then returned. If one did not exist, + // it is created. It will use the comparator for comparing keys. The + // on_create callback (passed to locktree_manager::create()) will be + // called with the given extra parameter. + locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, + void *on_create_extra); + + void reference_lt(locktree *lt); + + // effect: Releases one reference on a locktree. If the reference count + // transitions + // to zero, the on_destroy callback is called before it gets + // destroyed. + void release_lt(locktree *lt); + + void get_status(LTM_STATUS status); + + // effect: calls the iterate function on each pending lock request + // note: holds the manager's mutex + typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id, + TXNID txnid, const DBT *left_key, + const DBT *right_key, + TXNID blocking_txnid, + uint64_t start_time, + void *extra); + int iterate_pending_lock_requests(lock_request_iterate_callback cb, + void *extra); + + // effect: Determines if too many locks or too much memory is being used, + // Runs escalation on the manager if so. + // param: big_txn, if the current transaction is 'big' (has spilled rollback + // logs) returns: 0 if there enough resources to create a new lock, or + // TOKUDB_OUT_OF_LOCKS + // if there are not enough resources and lock escalation failed to + // free up enough resources for a new lock. + int check_current_lock_constraints(bool big_txn); + + bool over_big_threshold(void); + + void note_mem_used(uint64_t mem_used); + + void note_mem_released(uint64_t mem_freed); + + bool out_of_locks(void) const; + + // Escalate all locktrees + void escalate_all_locktrees(void); + + // Escalate a set of locktrees + void escalate_locktrees(locktree **locktrees, int num_locktrees); + + // effect: calls the private function run_escalation(), only ok to + // do for tests. + // rationale: to get better stress test coverage, we want a way to + // deterministicly trigger lock escalation. + void run_escalation_for_test(void); + void run_escalation(void); + + // Add time t to the escalator's wait time statistics + void add_escalator_wait_time(uint64_t t); + + void kill_waiter(void *extra); + + private: + static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024; + + // tracks the current number of locks and lock memory + uint64_t m_max_lock_memory; + uint64_t m_current_lock_memory; + + struct lt_counters m_lt_counters; + + // the create and destroy callbacks for the locktrees + lt_create_cb m_lt_create_callback; + lt_destroy_cb m_lt_destroy_callback; + lt_escalate_cb m_lt_escalate_callback; + void *m_lt_escalate_callback_extra; + + omt m_locktree_map; + + toku_external_mutex_factory_t mutex_factory; + + // the manager's mutex protects the locktree map + toku_mutex_t m_mutex; + + void mutex_lock(void); + + void mutex_unlock(void); + + // Manage the set of open locktrees + locktree *locktree_map_find(const DICTIONARY_ID &dict_id); + void locktree_map_put(locktree *lt); + void locktree_map_remove(locktree *lt); + + static int find_by_dict_id(locktree *const <, const DICTIONARY_ID &dict_id); + + void escalator_init(void); + void escalator_destroy(void); + + // statistics about lock escalation. + toku_mutex_t m_escalation_mutex; + uint64_t m_escalation_count; + tokutime_t m_escalation_time; + uint64_t m_escalation_latest_result; + uint64_t m_wait_escalation_count; + uint64_t m_wait_escalation_time; + uint64_t m_long_wait_escalation_count; + uint64_t m_long_wait_escalation_time; + + // the escalator coordinates escalation on a set of locktrees for a bunch of + // threads + class locktree_escalator { + public: + void create(void); + void destroy(void); + void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), + void *extra); + + private: + toku_mutex_t m_escalator_mutex; + toku_cond_t m_escalator_done; + bool m_escalator_running; + }; + + locktree_escalator m_escalator; + + friend class manager_unit_test; +}; + +// A locktree represents the set of row locks owned by all transactions +// over an open dictionary. Read and write ranges are represented as +// a left and right key which are compared with the given comparator +// +// Locktrees are not created and destroyed by the user. Instead, they are +// referenced and released using the locktree manager. +// +// A sample workflow looks like this: +// - Create a manager. +// - Get a locktree by dictionaroy id from the manager. +// - Perform read/write lock acquision on the locktree, add references to +// the locktree using the manager, release locks, release references, etc. +// - ... +// - Release the final reference to the locktree. It will be destroyed. +// - Destroy the manager. +class locktree { + public: + // effect: Creates a locktree + void create(locktree_manager *mgr, DICTIONARY_ID dict_id, + const comparator &cmp, + toku_external_mutex_factory_t mutex_factory); + + void destroy(void); + + // For thread-safe, external reference counting + void add_reference(void); + + // requires: the reference count is > 0 + // returns: the reference count, after decrementing it by one + uint32_t release_reference(void); + + // returns: the current reference count + uint32_t get_reference_count(void); + + // effect: Attempts to grant a read lock for the range of keys between + // [left_key, right_key]. returns: If the lock cannot be granted, return + // DB_LOCK_NOTGRANTED, and populate the + // given conflicts set with the txnids that hold conflicting locks in + // the range. If the locktree cannot create more locks, return + // TOKUDB_OUT_OF_LOCKS. + // note: Read locks cannot be shared between txnids, as one would expect. + // This is for simplicity since read locks are rare in MySQL. + int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn); + + // effect: Attempts to grant a write lock for the range of keys between + // [left_key, right_key]. returns: If the lock cannot be granted, return + // DB_LOCK_NOTGRANTED, and populate the + // given conflicts set with the txnids that hold conflicting locks in + // the range. If the locktree cannot create more locks, return + // TOKUDB_OUT_OF_LOCKS. + int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn); + + // effect: populate the conflicts set with the txnids that would preventing + // the given txnid from getting a lock on [left_key, right_key] + void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts); + + // effect: Release all of the lock ranges represented by the range buffer for + // a txnid. + void release_locks(TXNID txnid, const range_buffer *ranges, + bool all_trx_locks_hint = false); + + // effect: Runs escalation on this locktree + void escalate(lt_escalate_cb after_escalate_callback, void *extra); + + // returns: The userdata associated with this locktree, or null if it has not + // been set. + void *get_userdata(void) const; + + void set_userdata(void *userdata); + + locktree_manager *get_manager(void) const; + + void set_comparator(const comparator &cmp); + + // Set the user-provided Lock Escalation Barrier check function and its + // argument + // + // Lock Escalation Barrier limits the scope of Lock Escalation. + // For two keys A and B (such that A < B), + // escalation_barrier_check_func(A, B)==true means that there's a lock + // escalation barrier between A and B, and lock escalation is not allowed to + // bridge the gap between A and B. + // + // This method sets the user-provided barrier check function and its + // parameter. + void set_escalation_barrier_func(lt_escalation_barrier_check_func func, + void *extra); + + int compare(const locktree *lt) const; + + DICTIONARY_ID get_dict_id() const; + + // Private info struct for storing pending lock request state. + // Only to be used by lock requests. We store it here as + // something less opaque than usual to strike a tradeoff between + // abstraction and code complexity. It is still fairly abstract + // since the lock_request object is opaque + struct lt_lock_request_info *get_lock_request_info(void); + + typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, + TXNID txnid, bool is_shared, + TxnidVector *owners); + void dump_locks(void *cdata, dump_callback cb); + + private: + locktree_manager *m_mgr; + DICTIONARY_ID m_dict_id; + uint32_t m_reference_count; + + // Since the memory referenced by this comparator is not owned by the + // locktree, the user must guarantee it will outlive the locktree. + // + // The ydb API accomplishes this by opening an ft_handle in the on_create + // callback, which will keep the underlying FT (and its descriptor) in memory + // for as long as the handle is open. The ft_handle is stored opaquely in the + // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra + comparator m_cmp; + + lt_escalation_barrier_check_func m_escalation_barrier; + void *m_escalation_barrier_arg; + + concurrent_tree *m_rangetree; + + void *m_userdata; + struct lt_lock_request_info m_lock_request_info; + + // psergey-todo: + // Each transaction also keeps a list of ranges it has locked. + // So, when a transaction is running in STO mode, two identical + // lists are kept: the STO lock list and transaction's owned locks + // list. Why can't we do with just one list? + + // The following fields and members prefixed with "sto_" are for + // the single txnid optimization, intended to speed up the case + // when only one transaction is using the locktree. If we know + // the locktree has only one transaction, then acquiring locks + // takes O(1) work and releasing all locks takes O(1) work. + // + // How do we know that the locktree only has a single txnid? + // What do we do if it does? + // + // When a txn with txnid T requests a lock: + // - If the tree is empty, the optimization is possible. Set the single + // txnid to T, and insert the lock range into the buffer. + // - If the tree is not empty, check if the single txnid is T. If so, + // append the lock range to the buffer. Otherwise, migrate all of + // the locks in the buffer into the rangetree on behalf of txnid T, + // and invalid the single txnid. + // + // When a txn with txnid T releases its locks: + // - If the single txnid is valid, it must be for T. Destroy the buffer. + // - If it's not valid, release locks the normal way in the rangetree. + // + // To carry out the optimization we need to record a single txnid + // and a range buffer for each locktree, each protected by the root + // lock of the locktree's rangetree. The root lock for a rangetree + // is grabbed by preparing a locked keyrange on the rangetree. + TXNID m_sto_txnid; + range_buffer m_sto_buffer; + + // The single txnid optimization speeds up the case when only one + // transaction is using the locktree. But it has the potential to + // hurt the case when more than one txnid exists. + // + // There are two things we need to do to make the optimization only + // optimize the case we care about, and not hurt the general case. + // + // Bound the worst-case latency for lock migration when the + // optimization stops working: + // - Idea: Stop the optimization and migrate immediate if we notice + // the single txnid has takes many locks in the range buffer. + // - Implementation: Enforce a max size on the single txnid range buffer. + // - Analysis: Choosing the perfect max value, M, is difficult to do + // without some feedback from the field. Intuition tells us that M should + // not be so small that the optimization is worthless, and it should not + // be so big that it's unreasonable to have to wait behind a thread doing + // the work of converting M buffer locks into rangetree locks. + // + // Prevent concurrent-transaction workloads from trying the optimization + // in vain: + // - Idea: Don't even bother trying the optimization if we think the + // system is in a concurrent-transaction state. + // - Implementation: Do something even simpler than detecting whether the + // system is in a concurent-transaction state. Just keep a "score" value + // and some threshold. If at any time the locktree is eligible for the + // optimization, only do it if the score is at this threshold. When you + // actually do the optimization but someone has to migrate locks in the buffer + // (expensive), then reset the score back to zero. Each time a txn + // releases locks, the score is incremented by 1. + // - Analysis: If you let the threshold be "C", then at most 1 / C txns will + // do the optimization in a concurrent-transaction system. Similarly, it + // takes at most C txns to start using the single txnid optimzation, which + // is good when the system transitions from multithreaded to single threaded. + // + // STO_BUFFER_MAX_SIZE: + // + // We choose the max value to be 1 million since most transactions are smaller + // than 1 million and we can create a rangetree of 1 million elements in + // less than a second. So we can be pretty confident that this threshold + // enables the optimization almost always, and prevents super pathological + // latency issues for the first lock taken by a second thread. + // + // STO_SCORE_THRESHOLD: + // + // A simple first guess at a good value for the score threshold is 100. + // By our analysis, we'd end up doing the optimization in vain for + // around 1% of all transactions, which seems reasonable. Further, + // if the system goes single threaded, it ought to be pretty quick + // for 100 transactions to go by, so we won't have to wait long before + // we start doing the single txind optimzation again. + static const int STO_BUFFER_MAX_SIZE = 50 * 1024; + static const int STO_SCORE_THRESHOLD = 100; + int m_sto_score; + + // statistics about time spent ending the STO early + uint64_t m_sto_end_early_count; + tokutime_t m_sto_end_early_time; + + // effect: begins the single txnid optimizaiton, setting m_sto_txnid + // to the given txnid. + // requires: m_sto_txnid is invalid + void sto_begin(TXNID txnid); + + // effect: append a range to the sto buffer + // requires: m_sto_txnid is valid + void sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request); + + // effect: ends the single txnid optimization, releaseing any memory + // stored in the sto buffer, notifying the tracker, and + // invalidating m_sto_txnid. + // requires: m_sto_txnid is valid + void sto_end(void); + + // params: prepared_lkr is a void * to a prepared locked keyrange. see below. + // effect: ends the single txnid optimization early, migrating buffer locks + // into the rangetree, calling sto_end(), and then setting the + // sto_score back to zero. + // requires: m_sto_txnid is valid + void sto_end_early(void *prepared_lkr); + void sto_end_early_no_accounting(void *prepared_lkr); + + // params: prepared_lkr is a void * to a prepared locked keyrange. we can't + // use + // the real type because the compiler won't allow us to forward + // declare concurrent_tree::locked_keyrange without including + // concurrent_tree.h, which we cannot do here because it is a template + // implementation. + // requires: the prepared locked keyrange is for the locktree's rangetree + // requires: m_sto_txnid is valid + // effect: migrates each lock in the single txnid buffer into the locktree's + // rangetree, notifying the memory tracker as necessary. + void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr); + + // effect: If m_sto_txnid is valid, then release the txnid's locks + // by ending the optimization. + // requires: If m_sto_txnid is valid, it is equal to the given txnid + // returns: True if locks were released for this txnid + bool sto_try_release(TXNID txnid); + + // params: prepared_lkr is a void * to a prepared locked keyrange. see above. + // requires: the prepared locked keyrange is for the locktree's rangetree + // effect: If m_sto_txnid is valid and equal to the given txnid, then + // append a range onto the buffer. Otherwise, if m_sto_txnid is valid + // but not equal to this txnid, then migrate the buffer's locks + // into the rangetree and end the optimization, setting the score + // back to zero. + // returns: true if the lock was acquired for this txnid + bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key, + const DBT *right_key, bool is_write_request); + + // Effect: + // Provides a hook for a helgrind suppression. + // Returns: + // true if m_sto_txnid is not TXNID_NONE + bool sto_txnid_is_valid_unsafe(void) const; + + // Effect: + // Provides a hook for a helgrind suppression. + // Returns: + // m_sto_score + int sto_get_score_unsafe(void) const; + + void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key, + const DBT *right_key); + + int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, + const DBT *left_key, const DBT *right_key, + bool is_write_request, txnid_set *conflicts); + + int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts); + + int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn); + + friend class locktree_unit_test; + friend class manager_unit_test; + friend class lock_request_unit_test; + + // engine status reaches into the locktree to read some stats + friend void locktree_manager::get_status(LTM_STATUS status); +}; + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h new file mode 100644 index 0000000000..76e28d7477 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h @@ -0,0 +1,178 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include +#include + +#include "../util/dbt.h" +#include "../util/memarena.h" + +namespace toku { + +// a key range buffer represents a set of key ranges that can +// be stored, iterated over, and then destroyed all at once. +class range_buffer { + private: + // the key range buffer is a bunch of records in a row. + // each record has the following header, followed by the + // left key and right key data payload, if applicable. + // we limit keys to be 2^16, since we store lengths as 2 bytes. + static const size_t MAX_KEY_SIZE = 1 << 16; + + struct record_header { + bool left_neg_inf; + bool left_pos_inf; + bool right_pos_inf; + bool right_neg_inf; + uint16_t left_key_size; + uint16_t right_key_size; + bool is_exclusive_lock; + + bool left_is_infinite(void) const; + + bool right_is_infinite(void) const; + + void init(const DBT *left_key, const DBT *right_key, bool is_exclusive); + }; + // PORT static_assert(sizeof(record_header) == 8, "record header format is + // off"); + + public: + // the iterator abstracts reading over a buffer of variable length + // records one by one until there are no more left. + class iterator { + public: + iterator(); + iterator(const range_buffer *buffer); + + // a record represents the user-view of a serialized key range. + // it handles positive and negative infinity and the optimized + // point range case, where left and right points share memory. + class record { + public: + // get a read-only pointer to the left key of this record's range + const DBT *get_left_key(void) const; + + // get a read-only pointer to the right key of this record's range + const DBT *get_right_key(void) const; + + // how big is this record? this tells us where the next record is + size_t size(void) const; + + bool get_exclusive_flag() const { return _header.is_exclusive_lock; } + + // populate a record header and point our DBT's + // buffers into ours if they are not infinite. + void deserialize(const char *buf); + + private: + record_header _header; + DBT _left_key; + DBT _right_key; + }; + + // populate the given record object with the current + // the memory referred to by record is valid for only + // as long as the record exists. + bool current(record *rec); + + // move the iterator to the next record in the buffer + void next(void); + + private: + void reset_current_chunk(); + + // the key range buffer we are iterating over, the current + // offset in that buffer, and the size of the current record. + memarena::chunk_iterator _ma_chunk_iterator; + const void *_current_chunk_base; + size_t _current_chunk_offset; + size_t _current_chunk_max; + size_t _current_rec_size; + }; + + // allocate buffer space lazily instead of on creation. this way, + // no malloc/free is done if the transaction ends up taking no locks. + void create(void); + + // append a left/right key range to the buffer. + // if the keys are equal, then only one copy is stored. + void append(const DBT *left_key, const DBT *right_key, + bool is_write_request = false); + + // is this range buffer empty? + bool is_empty(void) const; + + // how much memory is being used by this range buffer? + uint64_t total_memory_size(void) const; + + // how many ranges are stored in this range buffer? + int get_num_ranges(void) const; + + void destroy(void); + + private: + memarena _arena; + int _num_ranges; + + void append_range(const DBT *left_key, const DBT *right_key, + bool is_write_request); + + // append a point to the buffer. this is the space/time saving + // optimization for key ranges where left == right. + void append_point(const DBT *key, bool is_write_request); +}; + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h new file mode 100644 index 0000000000..ec25a8c583 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h @@ -0,0 +1,302 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../ft/comparator.h" +#include "../portability/memory.h" +#include "../portability/toku_pthread.h" +// PORT: we need LTM_STATUS +#include "../ft/ft-status.h" +#include "../portability/txn_subst.h" +#include "keyrange.h" + +namespace toku { + +// a node in a tree with its own mutex +// - range is the "key" of this node +// - txnid is the single txnid associated with this node +// - left and right children may be null +// +// to build a tree on top of this abstraction, the user: +// - provides memory for a root node, initializes it via create_root() +// - performs tree operations on the root node. memory management +// below the root node is handled by the abstraction, not the user. +// this pattern: +// - guaruntees a root node always exists. +// - does not allow for rebalances on the root node + +class treenode { + public: + // every treenode function has some common requirements: + // - node is locked and children are never locked + // - node may be unlocked if no other thread has visibility + + // effect: create the root node + void create_root(const comparator *cmp); + + // effect: destroys the root node + void destroy_root(void); + + // effect: sets the txnid and copies the given range for this node + void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared); + + // returns: true iff this node is marked as empty + bool is_empty(void); + + // returns: true if this is the root node, denoted by a null parent + bool is_root(void); + + // returns: true if the given range overlaps with this node's range + bool range_overlaps(const keyrange &range); + + // effect: locks the node + void mutex_lock(void); + + // effect: unlocks the node + void mutex_unlock(void); + + // return: node whose child overlaps, or a child that is empty + // and would contain range if it existed + // given: if cmp_hint is non-null, then it is a precomputed + // comparison of this node's range to the given range. + treenode *find_node_with_overlapping_child( + const keyrange &range, const keyrange::comparison *cmp_hint); + + // effect: performs an in-order traversal of the ranges that overlap the + // given range, calling function->fn() on each node that does + // requires: function signature is: bool fn(const keyrange &range, TXNID + // txnid) requires: fn returns true to keep iterating, false to stop iterating + // requires: fn does not attempt to use any ranges read out by value + // after removing a node with an overlapping range from the tree. + template + void traverse_overlaps(const keyrange &range, F *function) { + keyrange::comparison c = range.compare(*m_cmp, m_range); + if (c == keyrange::comparison::EQUALS) { + // Doesn't matter if fn wants to keep going, there + // is nothing left, so return. + function->fn(m_range, m_txnid, m_is_shared, m_owners); + return; + } + + treenode *left = m_left_child.get_locked(); + if (left) { + if (c != keyrange::comparison::GREATER_THAN) { + // Target range is less than this node, or it overlaps this + // node. There may be something on the left. + left->traverse_overlaps(range, function); + } + left->mutex_unlock(); + } + + if (c == keyrange::comparison::OVERLAPS) { + bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners); + if (!keep_going) { + return; + } + } + + treenode *right = m_right_child.get_locked(); + if (right) { + if (c != keyrange::comparison::LESS_THAN) { + // Target range is greater than this node, or it overlaps this + // node. There may be something on the right. + right->traverse_overlaps(range, function); + } + right->mutex_unlock(); + } + } + + // effect: inserts the given range and txnid into a subtree, recursively + // requires: range does not overlap with any node below the subtree + bool insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the subtree + // requires: range exists in the subtree + // returns: the root of the resulting subtree + treenode *remove(const keyrange &range, TXNID txnid); + + // effect: removes this node and all of its children, recursively + // requires: every node at and below this node is unlocked + void recursive_remove(void); + + private: + // the child_ptr is a light abstraction for the locking of + // a child and the maintenence of its depth estimate. + + struct child_ptr { + // set the child pointer + void set(treenode *node); + + // get and lock this child if it exists + treenode *get_locked(void); + + treenode *ptr; + uint32_t depth_est; + }; + + // the balance factor at which a node is considered imbalanced + static const int32_t IMBALANCE_THRESHOLD = 2; + + // node-level mutex + toku_mutex_t m_mutex; + + // the range and txnid for this node. the range contains a copy + // of the keys originally inserted into the tree. nodes may + // swap ranges. but at the end of the day, when a node is + // destroyed, it frees the memory associated with whatever range + // it has at the time of destruction. + keyrange m_range; + + void remove_shared_owner(TXNID txnid); + + bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); } + + private: + // Owner transaction id. + // A value of TXNID_SHARED means this node has multiple owners + TXNID m_txnid; + + // If true, this lock is a non-exclusive lock, and it can have either + // one or several owners. + bool m_is_shared; + + // List of the owners, or nullptr if there's just one owner. + TxnidVector *m_owners; + + // two child pointers + child_ptr m_left_child; + child_ptr m_right_child; + + // comparator for ranges + // psergey-todo: Is there any sense to store the comparator in each tree + // node? + const comparator *m_cmp; + + // marked for the root node. the root node is never free()'d + // when removed, but instead marked as empty. + bool m_is_root; + + // marked for an empty node. only valid for the root. + bool m_is_empty; + + // effect: initializes an empty node with the given comparator + void init(const comparator *cmp); + + // requires: this is a shared node (m_is_shared==true) + // effect: another transaction is added as an owner. + // returns: true <=> added another owner + // false <=> this transaction is already an owner + bool add_shared_owner(TXNID txnid); + + // requires: *parent is initialized to something meaningful. + // requires: subtree is non-empty + // returns: the leftmost child of the given subtree + // returns: a pointer to the parent of said child in *parent, only + // if this function recurred, otherwise it is untouched. + treenode *find_leftmost_child(treenode **parent); + + // requires: *parent is initialized to something meaningful. + // requires: subtree is non-empty + // returns: the rightmost child of the given subtree + // returns: a pointer to the parent of said child in *parent, only + // if this function recurred, otherwise it is untouched. + treenode *find_rightmost_child(treenode **parent); + + // effect: remove the root of this subtree, destroying the old root + // returns: the new root of the subtree + treenode *remove_root_of_subtree(void); + + // requires: subtree is non-empty, direction is not 0 + // returns: the child of the subtree at either the left or rightmost extreme + treenode *find_child_at_extreme(int direction, treenode **parent); + + // effect: retrieves and possibly rebalances the left child + // returns: a locked left child, if it exists + treenode *lock_and_rebalance_left(void); + + // effect: retrieves and possibly rebalances the right child + // returns: a locked right child, if it exists + treenode *lock_and_rebalance_right(void); + + // returns: the estimated depth of this subtree + uint32_t get_depth_estimate(void) const; + + // returns: true iff left subtree depth is sufficiently less than the right + bool left_imbalanced(int threshold) const; + + // returns: true iff right subtree depth is sufficiently greater than the left + bool right_imbalanced(int threshold) const; + + // effect: performs an O(1) rebalance, which will "heal" an imbalance by at + // most 1. effect: if the new root is not this node, then this node is + // unlocked. returns: locked node representing the new root of the rebalanced + // subtree + treenode *maybe_rebalance(void); + + // returns: allocated treenode populated with a copy of the range and txnid + static treenode *alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared); + + // requires: node is a locked root node, or an unlocked non-root node + static void free(treenode *node); + + // effect: swaps the range/txnid pairs for node1 and node2. + static void swap_in_place(treenode *node1, treenode *node2); + + friend class concurrent_tree_unit_test; +}; + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h new file mode 100644 index 0000000000..d79c24fb0c --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h @@ -0,0 +1,92 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../portability/txn_subst.h" +#include "../util/omt.h" + +namespace toku { + +class txnid_set { + public: + // effect: Creates an empty set. Does not malloc space for + // any entries yet. That is done lazily on add(). + void create(void); + + // effect: Destroy the set's internals. + void destroy(void); + + // returns: True if the given txnid is a member of the set. + bool contains(TXNID id) const; + + // effect: Adds a given txnid to the set if it did not exist + void add(TXNID txnid); + + // effect: Deletes a txnid from the set if it exists. + void remove(TXNID txnid); + + // returns: Size of the set + uint32_t size(void) const; + + // returns: The "i'th" id in the set, as if it were sorted. + TXNID get(uint32_t i) const; + + private: + toku::omt m_txnids; + + friend class txnid_set_unit_test; +}; +ENSURE_POD(txnid_set); + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h new file mode 100644 index 0000000000..8042021707 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h @@ -0,0 +1,124 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../util/omt.h" +#include "txnid_set.h" + +namespace toku { + +// A wfg is a 'wait-for' graph. A directed edge in represents one +// txn waiting for another to finish before it can acquire a lock. + +class wfg { + public: + // Create a lock request graph + void create(void); + + // Destroy the internals of the lock request graph + void destroy(void); + + // Add an edge (a_id, b_id) to the graph + void add_edge(TXNID a_txnid, TXNID b_txnid); + + // Return true if a node with the given transaction id exists in the graph. + // Return false otherwise. + bool node_exists(TXNID txnid); + + // Return true if there exists a cycle from a given transaction id in the + // graph. Return false otherwise. + bool cycle_exists_from_txnid(TXNID txnid, + std::function reporter); + + // Apply a given function f to all of the nodes in the graph. The apply + // function returns when the function f is called for all of the nodes in the + // graph, or the function f returns non-zero. + void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra); + + // Apply a given function f to all of the edges whose origin is a given node + // id. The apply function returns when the function f is called for all edges + // in the graph rooted at node id, or the function f returns non-zero. + void apply_edges(TXNID txnid, + int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra), + void *extra); + + private: + struct node { + // txnid for this node and the associated set of edges + TXNID txnid; + txnid_set edges; + bool visited; + + static node *alloc(TXNID txnid); + + static void free(node *n); + }; + ENSURE_POD(node); + + toku::omt m_nodes; + + node *find_node(TXNID txnid); + + node *find_create_node(TXNID txnid); + + bool cycle_exists_from_node(node *target, node *head, + std::function reporter); + + static int find_by_txnid(node *const &node_a, const TXNID &txnid_b); +}; +ENSURE_POD(wfg); + +} /* namespace toku */ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/memory.h new file mode 100644 index 0000000000..0a621f8e01 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/memory.h @@ -0,0 +1,215 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "toku_portability.h" + +/* Percona memory allocation functions and macros. + * These are functions for malloc and free */ + +int toku_memory_startup(void) __attribute__((constructor)); +void toku_memory_shutdown(void) __attribute__((destructor)); + +/* Generally: errno is set to 0 or a value to indicate problems. */ + +// Everything should call toku_malloc() instead of malloc(), and toku_calloc() +// instead of calloc() That way the tests can can, e.g., replace the malloc +// function using toku_set_func_malloc(). +void *toku_calloc(size_t nmemb, size_t size) + __attribute__((__visibility__("default"))); +void *toku_xcalloc(size_t nmemb, size_t size) + __attribute__((__visibility__("default"))); +void *toku_malloc(size_t size) __attribute__((__visibility__("default"))); +void *toku_malloc_aligned(size_t alignment, size_t size) + __attribute__((__visibility__("default"))); + +// xmalloc aborts instead of return NULL if we run out of memory +void *toku_xmalloc(size_t size) __attribute__((__visibility__("default"))); +void *toku_xrealloc(void *, size_t size) + __attribute__((__visibility__("default"))); +void *toku_xmalloc_aligned(size_t alignment, size_t size) + __attribute__((__visibility__("default"))); +// Effect: Perform a os_malloc_aligned(size) with the additional property that +// the returned pointer is a multiple of ALIGNMENT. +// Fail with a resource_assert if the allocation fails (don't return an error +// code). If the alloc_aligned function has been set then call it instead. +// Requires: alignment is a power of two. + +void toku_free(void *) __attribute__((__visibility__("default"))); + +size_t toku_malloc_usable_size(void *p) + __attribute__((__visibility__("default"))); + +/* MALLOC is a macro that helps avoid a common error: + * Suppose I write + * struct foo *x = malloc(sizeof(struct foo)); + * That works fine. But if I change it to this, I've probably made an mistake: + * struct foo *x = malloc(sizeof(struct bar)); + * It can get worse, since one might have something like + * struct foo *x = malloc(sizeof(struct foo *)) + * which looks reasonable, but it allocoates enough to hold a pointer instead of + * the amount needed for the struct. So instead, write struct foo *MALLOC(x); + * and you cannot go wrong. + */ +#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v))) +/* MALLOC_N is like calloc(Except no 0ing of data): It makes an array. Write + * int *MALLOC_N(5,x); + * to make an array of 5 integers. + */ +#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v))) +#define MALLOC_N_ALIGNED(align, n, v) \ + CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v))) + +// CALLOC_N is like calloc with auto-figuring out size of members +#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v))) + +#define CALLOC(v) CALLOC_N(1, v) + +// XMALLOC macros are like MALLOC except they abort if the operation fails +#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v))) +#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v))) +#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v)))) +#define XCALLOC(v) XCALLOC_N(1, v) +#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s)) +#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v))) + +#define XMALLOC_N_ALIGNED(align, n, v) \ + CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v))) + +#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src))) +#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len)) + +// ZERO_ARRAY writes zeroes to a stack-allocated array +#define ZERO_ARRAY(o) \ + do { \ + memset((o), 0, sizeof(o)); \ + } while (0) +// ZERO_STRUCT writes zeroes to a stack-allocated struct +#define ZERO_STRUCT(o) \ + do { \ + memset(&(o), 0, sizeof(o)); \ + } while (0) + +/* Copy memory. Analogous to strdup() */ +void *toku_memdup(const void *v, size_t len); +/* Toku-version of strdup. Use this so that it calls toku_malloc() */ +char *toku_strdup(const char *s) __attribute__((__visibility__("default"))); +/* Toku-version of strndup. Use this so that it calls toku_malloc() */ +char *toku_strndup(const char *s, size_t n) + __attribute__((__visibility__("default"))); +/* Copy memory. Analogous to strdup() Crashes instead of returning NULL */ +void *toku_xmemdup(const void *v, size_t len) + __attribute__((__visibility__("default"))); +/* Toku-version of strdup. Use this so that it calls toku_xmalloc() Crashes + * instead of returning NULL */ +char *toku_xstrdup(const char *s) __attribute__((__visibility__("default"))); + +void toku_malloc_cleanup( + void); /* Before exiting, call this function to free up any internal data + structures from toku_malloc. Otherwise valgrind will complain of + memory leaks. */ + +/* Check to see if everything malloc'd was free. Might be a no-op depending on + * how memory.c is configured. */ +void toku_memory_check_all_free(void); +/* Check to see if memory is "sane". Might be a no-op. Probably better to + * simply use valgrind. */ +void toku_do_memory_check(void); + +typedef void *(*malloc_fun_t)(size_t); +typedef void (*free_fun_t)(void *); +typedef void *(*realloc_fun_t)(void *, size_t); +typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/); +typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/, + size_t /*size*/); + +void toku_set_func_malloc(malloc_fun_t f); +void toku_set_func_xmalloc_only(malloc_fun_t f); +void toku_set_func_malloc_only(malloc_fun_t f); +void toku_set_func_realloc(realloc_fun_t f); +void toku_set_func_xrealloc_only(realloc_fun_t f); +void toku_set_func_realloc_only(realloc_fun_t f); +void toku_set_func_free(free_fun_t f); + +typedef struct memory_status { + uint64_t malloc_count; // number of malloc operations + uint64_t free_count; // number of free operations + uint64_t realloc_count; // number of realloc operations + uint64_t malloc_fail; // number of malloc operations that failed + uint64_t realloc_fail; // number of realloc operations that failed + uint64_t requested; // number of bytes requested + uint64_t used; // number of bytes used (requested + overhead), obtained from + // malloc_usable_size() + uint64_t freed; // number of bytes freed; + uint64_t max_requested_size; // largest attempted allocation size + uint64_t last_failed_size; // size of the last failed allocation attempt + volatile uint64_t + max_in_use; // maximum memory footprint (used - freed), approximate (not + // worth threadsafety overhead for exact) + const char *mallocator_version; + uint64_t mmap_threshold; +} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS; + +void toku_memory_get_status(LOCAL_MEMORY_STATUS s); + +// Effect: Like toku_memory_footprint, except instead of passing p, +// we pass toku_malloc_usable_size(p). +size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable); + +// Effect: Return an estimate how how much space an object is using, possibly by +// using toku_malloc_usable_size(p). +// If p is NULL then returns 0. +size_t toku_memory_footprint(void *p, size_t touched); diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h new file mode 100644 index 0000000000..af47800fb7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h @@ -0,0 +1,39 @@ +// +// A replacement for toku_assert.h +// +#pragma once + +#include +#include + +#ifdef NDEBUG + +#define assert_zero(a) ((void)(a)) +#define invariant(a) ((void)(a)) +#define invariant_notnull(a) ((void)(a)) +#define invariant_zero(a) ((void)(a)) + +#else + +#define assert_zero(a) assert((a) == 0) +#define invariant(a) assert(a) +#define invariant_notnull(a) assert(a) +#define invariant_zero(a) assert_zero(a) + +#endif + +#define lazy_assert_zero(a) assert_zero(a) + +#define paranoid_invariant_zero(a) assert_zero(a) +#define paranoid_invariant_notnull(a) assert(a) +#define paranoid_invariant(a) assert(a) + +#define ENSURE_POD(type) \ + static_assert( \ + std::is_standard_layout::value && std::is_trivial::value, \ + #type "isn't POD") + +inline int get_error_errno(void) { + invariant(errno); + return errno; +} diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h new file mode 100644 index 0000000000..aaa2298faf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h @@ -0,0 +1,130 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include +#include +#include +#include + +#include "toku_assert_subst.h" + +__attribute__((const, always_inline)) static inline intptr_t which_cache_line( + intptr_t addr) { + static const size_t assumed_cache_line_size = 64; + return addr / assumed_cache_line_size; +} +template +__attribute__((const, always_inline)) static inline bool crosses_boundary( + T *addr, size_t width) { + const intptr_t int_addr = reinterpret_cast(addr); + const intptr_t last_byte = int_addr + width - 1; + return which_cache_line(int_addr) != which_cache_line(last_byte); +} + +template +__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_fetch_and_add(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_add_and_fetch(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_fetch_and_sub(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_sub_and_fetch(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap( + T *addr, U oldval, V newval) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_val_compare_and_swap(addr, oldval, newval); +} +template +__attribute__((always_inline)) static inline bool +toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +// in case you include this but not toku_portability.h +#pragma GCC poison __sync_fetch_and_add +#pragma GCC poison __sync_fetch_and_sub +#pragma GCC poison __sync_fetch_and_or +#pragma GCC poison __sync_fetch_and_and +#pragma GCC poison __sync_fetch_and_xor +#pragma GCC poison __sync_fetch_and_nand +#pragma GCC poison __sync_add_and_fetch +#pragma GCC poison __sync_sub_and_fetch +#pragma GCC poison __sync_or_and_fetch +#pragma GCC poison __sync_and_and_fetch +#pragma GCC poison __sync_xor_and_fetch +#pragma GCC poison __sync_nand_and_fetch +#pragma GCC poison __sync_bool_compare_and_swap +#pragma GCC poison __sync_val_compare_and_swap +#pragma GCC poison __sync_synchronize +#pragma GCC poison __sync_lock_test_and_set +#pragma GCC poison __sync_release diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h new file mode 100644 index 0000000000..eb8291c1d2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h @@ -0,0 +1,83 @@ +/* + A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided + condition and mutex that provides toku_pthread_*-like interface. The functions + are named + + toku_external_{mutex|cond}_XXX + + Lock Tree uses this mutex and condition for interruptible (long) lock waits. + + (It also still uses toku_pthread_XXX calls for mutexes/conditions for + shorter waits on internal objects) +*/ + +#pragma once + +#include +#include +#include + +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "toku_portability.h" + +using ROCKSDB_NAMESPACE::TransactionDBCondVar; +using ROCKSDB_NAMESPACE::TransactionDBMutex; + +typedef std::shared_ptr + toku_external_mutex_factory_t; + +typedef std::shared_ptr toku_external_mutex_t; +typedef std::shared_ptr toku_external_cond_t; + +static inline void toku_external_cond_init( + toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) { + *cond = mutex_factory->AllocateCondVar(); +} + +inline void toku_external_cond_destroy(toku_external_cond_t *cond) { + cond->reset(); // this will destroy the managed object +} + +inline void toku_external_cond_signal(toku_external_cond_t *cond) { + (*cond)->Notify(); +} + +inline void toku_external_cond_broadcast(toku_external_cond_t *cond) { + (*cond)->NotifyAll(); +} + +inline int toku_external_cond_timedwait(toku_external_cond_t *cond, + toku_external_mutex_t *mutex, + int64_t timeout_microsec) { + auto res = (*cond)->WaitFor(*mutex, timeout_microsec); + if (res.ok()) + return 0; + else + return ETIMEDOUT; +} + +inline void toku_external_mutex_init(toku_external_mutex_factory_t factory, + toku_external_mutex_t *mutex) { + // Use placement new: the memory has been allocated but constructor wasn't + // called + new (mutex) toku_external_mutex_t; + *mutex = factory->AllocateMutex(); +} + +inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) { + (*mutex)->Lock(); +} + +inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) { + (*mutex)->Lock(); + return 0; +} + +inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) { + (*mutex)->UnLock(); +} + +inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) { + mutex->reset(); // this will destroy the managed object +} diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h new file mode 100644 index 0000000000..c967e71771 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h @@ -0,0 +1,286 @@ +/*====== +This file is part of PerconaFT. + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#pragma once + +#include // FILE + +// Performance instrumentation object identifier type +typedef unsigned int pfs_key_t; + +enum class toku_instr_object_type { mutex, rwlock, cond, thread, file }; + +struct PSI_file; + +struct TOKU_FILE { + /** The real file. */ + FILE *file; + struct PSI_file *key; + TOKU_FILE() : file(nullptr), key(nullptr) {} +}; + +struct PSI_mutex; +struct PSI_cond; +struct PSI_rwlock; + +struct toku_mutex_t; +struct toku_cond_t; +struct toku_pthread_rwlock_t; + +class toku_instr_key; + +class toku_instr_probe_empty { + public: + explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {} + + void start_with_source_location(UU(const char *src_file), UU(int src_line)) {} + + void stop() {} +}; + +#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__) +#define TOKU_PROBE_STOP(p) p->stop + +extern toku_instr_key toku_uninstrumented; + +#ifndef MYSQL_TOKUDB_ENGINE + +#include + +class toku_instr_key { + public: + toku_instr_key(UU(toku_instr_object_type type), UU(const char *group), + UU(const char *name)) {} + + explicit toku_instr_key(UU(pfs_key_t key_id)) {} + // No-instrumentation constructor: + toku_instr_key() {} + ~toku_instr_key() {} +}; + +typedef toku_instr_probe_empty toku_instr_probe; + +enum class toku_instr_file_op { + file_stream_open, + file_create, + file_open, + file_delete, + file_rename, + file_read, + file_write, + file_sync, + file_stream_close, + file_close, + file_stat +}; + +struct PSI_file {}; +struct PSI_mutex {}; + +struct toku_io_instrumentation {}; + +inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { + return pthread_create(thread, attr, start_routine, arg); +} + +inline void toku_instr_register_current_thread() {} + +inline void toku_instr_delete_current_thread() {} + +// Instrument file creation, opening, closing, and renaming +inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr), + UU(const toku_instr_key &key), + UU(toku_instr_file_op op), + UU(const char *name), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_stream_open_end( + UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {} + +inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr), + UU(int fd)) {} + +inline void toku_instr_file_name_close_begin( + UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key), + UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_stream_close_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_fd_close_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(int fd), UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr), + UU(int result)) {} + +inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr), + UU(toku_instr_file_op op), UU(int fd), + UU(unsigned int count), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_name_io_begin( + UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key), + UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_stream_io_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr), + UU(unsigned int count)) {} + +struct toku_mutex_t; + +struct toku_mutex_instrumentation {}; + +inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key), + UU(toku_mutex_t &mutex)) { + return nullptr; +} + +inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {} + +inline void toku_instr_mutex_lock_start( + UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_mutex_trylock_start( + UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_mutex_lock_end( + UU(toku_mutex_instrumentation &mutex_instr), + UU(int pthread_mutex_lock_result)) {} + +inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {} + +struct toku_cond_instrumentation {}; + +enum class toku_instr_cond_op { + cond_wait, + cond_timedwait, +}; + +inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key), + UU(toku_cond_t &cond)) { + return nullptr; +} + +inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {} + +inline void toku_instr_cond_wait_start( + UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op), + UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr), + UU(int pthread_cond_wait_result)) {} + +inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {} + +inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {} + +#if 0 +// rw locks are not used +// rwlock instrumentation +struct toku_rwlock_instrumentation {}; + +inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key), + UU(toku_pthread_rwlock_t &rwlock)) { + return nullptr; +} + +inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {} + +inline void toku_instr_rwlock_rdlock_wait_start( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(toku_pthread_rwlock_t &rwlock), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_rwlock_wrlock_wait_start( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(toku_pthread_rwlock_t &rwlock), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_rwlock_rdlock_wait_end( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(int pthread_rwlock_wait_result)) {} + +inline void toku_instr_rwlock_wrlock_wait_end( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(int pthread_rwlock_wait_result)) {} + +inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {} +#endif + +#else // MYSQL_TOKUDB_ENGINE +// There can be not only mysql but also mongodb or any other PFS stuff +#include +#endif // MYSQL_TOKUDB_ENGINE + +// Mutexes +extern toku_instr_key manager_escalation_mutex_key; +extern toku_instr_key manager_escalator_mutex_key; +extern toku_instr_key manager_mutex_key; +extern toku_instr_key treenode_mutex_key; +extern toku_instr_key locktree_request_info_mutex_key; +extern toku_instr_key locktree_request_info_retry_mutex_key; + +// condition vars +extern toku_instr_key lock_request_m_wait_cond_key; +extern toku_instr_key locktree_request_info_retry_cv_key; +extern toku_instr_key manager_m_escalator_done_key; // unused diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h new file mode 100644 index 0000000000..9a95b38bd5 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h @@ -0,0 +1,87 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#if defined(__clang__) +#define constexpr_static_assert(a, b) +#else +#define constexpr_static_assert(a, b) static_assert(a, b) +#endif + +// include here, before they get deprecated +#include +#include +#include +#include +#include +#include +#include + +#include "toku_atomic.h" + +#if defined(__cplusplus) +#include +#endif + +#if defined(__cplusplus) +// decltype() here gives a reference-to-pointer instead of just a pointer, +// just use __typeof__ +#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value) +#else +#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value) +#endif + +#define UU(x) x __attribute__((__unused__)) + +#include "toku_instrumentation.h" diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h new file mode 100644 index 0000000000..571b950e11 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h @@ -0,0 +1,520 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include +#include +#include + +#include "toku_portability.h" +// PORT2: #include "toku_assert.h" + +// TODO: some things moved toku_instrumentation.h, not necessarily the best +// place +typedef pthread_attr_t toku_pthread_attr_t; +typedef pthread_t toku_pthread_t; +typedef pthread_mutex_t toku_pthread_mutex_t; +typedef pthread_condattr_t toku_pthread_condattr_t; +typedef pthread_cond_t toku_pthread_cond_t; +typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t; +typedef pthread_key_t toku_pthread_key_t; +typedef struct timespec toku_timespec_t; + +// TODO: break this include loop +#include +typedef pthread_mutexattr_t toku_pthread_mutexattr_t; + +struct toku_mutex_t { + pthread_mutex_t pmutex; + struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */ +#if defined(TOKU_PTHREAD_DEBUG) + pthread_t owner; // = pthread_self(); // for debugging + bool locked; + bool valid; + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +struct toku_cond_t { + pthread_cond_t pcond; + struct PSI_cond *psi_cond; +#if defined(TOKU_PTHREAD_DEBUG) + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_COND_INITIALIZER \ + { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 } +#else +#define TOKU_COND_INITIALIZER \ + { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) + +struct toku_pthread_rwlock_t { + pthread_rwlock_t rwlock; + struct PSI_rwlock *psi_rwlock; +#if defined(TOKU_PTHREAD_DEBUG) + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +typedef struct toku_mutex_aligned { + toku_mutex_t aligned_mutex __attribute__((__aligned__(64))); +} toku_mutex_aligned_t; + +// Initializing with {} will fill in a struct with all zeros. +// But you may also need a pragma to suppress the warnings, as follows +// +// #pragma GCC diagnostic push +// #pragma GCC diagnostic ignored "-Wmissing-field-initializers" +// toku_mutex_t foo = ZERO_MUTEX_INITIALIZER; +// #pragma GCC diagnostic pop +// +// In general it will be a lot of busy work to make this codebase compile +// cleanly with -Wmissing-field-initializers + +#define ZERO_MUTEX_INITIALIZER \ + {} + +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \ + .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) + +// Darwin doesn't provide adaptive mutexes +#if defined(__APPLE__) +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \ + .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) +#else // __FreeBSD__, __linux__, at least +#if defined(__GLIBC__) +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP +#else +// not all libc (e.g. musl) implement NP (Non-POSIX) attributes +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#endif +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \ + .owner = 0, .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) +#endif // defined(__APPLE__) + +// Different OSes implement mutexes as different amounts of nested structs. +// C++ will fill out all missing values with zeroes if you provide at least one +// zero, but it needs the right amount of nesting. +#if defined(__FreeBSD__) +#define ZERO_COND_INITIALIZER \ + { 0 } +#elif defined(__APPLE__) +#define ZERO_COND_INITIALIZER \ + { \ + { 0 } \ + } +#else // __linux__, at least +#define ZERO_COND_INITIALIZER \ + {} +#endif + +static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) { + int r = pthread_mutexattr_init(attr); + assert_zero(r); +} + +static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr, + int type) { + int r = pthread_mutexattr_settype(attr, type); + assert_zero(r); +} + +static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) { + int r = pthread_mutexattr_destroy(attr); + assert_zero(r); +} + +#if defined(TOKU_PTHREAD_DEBUG) +static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) { + invariant(mutex->locked); + invariant(mutex->owner == pthread_self()); +} +#else +static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex + __attribute__((unused))) {} +#endif // defined(TOKU_PTHREAD_DEBUG) + +// asserting that a mutex is unlocked only makes sense +// if the calling thread can guaruntee that no other threads +// are trying to lock this mutex at the time of the assertion +// +// a good example of this is a tree with mutexes on each node. +// when a node is locked the caller knows that no other threads +// can be trying to lock its childrens' mutexes. the children +// are in one of two fixed states: locked or unlocked. +#if defined(TOKU_PTHREAD_DEBUG) +static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) { + invariant(mutex->owner == 0); + invariant(!mutex->locked); +} +#else +static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex + __attribute__((unused))) {} +#endif // defined(TOKU_PTHREAD_DEBUG) + +#define toku_mutex_lock(M) \ + toku_mutex_lock_with_source_location(M, __FILE__, __LINE__) + +static inline void toku_cond_init(toku_cond_t *cond, + const toku_pthread_condattr_t *attr) { + int r = pthread_cond_init(&cond->pcond, attr); + assert_zero(r); +} + +#define toku_mutex_trylock(M) \ + toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__) + +inline void toku_mutex_unlock(toku_mutex_t *mutex) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->owner == pthread_self()); + invariant(mutex->valid); + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_unlock(mutex->psi_mutex); + int r = pthread_mutex_unlock(&mutex->pmutex); + assert_zero(r); +} + +inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex, + const char *src_file, + int src_line) { + toku_mutex_instrumentation mutex_instr; + toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line); + + const int r = pthread_mutex_lock(&mutex->pmutex); + toku_instr_mutex_lock_end(mutex_instr, r); + + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->valid); + invariant(!mutex->locked); + invariant(mutex->owner == 0); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex, + const char *src_file, + int src_line) { + toku_mutex_instrumentation mutex_instr; + toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line); + + const int r = pthread_mutex_lock(&mutex->pmutex); + toku_instr_mutex_lock_end(mutex_instr, r); + +#if defined(TOKU_PTHREAD_DEBUG) + if (r == 0) { + invariant(mutex->valid); + invariant(!mutex->locked); + invariant(mutex->owner == 0); + mutex->locked = true; + mutex->owner = pthread_self(); + } +#endif // defined(TOKU_PTHREAD_DEBUG) + return r; +} + +#define toku_cond_wait(C, M) \ + toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__) + +#define toku_cond_timedwait(C, M, W) \ + toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__) + +inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond, + const pthread_condattr_t *attr) { + toku_instr_cond_init(key, *cond); + int r = pthread_cond_init(&cond->pcond, attr); + assert_zero(r); +} + +inline void toku_cond_destroy(toku_cond_t *cond) { + toku_instr_cond_destroy(cond->psi_cond); + int r = pthread_cond_destroy(&cond->pcond); + assert_zero(r); +} + +inline void toku_cond_wait_with_source_location(toku_cond_t *cond, + toku_mutex_t *mutex, + const char *src_file, + int src_line) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + + /* Instrumentation start */ + toku_cond_instrumentation cond_instr; + toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond, + *mutex, src_file, src_line); + + /* Instrumented code */ + const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex); + + /* Instrumentation end */ + toku_instr_cond_wait_end(cond_instr, r); + + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + invariant(!mutex->locked); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond, + toku_mutex_t *mutex, + toku_timespec_t *wakeup_at, + const char *src_file, + int src_line) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + + /* Instrumentation start */ + toku_cond_instrumentation cond_instr; + toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait, + *cond, *mutex, src_file, src_line); + + /* Instrumented code */ + const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at); + + /* Instrumentation end */ + toku_instr_cond_wait_end(cond_instr, r); + +#if defined(TOKU_PTHREAD_DEBUG) + invariant(!mutex->locked); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) + return r; +} + +inline void toku_cond_signal(toku_cond_t *cond) { + toku_instr_cond_signal(*cond); + const int r = pthread_cond_signal(&cond->pcond); + assert_zero(r); +} + +inline void toku_cond_broadcast(toku_cond_t *cond) { + toku_instr_cond_broadcast(*cond); + const int r = pthread_cond_broadcast(&cond->pcond); + assert_zero(r); +} + +inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex, + const toku_pthread_mutexattr_t *attr) { +#if defined(TOKU_PTHREAD_DEBUG) + mutex->valid = true; +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_init(key, *mutex); + const int r = pthread_mutex_init(&mutex->pmutex, attr); + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + mutex->locked = false; + invariant(mutex->valid); + mutex->valid = true; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline void toku_mutex_destroy(toku_mutex_t *mutex) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->valid); + mutex->valid = false; + invariant(!mutex->locked); +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_destroy(mutex->psi_mutex); + int r = pthread_mutex_destroy(&mutex->pmutex); + assert_zero(r); +} + +#define toku_pthread_rwlock_rdlock(RW) \ + toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__) + +#define toku_pthread_rwlock_wrlock(RW) \ + toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__) + +#if 0 +inline void toku_pthread_rwlock_init( + const toku_instr_key &key, + toku_pthread_rwlock_t *__restrict rwlock, + const toku_pthread_rwlockattr_t *__restrict attr) { + toku_instr_rwlock_init(key, *rwlock); + int r = pthread_rwlock_init(&rwlock->rwlock, attr); + assert_zero(r); +} + +inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_destroy(rwlock->psi_rwlock); + int r = pthread_rwlock_destroy(&rwlock->rwlock); + assert_zero(r); +} + +inline void toku_pthread_rwlock_rdlock_with_source_location( + toku_pthread_rwlock_t *rwlock, + const char *src_file, + uint src_line) { + + /* Instrumentation start */ + toku_rwlock_instrumentation rwlock_instr; + toku_instr_rwlock_rdlock_wait_start( + rwlock_instr, *rwlock, src_file, src_line); + /* Instrumented code */ + const int r = pthread_rwlock_rdlock(&rwlock->rwlock); + + /* Instrumentation end */ + toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r); + + assert_zero(r); +} + +inline void toku_pthread_rwlock_wrlock_with_source_location( + toku_pthread_rwlock_t *rwlock, + const char *src_file, + uint src_line) { + + /* Instrumentation start */ + toku_rwlock_instrumentation rwlock_instr; + toku_instr_rwlock_wrlock_wait_start( + rwlock_instr, *rwlock, src_file, src_line); + /* Instrumented code */ + const int r = pthread_rwlock_wrlock(&rwlock->rwlock); + + /* Instrumentation end */ + toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r); + + assert_zero(r); +} + +inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_unlock(*rwlock); + const int r = pthread_rwlock_unlock(&rwlock->rwlock); + assert_zero(r); +} + +inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_unlock(*rwlock); + const int r = pthread_rwlock_unlock(&rwlock->rwlock); + assert_zero(r); +} +#endif + +static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) { + return pthread_join(thread, value_ptr); +} + +static inline int toku_pthread_detach(toku_pthread_t thread) { + return pthread_detach(thread); +} + +static inline int toku_pthread_key_create(toku_pthread_key_t *key, + void (*destroyf)(void *)) { + return pthread_key_create(key, destroyf); +} + +static inline int toku_pthread_key_delete(toku_pthread_key_t key) { + return pthread_key_delete(key); +} + +static inline void *toku_pthread_getspecific(toku_pthread_key_t key) { + return pthread_getspecific(key); +} + +static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) { + return pthread_setspecific(key, data); +} + +int toku_pthread_yield(void) __attribute__((__visibility__("default"))); + +static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); } + +static inline void *toku_pthread_done(void *exit_value) { + toku_instr_delete_current_thread(); + pthread_exit(exit_value); + return nullptr; // Avoid compiler warning +} diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h new file mode 100644 index 0000000000..3cb5b57902 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h @@ -0,0 +1,179 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include + +#ifdef HAVE_valgrind +#undef USE_VALGRIND +#define USE_VALGRIND 1 +#endif + +#if defined(__linux__) && USE_VALGRIND + +#include +#include + +#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size) +#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \ + VALGRIND_HG_ENABLE_CHECKING(p, size) +#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \ + VALGRIND_HG_DISABLE_CHECKING(p, size) +#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v) +#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v) +#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN() +#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END() +#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN() +#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END() + +/* + * How to make helgrind happy about tree rotations and new mutex orderings: + * + * // Tell helgrind that we unlocked it so that the next call doesn't get a + * "destroyed a locked mutex" error. + * // Tell helgrind that we destroyed the mutex. + * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka); + * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka); + * + * // And recreate it. It would be better to simply be able to say that the + * order on these two can now be reversed, because this code forgets all the + * ordering information for this mutex. + * // Then tell helgrind that we have locked it again. + * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0); + * VALGRIND_HG_MUTEX_LOCK_POST(&locka); + * + * When the ordering of two locks changes, we don't need tell Helgrind about do + * both locks. Just one is good enough. + */ + +#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \ + VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex); \ + VALGRIND_HG_MUTEX_DESTROY_PRE(mutex); \ + VALGRIND_HG_MUTEX_INIT_POST(mutex, 0); \ + VALGRIND_HG_MUTEX_LOCK_POST(mutex); + +#else // !defined(__linux__) || !USE_VALGRIND + +#define NVALGRIND 1 +#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0) +#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0) +#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0) +#define TOKU_DRD_IGNORE_VAR(v) +#define TOKU_DRD_STOP_IGNORING_VAR(v) +#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0) +#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0) +#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0) +#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0) +#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) +#undef RUNNING_ON_VALGRIND +#define RUNNING_ON_VALGRIND (0U) +#endif + +// Valgrind 3.10.1 (and previous versions). +// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING. +// Helgrind's implementation of disable and enable checking causes false races +// to be reported. In addition, the race report does not include ANY +// information about the code that uses the helgrind disable and enable +// functions. Therefore, it is very difficult to figure out the cause of the +// race. DRD does implement the disable and enable functions. + +// Problems with ANNOTATE_IGNORE_READS. +// Helgrind does not implement ignore reads. +// Annotate ignore reads is the way to inform DRD to ignore racy reads. + +// FT code uses unsafe reads in several places. These unsafe reads have been +// noted as valid since they use the toku_unsafe_fetch function. Unfortunately, +// this causes helgrind to report erroneous data races which makes use of +// helgrind problematic. + +// Unsafely fetch and return a `T' from src, telling drd to ignore +// racey access to src for the next sizeof(*src) bytes +template +T toku_unsafe_fetch(T *src) { + if (0) + TOKU_VALGRIND_HG_DISABLE_CHECKING(src, + sizeof *src); // disabled, see comment + TOKU_ANNOTATE_IGNORE_READS_BEGIN(); + T r = *src; + TOKU_ANNOTATE_IGNORE_READS_END(); + if (0) + TOKU_VALGRIND_HG_ENABLE_CHECKING(src, + sizeof *src); // disabled, see comment + return r; +} + +template +T toku_unsafe_fetch(T &src) { + return toku_unsafe_fetch(&src); +} + +// Unsafely set a `T' value into *dest from src, telling drd to ignore +// racey access to dest for the next sizeof(*dest) bytes +template +void toku_unsafe_set(T *dest, const T src) { + if (0) + TOKU_VALGRIND_HG_DISABLE_CHECKING(dest, + sizeof *dest); // disabled, see comment + TOKU_ANNOTATE_IGNORE_WRITES_BEGIN(); + *dest = src; + TOKU_ANNOTATE_IGNORE_WRITES_END(); + if (0) + TOKU_VALGRIND_HG_ENABLE_CHECKING(dest, + sizeof *dest); // disabled, see comment +} + +template +void toku_unsafe_set(T &dest, const T src) { + toku_unsafe_set(&dest, src); +} diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h new file mode 100644 index 0000000000..9b83c53511 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h @@ -0,0 +1,197 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include "toku_config.h" + +#include +#include +#include +#if defined(__powerpc__) +#include +#endif + +#if 0 +static inline float toku_tdiff (struct timeval *a, struct timeval *b) { + return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec)); +} +// PORT2: temporary: +#define HAVE_CLOCK_REALTIME +#if !defined(HAVE_CLOCK_REALTIME) +// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time. +typedef int clockid_t; +// just something bogus, it doesn't matter, we just want to make sure we're +// only supporting this mode because we're not sure we can support other modes +// without a real clock_gettime() +#define CLOCK_REALTIME 0x01867234 +#endif +int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default"))); +#endif + +// *************** Performance timers ************************ +// What do you really want from a performance timer: +// (1) Can determine actual time of day from the performance time. +// (2) Time goes forward, never backward. +// (3) Same time on different processors (or even different machines). +// (4) Time goes forward at a constant rate (doesn't get faster and slower) +// (5) Portable. +// (6) Getting the time is cheap. +// Unfortuately it seems tough to get Properties 1-5. So we go for Property 6,, +// but we abstract it. We offer a type tokutime_t which can hold the time. This +// type can be subtracted to get a time difference. We can get the present time +// cheaply. We can convert this type to seconds (but that can be expensive). The +// implementation is to use RDTSC (hence we lose property 3: not portable). +// Recent machines have constant_tsc in which case we get property (4). +// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock +// skew, so we get property (3). We get property 2 with RDTSC (as long as +// there's not any skew). We don't even try to get propety 1, since we don't +// need it. The decision here is that these times are really accurate only on +// modern machines with modern OSs. +typedef uint64_t tokutime_t; // Time type used in by tokutek timers. + +#if 0 +// The value of tokutime_t is not specified here. +// It might be microseconds since 1/1/1970 (if gettimeofday() is +// used), or clock cycles since boot (if rdtsc is used). Or something +// else. +// Two tokutime_t values can be subtracted to get a time difference. +// Use tokutime_to_seconds to that convert difference to seconds. +// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds(); +// +// For accurate time calculations do the subtraction in the right order: +// Right: tokutime_to_seconds(t1-t2); +// Wrong tokutime_to_seconds(t1)-toku_time_to_seconds(t2); +// Doing it the wrong way is likely to result in loss of precision. +// A double can hold numbers up to about 53 bits. RDTSC which uses about 33 bits every second, so that leaves +// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double. +// +double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds. + +#endif + +// Get the value of tokutime for right now. We want this to be fast, so we +// expose the implementation as RDTSC. +static inline tokutime_t toku_time_now(void) { +#if defined(__x86_64__) || defined(__i386__) + uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return (uint64_t)hi << 32 | lo; +#elif defined(__aarch64__) + uint64_t result; + __asm __volatile__("mrs %[rt], cntvct_el0" : [rt] "=r"(result)); + return result; +#elif defined(__powerpc__) + return __ppc_get_timebase(); +#elif defined(__s390x__) + uint64_t result; + asm volatile("stckf %0" : "=Q"(result) : : "cc"); + return result; +#elif defined(__riscv) && __riscv_xlen == 32 + uint32_t cycles_lo, cycles_hi0, cycles_hi1; + // Implemented in assembly because Clang insisted on branching. + asm volatile( + "rdcycleh %0\n" + "rdcycle %1\n" + "rdcycleh %2\n" + "sub %0, %0, %2\n" + "seqz %0, %0\n" + "sub %0, zero, %0\n" + "and %1, %1, %0\n" + : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1)); + return (static_cast(cycles_hi1) << 32) | cycles_lo; +#elif defined(__riscv) && __riscv_xlen == 64 + uint64_t cycles; + asm volatile("rdcycle %0" : "=r"(cycles)); + return cycles; +#elif defined(__loongarch64) + unsigned long result; + asm volatile ("rdtime.d\t%0,$r0" : "=r" (result)); + return result; +#else +#error No timer implementation for this platform +#endif +} + +static inline uint64_t toku_current_time_microsec(void) { + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec; +} + +#if 0 +// sleep microseconds +static inline void toku_sleep_microsec(uint64_t ms) { + struct timeval t; + + t.tv_sec = ms / 1000000; + t.tv_usec = ms % 1000000; + + select(0, NULL, NULL, NULL, &t); +} +#endif + +/* + PORT: Usage of this file: + + uint64_t toku_current_time_microsec() // uses gettimeoday + is used to track how much time various operations took (for example, lock + escalation). (TODO: it is not clear why these operations are tracked with + microsecond precision while others use nanoseconds) + + tokutime_t toku_time_now() // uses rdtsc + seems to be used for a very similar purpose. This has greater precision + + RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which + should be adequate substitutes. +*/ diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h new file mode 100644 index 0000000000..803914862f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h @@ -0,0 +1,27 @@ +// +// A substitute for ft/txn/txn.h +// +#pragma once + +#include + +#include "../util/omt.h" + +typedef uint64_t TXNID; +#define TXNID_NONE ((TXNID)0) + +// A set of transactions +// (TODO: consider using class toku::txnid_set. The reason for using STL +// container was that its API is easier) +class TxnidVector : public std::set { + public: + bool contains(TXNID txnid) { return find(txnid) != end(); } +}; + +// A value for lock structures with a meaning "the lock is owned by multiple +// transactions (and one has to check the TxnidVector to get their ids) +#define TXNID_SHARED (TXNID(-1)) + +// Auxiliary value meaning "any transaction id will do". No real transaction +// may have this is as id. +#define TXNID_ANY (TXNID(-2)) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/dbt.h new file mode 100644 index 0000000000..d86c440f86 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/dbt.h @@ -0,0 +1,98 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" + +// TODO: John +// Document this API a little better so that DBT +// memory management can be morm widely understood. + +DBT *toku_init_dbt(DBT *); + +// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true) +DBT toku_empty_dbt(void); + +DBT *toku_init_dbt_flags(DBT *, uint32_t flags); + +void toku_destroy_dbt(DBT *); + +DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len); + +DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len); + +DBT *toku_copyref_dbt(DBT *dst, const DBT src); + +DBT *toku_clone_dbt(DBT *dst, const DBT &src); + +void toku_sdbt_cleanup(struct simple_dbt *sdbt); + +// returns: special DBT pointer representing positive infinity +const DBT *toku_dbt_positive_infinity(void); + +// returns: special DBT pointer representing negative infinity +const DBT *toku_dbt_negative_infinity(void); + +// returns: true if the given dbt is either positive or negative infinity +bool toku_dbt_is_infinite(const DBT *dbt); + +// returns: true if the given dbt has no data (ie: dbt->data == nullptr) +bool toku_dbt_is_empty(const DBT *dbt); + +// effect: compares two potentially infinity-valued dbts +// requires: at least one is infinite (assert otherwise) +int toku_dbt_infinite_compare(const DBT *a, const DBT *b); + +// returns: true if the given dbts have the same data pointer and size +bool toku_dbt_equals(const DBT *a, const DBT *b); diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h new file mode 100644 index 0000000000..158750fdba --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h @@ -0,0 +1,144 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +//****************************************************************************** +// +// Overview: A growable array is a little bit like std::vector except that +// it doesn't have constructors (hence can be used in static constructs, since +// the google style guide says no constructors), and it's a little simpler. +// Operations: +// init and deinit (we don't have constructors and destructors). +// fetch_unchecked to get values out. +// store_unchecked to put values in. +// push to add an element at the end +// get_size to find out the size +// get_memory_size to find out how much memory the data stucture is using. +// +//****************************************************************************** + +namespace toku { + +template +class GrowableArray { + public: + void init(void) + // Effect: Initialize the array to contain no elements. + { + m_array = NULL; + m_size = 0; + m_size_limit = 0; + } + + void deinit(void) + // Effect: Deinitialize the array (freeing any memory it uses, for example). + { + toku_free(m_array); + m_array = NULL; + m_size = 0; + m_size_limit = 0; + } + + T fetch_unchecked(size_t i) const + // Effect: Fetch the ith element. If i is out of range, the system asserts. + { + return m_array[i]; + } + + void store_unchecked(size_t i, T v) + // Effect: Store v in the ith element. If i is out of range, the system + // asserts. + { + paranoid_invariant(i < m_size); + m_array[i] = v; + } + + void push(T v) + // Effect: Add v to the end of the array (increasing the size). The amortized + // cost of this operation is constant. Implementation hint: Double the size + // of the array when it gets too big so that the amortized cost stays + // constant. + { + if (m_size >= m_size_limit) { + if (m_array == NULL) { + m_size_limit = 1; + } else { + m_size_limit *= 2; + } + XREALLOC_N(m_size_limit, m_array); + } + m_array[m_size++] = v; + } + + size_t get_size(void) const + // Effect: Return the number of elements in the array. + { + return m_size; + } + size_t memory_size(void) const + // Effect: Return the size (in bytes) that the array occupies in memory. This + // is really only an estimate. + { + return sizeof(*this) + sizeof(T) * m_size_limit; + } + + private: + T *m_array; + size_t m_size; + size_t m_size_limit; // How much space is allocated in array. +}; + +} // namespace toku diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/memarena.h new file mode 100644 index 0000000000..ddcc1144f2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/memarena.h @@ -0,0 +1,141 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +/* + * A memarena is used to efficiently store a collection of objects that never + * move The pattern is allocate more and more stuff and free all of the items at + * once. The underlying memory will store 1 or more objects per chunk. Each + * chunk is contiguously laid out in memory but chunks are not necessarily + * contiguous with each other. + */ +class memarena { + public: + memarena() + : _current_chunk(arena_chunk()), + _other_chunks(nullptr), + _n_other_chunks(0), + _size_of_other_chunks(0), + _footprint_of_other_chunks(0) {} + + // Effect: Create a memarena with the specified initial size + void create(size_t initial_size); + + void destroy(void); + + // Effect: Allocate some memory. The returned value remains valid until the + // memarena is cleared or closed. + // In case of ENOMEM, aborts. + void *malloc_from_arena(size_t size); + + // Effect: Move all the memory from this memarena into DEST. + // When SOURCE is closed the memory won't be freed. + // When DEST is closed, the memory will be freed, unless DEST moves + // its memory to another memarena... + void move_memory(memarena *dest); + + // Effect: Calculate the amount of memory used by a memory arena. + size_t total_memory_size(void) const; + + // Effect: Calculate the used space of the memory arena (ie: excludes unused + // space) + size_t total_size_in_use(void) const; + + // Effect: Calculate the amount of memory used, according to + // toku_memory_footprint(), + // which is a more expensive but more accurate count of memory used. + size_t total_footprint(void) const; + + // iterator over the underlying chunks that store objects in the memarena. + // a chunk is represented by a pointer to const memory and a usable byte + // count. + class chunk_iterator { + public: + chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {} + + // returns: base pointer to the current chunk + // *used set to the number of usable bytes + // if more() is false, returns nullptr and *used = 0 + const void *current(size_t *used) const; + + // requires: more() is true + void next(); + + bool more() const; + + private: + // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk + // >= 0 represents the i'th chunk in the ma->_other_chunks array + const memarena *_ma; + int _chunk_idx; + }; + + private: + struct arena_chunk { + arena_chunk() : buf(nullptr), used(0), size(0) {} + char *buf; + size_t used; + size_t size; + }; + + struct arena_chunk _current_chunk; + struct arena_chunk *_other_chunks; + int _n_other_chunks; + size_t _size_of_other_chunks; // the buf_size of all the other chunks. + size_t _footprint_of_other_chunks; // the footprint of all the other chunks. + + friend class memarena_unit_test; +}; diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt.h new file mode 100644 index 0000000000..f208002d32 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt.h @@ -0,0 +1,794 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include +#include + +#include "../portability/toku_portability.h" +#include "../portability/toku_race_tools.h" +#include "growable_array.h" + +namespace toku { + +/** + * Order Maintenance Tree (OMT) + * + * Maintains a collection of totally ordered values, where each value has an + * integer weight. The OMT is a mutable datatype. + * + * The Abstraction: + * + * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector. + * The vector is numbered from $0$ to $|V|-1$. + * Each value has a weight. The weight of the $i$th element is denoted + * $w(V_i)$. + * + * We can create a new OMT, which is the empty vector. + * + * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where + * $|V'|=1+|V|$ and + * + * V'_j = V_j if $ji$. + * + * We can specify $i$ using a kind of function instead of as an integer. + * Let $b$ be a function mapping from values to nonzero integers, such that + * the signum of $b$ is monotically increasing. + * We can specify $i$ as the minimum integer such that $b(V_i)>0$. + * + * We look up a value using its index, or using a Heaviside function. + * For lookups, we allow $b$ to be zero for some values, and again the signum of + * $b$ must be monotonically increasing. When lookup up values, we can look up + * $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a + * special return code if no such value exists.) (Rationale: Ordinarily we want + * $i$ to be unique. But for various reasons we want to allow multiple zeros, + * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum + * integer such that $b(V_i)>0$. (Or an indication that no such value exists.) + * $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an + * indication that no such value exists.) + * + * When looking up a value using a Heaviside function, we get the value and its + * index. + * + * We can also split an OMT into two OMTs, splitting the weight of the values + * evenly. Find a value $j$ such that the values to the left of $j$ have about + * the same total weight as the values to the right of $j$. The resulting two + * OMTs contain the values to the left of $j$ and the values to the right of $j$ + * respectively. All of the values from the original OMT go into one of the new + * OMTs. If the weights of the values don't split exactly evenly, then the + * implementation has the freedom to choose whether the new left OMT or the new + * right OMT is larger. + * + * Performance: + * Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ + * calls to the Heaviside function. The memory required is O(|V|). + * + * Usage: + * The omt is templated by two parameters: + * - omtdata_t is what will be stored within the omt. These could be pointers + * or real data types (ints, structs). + * - omtdataout_t is what will be returned by find and related functions. By + * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To + * create an omt which will store "TXNID"s, for example, it is a good idea to + * typedef the template: typedef omt txnid_omt_t; If you are storing + * structs, you may want to be able to get a pointer to the data actually stored + * in the omt (see find_zero). To do this, use the second template parameter: + * typedef omt foo_omt_t; + */ + +namespace omt_internal { + +template +class subtree_templated { + private: + uint32_t m_index; + + public: + static const uint32_t NODE_NULL = UINT32_MAX; + inline void set_to_null(void) { m_index = NODE_NULL; } + + inline bool is_null(void) const { return NODE_NULL == this->get_index(); } + + inline uint32_t get_index(void) const { return m_index; } + + inline void set_index(uint32_t index) { + paranoid_invariant(index != NODE_NULL); + m_index = index; + } +} __attribute__((__packed__, aligned(4))); + +template <> +class subtree_templated { + private: + uint32_t m_bitfield; + static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31); + static const uint32_t MASK_BIT = ((uint32_t)1) << 31; + + inline void set_index_internal(uint32_t new_index) { + m_bitfield = (m_bitfield & MASK_BIT) | new_index; + } + + public: + static const uint32_t NODE_NULL = INT32_MAX; + inline void set_to_null(void) { this->set_index_internal(NODE_NULL); } + + inline bool is_null(void) const { return NODE_NULL == this->get_index(); } + + inline uint32_t get_index(void) const { + TOKU_DRD_IGNORE_VAR(m_bitfield); + const uint32_t bits = m_bitfield; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + return bits & MASK_INDEX; + } + + inline void set_index(uint32_t index) { + paranoid_invariant(index < NODE_NULL); + this->set_index_internal(index); + } + + inline bool get_bit(void) const { + TOKU_DRD_IGNORE_VAR(m_bitfield); + const uint32_t bits = m_bitfield; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + return (bits & MASK_BIT) != 0; + } + + inline void enable_bit(void) { + // These bits may be set by a thread with a write lock on some + // leaf, and the index can be read by another thread with a (read + // or write) lock on another thread. Also, the has_marks_below + // bit can be set by two threads simultaneously. Neither of these + // are real races, so if we are using DRD we should tell it to + // ignore these bits just while we set this bit. If there were a + // race in setting the index, that would be a real race. + TOKU_DRD_IGNORE_VAR(m_bitfield); + m_bitfield |= MASK_BIT; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + } + + inline void disable_bit(void) { m_bitfield &= MASK_INDEX; } +} __attribute__((__packed__)); + +template +class omt_node_templated { + public: + omtdata_t value; + uint32_t weight; + subtree_templated left; + subtree_templated right; + + // this needs to be in both implementations because we don't have + // a "static if" the caller can use + inline void clear_stolen_bits(void) {} +}; // note: originally this class had __attribute__((__packed__, aligned(4))) + +template +class omt_node_templated { + public: + omtdata_t value; + uint32_t weight; + subtree_templated left; + subtree_templated right; + inline bool get_marked(void) const { return left.get_bit(); } + inline void set_marked_bit(void) { return left.enable_bit(); } + inline void unset_marked_bit(void) { return left.disable_bit(); } + + inline bool get_marks_below(void) const { return right.get_bit(); } + inline void set_marks_below_bit(void) { + // This function can be called by multiple threads. + // Checking first reduces cache invalidation. + if (!this->get_marks_below()) { + right.enable_bit(); + } + } + inline void unset_marks_below_bit(void) { right.disable_bit(); } + + inline void clear_stolen_bits(void) { + this->unset_marked_bit(); + this->unset_marks_below_bit(); + } +}; // note: originally this class had __attribute__((__packed__, aligned(4))) + +} // namespace omt_internal + +template +class omt { + public: + /** + * Effect: Create an empty OMT. + * Performance: constant time. + */ + void create(void); + + /** + * Effect: Create an empty OMT with no internal allocated space. + * Performance: constant time. + * Rationale: In some cases we need a valid omt but don't want to malloc. + */ + void create_no_array(void); + + /** + * Effect: Create a OMT containing values. The number of values is in + * numvalues. Stores the new OMT in *omtp. Requires: this has not been created + * yet Requires: values != NULL Requires: values is sorted Performance: + * time=O(numvalues) Rationale: Normally to insert N values takes O(N lg N) + * amortized time. If the N values are known in advance, are sorted, and the + * structure is empty, we can batch insert them much faster. + */ + __attribute__((nonnull)) void create_from_sorted_array( + const omtdata_t *const values, const uint32_t numvalues); + + /** + * Effect: Create an OMT containing values. The number of values is in + * numvalues. On success the OMT takes ownership of *values array, and sets + * values=NULL. Requires: this has not been created yet Requires: values != + * NULL Requires: *values is sorted Requires: *values was allocated with + * toku_malloc Requires: Capacity of the *values array is <= new_capacity + * Requires: On success, *values may not be accessed again by the caller. + * Performance: time=O(1) + * Rational: create_from_sorted_array takes O(numvalues) time. + * By taking ownership of the array, we save a malloc and + * memcpy, and possibly a free (if the caller is done with the array). + */ + void create_steal_sorted_array(omtdata_t **const values, + const uint32_t numvalues, + const uint32_t new_capacity); + + /** + * Effect: Create a new OMT, storing it in *newomt. + * The values to the right of index (starting at index) are moved to *newomt. + * Requires: newomt != NULL + * Returns + * 0 success, + * EINVAL if index > toku_omt_size(omt) + * On nonzero return, omt and *newomt are unmodified. + * Performance: time=O(n) + * Rationale: We don't need a split-evenly operation. We need to split items + * so that their total sizes are even, and other similar splitting criteria. + * It's easy to split evenly by calling size(), and dividing by two. + */ + __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx); + + /** + * Effect: Appends leftomt and rightomt to produce a new omt. + * Creates this as the new omt. + * leftomt and rightomt are destroyed. + * Performance: time=O(n) is acceptable, but one can imagine implementations + * that are O(\log n) worst-case. + */ + __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt); + + /** + * Effect: Creates a copy of an omt. + * Creates this as the clone. + * Each element is copied directly. If they are pointers, the underlying + * data is not duplicated. Performance: O(n) or the running time of + * fill_array_with_subtree_values() + */ + void clone(const omt &src); + + /** + * Effect: Set the tree to be empty. + * Note: Will not reallocate or resize any memory. + * Performance: time=O(1) + */ + void clear(void); + + /** + * Effect: Destroy an OMT, freeing all its memory. + * If the values being stored are pointers, their underlying data is not + * freed. See free_items() Those values may be freed before or after calling + * toku_omt_destroy. Rationale: Returns no values since free() cannot fail. + * Rationale: Does not free the underlying pointers to reduce complexity. + * Performance: time=O(1) + */ + void destroy(void); + + /** + * Effect: return |this|. + * Performance: time=O(1) + */ + uint32_t size(void) const; + + /** + * Effect: Insert value into the OMT. + * If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST. + * Otherwise, let i be the minimum value such that $h(V_i, v)>0$. + * If no such i exists, then let i be |V| + * Then this has the same effect as + * insert_at(tree, value, i); + * If idx!=NULL then i is stored in *idx + * Requires: The signum of h must be monotonically increasing. + * Returns: + * 0 success + * DB_KEYEXIST the key is present (h was equal to zero for some value) + * On nonzero return, omt is unchanged. + * Performance: time=O(\log N) amortized. + * Rationale: Some future implementation may be O(\log N) worst-case time, but + * O(\log N) amortized is good enough for now. + */ + template + int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx); + + /** + * Effect: Increases indexes of all items at slot >= idx by 1. + * Insert value into the position at idx. + * Returns: + * 0 success + * EINVAL if idx > this->size() + * On error, omt is unchanged. + * Performance: time=O(\log N) amortized time. + * Rationale: Some future implementation may be O(\log N) worst-case time, but + * O(\log N) amortized is good enough for now. + */ + int insert_at(const omtdata_t &value, const uint32_t idx); + + /** + * Effect: Replaces the item at idx with value. + * Returns: + * 0 success + * EINVAL if idx>=this->size() + * On error, omt is unchanged. + * Performance: time=O(\log N) + * Rationale: The FT needs to be able to replace a value with another copy of + * the same value (allocated in a different location) + * + */ + int set_at(const omtdata_t &value, const uint32_t idx); + + /** + * Effect: Delete the item in slot idx. + * Decreases indexes of all items at slot > idx by 1. + * Returns + * 0 success + * EINVAL if idx>=this->size() + * On error, omt is unchanged. + * Rationale: To delete an item, first find its index using find or find_zero, + * then delete it. Performance: time=O(\log N) amortized. + */ + int delete_at(const uint32_t idx); + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a ref-to-const of the + * value stored in the omt. The second argument passed to f is the index of + * the value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL + * Returns: + * If f ever returns nonzero, then the iteration stops, and the value + * returned by f is returned by iterate. If f always returns zero, then + * iterate returns 0. Requires: Don't modify the omt while running. (E.g., f + * may not insert or delete values from the omt.) Performance: time=O(i+\log + * N) where i is the number of times f is called, and N is the number of + * elements in the omt. Rationale: Although the functional iterator requires + * defining another function (as opposed to C++ style iterator), it is much + * easier to read. Rationale: We may at some point use functors, but for now + * this is a smaller change from the old OMT. + */ + template + int iterate(iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a ref-to-const of the + * value stored in the omt. The second argument passed to f is the index of + * the value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). We will iterate only over + * [left,right) + * + * Requires: left <= right + * Requires: f != NULL + * Returns: + * EINVAL if right > this->size() + * If f ever returns nonzero, then the iteration stops, and the value + * returned by f is returned by iterate_on_range. If f always returns zero, + * then iterate_on_range returns 0. Requires: Don't modify the omt while + * running. (E.g., f may not insert or delete values from the omt.) + * Performance: time=O(i+\log N) where i is the number of times f is called, + * and N is the number of elements in the omt. Rational: Although the + * functional iterator requires defining another function (as opposed to C++ + * style iterator), it is much easier to read. + */ + template + int iterate_on_range(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Iterate over the values of the omt, and mark the nodes that are + * visited. Other than the marks, this behaves the same as iterate_on_range. + * Requires: supports_marks == true + * Performance: time=O(i+\log N) where i is the number of times f is called, + * and N is the number of elements in the omt. Notes: This function MAY be + * called concurrently by multiple threads, but not concurrently with any + * other non-const function. + */ + template + int iterate_and_mark_range(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra); + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value whose node has been marked. Other than the marks, this + * behaves the same as iterate. Requires: supports_marks == true Performance: + * time=O(i+\log N) where i is the number of times f is called, and N is the + * number of elements in the omt. + */ + template + int iterate_over_marked(iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Delete all elements from the omt, whose nodes have been marked. + * Requires: supports_marks == true + * Performance: time=O(N + i\log N) where i is the number of marked elements, + * {c,sh}ould be faster + */ + void delete_all_marked(void); + + /** + * Effect: Verify that the internal state of the marks in the tree are + * self-consistent. Crashes the system if the marks are in a bad state. + * Requires: supports_marks == true + * Performance: time=O(N) + * Notes: + * Even though this is a const function, it requires exclusive access. + * Rationale: + * The current implementation of the marks relies on a sort of + * "cache" bit representing the state of bits below it in the tree. + * This allows glass-box testing that these bits are correct. + */ + void verify_marks_consistent(void) const; + + /** + * Effect: None + * Returns whether there are any marks in the tree. + */ + bool has_marks(void) const; + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a pointer to the value + * stored in the omt. The second argument passed to f is the index of the + * value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). Requires: same as for + * iterate() Returns: same as for iterate() Performance: same as for iterate() + * Rationale: In general, most iterators should use iterate() since they + * should not modify the data stored in the omt. This function is for + * iterators which need to modify values (for example, free_items). Rationale: + * We assume if you are transforming the data in place, you want to do it to + * everything at once, so there is not yet an iterate_on_range_ptr (but there + * could be). + */ + template + void iterate_ptr(iterate_extra_t *const iterate_extra); + + /** + * Effect: Set *value=V_idx + * Returns + * 0 success + * EINVAL if index>=toku_omt_size(omt) + * On nonzero return, *value is unchanged + * Performance: time=O(\log N) + */ + int fetch(const uint32_t idx, omtdataout_t *const value) const; + + /** + * Effect: Find the smallest i such that h(V_i, extra)>=0 + * If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = + * V_i, and return 0. If there is such an i and h(V_i,extra)>0 then set + * *idxp=i and return DB_NOTFOUND. If there is no such i then set + * *idx=this->size() and return DB_NOTFOUND. Note: value is of type + * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is + * fixed by the instantiation. If it is the value type, then the value is + * copied out (even if the value type is a pointer to something else) If it is + * the pointer type, then *value is set to a pointer to the data within the + * omt. This is determined by the type of the omt as initially declared. If + * the omt is declared as omt, then foo_t's will be stored and foo_t's + * will be returned by find and related functions. If the omt is declared as + * omt, then foo_t's will be stored, and pointers to the + * stored items will be returned by find and related functions. Rationale: + * Structs too small for malloc should be stored directly in the omt. + * These structs may need to be edited as they exist inside the omt, so we + * need a way to get a pointer within the omt. Using separate functions for + * returning pointers and values increases code duplication and reduces + * type-checking. That also reduces the ability of the creator of a data + * structure to give advice to its future users. Slight overloading in this + * case seemed to provide a better API and better type checking. + */ + template + int find_zero(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + /** + * Effect: + * If direction >0 then find the smallest i such that h(V_i,extra)>0. + * If direction <0 then find the largest i such that h(V_i,extra)<0. + * (Direction may not be equal to zero.) + * If value!=NULL then store V_i in *value + * If idxp!=NULL then store i in *idxp. + * Requires: The signum of h is monotically increasing. + * Returns + * 0 success + * DB_NOTFOUND no such value is found. + * On nonzero return, *value and *idxp are unchanged + * Performance: time=O(\log N) + * Rationale: + * Here's how to use the find function to find various things + * Cases for find: + * find first value: ( h(v)=+1, direction=+1 ) + * find last value ( h(v)=-1, direction=-1 ) + * find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 ) + * find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 ) + * find X or successor to X ( same as find first X. ) + * + * Rationale: To help understand heaviside functions and behavor of find: + * There are 7 kinds of heaviside functions. + * The signus of the h must be monotonically increasing. + * Given a function of the following form, A is the element + * returned for direction>0, B is the element returned + * for direction<0, C is the element returned for + * direction==0 (see find_zero) (with a return of 0), and D is the element + * returned for direction==0 (see find_zero) with a return of DB_NOTFOUND. + * If any of A, B, or C are not found, then asking for the + * associated direction will return DB_NOTFOUND. + * See find_zero for more information. + * + * Let the following represent the signus of the heaviside function. + * + * -...- + * A + * D + * + * +...+ + * B + * D + * + * 0...0 + * C + * + * -...-0...0 + * AC + * + * 0...0+...+ + * C B + * + * -...-+...+ + * AB + * D + * + * -...-0...0+...+ + * AC B + */ + template + int find(const omtcmp_t &extra, int direction, omtdataout_t *const value, + uint32_t *const idxp) const; + + /** + * Effect: Return the size (in bytes) of the omt, as it resides in main + * memory. If the data stored are pointers, don't include the size of what + * they all point to. + */ + size_t memory_size(void); + + private: + typedef uint32_t node_idx; + typedef omt_internal::subtree_templated subtree; + typedef omt_internal::omt_node_templated omt_node; + ENSURE_POD(subtree); + + struct omt_array { + uint32_t start_idx; + uint32_t num_values; + omtdata_t *values; + }; + + struct omt_tree { + subtree root; + uint32_t free_idx; + omt_node *nodes; + }; + + bool is_array; + uint32_t capacity; + union { + struct omt_array a; + struct omt_tree t; + } d; + + __attribute__((nonnull)) void unmark(const subtree &subtree, + const uint32_t index, + GrowableArray *const indexes); + + void create_internal_no_array(const uint32_t new_capacity); + + void create_internal(const uint32_t new_capacity); + + uint32_t nweight(const subtree &subtree) const; + + node_idx node_malloc(void); + + void node_free(const node_idx idx); + + void maybe_resize_array(const uint32_t n); + + __attribute__((nonnull)) void fill_array_with_subtree_values( + omtdata_t *const array, const subtree &subtree) const; + + void convert_to_array(void); + + __attribute__((nonnull)) void rebuild_from_sorted_array( + subtree *const subtree, const omtdata_t *const values, + const uint32_t numvalues); + + void convert_to_tree(void); + + void maybe_resize_or_convert(const uint32_t n); + + bool will_need_rebalance(const subtree &subtree, const int leftmod, + const int rightmod) const; + + __attribute__((nonnull)) void insert_internal( + subtree *const subtreep, const omtdata_t &value, const uint32_t idx, + subtree **const rebalance_subtree); + + void set_at_internal_array(const omtdata_t &value, const uint32_t idx); + + void set_at_internal(const subtree &subtree, const omtdata_t &value, + const uint32_t idx); + + void delete_internal(subtree *const subtreep, const uint32_t idx, + omt_node *const copyn, + subtree **const rebalance_subtree); + + template + int iterate_internal_array(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const; + + template + void iterate_ptr_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra); + + template + void iterate_ptr_internal_array(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra); + + template + int iterate_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra) const; + + template + int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, + const uint32_t idx, + iterate_extra_t *const iterate_extra); + + template + int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra) const; + + uint32_t verify_marks_consistent_internal(const subtree &subtree, + const bool allow_marks) const; + + void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const; + + void fetch_internal(const subtree &subtree, const uint32_t i, + omtdataout_t *const value) const; + + __attribute__((nonnull)) void fill_array_with_subtree_idxs( + node_idx *const array, const subtree &subtree) const; + + __attribute__((nonnull)) void rebuild_subtree_from_idxs( + subtree *const subtree, const node_idx *const idxs, + const uint32_t numvalues); + + __attribute__((nonnull)) void rebalance(subtree *const subtree); + + __attribute__((nonnull)) static void copyout(omtdata_t *const out, + const omt_node *const n); + + __attribute__((nonnull)) static void copyout(omtdata_t **const out, + omt_node *const n); + + __attribute__((nonnull)) static void copyout( + omtdata_t *const out, const omtdata_t *const stored_value_ptr); + + __attribute__((nonnull)) static void copyout( + omtdata_t **const out, omtdata_t *const stored_value_ptr); + + template + int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + template + int find_internal_zero(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, uint32_t *const idxp) const; + + template + int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + template + int find_internal_plus(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, uint32_t *const idxp) const; + + template + int find_internal_minus_array(const omtcmp_t &extra, + omtdataout_t *const value, + uint32_t *const idxp) const; + + template + int find_internal_minus(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, + uint32_t *const idxp) const; +}; + +} // namespace toku + +// include the implementation here +#include "omt_impl.h" diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h new file mode 100644 index 0000000000..e779867165 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h @@ -0,0 +1,1295 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include + +#include "../db.h" +#include "../portability/memory.h" + +namespace toku { + +template +void omt::create(void) { + this->create_internal(2); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::create_no_array(void) { + if (!supports_marks) { + this->create_internal_no_array(0); + } else { + this->is_array = false; + this->capacity = 0; + this->d.t.nodes = nullptr; + this->d.t.root.set_to_null(); + this->d.t.free_idx = 0; + } +} + +template +void omt::create_from_sorted_array( + const omtdata_t *const values, const uint32_t numvalues) { + this->create_internal(numvalues); + memcpy(this->d.a.values, values, numvalues * (sizeof values[0])); + this->d.a.num_values = numvalues; + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::create_steal_sorted_array( + omtdata_t **const values, const uint32_t numvalues, + const uint32_t new_capacity) { + paranoid_invariant_notnull(values); + this->create_internal_no_array(new_capacity); + this->d.a.num_values = numvalues; + this->d.a.values = *values; + *values = nullptr; + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +int omt::split_at(omt *const newomt, + const uint32_t idx) { + barf_if_marked(*this); + paranoid_invariant_notnull(newomt); + if (idx > this->size()) { + return EINVAL; + } + this->convert_to_array(); + const uint32_t newsize = this->size() - idx; + newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx], + newsize); + this->d.a.num_values = idx; + this->maybe_resize_array(idx); + if (supports_marks) { + this->convert_to_tree(); + } + return 0; +} + +template +void omt::merge(omt *const leftomt, + omt *const rightomt) { + barf_if_marked(*this); + paranoid_invariant_notnull(leftomt); + paranoid_invariant_notnull(rightomt); + const uint32_t leftsize = leftomt->size(); + const uint32_t rightsize = rightomt->size(); + const uint32_t newsize = leftsize + rightsize; + + if (leftomt->is_array) { + if (leftomt->capacity - + (leftomt->d.a.start_idx + leftomt->d.a.num_values) >= + rightsize) { + this->create_steal_sorted_array( + &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity); + this->d.a.start_idx = leftomt->d.a.start_idx; + } else { + this->create_internal(newsize); + memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx], + leftomt->d.a.num_values * (sizeof this->d.a.values[0])); + } + } else { + this->create_internal(newsize); + leftomt->fill_array_with_subtree_values(&this->d.a.values[0], + leftomt->d.t.root); + } + leftomt->destroy(); + this->d.a.num_values = leftsize; + + if (rightomt->is_array) { + memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values], + &rightomt->d.a.values[rightomt->d.a.start_idx], + rightomt->d.a.num_values * (sizeof this->d.a.values[0])); + } else { + rightomt->fill_array_with_subtree_values( + &this->d.a.values[this->d.a.start_idx + this->d.a.num_values], + rightomt->d.t.root); + } + rightomt->destroy(); + this->d.a.num_values += rightsize; + paranoid_invariant(this->size() == newsize); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::clone(const omt &src) { + barf_if_marked(*this); + this->create_internal(src.size()); + if (src.is_array) { + memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx], + src.d.a.num_values * (sizeof this->d.a.values[0])); + } else { + src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root); + } + this->d.a.num_values = src.size(); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::clear(void) { + if (this->is_array) { + this->d.a.start_idx = 0; + this->d.a.num_values = 0; + } else { + this->d.t.root.set_to_null(); + this->d.t.free_idx = 0; + } +} + +template +void omt::destroy(void) { + this->clear(); + this->capacity = 0; + if (this->is_array) { + if (this->d.a.values != nullptr) { + toku_free(this->d.a.values); + } + this->d.a.values = nullptr; + } else { + if (this->d.t.nodes != nullptr) { + toku_free(this->d.t.nodes); + } + this->d.t.nodes = nullptr; + } +} + +template +uint32_t omt::size(void) const { + if (this->is_array) { + return this->d.a.num_values; + } else { + return this->nweight(this->d.t.root); + } +} + +template +template +int omt::insert(const omtdata_t &value, + const omtcmp_t &v, + uint32_t *const idx) { + int r; + uint32_t insert_idx; + + r = this->find_zero(v, nullptr, &insert_idx); + if (r == 0) { + if (idx) *idx = insert_idx; + return DB_KEYEXIST; + } + if (r != DB_NOTFOUND) return r; + + if ((r = this->insert_at(value, insert_idx))) return r; + if (idx) *idx = insert_idx; + + return 0; +} + +// The following 3 functions implement a static if for us. +template +static void barf_if_marked(const omt &UU(omt)) { +} + +template +static void barf_if_marked(const omt &omt) { + invariant(!omt.has_marks()); +} + +template +bool omt::has_marks(void) const { + static_assert(supports_marks, "Does not support marks"); + if (this->d.t.root.is_null()) { + return false; + } + const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()]; + return node.get_marks_below() || node.get_marked(); +} + +template +int omt::insert_at( + const omtdata_t &value, const uint32_t idx) { + barf_if_marked(*this); + if (idx > this->size()) { + return EINVAL; + } + + this->maybe_resize_or_convert(this->size() + 1); + if (this->is_array && idx != this->d.a.num_values && + (idx != 0 || this->d.a.start_idx == 0)) { + this->convert_to_tree(); + } + if (this->is_array) { + if (idx == this->d.a.num_values) { + this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value; + } else { + this->d.a.values[--this->d.a.start_idx] = value; + } + this->d.a.num_values++; + } else { + subtree *rebalance_subtree = nullptr; + this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree); + if (rebalance_subtree != nullptr) { + this->rebalance(rebalance_subtree); + } + } + return 0; +} + +template +int omt::set_at(const omtdata_t &value, + const uint32_t idx) { + barf_if_marked(*this); + if (idx >= this->size()) { + return EINVAL; + } + + if (this->is_array) { + this->set_at_internal_array(value, idx); + } else { + this->set_at_internal(this->d.t.root, value, idx); + } + return 0; +} + +template +int omt::delete_at( + const uint32_t idx) { + barf_if_marked(*this); + if (idx >= this->size()) { + return EINVAL; + } + + this->maybe_resize_or_convert(this->size() - 1); + if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) { + this->convert_to_tree(); + } + if (this->is_array) { + // Testing for 0 does not rule out it being the last entry. + // Test explicitly for num_values-1 + if (idx != this->d.a.num_values - 1) { + this->d.a.start_idx++; + } + this->d.a.num_values--; + } else { + subtree *rebalance_subtree = nullptr; + this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree); + if (rebalance_subtree != nullptr) { + this->rebalance(rebalance_subtree); + } + } + return 0; +} + +template +template +int omt::iterate( + iterate_extra_t *const iterate_extra) const { + return this->iterate_on_range(0, this->size(), + iterate_extra); +} + +template +template +int omt::iterate_on_range( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const { + if (right > this->size()) { + return EINVAL; + } + if (left == right) { + return 0; + } + if (this->is_array) { + return this->iterate_internal_array(left, right, + iterate_extra); + } + return this->iterate_internal(left, right, this->d.t.root, + 0, iterate_extra); +} + +template +template +int omt::iterate_and_mark_range( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) { + static_assert(supports_marks, "does not support marks"); + if (right > this->size()) { + return EINVAL; + } + if (left == right) { + return 0; + } + paranoid_invariant(!this->is_array); + return this->iterate_and_mark_range_internal( + left, right, this->d.t.root, 0, iterate_extra); +} + +// TODO: We can optimize this if we steal 3 bits. 1 bit: this node is +// marked. 1 bit: left subtree has marks. 1 bit: right subtree has marks. +template +template +int omt::iterate_over_marked( + iterate_extra_t *const iterate_extra) const { + static_assert(supports_marks, "does not support marks"); + paranoid_invariant(!this->is_array); + return this->iterate_over_marked_internal( + this->d.t.root, 0, iterate_extra); +} + +template +void omt::unmark( + const subtree &st, const uint32_t index, + GrowableArray *const indexes) { + if (st.is_null()) { + return; + } + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t index_root = index + this->nweight(n.left); + + const bool below = n.get_marks_below(); + if (below) { + this->unmark(n.left, index, indexes); + } + if (n.get_marked()) { + indexes->push(index_root); + } + n.clear_stolen_bits(); + if (below) { + this->unmark(n.right, index_root + 1, indexes); + } +} + +template +void omt::delete_all_marked(void) { + static_assert(supports_marks, "does not support marks"); + if (!this->has_marks()) { + return; + } + paranoid_invariant(!this->is_array); + GrowableArray marked_indexes; + marked_indexes.init(); + + // Remove all marks. + // We need to delete all the stolen bits before calling delete_at to + // prevent barfing. + this->unmark(this->d.t.root, 0, &marked_indexes); + + for (uint32_t i = 0; i < marked_indexes.get_size(); i++) { + // Delete from left to right, shift by number already deleted. + // Alternative is delete from right to left. + int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i); + lazy_assert_zero(r); + } + marked_indexes.deinit(); + barf_if_marked(*this); +} + +template +uint32_t +omt::verify_marks_consistent_internal( + const subtree &st, const bool UU(allow_marks)) const { + if (st.is_null()) { + return 0; + } + const omt_node &node = this->d.t.nodes[st.get_index()]; + uint32_t num_marks = + verify_marks_consistent_internal(node.left, node.get_marks_below()); + num_marks += + verify_marks_consistent_internal(node.right, node.get_marks_below()); + if (node.get_marks_below()) { + paranoid_invariant(allow_marks); + paranoid_invariant(num_marks > 0); + } else { + // redundant with invariant below, but nice to have explicitly + paranoid_invariant(num_marks == 0); + } + if (node.get_marked()) { + paranoid_invariant(allow_marks); + ++num_marks; + } + return num_marks; +} + +template +void omt::verify_marks_consistent( + void) const { + static_assert(supports_marks, "does not support marks"); + paranoid_invariant(!this->is_array); + this->verify_marks_consistent_internal(this->d.t.root, true); +} + +template +template +void omt::iterate_ptr( + iterate_extra_t *const iterate_extra) { + if (this->is_array) { + this->iterate_ptr_internal_array(0, this->size(), + iterate_extra); + } else { + this->iterate_ptr_internal( + 0, this->size(), this->d.t.root, 0, iterate_extra); + } +} + +template +int omt::fetch( + const uint32_t idx, omtdataout_t *const value) const { + if (idx >= this->size()) { + return EINVAL; + } + if (this->is_array) { + this->fetch_internal_array(idx, value); + } else { + this->fetch_internal(this->d.t.root, idx, value); + } + return 0; +} + +template +template +int omt::find_zero( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + uint32_t tmp_index; + uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; + int r; + if (this->is_array) { + r = this->find_internal_zero_array(extra, value, child_idxp); + } else { + r = this->find_internal_zero(this->d.t.root, extra, value, + child_idxp); + } + return r; +} + +template +template +int omt::find( + const omtcmp_t &extra, int direction, omtdataout_t *const value, + uint32_t *const idxp) const { + uint32_t tmp_index; + uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; + paranoid_invariant(direction != 0); + if (direction < 0) { + if (this->is_array) { + return this->find_internal_minus_array(extra, value, + child_idxp); + } else { + return this->find_internal_minus(this->d.t.root, extra, + value, child_idxp); + } + } else { + if (this->is_array) { + return this->find_internal_plus_array(extra, value, + child_idxp); + } else { + return this->find_internal_plus(this->d.t.root, extra, value, + child_idxp); + } + } +} + +template +size_t omt::memory_size(void) { + if (this->is_array) { + return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]); + } + return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]); +} + +template +void omt::create_internal_no_array( + const uint32_t new_capacity) { + this->is_array = true; + this->d.a.start_idx = 0; + this->d.a.num_values = 0; + this->d.a.values = nullptr; + this->capacity = new_capacity; +} + +template +void omt::create_internal( + const uint32_t new_capacity) { + this->create_internal_no_array(new_capacity); + XMALLOC_N(this->capacity, this->d.a.values); +} + +template +uint32_t omt::nweight( + const subtree &st) const { + if (st.is_null()) { + return 0; + } else { + return this->d.t.nodes[st.get_index()].weight; + } +} + +template +typename omt::node_idx +omt::node_malloc(void) { + paranoid_invariant(this->d.t.free_idx < this->capacity); + omt_node &n = this->d.t.nodes[this->d.t.free_idx]; + n.clear_stolen_bits(); + return this->d.t.free_idx++; +} + +template +void omt::node_free( + const node_idx UU(idx)) { + paranoid_invariant(idx < this->capacity); +} + +template +void omt::maybe_resize_array( + const uint32_t n) { + const uint32_t new_size = n <= 2 ? 4 : 2 * n; + const uint32_t room = this->capacity - this->d.a.start_idx; + + if (room < n || this->capacity / 2 >= new_size) { + omtdata_t *XMALLOC_N(new_size, tmp_values); + if (this->d.a.num_values) { + memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx], + this->d.a.num_values * (sizeof tmp_values[0])); + } + this->d.a.start_idx = 0; + this->capacity = new_size; + toku_free(this->d.a.values); + this->d.a.values = tmp_values; + } +} + +template +void omt::fill_array_with_subtree_values(omtdata_t *const array, + const subtree &st) + const { + if (st.is_null()) return; + const omt_node &tree = this->d.t.nodes[st.get_index()]; + this->fill_array_with_subtree_values(&array[0], tree.left); + array[this->nweight(tree.left)] = tree.value; + this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1], + tree.right); +} + +template +void omt::convert_to_array(void) { + if (!this->is_array) { + const uint32_t num_values = this->size(); + uint32_t new_size = 2 * num_values; + new_size = new_size < 4 ? 4 : new_size; + + omtdata_t *XMALLOC_N(new_size, tmp_values); + this->fill_array_with_subtree_values(tmp_values, this->d.t.root); + toku_free(this->d.t.nodes); + this->is_array = true; + this->capacity = new_size; + this->d.a.num_values = num_values; + this->d.a.values = tmp_values; + this->d.a.start_idx = 0; + } +} + +template +void omt::rebuild_from_sorted_array( + subtree *const st, const omtdata_t *const values, + const uint32_t numvalues) { + if (numvalues == 0) { + st->set_to_null(); + } else { + const uint32_t halfway = numvalues / 2; + const node_idx newidx = this->node_malloc(); + omt_node *const newnode = &this->d.t.nodes[newidx]; + newnode->weight = numvalues; + newnode->value = values[halfway]; + st->set_index(newidx); + // update everything before the recursive calls so the second call + // can be a tail call. + this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway); + this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1], + numvalues - (halfway + 1)); + } +} + +template +void omt::convert_to_tree(void) { + if (this->is_array) { + const uint32_t num_nodes = this->size(); + uint32_t new_size = num_nodes * 2; + new_size = new_size < 4 ? 4 : new_size; + + omt_node *XMALLOC_N(new_size, new_nodes); + omtdata_t *const values = this->d.a.values; + omtdata_t *const tmp_values = &values[this->d.a.start_idx]; + this->is_array = false; + this->d.t.nodes = new_nodes; + this->capacity = new_size; + this->d.t.free_idx = 0; + this->d.t.root.set_to_null(); + this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes); + toku_free(values); + } +} + +template +void omt::maybe_resize_or_convert( + const uint32_t n) { + if (this->is_array) { + this->maybe_resize_array(n); + } else { + const uint32_t new_size = n <= 2 ? 4 : 2 * n; + const uint32_t num_nodes = this->nweight(this->d.t.root); + if ((this->capacity / 2 >= new_size) || + (this->d.t.free_idx >= this->capacity && num_nodes < n) || + (this->capacity < n)) { + this->convert_to_array(); + // if we had a free list, the "supports_marks" version could + // just resize, as it is now, we have to convert to and back + // from an array. + if (supports_marks) { + this->convert_to_tree(); + } + } + } +} + +template +bool omt::will_need_rebalance( + const subtree &st, const int leftmod, const int rightmod) const { + if (st.is_null()) { + return false; + } + const omt_node &n = this->d.t.nodes[st.get_index()]; + // one of the 1's is for the root. + // the other is to take ceil(n/2) + const uint32_t weight_left = this->nweight(n.left) + leftmod; + const uint32_t weight_right = this->nweight(n.right) + rightmod; + return ((1 + weight_left < (1 + 1 + weight_right) / 2) || + (1 + weight_right < (1 + 1 + weight_left) / 2)); +} + +template +void omt::insert_internal( + subtree *const subtreep, const omtdata_t &value, const uint32_t idx, + subtree **const rebalance_subtree) { + if (subtreep->is_null()) { + paranoid_invariant_zero(idx); + const node_idx newidx = this->node_malloc(); + omt_node *const newnode = &this->d.t.nodes[newidx]; + newnode->weight = 1; + newnode->left.set_to_null(); + newnode->right.set_to_null(); + newnode->value = value; + subtreep->set_index(newidx); + } else { + omt_node &n = this->d.t.nodes[subtreep->get_index()]; + n.weight++; + if (idx <= this->nweight(n.left)) { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 1, 0)) { + *rebalance_subtree = subtreep; + } + this->insert_internal(&n.left, value, idx, rebalance_subtree); + } else { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, 1)) { + *rebalance_subtree = subtreep; + } + const uint32_t sub_index = idx - this->nweight(n.left) - 1; + this->insert_internal(&n.right, value, sub_index, rebalance_subtree); + } + } +} + +template +void omt::set_at_internal_array( + const omtdata_t &value, const uint32_t idx) { + this->d.a.values[this->d.a.start_idx + idx] = value; +} + +template +void omt::set_at_internal( + const subtree &st, const omtdata_t &value, const uint32_t idx) { + paranoid_invariant(!st.is_null()); + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (idx < leftweight) { + this->set_at_internal(n.left, value, idx); + } else if (idx == leftweight) { + n.value = value; + } else { + this->set_at_internal(n.right, value, idx - leftweight - 1); + } +} + +template +void omt::delete_internal( + subtree *const subtreep, const uint32_t idx, omt_node *const copyn, + subtree **const rebalance_subtree) { + paranoid_invariant_notnull(subtreep); + paranoid_invariant_notnull(rebalance_subtree); + paranoid_invariant(!subtreep->is_null()); + omt_node &n = this->d.t.nodes[subtreep->get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (idx < leftweight) { + n.weight--; + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, -1, 0)) { + *rebalance_subtree = subtreep; + } + this->delete_internal(&n.left, idx, copyn, rebalance_subtree); + } else if (idx == leftweight) { + if (n.left.is_null()) { + const uint32_t oldidx = subtreep->get_index(); + *subtreep = n.right; + if (copyn != nullptr) { + copyn->value = n.value; + } + this->node_free(oldidx); + } else if (n.right.is_null()) { + const uint32_t oldidx = subtreep->get_index(); + *subtreep = n.left; + if (copyn != nullptr) { + copyn->value = n.value; + } + this->node_free(oldidx); + } else { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, -1)) { + *rebalance_subtree = subtreep; + } + // don't need to copy up value, it's only used by this + // next call, and when that gets to the bottom there + // won't be any more recursion + n.weight--; + this->delete_internal(&n.right, 0, &n, rebalance_subtree); + } + } else { + n.weight--; + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, -1)) { + *rebalance_subtree = subtreep; + } + this->delete_internal(&n.right, idx - leftweight - 1, copyn, + rebalance_subtree); + } +} + +template +template +int omt::iterate_internal_array( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const { + int r; + for (uint32_t i = left; i < right; ++i) { + r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra); + if (r != 0) { + return r; + } + } + return 0; +} + +template +template +void omt::iterate_ptr_internal( + const uint32_t left, const uint32_t right, const subtree &st, + const uint32_t idx, iterate_extra_t *const iterate_extra) { + if (!st.is_null()) { + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root) { + this->iterate_ptr_internal(left, right, n.left, idx, + iterate_extra); + } + if (left <= idx_root && idx_root < right) { + int r = f(&n.value, idx_root, iterate_extra); + lazy_assert_zero(r); + } + if (idx_root + 1 < right) { + this->iterate_ptr_internal( + left, right, n.right, idx_root + 1, iterate_extra); + } + } +} + +template +template +void omt::iterate_ptr_internal_array( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) { + for (uint32_t i = left; i < right; ++i) { + int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra); + lazy_assert_zero(r); + } +} + +template +template +int omt::iterate_internal( + const uint32_t left, const uint32_t right, const subtree &st, + const uint32_t idx, iterate_extra_t *const iterate_extra) const { + if (st.is_null()) { + return 0; + } + int r; + const omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root) { + r = this->iterate_internal(left, right, n.left, idx, + iterate_extra); + if (r != 0) { + return r; + } + } + if (left <= idx_root && idx_root < right) { + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (idx_root + 1 < right) { + return this->iterate_internal( + left, right, n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template +template +int omt:: + iterate_and_mark_range_internal(const uint32_t left, const uint32_t right, + const subtree &st, const uint32_t idx, + iterate_extra_t *const iterate_extra) { + paranoid_invariant(!st.is_null()); + int r; + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root && !n.left.is_null()) { + n.set_marks_below_bit(); + r = this->iterate_and_mark_range_internal( + left, right, n.left, idx, iterate_extra); + if (r != 0) { + return r; + } + } + if (left <= idx_root && idx_root < right) { + n.set_marked_bit(); + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (idx_root + 1 < right && !n.right.is_null()) { + n.set_marks_below_bit(); + return this->iterate_and_mark_range_internal( + left, right, n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template +template +int omt::iterate_over_marked_internal( + const subtree &st, const uint32_t idx, + iterate_extra_t *const iterate_extra) const { + if (st.is_null()) { + return 0; + } + int r; + const omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (n.get_marks_below()) { + r = this->iterate_over_marked_internal(n.left, idx, + iterate_extra); + if (r != 0) { + return r; + } + } + if (n.get_marked()) { + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (n.get_marks_below()) { + return this->iterate_over_marked_internal( + n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template +void omt::fetch_internal_array( + const uint32_t i, omtdataout_t *const value) const { + if (value != nullptr) { + copyout(value, &this->d.a.values[this->d.a.start_idx + i]); + } +} + +template +void omt::fetch_internal( + const subtree &st, const uint32_t i, omtdataout_t *const value) const { + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (i < leftweight) { + this->fetch_internal(n.left, i, value); + } else if (i == leftweight) { + if (value != nullptr) { + copyout(value, &n); + } + } else { + this->fetch_internal(n.right, i - leftweight - 1, value); + } +} + +template +void omt::fill_array_with_subtree_idxs( + node_idx *const array, const subtree &st) const { + if (!st.is_null()) { + const omt_node &tree = this->d.t.nodes[st.get_index()]; + this->fill_array_with_subtree_idxs(&array[0], tree.left); + array[this->nweight(tree.left)] = st.get_index(); + this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1], + tree.right); + } +} + +template +void omt::rebuild_subtree_from_idxs( + subtree *const st, const node_idx *const idxs, const uint32_t numvalues) { + if (numvalues == 0) { + st->set_to_null(); + } else { + uint32_t halfway = numvalues / 2; + st->set_index(idxs[halfway]); + // node_idx newidx = idxs[halfway]; + omt_node &newnode = this->d.t.nodes[st->get_index()]; + newnode.weight = numvalues; + // value is already in there. + this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway); + this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1], + numvalues - (halfway + 1)); + // n_idx = newidx; + } +} + +template +void omt::rebalance( + subtree *const st) { + node_idx idx = st->get_index(); + if (idx == this->d.t.root.get_index()) { + // Try to convert to an array. + // If this fails, (malloc) nothing will have changed. + // In the failure case we continue on to the standard rebalance + // algorithm. + this->convert_to_array(); + if (supports_marks) { + this->convert_to_tree(); + } + } else { + const omt_node &n = this->d.t.nodes[idx]; + node_idx *tmp_array; + size_t mem_needed = n.weight * (sizeof tmp_array[0]); + size_t mem_free = + (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]); + bool malloced; + if (mem_needed <= mem_free) { + // There is sufficient free space at the end of the nodes array + // to hold enough node indexes to rebalance. + malloced = false; + tmp_array = + reinterpret_cast(&this->d.t.nodes[this->d.t.free_idx]); + } else { + malloced = true; + XMALLOC_N(n.weight, tmp_array); + } + this->fill_array_with_subtree_idxs(tmp_array, *st); + this->rebuild_subtree_from_idxs(st, tmp_array, n.weight); + if (malloced) toku_free(tmp_array); + } +} + +template +void omt::copyout( + omtdata_t *const out, const omt_node *const n) { + *out = n->value; +} + +template +void omt::copyout( + omtdata_t **const out, omt_node *const n) { + *out = &n->value; +} + +template +void omt::copyout( + omtdata_t *const out, const omtdata_t *const stored_value_ptr) { + *out = *stored_value_ptr; +} + +template +void omt::copyout( + omtdata_t **const out, omtdata_t *const stored_value_ptr) { + *out = stored_value_ptr; +} + +template +template +int omt::find_internal_zero_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best_pos = subtree::NODE_NULL; + uint32_t best_zero = subtree::NODE_NULL; + + while (min != limit) { + uint32_t mid = (min + limit) / 2; + int hv = h(this->d.a.values[mid], extra); + if (hv < 0) { + min = mid + 1; + } else if (hv > 0) { + best_pos = mid; + limit = mid; + } else { + best_zero = mid; + limit = mid; + } + } + if (best_zero != subtree::NODE_NULL) { + // Found a zero + if (value != nullptr) { + copyout(value, &this->d.a.values[best_zero]); + } + *idxp = best_zero - this->d.a.start_idx; + return 0; + } + if (best_pos != subtree::NODE_NULL) + *idxp = best_pos - this->d.a.start_idx; + else + *idxp = this->d.a.num_values; + return DB_NOTFOUND; +} + +template +template +int omt::find_internal_zero( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + *idxp = 0; + return DB_NOTFOUND; + } + omt_node &n = this->d.t.nodes[st.get_index()]; + int hv = h(n.value, extra); + if (hv < 0) { + int r = this->find_internal_zero(n.right, extra, value, idxp); + *idxp += this->nweight(n.left) + 1; + return r; + } else if (hv > 0) { + return this->find_internal_zero(n.left, extra, value, idxp); + } else { + int r = this->find_internal_zero(n.left, extra, value, idxp); + if (r == DB_NOTFOUND) { + *idxp = this->nweight(n.left); + if (value != nullptr) { + copyout(value, &n); + } + r = 0; + } + return r; + } +} + +template +template +int omt::find_internal_plus_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best = subtree::NODE_NULL; + + while (min != limit) { + const uint32_t mid = (min + limit) / 2; + const int hv = h(this->d.a.values[mid], extra); + if (hv > 0) { + best = mid; + limit = mid; + } else { + min = mid + 1; + } + } + if (best == subtree::NODE_NULL) { + return DB_NOTFOUND; + } + if (value != nullptr) { + copyout(value, &this->d.a.values[best]); + } + *idxp = best - this->d.a.start_idx; + return 0; +} + +template +template +int omt::find_internal_plus( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + return DB_NOTFOUND; + } + omt_node *const n = &this->d.t.nodes[st.get_index()]; + int hv = h(n->value, extra); + int r; + if (hv > 0) { + r = this->find_internal_plus(n->left, extra, value, idxp); + if (r == DB_NOTFOUND) { + *idxp = this->nweight(n->left); + if (value != nullptr) { + copyout(value, n); + } + r = 0; + } + } else { + r = this->find_internal_plus(n->right, extra, value, idxp); + if (r == 0) { + *idxp += this->nweight(n->left) + 1; + } + } + return r; +} + +template +template +int omt::find_internal_minus_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best = subtree::NODE_NULL; + + while (min != limit) { + const uint32_t mid = (min + limit) / 2; + const int hv = h(this->d.a.values[mid], extra); + if (hv < 0) { + best = mid; + min = mid + 1; + } else { + limit = mid; + } + } + if (best == subtree::NODE_NULL) { + return DB_NOTFOUND; + } + if (value != nullptr) { + copyout(value, &this->d.a.values[best]); + } + *idxp = best - this->d.a.start_idx; + return 0; +} + +template +template +int omt::find_internal_minus( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + return DB_NOTFOUND; + } + omt_node *const n = &this->d.t.nodes[st.get_index()]; + int hv = h(n->value, extra); + if (hv < 0) { + int r = + this->find_internal_minus(n->right, extra, value, idxp); + if (r == 0) { + *idxp += this->nweight(n->left) + 1; + } else if (r == DB_NOTFOUND) { + *idxp = this->nweight(n->left); + if (value != nullptr) { + copyout(value, n); + } + r = 0; + } + return r; + } else { + return this->find_internal_minus(n->left, extra, value, idxp); + } +} +} // namespace toku diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h new file mode 100644 index 0000000000..f20eeedf2f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h @@ -0,0 +1,165 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// Overview: A partitioned_counter provides a counter that can be incremented +// and the running sum can be read at any time. +// We assume that increments are frequent, whereas reading is infrequent. +// Implementation hint: Use thread-local storage so each thread increments its +// own data. The increment does not require a lock or atomic operation. +// Reading the data can be performed by iterating over the thread-local +// versions, summing them up. The data structure also includes a sum for all +// the threads that have died. Use a pthread_key to create the thread-local +// versions. When a thread finishes, the system calls pthread_key destructor +// which can add that thread's copy into the sum_of_dead counter. +// Rationale: For statistics such as are found in engine status, we need a +// counter that requires no cache misses to increment. We've seen significant +// performance speedups by removing certain counters. Rather than removing +// those statistics, we would like to just make the counter fast. We generally +// increment the counters frequently, and want to fetch the values +// infrequently. The counters are monotonic. The counters can be split into +// many counters, which can be summed up at the end. We don't care if we get +// slightly out-of-date counter sums when we read the counter. We don't care +// if there is a race on reading the a counter +// variable and incrementing. +// See tests/test_partitioned_counter.c for some performance measurements. +// Operations: +// create_partitioned_counter Create a counter initialized to zero. +// destroy_partitioned_counter Destroy it. +// increment_partitioned_counter Increment it. This is the frequent +// operation. read_partitioned_counter Get the current value. This is +// infrequent. +// See partitioned_counter.cc for the abstraction function and representation +// invariant. +// +// The google style guide says to avoid using constructors, and it appears that +// constructors may have broken all the tests, because they called +// pthread_key_create before the key was actually created. So the google style +// guide may have some wisdom there... +// +// This version does not use constructors, essentially reverrting to the google +// C++ style guide. +// + +// The old C interface. This required a bunch of explicit +// ___attribute__((__destructor__)) functions to remember to destroy counters at +// the end. +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct partitioned_counter *PARTITIONED_COUNTER; +PARTITIONED_COUNTER create_partitioned_counter(void); +// Effect: Create a counter, initialized to zero. + +void destroy_partitioned_counter(PARTITIONED_COUNTER); +// Effect: Destroy the counter. No operations on that counter are permitted +// after this. + +void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount); +// Effect: Increment the counter by amount. +// Requires: No overflows. This is a 64-bit unsigned counter. + +uint64_t read_partitioned_counter(PARTITIONED_COUNTER) + __attribute__((__visibility__("default"))); +// Effect: Return the current value of the counter. + +void partitioned_counters_init(void); +// Effect: Initialize any partitioned counters data structures that must be set +// up before any partitioned counters run. + +void partitioned_counters_destroy(void); +// Effect: Destroy any partitioned counters data structures. + +#if defined(__cplusplus) +}; +#endif + +#if 0 +#include + +#include "fttypes.h" + +// Used inside the PARTITIONED_COUNTER. +struct linked_list_head { + struct linked_list_element *first; +}; + + +class PARTITIONED_COUNTER { +public: + PARTITIONED_COUNTER(void); + // Effect: Construct a counter, initialized to zero. + + ~PARTITIONED_COUNTER(void); + // Effect: Destruct the counter. + + void increment(uint64_t amount); + // Effect: Increment the counter by amount. This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64). + // Requires: Don't use this from a static constructor or destructor. + + uint64_t read(void); + // Effect: Read the sum. + // Requires: Don't use this from a static constructor or destructor. + +private: + uint64_t _sum_of_dead; // The sum of all thread-local counts from threads that have terminated. + pthread_key_t _key; // The pthread_key which gives us the hook to construct and destruct thread-local storage. + struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter. + + // This function is used to destroy the thread-local part of the state when a thread terminates. + // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends. + friend void destroy_thread_local_part_of_partitioned_counters (void *); +}; +#endif diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/status.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/status.h new file mode 100644 index 0000000000..3fd0095d0f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/lib/util/status.h @@ -0,0 +1,76 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "partitioned_counter.h" +// PORT2: #include + +#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc) \ + do { \ + array.status[k].keyname = #k; \ + array.status[k].columnname = #c; \ + array.status[k].type = t; \ + array.status[k].legend = l; \ + constexpr_static_assert( \ + strcmp(#c, "NULL") && strcmp(#c, "0"), \ + "Use nullptr for no column name instead of NULL, 0, etc..."); \ + constexpr_static_assert( \ + (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"), \ + "Missing column name."); \ + array.status[k].include = \ + static_cast(inc); \ + if (t == STATUS_PARCOUNT) { \ + array.status[k].value.parcount = create_partitioned_counter(); \ + } \ + } while (0) diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h new file mode 100644 index 0000000000..2652add159 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef OS_WIN + +// For DeadlockInfoBuffer: +#include "util/thread_local.h" +#include "utilities/transactions/lock/point/point_lock_manager.h" +#include "utilities/transactions/lock/range/range_lock_manager.h" + +// Lock Tree library: +#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h" +#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h" +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +typedef DeadlockInfoBufferTempl RangeDeadlockInfoBuffer; + +// A Range Lock Manager that uses PerconaFT's locktree library +class RangeTreeLockManager : public RangeLockManagerBase, + public RangeLockManagerHandle { + public: + LockManager* getLockManager() override { return this; } + + void AddColumnFamily(const ColumnFamilyHandle* cfh) override; + void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override; + + void Resize(uint32_t) override; + std::vector GetDeadlockInfoBuffer() override; + + std::vector GetRangeDeadlockInfoBuffer() override; + void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override; + + // Get a lock on a range + // @note only exclusive locks are currently supported (requesting a + // non-exclusive lock will get an exclusive one) + using LockManager::TryLock; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start_endp, const Endpoint& end_endp, Env* env, + bool exclusive) override; + + void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env) override; + void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&, + const Endpoint&, Env*) override { + // TODO: range unlock does nothing... + } + + explicit RangeTreeLockManager( + std::shared_ptr mutex_factory); + + ~RangeTreeLockManager() override; + + int SetMaxLockMemory(size_t max_lock_memory) override { + return ltm_.set_max_lock_memory(max_lock_memory); + } + + size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); } + + Counters GetStatus() override; + + bool IsPointLockSupported() const override { + // One could have acquired a point lock (it is reduced to range lock) + return true; + } + + PointLockStatus GetPointLockStatus() override; + + // This is from LockManager + LockManager::RangeLockStatus GetRangeLockStatus() override; + + // This has the same meaning as GetRangeLockStatus but is from + // RangeLockManagerHandle + RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override { + return GetRangeLockStatus(); + } + + bool IsRangeLockSupported() const override { return true; } + + const LockTrackerFactory& GetLockTrackerFactory() const override { + return RangeTreeLockTrackerFactory::Get(); + } + + // Get the locktree which stores locks for the Column Family with given cf_id + std::shared_ptr GetLockTreeForCF(ColumnFamilyId cf_id); + + void SetEscalationBarrierFunc(EscalationBarrierFunc func) override { + barrier_func_ = func; + } + + private: + toku::locktree_manager ltm_; + + EscalationBarrierFunc barrier_func_ = + [](const Endpoint&, const Endpoint&) -> bool { return false; }; + + std::shared_ptr mutex_factory_; + + // Map from cf_id to locktree*. Can only be accessed while holding the + // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt + using LockTreeMap = + std::unordered_map>; + LockTreeMap ltree_map_; + + InstrumentedMutex ltree_map_mutex_; + + // Per-thread cache of ltree_map_. + // (uses the same approach as TransactionLockMgr::lock_maps_cache_) + std::unique_ptr ltree_lookup_cache_; + + RangeDeadlockInfoBuffer dlock_buffer_; + + std::shared_ptr MakeLockTreePtr(toku::locktree* lt); + static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key); + + // Callbacks + static int on_create(toku::locktree*, void*); + static void on_destroy(toku::locktree*) {} + static void on_escalate(TXNID txnid, const toku::locktree* lt, + const toku::range_buffer& buffer, void* extra); + + static bool OnEscalationBarrierCheck(const DBT* a, const DBT* b, void* extra); +}; + +void serialize_endpoint(const Endpoint& endp, std::string* buf); +void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos); + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h new file mode 100644 index 0000000000..4ef48d2527 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "util/mutexlock.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction.h" + +// Range Locking: +#include "lib/locktree/lock_request.h" +#include "lib/locktree/locktree.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeTreeLockManager; + +// Storage for locks that are currently held by a transaction. +// +// Locks are kept in toku::range_buffer because toku::locktree::release_locks() +// accepts that as an argument. +// +// Note: the list of locks may differ slighly from the contents of the lock +// tree, due to concurrency between lock acquisition, lock release, and lock +// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details. +// This property is currently harmless. +// +// Append() and ReleaseLocks() are not thread-safe, as they are expected to be +// called only by the owner transaction. ReplaceLocks() is safe to call from +// other threads. +class RangeLockList { + public: + ~RangeLockList() { Clear(); } + + RangeLockList() : releasing_locks_(false) {} + + void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key); + void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn, + bool all_trx_locks); + void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer); + + private: + void Clear() { + for (auto it : buffers_) { + it.second->destroy(); + } + buffers_.clear(); + } + + std::unordered_map> + buffers_; + port::Mutex mutex_; + std::atomic releasing_locks_; +}; + +// A LockTracker-based object that is used together with RangeTreeLockManager. +class RangeTreeLockTracker : public LockTracker { + public: + RangeTreeLockTracker() : range_list_(nullptr) {} + + RangeTreeLockTracker(const RangeTreeLockTracker&) = delete; + RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete; + + void Track(const PointLockRequest&) override; + void Track(const RangeLockRequest&) override; + + bool IsPointLockSupported() const override { + // This indicates that we don't implement GetPointLockStatus() + return false; + } + bool IsRangeLockSupported() const override { return true; } + + // a Not-supported dummy implementation. + UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + // "If this method is not supported, leave it as a no-op." + void Merge(const LockTracker&) override {} + + // "If this method is not supported, leave it as a no-op." + void Subtract(const LockTracker&) override {} + + void Clear() override; + + // "If this method is not supported, returns nullptr." + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker&) const override { + return nullptr; + } + + PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, + const std::string& key) const override; + + // The return value is only used for tests + uint64_t GetNumPointLocks() const override { return 0; } + + ColumnFamilyIterator* GetColumnFamilyIterator() const override { + return nullptr; + } + + KeyIterator* GetKeyIterator( + ColumnFamilyId /*column_family_id*/) const override { + return nullptr; + } + + void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn, + bool all_trx_locks) { + if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks); + } + + void ReplaceLocks(const toku::locktree* lt, + const toku::range_buffer& buffer) { + // range_list_ cannot be NULL here + range_list_->ReplaceLocks(lt, buffer); + } + + private: + RangeLockList* getOrCreateList(); + std::unique_ptr range_list_; +}; + +class RangeTreeLockTrackerFactory : public LockTrackerFactory { + public: + static const RangeTreeLockTrackerFactory& Get() { + static const RangeTreeLockTrackerFactory instance; + return instance; + } + + LockTracker* Create() const override { return new RangeTreeLockTracker(); } + + private: + RangeTreeLockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.cc new file mode 100644 index 0000000000..af629b528a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/optimistic_transaction.h" + +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/lock/point/point_lock_tracker.h" +#include "utilities/transactions/optimistic_transaction.h" +#include "utilities/transactions/optimistic_transaction_db_impl.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +OptimisticTransaction::OptimisticTransaction( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) + : TransactionBaseImpl(txn_db->GetBaseDB(), write_options, + PointLockTrackerFactory::Get()), + txn_db_(txn_db) { + Initialize(txn_options); +} + +void OptimisticTransaction::Initialize( + const OptimisticTransactionOptions& txn_options) { + if (txn_options.set_snapshot) { + SetSnapshot(); + } +} + +void OptimisticTransaction::Reinitialize( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options); + Initialize(txn_options); +} + +OptimisticTransaction::~OptimisticTransaction() {} + +void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); } + +Status OptimisticTransaction::Prepare() { + return Status::InvalidArgument( + "Two phase commit not supported for optimistic transactions."); +} + +Status OptimisticTransaction::Commit() { + auto txn_db_impl = static_cast_with_check(txn_db_); + assert(txn_db_impl); + switch (txn_db_impl->GetValidatePolicy()) { + case OccValidationPolicy::kValidateParallel: + return CommitWithParallelValidate(); + case OccValidationPolicy::kValidateSerial: + return CommitWithSerialValidate(); + default: + assert(0); + } + // unreachable, just void compiler complain + return Status::OK(); +} + +Status OptimisticTransaction::CommitWithSerialValidate() { + // Set up callback which will call CheckTransactionForConflicts() to + // check whether this transaction is safe to be committed. + OptimisticTransactionCallback callback(this); + + DBImpl* db_impl = static_cast_with_check(db_->GetRootDB()); + + Status s = db_impl->WriteWithCallback( + write_options_, GetWriteBatch()->GetWriteBatch(), &callback); + + if (s.ok()) { + Clear(); + } + + return s; +} + +Status OptimisticTransaction::CommitWithParallelValidate() { + auto txn_db_impl = static_cast_with_check(txn_db_); + assert(txn_db_impl); + DBImpl* db_impl = static_cast_with_check(db_->GetRootDB()); + assert(db_impl); + const size_t space = txn_db_impl->GetLockBucketsSize(); + std::set lk_idxes; + std::vector> lks; + std::unique_ptr cf_it( + tracked_locks_->GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::unique_ptr key_it( + tracked_locks_->GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space)); + } + } + // NOTE: in a single txn, all bucket-locks are taken in ascending order. + // In this way, txns from different threads all obey this rule so that + // deadlock can be avoided. + for (auto v : lk_idxes) { + lks.emplace_back(txn_db_impl->LockBucket(v)); + } + + Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, + true /* cache_only */); + if (!s.ok()) { + return s; + } + + s = db_impl->Write(write_options_, GetWriteBatch()->GetWriteBatch()); + if (s.ok()) { + Clear(); + } + + return s; +} + +Status OptimisticTransaction::Rollback() { + Clear(); + return Status::OK(); +} + +// Record this key so that we can check it for conflicts at commit time. +// +// 'exclusive' is unused for OptimisticTransaction. +Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + assert(!assume_tracked); // not supported + (void)assume_tracked; + if (!do_validate) { + return Status::OK(); + } + uint32_t cfh_id = GetColumnFamilyID(column_family); + + SetSnapshotIfNeeded(); + + SequenceNumber seq; + if (snapshot_) { + seq = snapshot_->GetSequenceNumber(); + } else { + seq = db_->GetLatestSequenceNumber(); + } + + std::string key_str = key.ToString(); + + TrackKey(cfh_id, key_str, seq, read_only, exclusive); + + // Always return OK. Confilct checking will happen at commit time. + return Status::OK(); +} + +// Returns OK if it is safe to commit this transaction. Returns Status::Busy +// if there are read or write conflicts that would prevent us from committing OR +// if we can not determine whether there would be any such conflicts. +// +// Should only be called on writer thread in order to avoid any race conditions +// in detecting write conflicts. +Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) { + auto db_impl = static_cast_with_check(db); + + // Since we are on the write thread and do not want to block other writers, + // we will do a cache-only conflict check. This can result in TryAgain + // getting returned if there is not sufficient memtable history to check + // for conflicts. + return TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, + true /* cache_only */); +} + +Status OptimisticTransaction::SetName(const TransactionName& /* unused */) { + return Status::InvalidArgument("Optimistic transactions cannot be named."); +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.h new file mode 100644 index 0000000000..a03fa16970 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransaction : public TransactionBaseImpl { + public: + OptimisticTransaction(OptimisticTransactionDB* db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + // No copying allowed + OptimisticTransaction(const OptimisticTransaction&) = delete; + void operator=(const OptimisticTransaction&) = delete; + + virtual ~OptimisticTransaction(); + + void Reinitialize(OptimisticTransactionDB* txn_db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + + Status Prepare() override; + + Status Commit() override; + + Status Rollback() override; + + Status SetName(const TransactionName& name) override; + + protected: + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false) override; + + private: + ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_; + + friend class OptimisticTransactionCallback; + + void Initialize(const OptimisticTransactionOptions& txn_options); + + // Returns OK if it is safe to commit this transaction. Returns Status::Busy + // if there are read or write conflicts that would prevent us from committing + // OR if we can not determine whether there would be any such conflicts. + // + // Should only be called on writer thread. + Status CheckTransactionForConflicts(DB* db); + + void Clear() override; + + void UnlockGetForUpdate(ColumnFamilyHandle* /* unused */, + const Slice& /* unused */) override { + // Nothing to unlock. + } + + Status CommitWithSerialValidate(); + + Status CommitWithParallelValidate(); +}; + +// Used at commit time to trigger transaction validation +class OptimisticTransactionCallback : public WriteCallback { + public: + explicit OptimisticTransactionCallback(OptimisticTransaction* txn) + : txn_(txn) {} + + Status Callback(DB* db) override { + return txn_->CheckTransactionForConflicts(db); + } + + bool AllowWriteBatching() override { return false; } + + private: + OptimisticTransaction* txn_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.cc new file mode 100644 index 0000000000..3a81615335 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/optimistic_transaction_db_impl.h" + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "utilities/transactions/optimistic_transaction.h" + +namespace ROCKSDB_NAMESPACE { + +Transaction* OptimisticTransactionDBImpl::BeginTransaction( + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new OptimisticTransaction(this, write_options, txn_options); + } +} + +std::unique_lock OptimisticTransactionDBImpl::LockBucket( + size_t idx) { + assert(idx < bucketed_locks_.size()); + return std::unique_lock(*bucketed_locks_[idx]); +} + +Status OptimisticTransactionDB::Open(const Options& options, + const std::string& dbname, + OptimisticTransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status OptimisticTransactionDB::Open( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr) { + return OptimisticTransactionDB::Open(db_options, + OptimisticTransactionDBOptions(), dbname, + column_families, handles, dbptr); +} + +Status OptimisticTransactionDB::Open( + const DBOptions& db_options, + const OptimisticTransactionDBOptions& occ_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr) { + Status s; + DB* db; + + std::vector column_families_copy = column_families; + + // Enable MemTable History if not already enabled + for (auto& column_family : column_families_copy) { + ColumnFamilyOptions* options = &column_family.options; + + if (options->max_write_buffer_size_to_maintain == 0 && + options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to + // max_write_buffer_number * write_buffer_size. + options->max_write_buffer_size_to_maintain = -1; + } + } + + s = DB::Open(db_options, dbname, column_families_copy, handles, &db); + + if (s.ok()) { + *dbptr = new OptimisticTransactionDBImpl(db, occ_options); + } + + return s; +} + +void OptimisticTransactionDBImpl::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.h new file mode 100644 index 0000000000..fef145356e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/optimistic_transaction_db_impl.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransactionDBImpl : public OptimisticTransactionDB { + public: + explicit OptimisticTransactionDBImpl( + DB* db, const OptimisticTransactionDBOptions& occ_options, + bool take_ownership = true) + : OptimisticTransactionDB(db), + db_owner_(take_ownership), + validate_policy_(occ_options.validate_policy) { + if (validate_policy_ == OccValidationPolicy::kValidateParallel) { + uint32_t bucket_size = std::max(16u, occ_options.occ_lock_buckets); + bucketed_locks_.reserve(bucket_size); + for (size_t i = 0; i < bucket_size; ++i) { + bucketed_locks_.emplace_back( + std::unique_ptr(new std::mutex)); + } + } + } + + ~OptimisticTransactionDBImpl() { + // Prevent this stackable from destroying + // base db + if (!db_owner_) { + db_ = nullptr; + } + } + + Transaction* BeginTransaction(const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, + Transaction* old_txn) override; + + // Transactional `DeleteRange()` is not yet supported. + using StackableDB::DeleteRange; + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } + + // Range deletions also must not be snuck into `WriteBatch`es as they are + // incompatible with `OptimisticTransactionDB`. + virtual Status Write(const WriteOptions& write_opts, + WriteBatch* batch) override { + if (batch->HasDeleteRange()) { + return Status::NotSupported(); + } + return OptimisticTransactionDB::Write(write_opts, batch); + } + + size_t GetLockBucketsSize() const { return bucketed_locks_.size(); } + + OccValidationPolicy GetValidatePolicy() const { return validate_policy_; } + + std::unique_lock LockBucket(size_t idx); + + private: + // NOTE: used in validation phase. Each key is hashed into some + // bucket. We then take the lock in the hash value order to avoid deadlock. + std::vector> bucketed_locks_; + + bool db_owner_; + + const OccValidationPolicy validate_policy_; + + void ReinitializeTransaction(Transaction* txn, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.cc new file mode 100644 index 0000000000..1771497a69 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.cc @@ -0,0 +1,1177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/pessimistic_transaction.h" + +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_util.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +std::atomic PessimisticTransaction::txn_id_counter_(1); + +TransactionID PessimisticTransaction::GenTxnID() { + return txn_id_counter_.fetch_add(1); +} + +PessimisticTransaction::PessimisticTransaction( + TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options, const bool init) + : TransactionBaseImpl( + txn_db->GetRootDB(), write_options, + static_cast_with_check(txn_db) + ->GetLockTrackerFactory()), + txn_db_impl_(nullptr), + expiration_time_(0), + txn_id_(0), + waiting_cf_id_(0), + waiting_key_(nullptr), + lock_timeout_(0), + deadlock_detect_(false), + deadlock_detect_depth_(0), + skip_concurrency_control_(false) { + txn_db_impl_ = static_cast_with_check(txn_db); + db_impl_ = static_cast_with_check(db_); + if (init) { + Initialize(txn_options); + } +} + +void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) { + // Range lock manager uses address of transaction object as TXNID + const TransactionDBOptions& db_options = txn_db_impl_->GetTxnDBOptions(); + if (db_options.lock_mgr_handle && + db_options.lock_mgr_handle->getLockManager()->IsRangeLockSupported()) { + txn_id_ = reinterpret_cast(this); + } else { + txn_id_ = GenTxnID(); + } + + txn_state_ = STARTED; + + deadlock_detect_ = txn_options.deadlock_detect; + deadlock_detect_depth_ = txn_options.deadlock_detect_depth; + write_batch_.SetMaxBytes(txn_options.max_write_batch_size); + skip_concurrency_control_ = txn_options.skip_concurrency_control; + + lock_timeout_ = txn_options.lock_timeout * 1000; + if (lock_timeout_ < 0) { + // Lock timeout not set, use default + lock_timeout_ = + txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000; + } + + if (txn_options.expiration >= 0) { + expiration_time_ = start_time_ + txn_options.expiration * 1000; + } else { + expiration_time_ = 0; + } + + if (txn_options.set_snapshot) { + SetSnapshot(); + } + + if (expiration_time_ > 0) { + txn_db_impl_->InsertExpirableTransaction(txn_id_, this); + } + use_only_the_last_commit_time_batch_for_recovery_ = + txn_options.use_only_the_last_commit_time_batch_for_recovery; + skip_prepare_ = txn_options.skip_prepare; + + read_timestamp_ = kMaxTxnTimestamp; + commit_timestamp_ = kMaxTxnTimestamp; +} + +PessimisticTransaction::~PessimisticTransaction() { + txn_db_impl_->UnLock(this, *tracked_locks_); + if (expiration_time_ > 0) { + txn_db_impl_->RemoveExpirableTransaction(txn_id_); + } + if (!name_.empty() && txn_state_ != COMMITTED) { + txn_db_impl_->UnregisterTransaction(this); + } +} + +void PessimisticTransaction::Clear() { + txn_db_impl_->UnLock(this, *tracked_locks_); + TransactionBaseImpl::Clear(); +} + +void PessimisticTransaction::Reinitialize( + TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + if (!name_.empty() && txn_state_ != COMMITTED) { + txn_db_impl_->UnregisterTransaction(this); + } + TransactionBaseImpl::Reinitialize(txn_db->GetRootDB(), write_options); + Initialize(txn_options); +} + +bool PessimisticTransaction::IsExpired() const { + if (expiration_time_ > 0) { + if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) { + // Transaction is expired. + return true; + } + } + + return false; +} + +WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : PessimisticTransaction(txn_db, write_options, txn_options) {} + +Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive, const bool do_validate) { + return GetForUpdateImpl(read_options, column_family, key, value, exclusive, + do_validate); +} + +Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive, const bool do_validate) { + return GetForUpdateImpl(read_options, column_family, key, pinnable_val, + exclusive, do_validate); +} + +template +inline Status WriteCommittedTxn::GetForUpdateImpl( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, TValue* value, bool exclusive, const bool do_validate) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + column_family = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(column_family); + if (!read_options.timestamp) { + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (0 == ts_sz) { + return TransactionBaseImpl::GetForUpdate(read_options, column_family, key, + value, exclusive, do_validate); + } + } else { + Status s = db_impl_->FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + + if (!do_validate) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with read_timestamp is not " + "defined."); + } else if (kMaxTxnTimestamp == read_timestamp_) { + return Status::InvalidArgument("read_timestamp must be set for validation"); + } + + if (!read_options.timestamp) { + ReadOptions read_opts_copy = read_options; + char ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(ts_buf, read_timestamp_); + Slice ts(ts_buf, sizeof(ts_buf)); + read_opts_copy.timestamp = &ts; + return TransactionBaseImpl::GetForUpdate(read_opts_copy, column_family, key, + value, exclusive, do_validate); + } + assert(read_options.timestamp); + const char* const ts_buf = read_options.timestamp->data(); + assert(read_options.timestamp->size() == sizeof(kMaxTxnTimestamp)); + TxnTimestamp ts = DecodeFixed64(ts_buf); + if (ts != read_timestamp_) { + return Status::InvalidArgument("Must read from the same read_timestamp"); + } + return TransactionBaseImpl::GetForUpdate(read_options, column_family, key, + value, exclusive, do_validate); +} + +Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + return Operate( + column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, &value, this]() { + Status s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + return Operate( + column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, &value, this]() { + Status s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family, + const Slice& key, const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDeleteUntracked( + ColumnFamilyHandle* column_family, const Slice& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + ++num_merges_; + } + return s; + }); +} + +template +Status WriteCommittedTxn::Operate(ColumnFamilyHandle* column_family, + const TKey& key, const bool do_validate, + const bool assume_tracked, + TOperation&& operation) { + Status s; + if constexpr (std::is_same_v) { + s = TryLock(column_family, key, /*read_only=*/false, /*exclusive=*/true, + do_validate, assume_tracked); + } else if constexpr (std::is_same_v) { + std::string key_buf; + Slice contiguous_key(key, &key_buf); + s = TryLock(column_family, contiguous_key, /*read_only=*/false, + /*exclusive=*/true, do_validate, assume_tracked); + } + if (!s.ok()) { + return s; + } + column_family = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz > 0) { + assert(ts_sz == sizeof(TxnTimestamp)); + if (!IndexingEnabled()) { + cfs_with_ts_tracked_when_indexing_disabled_.insert( + column_family->GetID()); + } + } + return operation(); +} + +Status WriteCommittedTxn::SetReadTimestampForValidation(TxnTimestamp ts) { + if (read_timestamp_ < kMaxTxnTimestamp && ts < read_timestamp_) { + return Status::InvalidArgument( + "Cannot decrease read timestamp for validation"); + } + read_timestamp_ = ts; + return Status::OK(); +} + +Status WriteCommittedTxn::SetCommitTimestamp(TxnTimestamp ts) { + if (read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) { + return Status::InvalidArgument( + "Cannot commit at timestamp smaller than or equal to read timestamp"); + } + commit_timestamp_ = ts; + return Status::OK(); +} + +Status PessimisticTransaction::CommitBatch(WriteBatch* batch) { + if (batch && WriteBatchInternal::HasKeyWithTimestamp(*batch)) { + // CommitBatch() needs to lock the keys in the batch. + // However, the application also needs to specify the timestamp for the + // keys in batch before calling this API. + // This means timestamp order may violate the order of locking, thus + // violate the sequence number order for the same user key. + // Therefore, we disallow this operation for now. + return Status::NotSupported( + "Batch to commit includes timestamp assigned before locking"); + } + + std::unique_ptr keys_to_unlock(lock_tracker_factory_.Create()); + Status s = LockBatch(batch, keys_to_unlock.get()); + + if (!s.ok()) { + return s; + } + + bool can_commit = false; + + if (IsExpired()) { + s = Status::Expired(); + } else if (expiration_time_ > 0) { + TransactionState expected = STARTED; + can_commit = std::atomic_compare_exchange_strong(&txn_state_, &expected, + AWAITING_COMMIT); + } else if (txn_state_ == STARTED) { + // lock stealing is not a concern + can_commit = true; + } + + if (can_commit) { + txn_state_.store(AWAITING_COMMIT); + s = CommitBatchInternal(batch); + if (s.ok()) { + txn_state_.store(COMMITTED); + } + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + txn_db_impl_->UnLock(this, *keys_to_unlock); + + return s; +} + +Status PessimisticTransaction::Prepare() { + if (name_.empty()) { + return Status::InvalidArgument( + "Cannot prepare a transaction that has not been named."); + } + + if (IsExpired()) { + return Status::Expired(); + } + + Status s; + bool can_prepare = false; + + if (expiration_time_ > 0) { + // must concern ourselves with expiraton and/or lock stealing + // need to compare/exchange bc locks could be stolen under us here + TransactionState expected = STARTED; + can_prepare = std::atomic_compare_exchange_strong(&txn_state_, &expected, + AWAITING_PREPARE); + } else if (txn_state_ == STARTED) { + // expiration and lock stealing is not possible + txn_state_.store(AWAITING_PREPARE); + can_prepare = true; + } + + if (can_prepare) { + // transaction can't expire after preparation + expiration_time_ = 0; + assert(log_number_ == 0 || + txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED); + + s = PrepareInternal(); + if (s.ok()) { + txn_state_.store(PREPARED); + } + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else if (txn_state_ == PREPARED) { + s = Status::InvalidArgument("Transaction has already been prepared."); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("Transaction has already been committed."); + } else if (txn_state_ == ROLLEDBACK) { + s = Status::InvalidArgument("Transaction has already been rolledback."); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + return s; +} + +Status WriteCommittedTxn::PrepareInternal() { + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_); + assert(s.ok()); + class MarkLogCallback : public PreReleaseCallback { + public: + MarkLogCallback(DBImpl* db, bool two_write_queues) + : db_(db), two_write_queues_(two_write_queues) { + (void)two_write_queues_; // to silence unused private field warning + } + virtual Status Callback(SequenceNumber, bool is_mem_disabled, + uint64_t log_number, size_t /*index*/, + size_t /*total*/) override { +#ifdef NDEBUG + (void)is_mem_disabled; +#endif + assert(log_number != 0); + assert(!two_write_queues_ || is_mem_disabled); // implies the 2nd queue + db_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(log_number); + return Status::OK(); + } + + private: + DBImpl* db_; + bool two_write_queues_; + } mark_log_callback(db_impl_, + db_impl_->immutable_db_options().two_write_queues); + + WriteCallback* const kNoWriteCallback = nullptr; + const uint64_t kRefNoLog = 0; + const bool kDisableMemtable = true; + SequenceNumber* const KIgnoreSeqUsed = nullptr; + const size_t kNoBatchCount = 0; + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + kNoWriteCallback, &log_number_, kRefNoLog, + kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount, + &mark_log_callback); + return s; +} + +Status PessimisticTransaction::Commit() { + bool commit_without_prepare = false; + bool commit_prepared = false; + + if (IsExpired()) { + return Status::Expired(); + } + + if (expiration_time_ > 0) { + // we must atomicaly compare and exchange the state here because at + // this state in the transaction it is possible for another thread + // to change our state out from under us in the even that we expire and have + // our locks stolen. In this case the only valid state is STARTED because + // a state of PREPARED would have a cleared expiration_time_. + TransactionState expected = STARTED; + commit_without_prepare = std::atomic_compare_exchange_strong( + &txn_state_, &expected, AWAITING_COMMIT); + TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1"); + } else if (txn_state_ == PREPARED) { + // expiration and lock stealing is not a concern + commit_prepared = true; + } else if (txn_state_ == STARTED) { + // expiration and lock stealing is not a concern + if (skip_prepare_) { + commit_without_prepare = true; + } else { + return Status::TxnNotPrepared(); + } + } + + Status s; + if (commit_without_prepare) { + assert(!commit_prepared); + if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) { + s = Status::InvalidArgument( + "Commit-time batch contains values that will not be committed."); + } else { + txn_state_.store(AWAITING_COMMIT); + if (log_number_ > 0) { + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + s = CommitWithoutPrepareInternal(); + if (!name_.empty()) { + txn_db_impl_->UnregisterTransaction(this); + } + Clear(); + if (s.ok()) { + txn_state_.store(COMMITTED); + } + } + } else if (commit_prepared) { + txn_state_.store(AWAITING_COMMIT); + + s = CommitInternal(); + + if (!s.ok()) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Commit write failed"); + return s; + } + + // FindObsoleteFiles must now look to the memtables + // to determine what prep logs must be kept around, + // not the prep section heap. + assert(log_number_ > 0); + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + txn_db_impl_->UnregisterTransaction(this); + + Clear(); + txn_state_.store(COMMITTED); + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("Transaction has already been committed."); + } else if (txn_state_ == ROLLEDBACK) { + s = Status::InvalidArgument("Transaction has already been rolledback."); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + return s; +} + +Status WriteCommittedTxn::CommitWithoutPrepareInternal() { + WriteBatchWithIndex* wbwi = GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + + const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb); + if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must assign a commit timestamp"); + } + + if (needs_ts) { + assert(commit_timestamp_ != kMaxTxnTimestamp); + char commit_ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(commit_ts_buf, commit_timestamp_); + Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); + + Status s = + wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf); + if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits::max(); + }); + if (!s.ok()) { + return s; + } + } + + uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must set transaction commit timestamp"); + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + auto s = db_impl_->WriteImpl(write_options_, wb, + /*callback*/ nullptr, /*log_used*/ nullptr, + /*log_ref*/ 0, /*disable_memtable*/ false, + &seq_used, /*batch_cnt=*/0, + /*pre_release_callback=*/nullptr, post_mem_cb); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) { + uint64_t seq_used = kMaxSequenceNumber; + auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr, + /*log_used*/ nullptr, /*log_ref*/ 0, + /*disable_memtable*/ false, &seq_used); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status WriteCommittedTxn::CommitInternal() { + WriteBatchWithIndex* wbwi = GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + + const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb); + if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must assign a commit timestamp"); + } + // We take the commit-time batch and append the Commit marker. + // The Memtable will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + + Status s; + if (!needs_ts) { + s = WriteBatchInternal::MarkCommit(working_batch, name_); + } else { + assert(commit_timestamp_ != kMaxTxnTimestamp); + char commit_ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(commit_ts_buf, commit_timestamp_); + Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); + s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_, + commit_ts); + if (s.ok()) { + s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) != + cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits::max(); + }); + } + } + + if (!s.ok()) { + return s; + } + + // any operations appended to this working_batch will be ignored from WAL + working_batch->MarkWalTerminationPoint(); + + // insert prepared batch into Memtable only skipping WAL. + // Memtable will ignore BeginPrepare/EndPrepare markers + // in non recovery mode and simply insert the values + s = WriteBatchInternal::Append(working_batch, wb); + assert(s.ok()); + + uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + s = Status::InvalidArgument("Must set transaction commit timestamp"); + return s; + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr, + /*log_used*/ nullptr, /*log_ref*/ log_number_, + /*disable_memtable*/ false, &seq_used, + /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, + post_mem_cb); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status PessimisticTransaction::Rollback() { + Status s; + if (txn_state_ == PREPARED) { + txn_state_.store(AWAITING_ROLLBACK); + + s = RollbackInternal(); + + if (s.ok()) { + // we do not need to keep our prepared section around + assert(log_number_ > 0); + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + Clear(); + txn_state_.store(ROLLEDBACK); + } + } else if (txn_state_ == STARTED) { + if (log_number_ > 0) { + assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED); + assert(GetId() > 0); + s = RollbackInternal(); + + if (s.ok()) { + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + } + // prepare couldn't have taken place + Clear(); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("This transaction has already been committed."); + } else { + s = Status::InvalidArgument( + "Two phase transaction is not in state for rollback."); + } + + return s; +} + +Status WriteCommittedTxn::RollbackInternal() { + WriteBatch rollback_marker; + auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &rollback_marker); + return s; +} + +Status PessimisticTransaction::RollbackToSavePoint() { + if (txn_state_ != STARTED) { + return Status::InvalidArgument("Transaction is beyond state for rollback."); + } + + if (save_points_ != nullptr && !save_points_->empty()) { + // Unlock any keys locked since last transaction + auto& save_point_tracker = *save_points_->top().new_locks_; + std::unique_ptr t( + tracked_locks_->GetTrackedLocksSinceSavePoint(save_point_tracker)); + if (t) { + txn_db_impl_->UnLock(this, *t); + } + } + + return TransactionBaseImpl::RollbackToSavePoint(); +} + +// Lock all keys in this batch. +// On success, caller should unlock keys_to_unlock +Status PessimisticTransaction::LockBatch(WriteBatch* batch, + LockTracker* keys_to_unlock) { + if (!batch) { + return Status::InvalidArgument("batch is nullptr"); + } + + class Handler : public WriteBatch::Handler { + public: + // Sorted map of column_family_id to sorted set of keys. + // Since LockBatch() always locks keys in sorted order, it cannot deadlock + // with itself. We're not using a comparator here since it doesn't matter + // what the sorting is as long as it's consistent. + std::map> keys_; + + Handler() {} + + void RecordKey(uint32_t column_family_id, const Slice& key) { + std::string key_str = key.ToString(); + + auto& cfh_keys = keys_[column_family_id]; + auto iter = cfh_keys.find(key_str); + if (iter == cfh_keys.end()) { + // key not yet seen, store it. + cfh_keys.insert({std::move(key_str)}); + } + } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& /* unused */) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& /* unused */) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + }; + + // Iterating on this handler will add all keys in this batch into keys + Handler handler; + Status s = batch->Iterate(&handler); + if (!s.ok()) { + return s; + } + + // Attempt to lock all keys + for (const auto& cf_iter : handler.keys_) { + uint32_t cfh_id = cf_iter.first; + auto& cfh_keys = cf_iter.second; + + for (const auto& key_iter : cfh_keys) { + const std::string& key = key_iter; + + s = txn_db_impl_->TryLock(this, cfh_id, key, true /* exclusive */); + if (!s.ok()) { + break; + } + PointLockRequest r; + r.column_family_id = cfh_id; + r.key = key; + r.seq = kMaxSequenceNumber; + r.read_only = false; + r.exclusive = true; + keys_to_unlock->Track(r); + } + + if (!s.ok()) { + break; + } + } + + if (!s.ok()) { + txn_db_impl_->UnLock(this, *keys_to_unlock); + } + + return s; +} + +// Attempt to lock this key. +// Returns OK if the key has been successfully locked. Non-ok, otherwise. +// If check_shapshot is true and this transaction has a snapshot set, +// this key will only be locked if there have been no writes to this key since +// the snapshot time. +Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + assert(!assume_tracked || !do_validate); + Status s; + if (UNLIKELY(skip_concurrency_control_)) { + return s; + } + uint32_t cfh_id = GetColumnFamilyID(column_family); + std::string key_str = key.ToString(); + + PointLockStatus status; + bool lock_upgrade; + bool previously_locked; + if (tracked_locks_->IsPointLockSupported()) { + status = tracked_locks_->GetPointLockStatus(cfh_id, key_str); + previously_locked = status.locked; + lock_upgrade = previously_locked && exclusive && !status.exclusive; + } else { + // If the record is tracked, we can assume it was locked, too. + previously_locked = assume_tracked; + status.locked = false; + lock_upgrade = false; + } + + // Lock this key if this transactions hasn't already locked it or we require + // an upgrade. + if (!previously_locked || lock_upgrade) { + s = txn_db_impl_->TryLock(this, cfh_id, key_str, exclusive); + } + + const ColumnFamilyHandle* const cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + SetSnapshotIfNeeded(); + + // Even though we do not care about doing conflict checking for this write, + // we still need to take a lock to make sure we do not cause a conflict with + // some other write. However, we do not need to check if there have been + // any writes since this transaction's snapshot. + // TODO(agiardullo): could optimize by supporting shared txn locks in the + // future. + SequenceNumber tracked_at_seq = + status.locked ? status.seq : kMaxSequenceNumber; + if (!do_validate || (snapshot_ == nullptr && + (0 == ts_sz || kMaxTxnTimestamp == read_timestamp_))) { + if (assume_tracked && !previously_locked && + tracked_locks_->IsPointLockSupported()) { + s = Status::InvalidArgument( + "assume_tracked is set but it is not tracked yet"); + } + // Need to remember the earliest sequence number that we know that this + // key has not been modified after. This is useful if this same + // transaction later tries to lock this key again. + if (tracked_at_seq == kMaxSequenceNumber) { + // Since we haven't checked a snapshot, we only know this key has not + // been modified since after we locked it. + // Note: when last_seq_same_as_publish_seq_==false this is less than the + // latest allocated seq but it is ok since i) this is just a heuristic + // used only as a hint to avoid actual check for conflicts, ii) this would + // cause a false positive only if the snapthot is taken right after the + // lock, which would be an unusual sequence. + tracked_at_seq = db_->GetLatestSequenceNumber(); + } + } else if (s.ok()) { + // If a snapshot is set, we need to make sure the key hasn't been modified + // since the snapshot. This must be done after we locked the key. + // If we already have validated an earilier snapshot it must has been + // reflected in tracked_at_seq and ValidateSnapshot will return OK. + s = ValidateSnapshot(column_family, key, &tracked_at_seq); + + if (!s.ok()) { + // Failed to validate key + // Unlock key we just locked + if (lock_upgrade) { + s = txn_db_impl_->TryLock(this, cfh_id, key_str, false /* exclusive */); + assert(s.ok()); + } else if (!previously_locked) { + txn_db_impl_->UnLock(this, cfh_id, key.ToString()); + } + } + } + + if (s.ok()) { + // We must track all the locked keys so that we can unlock them later. If + // the key is already locked, this func will update some stats on the + // tracked key. It could also update the tracked_at_seq if it is lower + // than the existing tracked key seq. These stats are necessary for + // RollbackToSavePoint to determine whether a key can be safely removed + // from tracked_keys_. Removal can only be done if a key was only locked + // during the current savepoint. + // + // Recall that if assume_tracked is true, we assume that TrackKey has been + // called previously since the last savepoint, with the same exclusive + // setting, and at a lower sequence number, so skipping here should be + // safe. + if (!assume_tracked) { + TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive); + } else { +#ifndef NDEBUG + if (tracked_locks_->IsPointLockSupported()) { + PointLockStatus lock_status = + tracked_locks_->GetPointLockStatus(cfh_id, key_str); + assert(lock_status.locked); + assert(lock_status.seq <= tracked_at_seq); + assert(lock_status.exclusive == exclusive); + } +#endif + } + } + + return s; +} + +Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_endp, + const Endpoint& end_endp) { + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + uint32_t cfh_id = GetColumnFamilyID(cfh); + + Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp); + + if (s.ok()) { + RangeLockRequest req{cfh_id, start_endp, end_endp}; + tracked_locks_->Track(req); + } + return s; +} + +// Return OK() if this key has not been modified more recently than the +// transaction snapshot_. +// tracked_at_seq is the global seq at which we either locked the key or already +// have done ValidateSnapshot. +Status PessimisticTransaction::ValidateSnapshot( + ColumnFamilyHandle* column_family, const Slice& key, + SequenceNumber* tracked_at_seq) { + assert(snapshot_ || read_timestamp_ < kMaxTxnTimestamp); + + SequenceNumber snap_seq = 0; + if (snapshot_) { + snap_seq = snapshot_->GetSequenceNumber(); + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated (or locked) at a sequence number + // earlier than the current snapshot's sequence number, we already know it + // has not been modified aftter snap_seq either. + return Status::OK(); + } + } else { + snap_seq = db_impl_->GetLatestSequenceNumber(); + } + + // Otherwise we have either + // 1: tracked_at_seq == kMaxSequenceNumber, i.e., first time tracking the key + // 2: snap_seq < tracked_at_seq: last time we lock the key was via + // do_validate=false which means we had skipped ValidateSnapshot. In both + // cases we should do ValidateSnapshot now. + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + std::string ts_buf; + if (ts_sz > 0 && read_timestamp_ < kMaxTxnTimestamp) { + assert(ts_sz == sizeof(read_timestamp_)); + PutFixed64(&ts_buf, read_timestamp_); + } + + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, ts_sz == 0 ? nullptr : &ts_buf, + false /* cache_only */); +} + +bool PessimisticTransaction::TryStealingLocks() { + assert(IsExpired()); + TransactionState expected = STARTED; + return std::atomic_compare_exchange_strong(&txn_state_, &expected, + LOCKS_STOLEN); +} + +void PessimisticTransaction::UnlockGetForUpdate( + ColumnFamilyHandle* column_family, const Slice& key) { + txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString()); +} + +Status PessimisticTransaction::SetName(const TransactionName& name) { + Status s; + if (txn_state_ == STARTED) { + if (name_.length()) { + s = Status::InvalidArgument("Transaction has already been named."); + } else if (txn_db_impl_->GetTransactionByName(name) != nullptr) { + s = Status::InvalidArgument("Transaction name must be unique."); + } else if (name.length() < 1 || name.length() > 512) { + s = Status::InvalidArgument( + "Transaction name length must be between 1 and 512 chars."); + } else { + name_ = name; + txn_db_impl_->RegisterTransaction(this); + } + } else { + s = Status::InvalidArgument("Transaction is beyond state for naming."); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.h new file mode 100644 index 0000000000..ffff50974a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction.h @@ -0,0 +1,311 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB; + +// A transaction under pessimistic concurrency control. This class implements +// the locking API and interfaces with the lock manager as well as the +// pessimistic transactional db. +class PessimisticTransaction : public TransactionBaseImpl { + public: + PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options, + const bool init = true); + // No copying allowed + PessimisticTransaction(const PessimisticTransaction&) = delete; + void operator=(const PessimisticTransaction&) = delete; + + ~PessimisticTransaction() override; + + void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + + Status Prepare() override; + + Status Commit() override; + + // It is basically Commit without going through Prepare phase. The write batch + // is also directly provided instead of expecting txn to gradually batch the + // transactions writes to an internal write batch. + Status CommitBatch(WriteBatch* batch); + + Status Rollback() override; + + Status RollbackToSavePoint() override; + + Status SetName(const TransactionName& name) override; + + // Generate a new unique transaction identifier + static TransactionID GenTxnID(); + + TransactionID GetID() const override { return txn_id_; } + + std::vector GetWaitingTxns(uint32_t* column_family_id, + std::string* key) const override { + std::lock_guard lock(wait_mutex_); + std::vector ids(waiting_txn_ids_.size()); + if (key) *key = waiting_key_ ? *waiting_key_ : ""; + if (column_family_id) *column_family_id = waiting_cf_id_; + std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin()); + return ids; + } + + void SetWaitingTxn(autovector ids, uint32_t column_family_id, + const std::string* key) { + std::lock_guard lock(wait_mutex_); + waiting_txn_ids_ = ids; + waiting_cf_id_ = column_family_id; + waiting_key_ = key; + } + + void ClearWaitingTxn() { + std::lock_guard lock(wait_mutex_); + waiting_txn_ids_.clear(); + waiting_cf_id_ = 0; + waiting_key_ = nullptr; + } + + // Returns the time (in microseconds according to Env->GetMicros()) + // that this transaction will be expired. Returns 0 if this transaction does + // not expire. + uint64_t GetExpirationTime() const { return expiration_time_; } + + // returns true if this transaction has an expiration_time and has expired. + bool IsExpired() const; + + // Returns the number of microseconds a transaction can wait on acquiring a + // lock or -1 if there is no timeout. + int64_t GetLockTimeout() const { return lock_timeout_; } + void SetLockTimeout(int64_t timeout) override { + lock_timeout_ = timeout * 1000; + } + + // Returns true if locks were stolen successfully, false otherwise. + bool TryStealingLocks(); + + bool IsDeadlockDetect() const override { return deadlock_detect_; } + + int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; } + + virtual Status GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_key, + const Endpoint& end_key) override; + + protected: + // Refer to + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery + bool use_only_the_last_commit_time_batch_for_recovery_ = false; + // Refer to + // TransactionOptions::skip_prepare + bool skip_prepare_ = false; + + virtual Status PrepareInternal() = 0; + + virtual Status CommitWithoutPrepareInternal() = 0; + + // batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch + // with no duplicate keys. If zero, then the number of sub-batches is unknown. + virtual Status CommitBatchInternal(WriteBatch* batch, + size_t batch_cnt = 0) = 0; + + virtual Status CommitInternal() = 0; + + virtual Status RollbackInternal() = 0; + + virtual void Initialize(const TransactionOptions& txn_options); + + Status LockBatch(WriteBatch* batch, LockTracker* keys_to_unlock); + + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false) override; + + void Clear() override; + + PessimisticTransactionDB* txn_db_impl_; + DBImpl* db_impl_; + + // If non-zero, this transaction should not be committed after this time (in + // microseconds according to Env->NowMicros()) + uint64_t expiration_time_; + + // Timestamp used by the transaction to perform all GetForUpdate. + // Use this timestamp for conflict checking. + // read_timestamp_ == kMaxTxnTimestamp means this transaction has not + // performed any GetForUpdate. It is possible that the transaction has + // performed blind writes or Get, though. + TxnTimestamp read_timestamp_{kMaxTxnTimestamp}; + TxnTimestamp commit_timestamp_{kMaxTxnTimestamp}; + + private: + friend class TransactionTest_ValidateSnapshotTest_Test; + // Used to create unique ids for transactions. + static std::atomic txn_id_counter_; + + // Unique ID for this transaction + TransactionID txn_id_; + + // IDs for the transactions that are blocking the current transaction. + // + // empty if current transaction is not waiting. + autovector waiting_txn_ids_; + + // The following two represents the (cf, key) that a transaction is waiting + // on. + // + // If waiting_key_ is not null, then the pointer should always point to + // a valid string object. The reason is that it is only non-null when the + // transaction is blocked in the PointLockManager::AcquireWithTimeout + // function. At that point, the key string object is one of the function + // parameters. + uint32_t waiting_cf_id_; + const std::string* waiting_key_; + + // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_. + mutable std::mutex wait_mutex_; + + // Timeout in microseconds when locking a key or -1 if there is no timeout. + int64_t lock_timeout_; + + // Whether to perform deadlock detection or not. + bool deadlock_detect_; + + // Whether to perform deadlock detection or not. + int64_t deadlock_detect_depth_; + + // Refer to TransactionOptions::skip_concurrency_control + bool skip_concurrency_control_; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq); + + void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; +}; + +class WriteCommittedTxn : public PessimisticTransaction { + public: + WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + // No copying allowed + WriteCommittedTxn(const WriteCommittedTxn&) = delete; + void operator=(const WriteCommittedTxn&) = delete; + + ~WriteCommittedTxn() override {} + + using TransactionBaseImpl::GetForUpdate; + Status GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override; + Status GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override; + + using TransactionBaseImpl::Put; + // `key` does NOT include timestamp even when it's enabled. + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::PutUntracked; + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + + using TransactionBaseImpl::Delete; + // `key` does NOT include timestamp even when it's enabled. + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::DeleteUntracked; + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + + using TransactionBaseImpl::SingleDelete; + // `key` does NOT include timestamp even when it's enabled. + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::SingleDeleteUntracked; + Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + + using TransactionBaseImpl::Merge; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + + Status SetReadTimestampForValidation(TxnTimestamp ts) override; + Status SetCommitTimestamp(TxnTimestamp ts) override; + TxnTimestamp GetCommitTimestamp() const override { return commit_timestamp_; } + + private: + template + Status GetForUpdateImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + TValue* value, bool exclusive, + const bool do_validate); + + template + Status Operate(ColumnFamilyHandle* column_family, const TKey& key, + const bool do_validate, const bool assume_tracked, + TOperation&& operation); + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + + Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override; + + Status CommitInternal() override; + + Status RollbackInternal() override; + + // Column families that enable timestamps and whose data are written when + // indexing_enabled_ is false. If a key is written when indexing_enabled_ is + // true, then the corresponding column family is not added to cfs_with_ts + // even if it enables timestamp. + std::unordered_set cfs_with_ts_tracked_when_indexing_disabled_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.cc new file mode 100644 index 0000000000..8009bef197 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.cc @@ -0,0 +1,780 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/pessimistic_transaction_db.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" +#include "utilities/transactions/write_prepared_txn_db.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +PessimisticTransactionDB::PessimisticTransactionDB( + DB* db, const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + db_impl_(static_cast_with_check(db)), + txn_db_options_(txn_db_options), + lock_manager_(NewLockManager(this, txn_db_options)) { + assert(db_impl_ != nullptr); + info_log_ = db_impl_->GetDBOptions().info_log; +} + +// Support initiliazing PessimisticTransactionDB from a stackable db +// +// PessimisticTransactionDB +// ^ ^ +// | | +// | + +// | StackableDB +// | ^ +// | | +// + + +// DBImpl +// ^ +// |(inherit) +// + +// DB +// +PessimisticTransactionDB::PessimisticTransactionDB( + StackableDB* db, const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + db_impl_(static_cast_with_check(db->GetRootDB())), + txn_db_options_(txn_db_options), + lock_manager_(NewLockManager(this, txn_db_options)) { + assert(db_impl_ != nullptr); +} + +PessimisticTransactionDB::~PessimisticTransactionDB() { + while (!transactions_.empty()) { + delete transactions_.begin()->second; + // TODO(myabandeh): this seems to be an unsafe approach as it is not quite + // clear whether delete would also remove the entry from transactions_. + } +} + +Status PessimisticTransactionDB::VerifyCFOptions( + const ColumnFamilyOptions& cf_options) { + const Comparator* const ucmp = cf_options.comparator; + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (0 == ts_sz) { + return Status::OK(); + } + if (ts_sz != sizeof(TxnTimestamp)) { + std::ostringstream oss; + oss << "Timestamp of transaction must have " << sizeof(TxnTimestamp) + << " bytes. CF comparator " << std::string(ucmp->Name()) + << " timestamp size is " << ts_sz << " bytes"; + return Status::InvalidArgument(oss.str()); + } + if (txn_db_options_.write_policy != WRITE_COMMITTED) { + return Status::NotSupported("Only WriteCommittedTxn supports timestamp"); + } + return Status::OK(); +} + +Status PessimisticTransactionDB::Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) { + for (auto cf_ptr : handles) { + AddColumnFamily(cf_ptr); + } + // Verify cf options + for (auto handle : handles) { + ColumnFamilyDescriptor cfd; + Status s = handle->GetDescriptor(&cfd); + if (!s.ok()) { + return s; + } + s = VerifyCFOptions(cfd.options); + if (!s.ok()) { + return s; + } + } + + // Re-enable compaction for the column families that initially had + // compaction enabled. + std::vector compaction_enabled_cf_handles; + compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); + for (auto index : compaction_enabled_cf_indices) { + compaction_enabled_cf_handles.push_back(handles[index]); + } + + Status s = EnableAutoCompaction(compaction_enabled_cf_handles); + + // create 'real' transactions from recovered shell transactions + auto dbimpl = static_cast_with_check(GetRootDB()); + assert(dbimpl != nullptr); + auto rtrxs = dbimpl->recovered_transactions(); + + for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) { + auto recovered_trx = it->second; + assert(recovered_trx); + assert(recovered_trx->batches_.size() == 1); + const auto& seq = recovered_trx->batches_.begin()->first; + const auto& batch_info = recovered_trx->batches_.begin()->second; + assert(batch_info.log_number_); + assert(recovered_trx->name_.length()); + + WriteOptions w_options; + w_options.sync = true; + TransactionOptions t_options; + // This would help avoiding deadlock for keys that although exist in the WAL + // did not go through concurrency control. This includes the merge that + // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if + // there is a conflict between the keys of two transactions that must be + // avoided, it is already avoided by the application, MyRocks, before the + // restart (ii) application, MyRocks, guarntees to rollback/commit the + // recovered transactions before new transactions start. + t_options.skip_concurrency_control = true; + + Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr); + assert(real_trx); + real_trx->SetLogNumber(batch_info.log_number_); + assert(seq != kMaxSequenceNumber); + if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) { + real_trx->SetId(seq); + } + + s = real_trx->SetName(recovered_trx->name_); + if (!s.ok()) { + break; + } + + s = real_trx->RebuildFromWriteBatch(batch_info.batch_); + // WriteCommitted set this to to disable this check that is specific to + // WritePrepared txns + assert(batch_info.batch_cnt_ == 0 || + real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_); + real_trx->SetState(Transaction::PREPARED); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + dbimpl->DeleteAllRecoveredTransactions(); + } + return s; +} + +Transaction* WriteCommittedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WriteCommittedTxn(this, write_options, txn_options); + } +} + +TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options) { + TransactionDBOptions validated = txn_db_options; + + if (txn_db_options.num_stripes == 0) { + validated.num_stripes = 1; + } + + return validated; +} + +Status TransactionDB::Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = TransactionDB::Open(db_options, txn_db_options, dbname, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status TransactionDB::Open( + const DBOptions& db_options, const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, TransactionDB** dbptr) { + Status s; + DB* db = nullptr; + if (txn_db_options.write_policy == WRITE_COMMITTED && + db_options.unordered_write) { + return Status::NotSupported( + "WRITE_COMMITTED is incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_UNPREPARED && + db_options.unordered_write) { + // TODO(lth): support it + return Status::NotSupported( + "WRITE_UNPREPARED is currently incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_PREPARED && + db_options.unordered_write && !db_options.two_write_queues) { + return Status::NotSupported( + "WRITE_PREPARED is incompatible with unordered_writes if " + "two_write_queues is not enabled."); + } + + std::vector column_families_copy = column_families; + std::vector compaction_enabled_cf_indices; + DBOptions db_options_2pc = db_options; + PrepareWrap(&db_options_2pc, &column_families_copy, + &compaction_enabled_cf_indices); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db, + use_seq_per_batch, use_batch_per_txn); + if (s.ok()) { + ROCKS_LOG_WARN(db->GetDBOptions().info_log, + "Transaction write_policy is %" PRId32, + static_cast(txn_db_options.write_policy)); + // if WrapDB return non-ok, db will be deleted in WrapDB() via + // ~StackableDB(). + s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles, + dbptr); + } + return s; +} + +void TransactionDB::PrepareWrap( + DBOptions* db_options, std::vector* column_families, + std::vector* compaction_enabled_cf_indices) { + compaction_enabled_cf_indices->clear(); + + // Enable MemTable History if not already enabled + for (size_t i = 0; i < column_families->size(); i++) { + ColumnFamilyOptions* cf_options = &(*column_families)[i].options; + + if (cf_options->max_write_buffer_size_to_maintain == 0 && + cf_options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to + // max_write_buffer_number * write_buffer_size. + cf_options->max_write_buffer_size_to_maintain = -1; + } + if (!cf_options->disable_auto_compactions) { + // Disable compactions momentarily to prevent race with DB::Open + cf_options->disable_auto_compactions = true; + compaction_enabled_cf_indices->push_back(i); + } + } + db_options->allow_2pc = true; +} + +namespace { +template +Status WrapAnotherDBInternal( + DBType* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr) { + assert(db != nullptr); + assert(dbptr != nullptr); + *dbptr = nullptr; + std::unique_ptr txn_db; + // txn_db owns object pointed to by the raw db pointer. + switch (txn_db_options.write_policy) { + case WRITE_UNPREPARED: + txn_db.reset(new WriteUnpreparedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + break; + case WRITE_PREPARED: + txn_db.reset(new WritePreparedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + break; + case WRITE_COMMITTED: + default: + txn_db.reset(new WriteCommittedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + } + txn_db->UpdateCFComparatorMap(handles); + Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles); + // In case of a failure at this point, db is deleted via the txn_db destructor + // and set to nullptr. + if (s.ok()) { + *dbptr = txn_db.release(); + } else { + for (auto* h : handles) { + delete h; + } + // txn_db still owns db, and ~StackableDB() will be called when txn_db goes + // out of scope, deleting the input db pointer. + ROCKS_LOG_FATAL(db->GetDBOptions().info_log, + "Failed to initialize txn_db: %s", s.ToString().c_str()); + } + return s; +} +} // namespace + +Status TransactionDB::WrapDB( + // make sure this db is already opened with memtable history enabled, + // auto compaction distabled and 2 phase commit enabled + DB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr) { + return WrapAnotherDBInternal(db, txn_db_options, + compaction_enabled_cf_indices, handles, dbptr); +} + +Status TransactionDB::WrapStackableDB( + // make sure this stackable_db is already opened with memtable history + // enabled, auto compaction distabled and 2 phase commit enabled + StackableDB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr) { + return WrapAnotherDBInternal(db, txn_db_options, + compaction_enabled_cf_indices, handles, dbptr); +} + +// Let LockManager know that this column family exists so it can +// allocate a LockMap for it. +void PessimisticTransactionDB::AddColumnFamily( + const ColumnFamilyHandle* handle) { + lock_manager_->AddColumnFamily(handle); +} + +Status PessimisticTransactionDB::CreateColumnFamily( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle) { + InstrumentedMutexLock l(&column_family_mutex_); + Status s = VerifyCFOptions(options); + if (!s.ok()) { + return s; + } + + s = db_->CreateColumnFamily(options, column_family_name, handle); + if (s.ok()) { + lock_manager_->AddColumnFamily(*handle); + UpdateCFComparatorMap(*handle); + } + + return s; +} + +Status PessimisticTransactionDB::CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = VerifyCFOptions(options); + if (!s.ok()) { + return s; + } + + s = db_->CreateColumnFamilies(options, column_family_names, handles); + if (s.ok()) { + for (auto* handle : *handles) { + lock_manager_->AddColumnFamily(handle); + UpdateCFComparatorMap(handle); + } + } + + return s; +} + +Status PessimisticTransactionDB::CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) { + InstrumentedMutexLock l(&column_family_mutex_); + + for (auto& cf_desc : column_families) { + Status s = VerifyCFOptions(cf_desc.options); + if (!s.ok()) { + return s; + } + } + + Status s = db_->CreateColumnFamilies(column_families, handles); + if (s.ok()) { + for (auto* handle : *handles) { + lock_manager_->AddColumnFamily(handle); + UpdateCFComparatorMap(handle); + } + } + + return s; +} + +// Let LockManager know that it can deallocate the LockMap for this +// column family. +Status PessimisticTransactionDB::DropColumnFamily( + ColumnFamilyHandle* column_family) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamily(column_family); + if (s.ok()) { + lock_manager_->RemoveColumnFamily(column_family); + } + + return s; +} + +Status PessimisticTransactionDB::DropColumnFamilies( + const std::vector& column_families) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamilies(column_families); + if (s.ok()) { + for (auto* handle : column_families) { + lock_manager_->RemoveColumnFamily(handle); + } + } + + return s; +} + +Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn, + uint32_t cfh_id, + const std::string& key, + bool exclusive) { + return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive); +} + +Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn, + uint32_t cfh_id, + const Endpoint& start_endp, + const Endpoint& end_endp) { + return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(), + /*exclusive=*/true); +} + +void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn, + const LockTracker& keys) { + lock_manager_->UnLock(txn, keys, GetEnv()); +} + +void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn, + uint32_t cfh_id, const std::string& key) { + lock_manager_->UnLock(txn, cfh_id, key, GetEnv()); +} + +// Used when wrapping DB write operations in a transaction +Transaction* PessimisticTransactionDB::BeginInternalTransaction( + const WriteOptions& options) { + TransactionOptions txn_options; + Transaction* txn = BeginTransaction(options, txn_options, nullptr); + + // Use default timeout for non-transactional writes + txn->SetLockTimeout(txn_db_options_.default_lock_timeout); + return txn; +} + +// All user Put, Merge, Delete, and Write requests must be intercepted to make +// sure that they lock all keys that they are writing to avoid causing conflicts +// with any concurrent transactions. The easiest way to do this is to wrap all +// write operations in a transaction. +// +// Put(), Merge(), and Delete() only lock a single key per call. Write() will +// sort its keys before locking them. This guarantees that TransactionDB write +// methods cannot deadlock with each other (but still could deadlock with a +// Transaction). +Status PessimisticTransactionDB::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(options); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do PutUntracked(). + s = txn->PutUntracked(column_family, key, val); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(wopts); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // DeleteUntracked(). + s = txn->DeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(wopts); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // SingleDeleteUntracked(). + s = txn->SingleDeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(options); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // MergeUntracked(). + s = txn->MergeUntracked(column_family, key, value); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + return WriteWithConcurrencyControl(opts, updates); +} + +Status WriteCommittedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + Status s = FailIfBatchHasTs(updates); + if (!s.ok()) { + return s; + } + if (txn_db_options_.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } +} + +Status WriteCommittedTxnDB::Write( + const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { + Status s = FailIfBatchHasTs(updates); + if (!s.ok()) { + return s; + } + if (optimizations.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } +} + +void PessimisticTransactionDB::InsertExpirableTransaction( + TransactionID tx_id, PessimisticTransaction* tx) { + assert(tx->GetExpirationTime() > 0); + std::lock_guard lock(map_mutex_); + expirable_transactions_map_.insert({tx_id, tx}); +} + +void PessimisticTransactionDB::RemoveExpirableTransaction(TransactionID tx_id) { + std::lock_guard lock(map_mutex_); + expirable_transactions_map_.erase(tx_id); +} + +bool PessimisticTransactionDB::TryStealingExpiredTransactionLocks( + TransactionID tx_id) { + std::lock_guard lock(map_mutex_); + + auto tx_it = expirable_transactions_map_.find(tx_id); + if (tx_it == expirable_transactions_map_.end()) { + return true; + } + PessimisticTransaction& tx = *(tx_it->second); + return tx.TryStealingLocks(); +} + +void PessimisticTransactionDB::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + auto txn_impl = static_cast_with_check(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + +Transaction* PessimisticTransactionDB::GetTransactionByName( + const TransactionName& name) { + std::lock_guard lock(name_map_mutex_); + auto it = transactions_.find(name); + if (it == transactions_.end()) { + return nullptr; + } else { + return it->second; + } +} + +void PessimisticTransactionDB::GetAllPreparedTransactions( + std::vector* transv) { + assert(transv); + transv->clear(); + std::lock_guard lock(name_map_mutex_); + for (auto it = transactions_.begin(); it != transactions_.end(); ++it) { + if (it->second->GetState() == Transaction::PREPARED) { + transv->push_back(it->second); + } + } +} + +LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() { + return lock_manager_->GetPointLockStatus(); +} + +std::vector PessimisticTransactionDB::GetDeadlockInfoBuffer() { + return lock_manager_->GetDeadlockInfoBuffer(); +} + +void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) { + lock_manager_->Resize(target_size); +} + +void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) { + assert(txn); + assert(txn->GetName().length() > 0); + assert(GetTransactionByName(txn->GetName()) == nullptr); + assert(txn->GetState() == Transaction::STARTED); + std::lock_guard lock(name_map_mutex_); + transactions_[txn->GetName()] = txn; +} + +void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) { + assert(txn); + std::lock_guard lock(name_map_mutex_); + auto it = transactions_.find(txn->GetName()); + assert(it != transactions_.end()); + transactions_.erase(it); +} + +std::pair> +PessimisticTransactionDB::CreateTimestampedSnapshot(TxnTimestamp ts) { + if (kMaxTxnTimestamp == ts) { + return std::make_pair(Status::InvalidArgument("invalid ts"), nullptr); + } + assert(db_impl_); + return db_impl_->CreateTimestampedSnapshot(kMaxSequenceNumber, ts); +} + +std::shared_ptr +PessimisticTransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshot(ts); +} + +void PessimisticTransactionDB::ReleaseTimestampedSnapshotsOlderThan( + TxnTimestamp ts) { + assert(db_impl_); + db_impl_->ReleaseTimestampedSnapshotsOlderThan(ts); +} + +Status PessimisticTransactionDB::GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& timestamped_snapshots) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshots(ts_lb, ts_ub, timestamped_snapshots); +} + +Status SnapshotCreationCallback::operator()(SequenceNumber seq, + bool disable_memtable) { + assert(db_impl_); + assert(commit_ts_ != kMaxTxnTimestamp); + + const bool two_write_queues = + db_impl_->immutable_db_options().two_write_queues; + assert(!two_write_queues || !disable_memtable); +#ifdef NDEBUG + (void)two_write_queues; + (void)disable_memtable; +#endif + + const bool seq_per_batch = db_impl_->seq_per_batch(); + if (!seq_per_batch) { + assert(db_impl_->GetLastPublishedSequence() <= seq); + } else { + assert(db_impl_->GetLastPublishedSequence() < seq); + } + + // Create a snapshot which can also be used for write conflict checking. + auto ret = db_impl_->CreateTimestampedSnapshot(seq, commit_ts_); + snapshot_creation_status_ = ret.first; + snapshot_ = ret.second; + if (snapshot_creation_status_.ok()) { + assert(snapshot_); + } else { + assert(!snapshot_); + } + if (snapshot_ && snapshot_notifier_) { + snapshot_notifier_->SnapshotCreated(snapshot_.get()); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.h new file mode 100644 index 0000000000..b662048bd4 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/pessimistic_transaction_db.h @@ -0,0 +1,316 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/db_iter.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "utilities/transactions/lock/lock_manager.h" +#include "utilities/transactions/lock/range/range_lock_manager.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/write_prepared_txn.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB : public TransactionDB { + public: + explicit PessimisticTransactionDB(DB* db, + const TransactionDBOptions& txn_db_options); + + explicit PessimisticTransactionDB(StackableDB* db, + const TransactionDBOptions& txn_db_options); + + virtual ~PessimisticTransactionDB(); + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + virtual Status Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles); + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override = 0; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + using TransactionDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + inline Status WriteWithConcurrencyControl(const WriteOptions& opts, + WriteBatch* updates) { + Status s; + if (opts.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + updates, opts.protection_bytes_per_key); + } + if (s.ok()) { + // Need to lock all keys in this batch to prevent write conflicts with + // concurrent transactions. + Transaction* txn = BeginInternalTransaction(opts); + txn->DisableIndexing(); + + auto txn_impl = static_cast_with_check(txn); + + // Since commitBatch sorts the keys before locking, concurrent Write() + // operations will not cause a deadlock. + // In order to avoid a deadlock with a concurrent Transaction, + // Transactions should use a lock timeout. + s = txn_impl->CommitBatch(updates); + + delete txn; + } + + return s; + } + + using StackableDB::CreateColumnFamily; + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles) override; + + Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override; + + using StackableDB::DropColumnFamily; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + + Status DropColumnFamilies( + const std::vector& column_families) override; + + Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id, + const std::string& key, bool exclusive); + Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id, + const Endpoint& start_endp, const Endpoint& end_endp); + + void UnLock(PessimisticTransaction* txn, const LockTracker& keys); + void UnLock(PessimisticTransaction* txn, uint32_t cfh_id, + const std::string& key); + + void AddColumnFamily(const ColumnFamilyHandle* handle); + + static TransactionDBOptions ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options); + + const TransactionDBOptions& GetTxnDBOptions() const { + return txn_db_options_; + } + + void InsertExpirableTransaction(TransactionID tx_id, + PessimisticTransaction* tx); + void RemoveExpirableTransaction(TransactionID tx_id); + + // If transaction is no longer available, locks can be stolen + // If transaction is available, try stealing locks directly from transaction + // It is the caller's responsibility to ensure that the referred transaction + // is expirable (GetExpirationTime() > 0) and that it is expired. + bool TryStealingExpiredTransactionLocks(TransactionID tx_id); + + Transaction* GetTransactionByName(const TransactionName& name) override; + + void RegisterTransaction(Transaction* txn); + void UnregisterTransaction(Transaction* txn); + + // not thread safe. current use case is during recovery (single thread) + void GetAllPreparedTransactions(std::vector* trans) override; + + LockManager::PointLockStatus GetLockStatusData() override; + + std::vector GetDeadlockInfoBuffer() override; + void SetDeadlockInfoBufferSize(uint32_t target_size) override; + + // The default implementation does nothing. The actual implementation is moved + // to the child classes that actually need this information. This was due to + // an odd performance drop we observed when the added std::atomic member to + // the base class even when the subclass do not read it in the fast path. + virtual void UpdateCFComparatorMap(const std::vector&) {} + virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {} + + // Use the returned factory to create LockTrackers in transactions. + const LockTrackerFactory& GetLockTrackerFactory() const { + return lock_manager_->GetLockTrackerFactory(); + } + + std::pair> CreateTimestampedSnapshot( + TxnTimestamp ts) override; + + std::shared_ptr GetTimestampedSnapshot( + TxnTimestamp ts) const override; + + void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) override; + + Status GetTimestampedSnapshots(TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& + timestamped_snapshots) const override; + + protected: + DBImpl* db_impl_; + std::shared_ptr info_log_; + const TransactionDBOptions txn_db_options_; + + static Status FailIfBatchHasTs(const WriteBatch* wb); + + static Status FailIfCfEnablesTs(const DB* db, + const ColumnFamilyHandle* column_family); + + void ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions()); + + virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options); + + private: + friend class WritePreparedTxnDB; + friend class WritePreparedTxnDBMock; + friend class WriteUnpreparedTxn; + friend class TransactionTest_DoubleCrashInRecovery_Test; + friend class TransactionTest_DoubleEmptyWrite_Test; + friend class TransactionTest_DuplicateKeys_Test; + friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test; + friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test; + friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test; + friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test; + + Transaction* BeginInternalTransaction(const WriteOptions& options); + + std::shared_ptr lock_manager_; + + // Must be held when adding/dropping column families. + InstrumentedMutex column_family_mutex_; + + // Used to ensure that no locks are stolen from an expirable transaction + // that has started a commit. Only transactions with an expiration time + // should be in this map. + std::mutex map_mutex_; + std::unordered_map + expirable_transactions_map_; + + // map from name to two phase transaction instance + std::mutex name_map_mutex_; + std::unordered_map transactions_; + + // Signal that we are testing a crash scenario. Some asserts could be relaxed + // in such cases. + virtual void TEST_Crash() {} +}; + +// A PessimisticTransactionDB that writes the data to the DB after the commit. +// In this way the DB only contains the committed data. +class WriteCommittedTxnDB : public PessimisticTransactionDB { + public: + explicit WriteCommittedTxnDB(DB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options) {} + + explicit WriteCommittedTxnDB(StackableDB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options) {} + + virtual ~WriteCommittedTxnDB() {} + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + // Optimized version of ::Write that makes use of skip_concurrency_control + // hint + using TransactionDB::Write; + virtual Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, + WriteBatch* updates) override; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; +}; + +inline Status PessimisticTransactionDB::FailIfBatchHasTs( + const WriteBatch* batch) { + if (batch != nullptr && WriteBatchInternal::HasKeyWithTimestamp(*batch)) { + return Status::NotSupported( + "Writes with timestamp must go through transaction API instead of " + "TransactionDB."); + } + return Status::OK(); +} + +inline Status PessimisticTransactionDB::FailIfCfEnablesTs( + const DB* db, const ColumnFamilyHandle* column_family) { + assert(db); + column_family = column_family ? column_family : db->DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + return Status::NotSupported( + "Write operation with user timestamp must go through the transaction " + "API instead of TransactionDB."); + } + return Status::OK(); +} + +class SnapshotCreationCallback : public PostMemTableCallback { + public: + explicit SnapshotCreationCallback( + DBImpl* dbi, TxnTimestamp commit_ts, + const std::shared_ptr& notifier, + std::shared_ptr& snapshot) + : db_impl_(dbi), + commit_ts_(commit_ts), + snapshot_notifier_(notifier), + snapshot_(snapshot) { + assert(db_impl_); + } + + ~SnapshotCreationCallback() override { + snapshot_creation_status_.PermitUncheckedError(); + } + + Status operator()(SequenceNumber seq, bool disable_memtable) override; + + private: + DBImpl* const db_impl_; + const TxnTimestamp commit_ts_; + std::shared_ptr snapshot_notifier_; + std::shared_ptr& snapshot_; + + Status snapshot_creation_status_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/snapshot_checker.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/snapshot_checker.cc new file mode 100644 index 0000000000..da363a12fb --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/snapshot_checker.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/snapshot_checker.h" + + +#include "port/lang.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + + +WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( + WritePreparedTxnDB* txn_db) + : txn_db_(txn_db){}; + +SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const { + bool snapshot_released = false; + // TODO(myabandeh): set min_uncommitted + bool in_snapshot = txn_db_->IsInSnapshot( + sequence, snapshot_sequence, kMinUnCommittedSeq, &snapshot_released); + if (snapshot_released) { + return SnapshotCheckerResult::kSnapshotReleased; + } + return in_snapshot ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; +} + + +DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() { + STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance); + return &instance; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.cc new file mode 100644 index 0000000000..5963f7429f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.cc @@ -0,0 +1,757 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/transaction_base.h" + +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +Status Transaction::CommitAndTryCreateSnapshot( + std::shared_ptr notifier, TxnTimestamp ts, + std::shared_ptr* snapshot) { + if (snapshot) { + snapshot->reset(); + } + TxnTimestamp commit_ts = GetCommitTimestamp(); + if (commit_ts == kMaxTxnTimestamp) { + if (ts == kMaxTxnTimestamp) { + return Status::InvalidArgument("Commit timestamp unset"); + } else { + const Status s = SetCommitTimestamp(ts); + if (!s.ok()) { + return s; + } + } + } else if (ts != kMaxTxnTimestamp) { + if (ts != commit_ts) { + // For now we treat this as error. + return Status::InvalidArgument("Different commit ts specified"); + } + } + SetSnapshotOnNextOperation(notifier); + Status s = Commit(); + if (!s.ok()) { + return s; + } + assert(s.ok()); + // If we reach here, we must return ok status for this function. + std::shared_ptr new_snapshot = GetTimestampedSnapshot(); + + if (snapshot) { + *snapshot = new_snapshot; + } + return Status::OK(); +} + +TransactionBaseImpl::TransactionBaseImpl( + DB* db, const WriteOptions& write_options, + const LockTrackerFactory& lock_tracker_factory) + : db_(db), + dbimpl_(static_cast_with_check(db)), + write_options_(write_options), + cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())), + lock_tracker_factory_(lock_tracker_factory), + start_time_(dbimpl_->GetSystemClock()->NowMicros()), + write_batch_(cmp_, 0, true, 0, write_options.protection_bytes_per_key), + tracked_locks_(lock_tracker_factory_.Create()), + commit_time_batch_(0 /* reserved_bytes */, 0 /* max_bytes */, + write_options.protection_bytes_per_key, + 0 /* default_cf_ts_sz */), + indexing_enabled_(true) { + assert(dynamic_cast(db_) != nullptr); + log_number_ = 0; + if (dbimpl_->allow_2pc()) { + InitWriteBatch(); + } +} + +TransactionBaseImpl::~TransactionBaseImpl() { + // Release snapshot if snapshot is set + SetSnapshotInternal(nullptr); +} + +void TransactionBaseImpl::Clear() { + save_points_.reset(nullptr); + write_batch_.Clear(); + commit_time_batch_.Clear(); + tracked_locks_->Clear(); + num_puts_ = 0; + num_deletes_ = 0; + num_merges_ = 0; + + if (dbimpl_->allow_2pc()) { + InitWriteBatch(); + } +} + +void TransactionBaseImpl::Reinitialize(DB* db, + const WriteOptions& write_options) { + Clear(); + ClearSnapshot(); + id_ = 0; + db_ = db; + name_.clear(); + log_number_ = 0; + write_options_ = write_options; + start_time_ = dbimpl_->GetSystemClock()->NowMicros(); + indexing_enabled_ = true; + cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily()); + WriteBatchInternal::UpdateProtectionInfo( + write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key) + .PermitUncheckedError(); + WriteBatchInternal::UpdateProtectionInfo( + &commit_time_batch_, write_options_.protection_bytes_per_key) + .PermitUncheckedError(); +} + +void TransactionBaseImpl::SetSnapshot() { + const Snapshot* snapshot = dbimpl_->GetSnapshotForWriteConflictBoundary(); + SetSnapshotInternal(snapshot); +} + +void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) { + // Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to + // be released, not deleted when it is no longer referenced. + snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot, + this, std::placeholders::_1, db_)); + snapshot_needed_ = false; + snapshot_notifier_ = nullptr; +} + +void TransactionBaseImpl::SetSnapshotOnNextOperation( + std::shared_ptr notifier) { + snapshot_needed_ = true; + snapshot_notifier_ = notifier; +} + +void TransactionBaseImpl::SetSnapshotIfNeeded() { + if (snapshot_needed_) { + std::shared_ptr notifier = snapshot_notifier_; + SetSnapshot(); + if (notifier != nullptr) { + notifier->SnapshotCreated(GetSnapshot()); + } + } +} + +Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family, + const SliceParts& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + size_t key_size = 0; + for (int i = 0; i < key.num_parts; ++i) { + key_size += key.parts[i].size(); + } + + std::string str; + str.reserve(key_size); + + for (int i = 0; i < key.num_parts; ++i) { + str.append(key.parts[i].data(), key.parts[i].size()); + } + + return TryLock(column_family, str, read_only, exclusive, do_validate, + assume_tracked); +} + +void TransactionBaseImpl::SetSavePoint() { + if (save_points_ == nullptr) { + save_points_.reset( + new std::stack>()); + } + save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_, + num_puts_, num_deletes_, num_merges_, + lock_tracker_factory_); + write_batch_.SetSavePoint(); +} + +Status TransactionBaseImpl::RollbackToSavePoint() { + if (save_points_ != nullptr && save_points_->size() > 0) { + // Restore saved SavePoint + TransactionBaseImpl::SavePoint& save_point = save_points_->top(); + snapshot_ = save_point.snapshot_; + snapshot_needed_ = save_point.snapshot_needed_; + snapshot_notifier_ = save_point.snapshot_notifier_; + num_puts_ = save_point.num_puts_; + num_deletes_ = save_point.num_deletes_; + num_merges_ = save_point.num_merges_; + + // Rollback batch + Status s = write_batch_.RollbackToSavePoint(); + assert(s.ok()); + + // Rollback any keys that were tracked since the last savepoint + tracked_locks_->Subtract(*save_point.new_locks_); + + save_points_->pop(); + + return s; + } else { + assert(write_batch_.RollbackToSavePoint().IsNotFound()); + return Status::NotFound(); + } +} + +Status TransactionBaseImpl::PopSavePoint() { + if (save_points_ == nullptr || save_points_->empty()) { + // No SavePoint yet. + assert(write_batch_.PopSavePoint().IsNotFound()); + return Status::NotFound(); + } + + assert(!save_points_->empty()); + // If there is another savepoint A below the current savepoint B, then A needs + // to inherit tracked_keys in B so that if we rollback to savepoint A, we + // remember to unlock keys in B. If there is no other savepoint below, then we + // can safely discard savepoint info. + if (save_points_->size() == 1) { + save_points_->pop(); + } else { + TransactionBaseImpl::SavePoint top(lock_tracker_factory_); + std::swap(top, save_points_->top()); + save_points_->pop(); + + save_points_->top().new_locks_->Merge(*top.new_locks_); + } + + return write_batch_.PopSavePoint(); +} + +Status TransactionBaseImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status TransactionBaseImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, + pinnable_val); +} + +Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive, + const bool do_validate) { + if (!do_validate && read_options.snapshot != nullptr) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with snapshot is not " + "defined."); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + Status s = + TryLock(column_family, key, true /* read_only */, exclusive, do_validate); + + if (s.ok() && value != nullptr) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + s = Get(read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + } + return s; +} + +Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive, + const bool do_validate) { + if (!do_validate && read_options.snapshot != nullptr) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with snapshot is not " + "defined."); + } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + Status s = + TryLock(column_family, key, true /* read_only */, exclusive, do_validate); + + if (s.ok() && pinnable_val != nullptr) { + s = Get(read_options, column_family, key, pinnable_val); + } + return s; +} + +std::vector TransactionBaseImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGet with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } + + values->resize(num_keys); + + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + } + + return stat_list; +} + +void TransactionBaseImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); + write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, + num_keys, keys, values, statuses, + sorted_input); +} + +std::vector TransactionBaseImpl::MultiGetForUpdate( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + // Regardless of whether the MultiGet succeeded, track these keys. + size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } + values->resize(num_keys); + + // Lock all keys + for (size_t i = 0; i < num_keys; ++i) { + Status s = TryLock(column_family[i], keys[i], true /* read_only */, + true /* exclusive */); + if (!s.ok()) { + // Fail entire multiget if we cannot lock all keys + return std::vector(num_keys, s); + } + } + + // TODO(agiardullo): optimize multiget? + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + } + + return stat_list; +} + +Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter, + &read_options); +} + +Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(column_family, db_iter, + &read_options); +} + +Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + num_merges_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, + const Slice& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + num_merges_++; + } + } + + return s; +} + +Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDeleteUntracked( + ColumnFamilyHandle* column_family, const Slice& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +void TransactionBaseImpl::PutLogData(const Slice& blob) { + auto s = write_batch_.PutLogData(blob); + (void)s; + assert(s.ok()); +} + +WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() { + return &write_batch_; +} + +uint64_t TransactionBaseImpl::GetElapsedTime() const { + return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000; +} + +uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; } + +uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; } + +uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; } + +uint64_t TransactionBaseImpl::GetNumKeys() const { + return tracked_locks_->GetNumPointLocks(); +} + +void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key, + SequenceNumber seq, bool read_only, + bool exclusive) { + PointLockRequest r; + r.column_family_id = cfh_id; + r.key = key; + r.seq = seq; + r.read_only = read_only; + r.exclusive = exclusive; + + // Update map of all tracked keys for this transaction + tracked_locks_->Track(r); + + if (save_points_ != nullptr && !save_points_->empty()) { + // Update map of tracked keys in this SavePoint + save_points_->top().new_locks_->Track(r); + } +} + +// Gets the write batch that should be used for Put/Merge/Deletes. +// +// Returns either a WriteBatch or WriteBatchWithIndex depending on whether +// DisableIndexing() has been called. +WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() { + if (indexing_enabled_) { + // Use WriteBatchWithIndex + return &write_batch_; + } else { + // Don't use WriteBatchWithIndex. Return base WriteBatch. + return write_batch_.GetWriteBatch(); + } +} + +void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) { + if (snapshot != nullptr) { + ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log, + "ReleaseSnapshot %" PRIu64 " Set", + snapshot->GetSequenceNumber()); + db->ReleaseSnapshot(snapshot); + } +} + +void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) { + PointLockRequest r; + r.column_family_id = GetColumnFamilyID(column_family); + r.key = key.ToString(); + r.read_only = true; + + bool can_untrack = false; + if (save_points_ != nullptr && !save_points_->empty()) { + // If there is no GetForUpdate of the key in this save point, + // then cannot untrack from the global lock tracker. + UntrackStatus s = save_points_->top().new_locks_->Untrack(r); + can_untrack = (s != UntrackStatus::NOT_TRACKED); + } else { + // No save point, so can untrack from the global lock tracker. + can_untrack = true; + } + + if (can_untrack) { + // If erased from the global tracker, then can unlock the key. + UntrackStatus s = tracked_locks_->Untrack(r); + bool can_unlock = (s == UntrackStatus::REMOVED); + if (can_unlock) { + UnlockGetForUpdate(column_family, key); + } + } +} + +Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) { + struct IndexedWriteBatchBuilder : public WriteBatch::Handler { + Transaction* txn_; + DBImpl* db_; + IndexedWriteBatchBuilder(Transaction* txn, DBImpl* db) + : txn_(txn), db_(db) { + assert(dynamic_cast(txn_) != nullptr); + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + return txn_->Put(db_->GetColumnFamilyHandle(cf), key, val); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return txn_->Delete(db_->GetColumnFamilyHandle(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return txn_->SingleDelete(db_->GetColumnFamilyHandle(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + return txn_->Merge(db_->GetColumnFamilyHandle(cf), key, val); + } + + // this is used for reconstructing prepared transactions upon + // recovery. there should not be any meta markers in the batches + // we are processing. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + IndexedWriteBatchBuilder copycat(this, dbimpl_); + return src_batch->Iterate(©cat); +} + +WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() { + return &commit_time_batch_; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.h new file mode 100644 index 0000000000..bde09b6991 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_base.h @@ -0,0 +1,382 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionBaseImpl : public Transaction { + public: + TransactionBaseImpl(DB* db, const WriteOptions& write_options, + const LockTrackerFactory& lock_tracker_factory); + + ~TransactionBaseImpl() override; + + // Remove pending operations queued in this transaction. + virtual void Clear(); + + void Reinitialize(DB* db, const WriteOptions& write_options); + + // Called before executing Put, Merge, Delete, and GetForUpdate. If TryLock + // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed. + // do_validate will be false if called from PutUntracked, DeleteUntracked, + // MergeUntracked, or GetForUpdate(do_validate=false) + virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, + const bool do_validate = true, + const bool assume_tracked = false) = 0; + + void SetSavePoint() override; + + Status RollbackToSavePoint() override; + + Status PopSavePoint() override; + + using Transaction::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override { + return Get(options, db_->DefaultColumnFamily(), key, value); + } + + using Transaction::GetForUpdate; + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override; + + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override; + + Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override { + return GetForUpdate(options, db_->DefaultColumnFamily(), key, value, + exclusive, do_validate); + } + + using Transaction::MultiGet; + std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) override { + return MultiGet(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input = false) override; + + using Transaction::MultiGetForUpdate; + std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override { + return MultiGetForUpdate(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(const Slice& key) override { + return SingleDelete(nullptr, key); + } + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + Status SingleDelete(const SliceParts& key) override { + return SingleDelete(nullptr, key); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); + } + + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); + } + + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); + } + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); + } + + Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDeleteUntracked(const Slice& key) override { + return SingleDeleteUntracked(nullptr, key); + } + + void PutLogData(const Slice& blob) override; + + WriteBatchWithIndex* GetWriteBatch() override; + + virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ + } + + const Snapshot* GetSnapshot() const override { + // will return nullptr when there is no snapshot + return snapshot_.get(); + } + + std::shared_ptr GetTimestampedSnapshot() const override { + return snapshot_; + } + + virtual void SetSnapshot() override; + void SetSnapshotOnNextOperation( + std::shared_ptr notifier = nullptr) override; + + void ClearSnapshot() override { + snapshot_.reset(); + snapshot_needed_ = false; + snapshot_notifier_ = nullptr; + } + + void DisableIndexing() override { indexing_enabled_ = false; } + + void EnableIndexing() override { indexing_enabled_ = true; } + + bool IndexingEnabled() const { return indexing_enabled_; } + + uint64_t GetElapsedTime() const override; + + uint64_t GetNumPuts() const override; + + uint64_t GetNumDeletes() const override; + + uint64_t GetNumMerges() const override; + + uint64_t GetNumKeys() const override; + + void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; + void UndoGetForUpdate(const Slice& key) override { + return UndoGetForUpdate(nullptr, key); + }; + + WriteOptions* GetWriteOptions() override { return &write_options_; } + + void SetWriteOptions(const WriteOptions& write_options) override { + write_options_ = write_options; + } + + // Used for memory management for snapshot_ + void ReleaseSnapshot(const Snapshot* snapshot, DB* db); + + // iterates over the given batch and makes the appropriate inserts. + // used for rebuilding prepared transactions after recovery. + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + + WriteBatch* GetCommitTimeWriteBatch() override; + + LockTracker& GetTrackedLocks() { return *tracked_locks_; } + + protected: + // Add a key to the list of tracked keys. + // + // seqno is the earliest seqno this key was involved with this transaction. + // readonly should be set to true if no data was written for this key + void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno, + bool readonly, bool exclusive); + + // Called when UndoGetForUpdate determines that this key can be unlocked. + virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + // Sets a snapshot if SetSnapshotOnNextOperation() has been called. + void SetSnapshotIfNeeded(); + + // Initialize write_batch_ for 2PC by inserting Noop. + inline void InitWriteBatch(bool clear = false) { + if (clear) { + write_batch_.Clear(); + } + assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader); + auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + assert(s.ok()); + } + + WriteBatchBase* GetBatchForWrite(); + + DB* db_; + DBImpl* dbimpl_; + + WriteOptions write_options_; + + const Comparator* cmp_; + + const LockTrackerFactory& lock_tracker_factory_; + + // Stores that time the txn was constructed, in microseconds. + uint64_t start_time_; + + // Stores the current snapshot that was set by SetSnapshot or null if + // no snapshot is currently set. + std::shared_ptr snapshot_; + + // Count of various operations pending in this transaction + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; + + struct SavePoint { + std::shared_ptr snapshot_; + bool snapshot_needed_ = false; + std::shared_ptr snapshot_notifier_; + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; + + // Record all locks tracked since the last savepoint + std::shared_ptr new_locks_; + + SavePoint(std::shared_ptr snapshot, bool snapshot_needed, + std::shared_ptr snapshot_notifier, + uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges, + const LockTrackerFactory& lock_tracker_factory) + : snapshot_(snapshot), + snapshot_needed_(snapshot_needed), + snapshot_notifier_(snapshot_notifier), + num_puts_(num_puts), + num_deletes_(num_deletes), + num_merges_(num_merges), + new_locks_(lock_tracker_factory.Create()) {} + + explicit SavePoint(const LockTrackerFactory& lock_tracker_factory) + : new_locks_(lock_tracker_factory.Create()) {} + }; + + // Records writes pending in this transaction + WriteBatchWithIndex write_batch_; + + // For Pessimistic Transactions this is the set of acquired locks. + // Optimistic Transactions will keep note the requested locks (not actually + // locked), and do conflict checking until commit time based on the tracked + // lock requests. + std::unique_ptr tracked_locks_; + + // Stack of the Snapshot saved at each save point. Saved snapshots may be + // nullptr if there was no snapshot at the time SetSavePoint() was called. + std::unique_ptr>> + save_points_; + + private: + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + + // Extra data to be persisted with the commit. Note this is only used when + // prepare phase is not skipped. + WriteBatch commit_time_batch_; + + // If true, future Put/Merge/Deletes will be indexed in the + // WriteBatchWithIndex. + // If false, future Put/Merge/Deletes will be inserted directly into the + // underlying WriteBatch and not indexed in the WriteBatchWithIndex. + bool indexing_enabled_; + + // SetSnapshotOnNextOperation() has been called and the snapshot has not yet + // been reset. + bool snapshot_needed_ = false; + + // SetSnapshotOnNextOperation() has been called and the caller would like + // a notification through the TransactionNotifier interface + std::shared_ptr snapshot_notifier_ = nullptr; + + Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false); + + void SetSnapshotInternal(const Snapshot* snapshot); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.cc new file mode 100644 index 0000000000..52c299b376 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/transaction_db_mutex_impl.h" + +#include +#include +#include +#include + +#include "rocksdb/utilities/transaction_db_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutexImpl : public TransactionDBMutex { + public: + TransactionDBMutexImpl() {} + ~TransactionDBMutexImpl() override {} + + Status Lock() override; + + Status TryLockFor(int64_t timeout_time) override; + + void UnLock() override { mutex_.unlock(); } + + friend class TransactionDBCondVarImpl; + + private: + std::mutex mutex_; +}; + +class TransactionDBCondVarImpl : public TransactionDBCondVar { + public: + TransactionDBCondVarImpl() {} + ~TransactionDBCondVarImpl() override {} + + Status Wait(std::shared_ptr mutex) override; + + Status WaitFor(std::shared_ptr mutex, + int64_t timeout_time) override; + + void Notify() override { cv_.notify_one(); } + + void NotifyAll() override { cv_.notify_all(); } + + private: + std::condition_variable cv_; +}; + +std::shared_ptr +TransactionDBMutexFactoryImpl::AllocateMutex() { + return std::shared_ptr(new TransactionDBMutexImpl()); +} + +std::shared_ptr +TransactionDBMutexFactoryImpl::AllocateCondVar() { + return std::shared_ptr(new TransactionDBCondVarImpl()); +} + +Status TransactionDBMutexImpl::Lock() { + mutex_.lock(); + return Status::OK(); +} + +Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { + bool locked = true; + + if (timeout_time == 0) { + locked = mutex_.try_lock(); + } else { + // Previously, this code used a std::timed_mutex. However, this was changed + // due to known bugs in gcc versions < 4.9. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54562 + // + // Since this mutex isn't held for long and only a single mutex is ever + // held at a time, it is reasonable to ignore the lock timeout_time here + // and only check it when waiting on the condition_variable. + mutex_.lock(); + } + + if (!locked) { + // timeout acquiring mutex + return Status::TimedOut(Status::SubCode::kMutexTimeout); + } + + return Status::OK(); +} + +Status TransactionDBCondVarImpl::Wait( + std::shared_ptr mutex) { + auto mutex_impl = reinterpret_cast(mutex.get()); + + std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); + cv_.wait(lock); + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + + return Status::OK(); +} + +Status TransactionDBCondVarImpl::WaitFor( + std::shared_ptr mutex, int64_t timeout_time) { + Status s; + + auto mutex_impl = reinterpret_cast(mutex.get()); + std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); + + if (timeout_time < 0) { + // If timeout is negative, do not use a timeout + cv_.wait(lock); + } else { + auto duration = std::chrono::microseconds(timeout_time); + auto cv_status = cv_.wait_for(lock, duration); + + // Check if the wait stopped due to timing out. + if (cv_status == std::cv_status::timeout) { + s = Status::TimedOut(Status::SubCode::kMutexTimeout); + } + } + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + + // CV was signaled, or we spuriously woke up (but didn't time out) + return s; +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.h new file mode 100644 index 0000000000..f509f4017a --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_db_mutex_impl.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/utilities/transaction_db_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutex; +class TransactionDBCondVar; + +// Default implementation of TransactionDBMutexFactory. May be overridden +// by TransactionDBOptions.custom_mutex_factory. +class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory { + public: + std::shared_ptr AllocateMutex() override; + std::shared_ptr AllocateCondVar() override; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_test.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_test.h new file mode 100644 index 0000000000..0b86453a40 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_test.h @@ -0,0 +1,578 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Return true if the ith bit is set in combination represented by comb +bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); } + +enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite }; + +class TransactionTestBase : public ::testing::Test { + public: + TransactionDB* db; + SpecialEnv special_env; + FaultInjectionTestEnv* env; + std::string dbname; + Options options; + + TransactionDBOptions txn_db_options; + bool use_stackable_db_; + + TransactionTestBase(bool use_stackable_db, bool two_write_queue, + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) + : db(nullptr), + special_env(Env::Default()), + env(nullptr), + use_stackable_db_(use_stackable_db) { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + options.write_buffer_size = 4 * 1024; + options.unordered_write = write_ordering == kUnorderedWrite; + options.level0_file_num_compaction_trigger = 2; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + special_env.skip_fsync_ = true; + env = new FaultInjectionTestEnv(&special_env); + options.env = env; + options.two_write_queues = two_write_queue; + dbname = test::PerThreadDBPath("transaction_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + txn_db_options.transaction_lock_timeout = 0; + txn_db_options.default_lock_timeout = 0; + txn_db_options.write_policy = write_policy; + txn_db_options.rollback_merge_operands = true; + // This will stress write unprepared, by forcing write batch flush on every + // write. + txn_db_options.default_write_batch_flush_threshold = 1; + // Write unprepared requires all transactions to be named. This setting + // autogenerates the name so that existing tests can pass. + txn_db_options.autogenerate_name = true; + Status s; + if (use_stackable_db == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + EXPECT_OK(s); + } + + ~TransactionTestBase() { + delete db; + db = nullptr; + // This is to skip the assert statement in FaultInjectionTestEnv. There + // seems to be a bug in btrfs that the makes readdir return recently + // unlink-ed files. By using the default fs we simply ignore errors resulted + // from attempting to delete such files in DestroyDB. + if (getenv("KEEP_DB") == nullptr) { + options.env = Env::Default(); + EXPECT_OK(DestroyDB(dbname, options)); + } else { + fprintf(stdout, "db is still in %s\n", dbname.c_str()); + } + delete env; + } + + Status ReOpenNoDelete() { + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + assert(!s.ok() || db != nullptr); + return s; + } + + Status ReOpenNoDelete(std::vector& cfs, + std::vector* handles) { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles, + &db); + } else { + s = OpenWithStackableDB(cfs, handles); + } + assert(!s.ok() || db != nullptr); + return s; + } + + Status ReOpen() { + delete db; + db = nullptr; + DestroyDB(dbname, options); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + assert(db != nullptr); + return s; + } + + Status OpenWithStackableDB(std::vector& cfs, + std::vector* handles) { + std::vector compaction_enabled_cf_indices; + TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices); + DB* root_db = nullptr; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db, + use_seq_per_batch, use_batch_per_txn); + auto stackable_db = std::make_unique(root_db); + if (s.ok()) { + assert(root_db != nullptr); + // If WrapStackableDB() returns non-ok, then stackable_db is already + // deleted within WrapStackableDB(). + s = TransactionDB::WrapStackableDB(stackable_db.release(), txn_db_options, + compaction_enabled_cf_indices, + *handles, &db); + } + return s; + } + + Status OpenWithStackableDB() { + std::vector compaction_enabled_cf_indices; + std::vector column_families{ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions(options))}; + + TransactionDB::PrepareWrap(&options, &column_families, + &compaction_enabled_cf_indices); + std::vector handles; + DB* root_db = nullptr; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, column_families, &handles, + &root_db, use_seq_per_batch, use_batch_per_txn); + if (!s.ok()) { + delete root_db; + return s; + } + StackableDB* stackable_db = new StackableDB(root_db); + assert(root_db != nullptr); + assert(handles.size() == 1); + s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options, + compaction_enabled_cf_indices, handles, + &db); + delete handles[0]; + if (!s.ok()) { + delete stackable_db; + } + return s; + } + + std::atomic linked = {0}; + std::atomic exp_seq = {0}; + std::atomic commit_writes = {0}; + std::atomic expected_commits = {0}; + // Without Prepare, the commit does not write to WAL + std::atomic with_empty_commits = {0}; + void TestTxn0(size_t index) { + // Test DB's internal txn. It involves no prepare phase nor a commit marker. + auto s = db->Put(WriteOptions(), "key" + std::to_string(index), "value"); + ASSERT_OK(s); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq++; + } else { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } + with_empty_commits++; + } + + void TestTxn1(size_t index) { + // Testing directly writing a write batch. Functionality-wise it is + // equivalent to commit without prepare. + WriteBatch wb; + auto istr = std::to_string(index); + ASSERT_OK(wb.Put("k1" + istr, "v1")); + ASSERT_OK(wb.Put("k2" + istr, "v2")); + ASSERT_OK(wb.Put("k3" + istr, "v3")); + auto s = db->Write(WriteOptions(), &wb); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 3; + } else { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } + ASSERT_OK(s); + with_empty_commits++; + } + + void TestTxn2(size_t index) { + // Commit without prepare. It should write to DB without a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Commit()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 4; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 4; + // WriteUnprepared implements CommitWithoutPrepareInternal by simply + // calling Prepare then Commit. Consume one seq for the prepare. + exp_seq++; + } + delete txn; + with_empty_commits++; + } + + void TestTxn3(size_t index) { + // A full 2pc txn that also involves a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5"))); + expected_commits++; + ASSERT_OK(txn->Prepare()); + commit_writes++; + ASSERT_OK(txn->Commit()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 5; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + // Consume one seq per commit marker + exp_seq++; + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 5; + // Consume one seq per commit marker + exp_seq++; + } + delete txn; + } + + void TestTxn4(size_t index) { + // A full 2pc txn that also involves a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5"))); + expected_commits++; + ASSERT_OK(txn->Prepare()); + commit_writes++; + ASSERT_OK(txn->Rollback()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // No seq is consumed for deleting the txn buffer + exp_seq += 0; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + // Consume one seq per rollback batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for rollback commit + exp_seq++; + } + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 5; + // Consume one seq per rollback batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for rollback commit + exp_seq++; + } + } + delete txn; + } + + // Test that we can change write policy after a clean shutdown (which would + // empty the WAL) + void CrossCompatibilityTest(TxnDBWritePolicy from_policy, + TxnDBWritePolicy to_policy, bool empty_wal) { + TransactionOptions txn_options; + ReadOptions read_options; + WriteOptions write_options; + uint32_t index = 0; + Random rnd(1103); + options.write_buffer_size = 1024; // To create more sst files + std::unordered_map committed_kvs; + Transaction* txn; + + txn_db_options.write_policy = from_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } + ASSERT_OK(ReOpen()); + + for (int i = 0; i < 1024; i++) { + auto istr = std::to_string(index); + auto k = Slice("foo-" + istr).ToString(); + auto v = Slice("bar-" + istr).ToString(); + // For test the duplicate keys + auto v2 = Slice("bar2-" + istr).ToString(); + auto type = rnd.Uniform(4); + switch (type) { + case 0: + committed_kvs[k] = v; + ASSERT_OK(db->Put(write_options, k, v)); + committed_kvs[k] = v2; + ASSERT_OK(db->Put(write_options, k, v2)); + break; + case 1: { + WriteBatch wb; + committed_kvs[k] = v; + ASSERT_OK(wb.Put(k, v)); + committed_kvs[k] = v2; + ASSERT_OK(wb.Put(k, v2)); + ASSERT_OK(db->Write(write_options, &wb)); + + } break; + case 2: + case 3: + txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + istr)); + committed_kvs[k] = v; + ASSERT_OK(txn->Put(k, v)); + committed_kvs[k] = v2; + ASSERT_OK(txn->Put(k, v2)); + + if (type == 3) { + ASSERT_OK(txn->Prepare()); + } + ASSERT_OK(txn->Commit()); + delete txn; + break; + default: + FAIL(); + } + + index++; + } // for i + + txn_db_options.write_policy = to_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } + auto db_impl = static_cast_with_check(db->GetRootDB()); + // Before upgrade/downgrade the WAL must be emptied + if (empty_wal) { + ASSERT_OK(db_impl->TEST_FlushMemTable()); + } else { + ASSERT_OK(db_impl->FlushWAL(true)); + } + auto s = ReOpenNoDelete(); + if (empty_wal) { + ASSERT_OK(s); + } else { + // Test that we can detect the WAL that is produced by an incompatible + // WritePolicy and fail fast before mis-interpreting the WAL. + ASSERT_TRUE(s.IsNotSupported()); + return; + } + db_impl = static_cast_with_check(db->GetRootDB()); + // Check that WAL is empty + VectorLogPtr log_files; + ASSERT_OK(db_impl->GetSortedWalFiles(log_files)); + ASSERT_EQ(0, log_files.size()); + + for (auto& kv : committed_kvs) { + std::string value; + s = db->Get(read_options, kv.first, &value); + if (s.IsNotFound()) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_OK(s); + if (kv.second != value) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_EQ(kv.second, value); + } + } +}; + +class TransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + TransactionTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())){}; +}; + +class TransactionStressTest : public TransactionTest {}; + +class MySQLStyleTransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + MySQLStyleTransactionTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + with_slow_threads_(std::get<4>(GetParam())) { + if (with_slow_threads_ && + (txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED)) { + // The corner case with slow threads involves the caches filling + // over which would not happen even with artifial delays. To help + // such cases to show up we lower the size of the cache-related data + // structures. + txn_db_options.wp_snapshot_cache_bits = 1; + txn_db_options.wp_commit_cache_bits = 10; + options.write_buffer_size = 1024; + EXPECT_OK(ReOpen()); + } + }; + + protected: + // Also emulate slow threads by addin artiftial delays + const bool with_slow_threads_; +}; + +class WriteCommittedTxnWithTsTest + : public TransactionTestBase, + public ::testing::WithParamInterface> { + public: + WriteCommittedTxnWithTsTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + WRITE_COMMITTED, kOrderedWrite) {} + ~WriteCommittedTxnWithTsTest() override { + for (auto* h : handles_) { + delete h; + } + } + + Status GetFromDb(ReadOptions read_opts, ColumnFamilyHandle* column_family, + const Slice& key, TxnTimestamp ts, std::string* value) { + std::string ts_buf; + PutFixed64(&ts_buf, ts); + Slice ts_slc = ts_buf; + read_opts.timestamp = &ts_slc; + assert(db); + return db->Get(read_opts, column_family, key, value); + } + + Transaction* NewTxn(WriteOptions write_opts, TransactionOptions txn_opts) { + assert(db); + auto* txn = db->BeginTransaction(write_opts, txn_opts); + assert(txn); + const bool enable_indexing = std::get<2>(GetParam()); + if (enable_indexing) { + txn->EnableIndexing(); + } else { + txn->DisableIndexing(); + } + return txn; + } + + protected: + std::vector handles_{}; +}; + +class TimestampedSnapshotWithTsSanityCheck + : public TransactionTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + explicit TimestampedSnapshotWithTsSanityCheck() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())) {} + ~TimestampedSnapshotWithTsSanityCheck() override { + for (auto* h : handles_) { + delete h; + } + } + + protected: + std::vector handles_{}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.cc new file mode 100644 index 0000000000..d5a68807e7 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/transaction_util.h" + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +Status TransactionUtil::CheckKeyForConflicts( + DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key, + SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only, + ReadCallback* snap_checker, SequenceNumber min_uncommitted) { + Status result; + + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd); + + if (sv == nullptr) { + result = Status::InvalidArgument("Could not access column family " + + cfh->GetName()); + } + + if (result.ok()) { + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts, + cache_only, snap_checker, min_uncommitted); + + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); + } + + return result; +} + +Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, + SequenceNumber snap_seq, + const std::string& key, + const std::string* const read_ts, + bool cache_only, ReadCallback* snap_checker, + SequenceNumber min_uncommitted) { + // When `min_uncommitted` is provided, keys are not always committed + // in sequence number order, and `snap_checker` is used to check whether + // specific sequence number is in the database is visible to the transaction. + // So `snap_checker` must be provided. + assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr); + + Status result; + bool need_to_read_sst = false; + + // Since it would be too slow to check the SST files, we will only use + // the memtables to check whether there have been any recent writes + // to this key after it was accessed in this transaction. But if the + // Memtables do not contain a long enough history, we must fail the + // transaction. + if (earliest_seq == kMaxSequenceNumber) { + // The age of this memtable is unknown. Cannot rely on it to check + // for recent writes. This error shouldn't happen often in practice as + // the Memtable should have a valid earliest sequence number except in some + // corner cases (such as error cases during recovery). + need_to_read_sst = true; + + if (cache_only) { + result = Status::TryAgain( + "Transaction could not check for conflicts as the MemTable does not " + "contain a long enough history to check write at SequenceNumber: ", + std::to_string(snap_seq)); + } + } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) { + // Use <= for min_uncommitted since earliest_seq is actually the largest sec + // before this memtable was created + need_to_read_sst = true; + + if (cache_only) { + // The age of this memtable is too new to use to check for recent + // writes. + char msg[300]; + snprintf(msg, sizeof(msg), + "Transaction could not check for conflicts for operation at " + "SequenceNumber %" PRIu64 + " as the MemTable only contains changes newer than " + "SequenceNumber %" PRIu64 + ". Increasing the value of the " + "max_write_buffer_size_to_maintain option could reduce the " + "frequency " + "of this error.", + snap_seq, earliest_seq); + result = Status::TryAgain(msg); + } + } + + if (result.ok()) { + SequenceNumber seq = kMaxSequenceNumber; + std::string timestamp; + bool found_record_for_key = false; + + // When min_uncommitted == kMaxSequenceNumber, writes are committed in + // sequence number order, so only keys larger than `snap_seq` can cause + // conflict. + // When min_uncommitted != kMaxSequenceNumber, keys lower than + // min_uncommitted will not triggered conflicts, while keys larger than + // min_uncommitted might create conflicts, so we need to read them out + // from the DB, and call callback to snap_checker to determine. So only + // keys lower than min_uncommitted can be skipped. + SequenceNumber lower_bound_seq = + (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted; + Status s = db_impl->GetLatestSequenceForKey( + sv, key, !need_to_read_sst, lower_bound_seq, &seq, + !read_ts ? nullptr : ×tamp, &found_record_for_key, + /*is_blob_index=*/nullptr); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + result = s; + } else if (found_record_for_key) { + bool write_conflict = snap_checker == nullptr + ? snap_seq < seq + : !snap_checker->IsVisible(seq); + // Perform conflict checking based on timestamp if applicable. + if (!write_conflict && read_ts != nullptr) { + ColumnFamilyData* cfd = sv->cfd; + assert(cfd); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + assert(read_ts->size() == ucmp->timestamp_size()); + assert(read_ts->size() == timestamp.size()); + // Write conflict if *ts < timestamp. + write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0; + } + if (write_conflict) { + result = Status::Busy(); + } + } + } + + return result; +} + +Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, + const LockTracker& tracker, + bool cache_only) { + Status result; + + std::unique_ptr cf_it( + tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf); + if (sv == nullptr) { + result = Status::InvalidArgument("Could not access column family " + + std::to_string(cf)); + break; + } + + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + // For each of the keys in this transaction, check to see if someone has + // written to this key since the start of the transaction. + std::unique_ptr key_it( + tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + PointLockStatus status = tracker.GetPointLockStatus(cf, key); + const SequenceNumber key_seq = status.seq; + + // TODO: support timestamp-based conflict checking. + // CheckKeysForConflicts() is currently used only by optimistic + // transactions. + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key, + /*read_ts=*/nullptr, cache_only); + if (!result.ok()) { + break; + } + } + + db_impl->ReturnAndCleanupSuperVersion(cf, sv); + + if (!result.ok()) { + break; + } + } + + return result; +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.h new file mode 100644 index 0000000000..725e1c9279 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/transaction_util.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include + +#include "db/dbformat.h" +#include "db/read_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +struct SuperVersion; +class WriteBatchWithIndex; + +class TransactionUtil { + public: + // Verifies there have been no commits to this key in the db since this + // sequence number. If user-defined timestamp is enabled, then also check + // no commits to this key in the db since the given ts. + // + // If cache_only is true, then this function will not attempt to read any + // SST files. This will make it more likely this function will + // return an error if it is unable to determine if there are any conflicts. + // + // See comment of CheckKey() for explanation of `snap_seq`, `ts`, + // `snap_checker` and `min_uncommitted`. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + static Status CheckKeyForConflicts( + DBImpl* db_impl, ColumnFamilyHandle* column_family, + const std::string& key, SequenceNumber snap_seq, + const std::string* const ts, bool cache_only, + ReadCallback* snap_checker = nullptr, + SequenceNumber min_uncommitted = kMaxSequenceNumber); + + // For each key,SequenceNumber pair tracked by the LockTracker, this function + // will verify there have been no writes to the key in the db since that + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + // + // REQUIRED: + // This function should only be called on the write thread or if the + // mutex is held. + // tracker must support point lock. + static Status CheckKeysForConflicts(DBImpl* db_impl, + const LockTracker& tracker, + bool cache_only); + + private: + // If `snap_checker` == nullptr, writes are always commited in sequence number + // order. All sequence number <= `snap_seq` will not conflict with any + // write, and all keys > `snap_seq` of `key` will trigger conflict. + // If `snap_checker` != nullptr, writes may not commit in sequence number + // order. In this case `min_uncommitted` is a lower bound. + // seq < `min_uncommitted`: no conflict + // seq > `snap_seq`: applicable to conflict + // `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine. + // + // If user-defined timestamp is enabled, a write conflict is detected if an + // operation for `key` with timestamp greater than `ts` exists. + static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, SequenceNumber snap_seq, + const std::string& key, const std::string* const ts, + bool cache_only, ReadCallback* snap_checker = nullptr, + SequenceNumber min_uncommitted = kMaxSequenceNumber); +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.cc new file mode 100644 index 0000000000..c27a679e4b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.cc @@ -0,0 +1,515 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/write_prepared_txn.h" + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : PessimisticTransaction(txn_db, write_options, txn_options, false), + wpt_db_(txn_db) { + // Call Initialize outside PessimisticTransaction constructor otherwise it + // would skip overridden functions in WritePreparedTxn since they are not + // defined yet in the constructor of PessimisticTransaction + Initialize(txn_options); +} + +void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) { + PessimisticTransaction::Initialize(txn_options); + prepare_batch_cnt_ = 0; +} + +void WritePreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + assert(options.io_activity == Env::IOActivity::kUnknown); + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!callback.valid() || + !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + +Status WritePreparedTxn::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); + Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, + pinnable_val, &callback); + const bool callback_valid = + callback.valid(); // NOTE: validity of callback must always be checked + // before it is destructed + if (res.ok()) { + if (!LIKELY(callback_valid && + wpt_db_->ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + res = Status::TryAgain(); + } + } + + return res; +} + +Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) { + // Make sure to get iterator from WritePrepareTxnDB, not the root db. + Iterator* db_iter = wpt_db_->NewIterator(options); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(db_iter); +} + +Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + // Make sure to get iterator from WritePrepareTxnDB, not the root db. + Iterator* db_iter = wpt_db_->NewIterator(options, column_family); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(column_family, db_iter); +} + +Status WritePreparedTxn::PrepareInternal() { + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + const bool WRITE_AFTER_COMMIT = true; + const bool kFirstPrepareBatch = true; + auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_, !WRITE_AFTER_COMMIT); + assert(s.ok()); + // For each duplicate key we account for a new sub-batch + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + // Having AddPrepared in the PreReleaseCallback allows in-order addition of + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, prepare_batch_cnt_, + db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch); + const bool DISABLE_MEMTABLE = true; + uint64_t seq_used = kMaxSequenceNumber; + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, &log_number_, /*log ref*/ 0, + !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_, + &add_prepared_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + auto prepare_seq = seq_used; + SetId(prepare_seq); + return s; +} + +Status WritePreparedTxn::CommitWithoutPrepareInternal() { + // For each duplicate key we account for a new sub-batch + const size_t batch_cnt = GetWriteBatch()->SubBatchCnt(); + return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt); +} + +Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch, + size_t batch_cnt) { + return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this); +} + +Status WritePreparedTxn::CommitInternal() { + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitInternal prepare_seq: %" PRIu64, GetID()); + // We take the commit-time batch and append the Commit marker. + // The Memtable will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + const bool empty = working_batch->Count() == 0; + auto s = WriteBatchInternal::MarkCommit(working_batch, name_); + assert(s.ok()); + + const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_; + if (!empty) { + // When not writing to memtable, we can still cache the latest write batch. + // The cached batch will be written to memtable in WriteRecoverableState + // during FlushMemTable + if (for_recovery) { + WriteBatchInternal::SetAsLatestPersistentState(working_batch); + } else { + return Status::InvalidArgument( + "Commit-time-batch can only be used if " + "use_only_the_last_commit_time_batch_for_recovery is true"); + } + } + + auto prepare_seq = GetId(); + const bool includes_data = !empty && !for_recovery; + assert(prepare_batch_cnt_); + size_t commit_batch_cnt = 0; + if (UNLIKELY(includes_data)) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Duplicate key overhead"); + SubBatchCounter counter(*wpt_db_->GetCFComparatorMap()); + s = working_batch->Iterate(&counter); + assert(s.ok()); + commit_batch_cnt = counter.BatchCount(); + } + const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt); + // This is to call AddPrepared on CommitTimeWriteBatch + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + uint64_t seq_used = kMaxSequenceNumber; + // Since the prepared batch is directly written to memtable, there is already + // a connection between the memtable and its WAL, so there is no need to + // redundantly reference the log that contains the prepared data. + const uint64_t zero_log_number = 0ull; + size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; + // If `two_write_queues && includes_data`, then `do_one_write` is false. The + // following `WriteImpl` will insert the data of the commit-time-batch into + // the database before updating the commit cache. Therefore, the data of the + // commmit-time-batch is considered uncommitted. Furthermore, since data of + // the commit-time-batch are not locked, it is possible for two uncommitted + // versions of the same key to co-exist for a (short) period of time until + // the commit cache is updated by the second write. If the two uncommitted + // keys are compacted to the bottommost level in the meantime, it is possible + // that compaction iterator will zero out the sequence numbers of both, thus + // violating the invariant that an SST does not have two identical internal + // keys. To prevent this situation, we should allow the usage of + // commit-time-batch only if the user sets + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to + // true. See the comments about GetCommitTimeWriteBatch() in + // include/rocksdb/utilities/transaction.h. + s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, + zero_log_number, disable_memtable, &seq_used, + batch_cnt, pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; + if (LIKELY(do_one_write || !s.ok())) { + if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues && + s.ok())) { + // Note: RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_); + } // else RemovePrepared is called from within PreReleaseCallback + if (UNLIKELY(!do_one_write)) { + assert(!s.ok()); + // Cleanup the prepared entry we added with add_prepared_callback + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } + return s; + } // else do the 2nd write to publish seq + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + const size_t kZeroData = 0; + // Update commit map only from the 2nd queue + WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData, + commit_batch_seq, commit_batch_cnt); + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_aux_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + return s; +} + +Status WritePreparedTxn::RollbackInternal() { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "RollbackInternal prepare_seq: %" PRIu64, GetId()); + + assert(db_impl_); + assert(wpt_db_); + + WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */, + write_options_.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + assert(GetId() != kMaxSequenceNumber); + assert(GetId() > 0); + auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap(); + auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap(); + auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); + struct RollbackWriteBatchBuilder : public WriteBatch::Handler { + DBImpl* const db_; + WritePreparedTxnDB* const wpt_db_; + WritePreparedTxnReadCallback callback_; + WriteBatch* rollback_batch_; + std::map& comparators_; + std::map& handles_; + using CFKeys = std::set; + std::map keys_; + bool rollback_merge_operands_; + ReadOptions roptions_; + + RollbackWriteBatchBuilder( + DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq, + WriteBatch* dst_batch, + std::map& comparators, + std::map& handles, + bool rollback_merge_operands, const ReadOptions& _roptions) + : db_(db), + wpt_db_(wpt_db), + callback_(wpt_db, snap_seq), // disable min_uncommitted optimization + rollback_batch_(dst_batch), + comparators_(comparators), + handles_(handles), + rollback_merge_operands_(rollback_merge_operands), + roptions_(_roptions) {} + + Status Rollback(uint32_t cf, const Slice& key) { + Status s; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); + } + auto it = cf_keys.insert(key); + // second is false if a element already existed. + if (it.second == false) { + return s; + } + + PinnableSlice pinnable_val; + bool not_used; + auto cf_handle = handles_[cf]; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback_; + s = db_->GetImpl(roptions_, key, get_impl_options); + assert(s.ok() || s.IsNotFound()); + if (s.ok()) { + s = rollback_batch_->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + // There has been no readable value before txn. By adding a delete we + // make sure that there will be none afterwards either. + if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) { + s = rollback_batch_->SingleDelete(cf_handle, key); + } else { + s = rollback_batch_->Delete(cf_handle, key); + } + assert(s.ok()); + } else { + // Unexpected status. Return it to the user. + } + return s; + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override { + return Rollback(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } + } + + Status MarkNoop(bool) override { return Status::OK(); } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + Status MarkCommit(const Slice&) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + + protected: + Handler::OptionState WriteAfterCommit() const override { + return Handler::OptionState::kDisabled; + } + } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch, + *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), + wpt_db_->txn_db_options_.rollback_merge_operands, + roptions); + auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler); + if (!s.ok()) { + return s; + } + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(&rollback_batch, name_); + assert(s.ok()); + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + const bool DISABLE_MEMTABLE = true; + const uint64_t NO_REF_LOG = 0; + uint64_t seq_used = kMaxSequenceNumber; + const size_t ONE_BATCH = 1; + const bool kFirstPrepareBatch = true; + // We commit the rolled back prepared batches. Although this is + // counter-intuitive, i) it is safe to do so, since the prepared batches are + // already canceled out by the rollback batch, ii) adding the commit entry to + // CommitCache will allow us to benefit from the existing mechanism in + // CommitCache that keeps an entry evicted due to max advance and yet overlaps + // with a live snapshot around so that the live snapshot properly skips the + // entry even if its prepare seq is lower than max_evicted_seq_. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, ONE_BATCH, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + // Note: the rollback batch does not need AddPrepared since it is written to + // DB in one shot. min_uncommitted still works since it requires capturing + // data that is written to DB but not yet committed, while + // the rollback batch commits with PreReleaseCallback. + s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr, + NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (!s.ok()) { + return s; + } + if (do_one_write) { + assert(!db_impl_->immutable_db_options().two_write_queues); + wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); + return s; + } // else do the 2nd write for commit + uint64_t rollback_seq = seq_used; + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal 2nd write rollback_seq: %" PRIu64, + rollback_seq); + // Commit the batch by writing an empty batch to the queue that will release + // the commit sequence number to readers. + WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare( + wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_); + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_prepare); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal (status=%s) commit: %" PRIu64, + s.ToString().c_str(), GetId()); + // TODO(lth): For WriteUnPrepared that rollback is called frequently, + // RemovePrepared could be moved to the callback to reduce lock contention. + if (s.ok()) { + wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); + } + // Note: RemovePrepared for prepared batch is called from within + // PreReleaseCallback + wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH); + + return s; +} + +Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check(snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted, + kBackedByDBSnapshot); + // TODO(yanqin): support user-defined timestamp + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr, + false /* cache_only */, &snap_checker, min_uncommitted); +} + +void WritePreparedTxn::SetSnapshot() { + const bool kForWWConflictCheck = true; + SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck); + SetSnapshotInternal(snapshot); +} + +Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) { + auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch); + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.h new file mode 100644 index 0000000000..f621e37aba --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn.h @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include +#include +#include +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class WritePreparedTxnDB; + +// This impl could write to DB also uncommitted data and then later tell apart +// committed data from uncommitted data. Uncommitted data could be after the +// Prepare phase in 2PC (WritePreparedTxn) or before that +// (WriteUnpreparedTxnImpl). +class WritePreparedTxn : public PessimisticTransaction { + public: + WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + // No copying allowed + WritePreparedTxn(const WritePreparedTxn&) = delete; + void operator=(const WritePreparedTxn&) = delete; + + virtual ~WritePreparedTxn() {} + + // To make WAL commit markers visible, the snapshot will be based on the last + // seq in the WAL that is also published, LastPublishedSequence, as opposed to + // the last seq in the memtable. + using Transaction::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + // Note: The behavior is undefined in presence of interleaved writes to the + // same transaction. + // To make WAL commit markers visible, the snapshot will be + // based on the last seq in the WAL that is also published, + // LastPublishedSequence, as opposed to the last seq in the memtable. + using Transaction::GetIterator; + virtual Iterator* GetIterator(const ReadOptions& options) override; + virtual Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + virtual void SetSnapshot() override; + + protected: + void Initialize(const TransactionOptions& txn_options) override; + // Override the protected SetId to make it visible to the friend class + // WritePreparedTxnDB + inline void SetId(uint64_t id) override { Transaction::SetId(id); } + + private: + friend class WritePreparedTransactionTest_BasicRecoveryTest_Test; + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + + Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override; + + // Since the data is already written to memtables at the Prepare phase, the + // commit entails writing only a commit marker in the WAL. The sequence number + // of the commit marker is then the commit timestamp of the transaction. To + // make WAL commit markers visible, the snapshot will be based on the last seq + // in the WAL that is also published, LastPublishedSequence, as opposed to the + // last seq in the memtable. + Status CommitInternal() override; + + Status RollbackInternal() override; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + + WritePreparedTxnDB* wpt_db_; + // Number of sub-batches in prepare + size_t prepare_batch_cnt_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.cc new file mode 100644 index 0000000000..6118c3549e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.cc @@ -0,0 +1,1038 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/write_prepared_txn_db.h" + +#include +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +// This function is for testing only. If it returns true, then all entries in +// the commit cache will be evicted. Unit and/or stress tests (db_stress) +// can implement this function and customize how frequently commit cache +// eviction occurs. +// TODO: remove this function once we can configure commit cache to be very +// small so that eviction occurs very frequently. This requires the commit +// cache entry to be able to encode prepare and commit sequence numbers so that +// the commit sequence number does not have to be within a certain range of +// prepare sequence number. +extern "C" bool rocksdb_write_prepared_TEST_ShouldClearCommitCache(void) + __attribute__((__weak__)); + +namespace ROCKSDB_NAMESPACE { + +Status WritePreparedTxnDB::Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) { + auto dbimpl = static_cast_with_check(GetRootDB()); + assert(dbimpl != nullptr); + auto rtxns = dbimpl->recovered_transactions(); + std::map ordered_seq_cnt; + for (auto rtxn : rtxns) { + // There should only one batch for WritePrepared policy. + assert(rtxn.second->batches_.size() == 1); + const auto& seq = rtxn.second->batches_.begin()->first; + const auto& batch_info = rtxn.second->batches_.begin()->second; + auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + ordered_seq_cnt[seq] = cnt; + } + // AddPrepared must be called in order + for (auto seq_cnt : ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } + SequenceNumber prev_max = max_evicted_seq_; + SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); + AdvanceMaxEvictedSeq(prev_max, last_seq); + // Create a gap between max and the next snapshot. This simplifies the logic + // in IsInSnapshot by not having to consider the special case of max == + // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest. + if (last_seq) { + db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1); + db_impl_->versions_->SetLastSequence(last_seq + 1); + db_impl_->versions_->SetLastPublishedSequence(last_seq + 1); + } + + db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this)); + // A callback to commit a single sub-batch + class CommitSubBatchPreReleaseCallback : public PreReleaseCallback { + public: + explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) + : db_(db) {} + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { + assert(!is_mem_disabled); + db_->AddCommitted(commit_seq, commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + }; + db_impl_->SetRecoverableStatePreReleaseCallback( + new CommitSubBatchPreReleaseCallback(this)); + + auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices, + handles); + return s; +} + +Status WritePreparedTxnDB::VerifyCFOptions( + const ColumnFamilyOptions& cf_options) { + Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options); + if (!s.ok()) { + return s; + } + if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) { + return Status::InvalidArgument( + "memtable_factory->CanHandleDuplicatedKey() cannot be false with " + "WritePrpeared transactions"); + } + return Status::OK(); +} + +Transaction* WritePreparedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WritePreparedTxn(this, write_options, txn_options); + } +} + +Status WritePreparedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + if (txn_db_options_.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN); + } else { + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + +Status WritePreparedTxnDB::Write( + const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { + if (optimizations.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + const size_t ONE_BATCH_CNT = 1; + const size_t batch_cnt = optimizations.skip_duplicate_key_check + ? ONE_BATCH_CNT + : UNKNOWN_BATCH_CNT; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, batch_cnt, NO_TXN); + } else { + // TODO(myabandeh): Make use of skip_duplicate_key_check hint + // Fall back to unoptimized version + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + +Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, + WriteBatch* batch, size_t batch_cnt, + WritePreparedTxn* txn) { + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitBatchInternal"); + if (batch->Count() == 0) { + // Otherwise our 1 seq per batch logic will break since there is no seq + // increased for this batch. + return Status::OK(); + } + + if (write_options_orig.protection_bytes_per_key > 0) { + auto s = WriteBatchInternal::UpdateProtectionInfo( + batch, write_options_orig.protection_bytes_per_key); + if (!s.ok()) { + return s; + } + } + + if (batch_cnt == 0) { // not provided, then compute it + // TODO(myabandeh): add an option to allow user skipping this cost + SubBatchCounter counter(*GetCFComparatorMap()); + auto s = batch->Iterate(&counter); + if (!s.ok()) { + return s; + } + batch_cnt = counter.BatchCount(); + WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD); + ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches", + static_cast(batch_cnt)); + } + assert(batch_cnt); + + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + WriteOptions write_options(write_options_orig); + // In the absence of Prepare markers, use Noop as a batch separator + auto s = WriteBatchInternal::InsertNoop(batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const uint64_t no_log_ref = 0; + uint64_t seq_used = kMaxSequenceNumber; + const size_t ZERO_PREPARES = 0; + const bool kSeperatePrepareCommitBatches = true; + // Since this is not 2pc, there is no need for AddPrepared but having it in + // the PreReleaseCallback enables an optimization. Refer to + // SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + this, db_impl_, batch_cnt, + db_impl_->immutable_db_options().two_write_queues, + !kSeperatePrepareCommitBatches); + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref, + !DISABLE_MEMTABLE, &seq_used, batch_cnt, + pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + uint64_t prepare_seq = seq_used; + if (txn != nullptr) { + txn->SetId(prepare_seq); + } + if (!s.ok()) { + return s; + } + if (do_one_write) { + return s; + } // else do the 2nd write for commit + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitBatchInternal 2nd write prepare_seq: %" PRIu64, + prepare_seq); + // Commit the batch by writing an empty batch to the 2nd queue that will + // release the commit sequence number to readers. + const size_t ZERO_COMMITS = 0; + WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare( + this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS); + WriteBatch empty_batch; + write_options.disableWAL = true; + write_options.sync = false; + const size_t ONE_BATCH = 1; // Just to inc the seq + s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr, + no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_prepare); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Note: RemovePrepared is called from within PreReleaseCallback + return s; +} + +Status WritePreparedTxnDB::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted, + backed_by_snapshot); + bool* dont_care = nullptr; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + get_impl_options.value_found = dont_care; + get_impl_options.callback = &callback; + auto res = db_impl_->GetImpl(options, key, get_impl_options); + if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { + return res; + } else { + res.PermitUncheckedError(); + WPRecordTick(TXN_GET_TRY_AGAIN); + return Status::TryAgain(); + } +} + +void WritePreparedTxnDB::UpdateCFComparatorMap( + const std::vector& handles) { + auto cf_map = new std::map(); + auto handle_map = new std::map(); + for (auto h : handles) { + auto id = h->GetID(); + const Comparator* comparator = h->GetComparator(); + (*cf_map)[id] = comparator; + if (id != 0) { + (*handle_map)[id] = h; + } else { + // The pointer to the default cf handle in the handles will be deleted. + // Use the pointer maintained by the db instead. + (*handle_map)[id] = DefaultColumnFamily(); + } + } + cf_map_.reset(cf_map); + handle_map_.reset(handle_map); +} + +void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) { + auto old_cf_map_ptr = cf_map_.get(); + assert(old_cf_map_ptr); + auto cf_map = new std::map(*old_cf_map_ptr); + auto old_handle_map_ptr = handle_map_.get(); + assert(old_handle_map_ptr); + auto handle_map = + new std::map(*old_handle_map_ptr); + auto id = h->GetID(); + const Comparator* comparator = h->GetComparator(); + (*cf_map)[id] = comparator; + (*handle_map)[id] = h; + cf_map_.reset(cf_map); + handle_map_.reset(handle_map); +} + +std::vector WritePreparedTxnDB::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + assert(values); + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]); + } + return stat_list; +} + +// Struct to hold ownership of snapshot and read callback for iterator cleanup. +struct WritePreparedTxnDB::IteratorState { + IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, + std::shared_ptr s, + SequenceNumber min_uncommitted) + : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot), + snapshot(s) {} + + WritePreparedTxnReadCallback callback; + std::shared_ptr snapshot; +}; + +namespace { +static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { + delete reinterpret_cast(arg1); +} +} // anonymous namespace + +Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + if (options.snapshot != nullptr) { + snapshot_seq = options.snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check(options.snapshot) + ->min_uncommitted_; + } else { + auto* snapshot = GetSnapshot(); + // We take a snapshot to make sure that the related data in the commit map + // are not deleted. + snapshot_seq = snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check(snapshot)->min_uncommitted_; + own_snapshot = std::make_shared(db_impl_, snapshot); + } + assert(snapshot_seq != kMaxSequenceNumber); + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); + auto* db_iter = + db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); + db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); + return db_iter; +} + +Status WritePreparedTxnDB::NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) { + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + if (options.snapshot != nullptr) { + snapshot_seq = options.snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check(options.snapshot) + ->min_uncommitted_; + } else { + auto* snapshot = GetSnapshot(); + // We take a snapshot to make sure that the related data in the commit map + // are not deleted. + snapshot_seq = snapshot->GetSequenceNumber(); + own_snapshot = std::make_shared(db_impl_, snapshot); + min_uncommitted = + static_cast_with_check(snapshot)->min_uncommitted_; + } + iterators->clear(); + iterators->reserve(column_families.size()); + for (auto* column_family : column_families) { + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); + auto* db_iter = + db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); + db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); + iterators->push_back(db_iter); + } + return Status::OK(); +} + +void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) { + // Adcance max_evicted_seq_ no more than 100 times before the cache wraps + // around. + INC_STEP_FOR_MAX_EVICTED = + std::max(COMMIT_CACHE_SIZE / 100, static_cast(1)); + snapshot_cache_ = std::unique_ptr[]>( + new std::atomic[SNAPSHOT_CACHE_SIZE] {}); + commit_cache_ = std::unique_ptr[]>( + new std::atomic[COMMIT_CACHE_SIZE] {}); + dummy_max_snapshot_.number_ = kMaxSequenceNumber; + rollback_deletion_type_callback_ = + txn_db_opts.rollback_deletion_type_callback; +} + +void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max, + bool locked) { + // When max_evicted_seq_ advances, move older entries from prepared_txns_ + // to delayed_prepared_. This guarantees that if a seq is lower than max, + // then it is not in prepared_txns_ and save an expensive, synchronized + // lookup from a shared set. delayed_prepared_ is expected to be empty in + // normal cases. + ROCKS_LOG_DETAILS( + info_log_, + "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64, + prepared_txns_.empty(), + prepared_txns_.empty() ? 0 : prepared_txns_.top()); + const SequenceNumber prepared_top = prepared_txns_.top(); + const bool empty = prepared_top == kMaxSequenceNumber; + // Preliminary check to avoid the synchronization cost + if (!empty && prepared_top <= new_max) { + if (locked) { + // Needed to avoid double locking in pop(). + prepared_txns_.push_pop_mutex()->Unlock(); + } + WriteLock wl(&prepared_mutex_); + // Need to fetch fresh values of ::top after mutex is acquired + while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) { + auto to_be_popped = prepared_txns_.top(); + delayed_prepared_.insert(to_be_popped); + ROCKS_LOG_WARN(info_log_, + "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64 + " new_max=%" PRIu64 ")", + static_cast(delayed_prepared_.size()), + to_be_popped, new_max); + delayed_prepared_empty_.store(false, std::memory_order_release); + // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise + // there will be a point in time that the entry is neither in + // prepared_txns_ nor in delayed_prepared_, which will not be checked if + // delayed_prepared_empty_ is false. + prepared_txns_.pop(); + } + if (locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + } +} + +void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) { + ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64, + seq, max_evicted_seq_.load()); + TEST_SYNC_POINT("AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPrepared::begin:resume"); + if (!locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + prepared_txns_.push_pop_mutex()->AssertHeld(); + prepared_txns_.push(seq); + auto new_max = future_max_evicted_seq_.load(); + if (UNLIKELY(seq <= new_max)) { + // This should not happen in normal case + ROCKS_LOG_ERROR( + info_log_, + "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64 + " <= %" PRIu64, + seq, new_max); + CheckPreparedAgainstMax(new_max, true /*locked*/); + } + if (!locked) { + prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPrepared::end"); +} + +void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, + uint8_t loop_cnt) { + ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64, + prepare_seq, commit_seq); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start:pause"); + auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE; + CommitEntry64b evicted_64b; + CommitEntry evicted; + bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted); + if (LIKELY(to_be_evicted)) { + assert(evicted.prep_seq != prepare_seq); + auto prev_max = max_evicted_seq_.load(std::memory_order_acquire); + ROCKS_LOG_DETAILS(info_log_, + "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64, + evicted.prep_seq, evicted.commit_seq, prev_max); + if (prev_max < evicted.commit_seq) { + auto last = db_impl_->GetLastPublishedSequence(); // could be 0 + SequenceNumber max_evicted_seq; + if (LIKELY(evicted.commit_seq < last)) { + assert(last > 0); + // Inc max in larger steps to avoid frequent updates + max_evicted_seq = + std::min(evicted.commit_seq + INC_STEP_FOR_MAX_EVICTED, last - 1); + } else { + // legit when a commit entry in a write batch overwrite the previous one + max_evicted_seq = evicted.commit_seq; + } +#ifdef OS_LINUX + if (rocksdb_write_prepared_TEST_ShouldClearCommitCache && + rocksdb_write_prepared_TEST_ShouldClearCommitCache()) { + max_evicted_seq = last; + } +#endif // OS_LINUX + ROCKS_LOG_DETAILS(info_log_, + "%lu Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64 + " => %lu", + prepare_seq, evicted.prep_seq, evicted.commit_seq, + prev_max, max_evicted_seq); + AdvanceMaxEvictedSeq(prev_max, max_evicted_seq); + } + if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) { + WriteLock wl(&prepared_mutex_); + auto dp_iter = delayed_prepared_.find(evicted.prep_seq); + if (dp_iter != delayed_prepared_.end()) { + // This is a rare case that txn is committed but prepared_txns_ is not + // cleaned up yet. Refer to delayed_prepared_commits_ definition for + // why it should be kept updated. + delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq; + ROCKS_LOG_DEBUG(info_log_, + "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64, + evicted.prep_seq, evicted.commit_seq); + } + } + // After each eviction from commit cache, check if the commit entry should + // be kept around because it overlaps with a live snapshot. + CheckAgainstSnapshots(evicted); + } + bool succ = + ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq}); + if (UNLIKELY(!succ)) { + ROCKS_LOG_ERROR(info_log_, + "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64 + ",%" PRIu64 " retrying...", + indexed_seq, prepare_seq, commit_seq); + // A very rare event, in which the commit entry is updated before we do. + // Here we apply a very simple solution of retrying. + if (loop_cnt > 100) { + throw std::runtime_error("Infinite loop in AddCommitted!"); + } + AddCommitted(prepare_seq, commit_seq, ++loop_cnt); + return; + } + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end:pause"); +} + +void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq, + const size_t batch_cnt) { + TEST_SYNC_POINT_CALLBACK( + "RemovePrepared:Start", + const_cast(reinterpret_cast(&prepare_seq))); + TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause"); + TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume"); + ROCKS_LOG_DETAILS(info_log_, + "RemovePrepared %" PRIu64 " cnt: %" ROCKSDB_PRIszt, + prepare_seq, batch_cnt); + WriteLock wl(&prepared_mutex_); + for (size_t i = 0; i < batch_cnt; i++) { + prepared_txns_.erase(prepare_seq + i); + bool was_empty = delayed_prepared_.empty(); + if (!was_empty) { + delayed_prepared_.erase(prepare_seq + i); + auto it = delayed_prepared_commits_.find(prepare_seq + i); + if (it != delayed_prepared_commits_.end()) { + ROCKS_LOG_DETAILS(info_log_, "delayed_prepared_commits_.erase %" PRIu64, + prepare_seq + i); + delayed_prepared_commits_.erase(it); + } + bool is_empty = delayed_prepared_.empty(); + if (was_empty != is_empty) { + delayed_prepared_empty_.store(is_empty, std::memory_order_release); + } + } + } +} + +bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq, + CommitEntry64b* entry_64b, + CommitEntry* entry) const { + *entry_64b = commit_cache_[static_cast(indexed_seq)].load( + std::memory_order_acquire); + bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT); + return valid; +} + +bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq, + const CommitEntry& new_entry, + CommitEntry* evicted_entry) { + CommitEntry64b new_entry_64b(new_entry, FORMAT); + CommitEntry64b evicted_entry_64b = + commit_cache_[static_cast(indexed_seq)].exchange( + new_entry_64b, std::memory_order_acq_rel); + bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT); + return valid; +} + +bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq, + CommitEntry64b& expected_entry_64b, + const CommitEntry& new_entry) { + auto& atomic_entry = commit_cache_[static_cast(indexed_seq)]; + CommitEntry64b new_entry_64b(new_entry, FORMAT); + bool succ = atomic_entry.compare_exchange_strong( + expected_entry_64b, new_entry_64b, std::memory_order_acq_rel, + std::memory_order_acquire); + return succ; +} + +void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, + const SequenceNumber& new_max) { + ROCKS_LOG_DETAILS(info_log_, + "AdvanceMaxEvictedSeq overhead %" PRIu64 " => %" PRIu64, + prev_max, new_max); + // Declare the intention before getting snapshot from the DB. This helps a + // concurrent GetSnapshot to wait to catch up with future_max_evicted_seq_ if + // it has not already. Otherwise the new snapshot is when we ask DB for + // snapshots smaller than future max. + auto updated_future_max = prev_max; + while (updated_future_max < new_max && + !future_max_evicted_seq_.compare_exchange_weak( + updated_future_max, new_max, std::memory_order_acq_rel, + std::memory_order_relaxed)) { + }; + + CheckPreparedAgainstMax(new_max, false /*locked*/); + + // With each change to max_evicted_seq_ fetch the live snapshots behind it. + // We use max as the version of snapshots to identify how fresh are the + // snapshot list. This works because the snapshots are between 0 and + // max, so the larger the max, the more complete they are. + SequenceNumber new_snapshots_version = new_max; + std::vector snapshots; + bool update_snapshots = false; + if (new_snapshots_version > snapshots_version_) { + // This is to avoid updating the snapshots_ if it already updated + // with a more recent vesion by a concrrent thread + update_snapshots = true; + // We only care about snapshots lower then max + snapshots = GetSnapshotListFromDB(new_max); + } + if (update_snapshots) { + UpdateSnapshots(snapshots, new_snapshots_version); + if (!snapshots.empty()) { + WriteLock wl(&old_commit_map_mutex_); + for (auto snap : snapshots) { + // This allows IsInSnapshot to tell apart the reads from in valid + // snapshots from the reads from committed values in valid snapshots. + old_commit_map_[snap]; + } + old_commit_map_empty_.store(false, std::memory_order_release); + } + } + auto updated_prev_max = prev_max; + TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:pause"); + TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:resume"); + while (updated_prev_max < new_max && + !max_evicted_seq_.compare_exchange_weak(updated_prev_max, new_max, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + }; +} + +const Snapshot* WritePreparedTxnDB::GetSnapshot() { + const bool kForWWConflictCheck = true; + return GetSnapshotInternal(!kForWWConflictCheck); +} + +SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal( + bool for_ww_conflict_check) { + // Note: for this optimization setting the last sequence number and obtaining + // the smallest uncommitted seq should be done atomically. However to avoid + // the mutex overhead, we call SmallestUnCommittedSeq BEFORE taking the + // snapshot. Since we always updated the list of unprepared seq (via + // AddPrepared) AFTER the last sequence is updated, this guarantees that the + // smallest uncommitted seq that we pair with the snapshot is smaller or equal + // the value that would be obtained otherwise atomically. That is ok since + // this optimization works as long as min_uncommitted is less than or equal + // than the smallest uncommitted seq when the snapshot was taken. + auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq(); + SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first"); + assert(snap_impl); + SequenceNumber snap_seq = snap_impl->GetSequenceNumber(); + // Note: Check against future_max_evicted_seq_ (in contrast with + // max_evicted_seq_) in case there is a concurrent AdvanceMaxEvictedSeq. + if (UNLIKELY(snap_seq != 0 && snap_seq <= future_max_evicted_seq_)) { + // There is a very rare case in which the commit entry evicts another commit + // entry that is not published yet thus advancing max evicted seq beyond the + // last published seq. This case is not likely in real-world setup so we + // handle it with a few retries. + size_t retry = 0; + SequenceNumber max; + while ((max = future_max_evicted_seq_.load()) != 0 && + snap_impl->GetSequenceNumber() <= max && retry < 100) { + ROCKS_LOG_WARN(info_log_, + "GetSnapshot snap: %" PRIu64 " max: %" PRIu64 + " retry %" ROCKSDB_PRIszt, + snap_impl->GetSequenceNumber(), max, retry); + ReleaseSnapshot(snap_impl); + // Wait for last visible seq to catch up with max, and also go beyond it + // by one. + AdvanceSeqByOne(); + snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + assert(snap_impl); + retry++; + } + assert(snap_impl->GetSequenceNumber() > max); + if (snap_impl->GetSequenceNumber() <= max) { + throw std::runtime_error( + "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) + + " after " + std::to_string(retry) + + " retries is still less than futre_max_evicted_seq_" + + std::to_string(max)); + } + } + EnhanceSnapshot(snap_impl, min_uncommitted); + ROCKS_LOG_DETAILS( + db_impl_->immutable_db_options().info_log, + "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64, + snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end"); + return snap_impl; +} + +void WritePreparedTxnDB::AdvanceSeqByOne() { + // Inserting an empty value will i) let the max evicted entry to be + // published, i.e., max == last_published, increase the last published to + // be one beyond max, i.e., max < last_published. + WriteOptions woptions; + TransactionOptions txn_options; + Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr); + std::hash hasher; + char name[64]; + snprintf(name, 64, "txn%" ROCKSDB_PRIszt, hasher(std::this_thread::get_id())); + assert(strlen(name) < 64 - 1); + Status s = txn0->SetName(name); + assert(s.ok()); + if (s.ok()) { + // Without prepare it would simply skip the commit + s = txn0->Prepare(); + } + assert(s.ok()); + if (s.ok()) { + s = txn0->Commit(); + } + assert(s.ok()); + delete txn0; +} + +const std::vector WritePreparedTxnDB::GetSnapshotListFromDB( + SequenceNumber max) { + ROCKS_LOG_DETAILS(info_log_, "GetSnapshotListFromDB with max %" PRIu64, max); + InstrumentedMutexLock dblock(db_impl_->mutex()); + db_impl_->mutex()->AssertHeld(); + return db_impl_->snapshots().GetAll(nullptr, max); +} + +void WritePreparedTxnDB::ReleaseSnapshotInternal( + const SequenceNumber snap_seq) { + // TODO(myabandeh): relax should enough since the synchronizatin is already + // done by snapshots_mutex_ under which this function is called. + if (snap_seq <= max_evicted_seq_.load(std::memory_order_acquire)) { + // Then this is a rare case that transaction did not finish before max + // advances. It is expected for a few read-only backup snapshots. For such + // snapshots we might have kept around a couple of entries in the + // old_commit_map_. Check and do garbage collection if that is the case. + bool need_gc = false; + { + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); + ReadLock rl(&old_commit_map_mutex_); + auto prep_set_entry = old_commit_map_.find(snap_seq); + need_gc = prep_set_entry != old_commit_map_.end(); + } + if (need_gc) { + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); + WriteLock wl(&old_commit_map_mutex_); + old_commit_map_.erase(snap_seq); + old_commit_map_empty_.store(old_commit_map_.empty(), + std::memory_order_release); + } + } +} + +void WritePreparedTxnDB::CleanupReleasedSnapshots( + const std::vector& new_snapshots, + const std::vector& old_snapshots) { + auto newi = new_snapshots.begin(); + auto oldi = old_snapshots.begin(); + for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) { + assert(*newi >= *oldi); // cannot have new snapshots with lower seq + if (*newi == *oldi) { // still not released + auto value = *newi; + while (newi != new_snapshots.end() && *newi == value) { + newi++; + } + while (oldi != old_snapshots.end() && *oldi == value) { + oldi++; + } + } else { + assert(*newi > *oldi); // *oldi is released + ReleaseSnapshotInternal(*oldi); + oldi++; + } + } + // Everything remained in old_snapshots is released and must be cleaned up + for (; oldi != old_snapshots.end(); oldi++) { + ReleaseSnapshotInternal(*oldi); + } +} + +void WritePreparedTxnDB::UpdateSnapshots( + const std::vector& snapshots, + const SequenceNumber& version) { + ROCKS_LOG_DETAILS(info_log_, "UpdateSnapshots with version %" PRIu64, + version); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:start"); +#ifndef NDEBUG + size_t sync_i = 0; +#endif + ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead"); + WriteLock wl(&snapshots_mutex_); + snapshots_version_ = version; + // We update the list concurrently with the readers. + // Both new and old lists are sorted and the new list is subset of the + // previous list plus some new items. Thus if a snapshot repeats in + // both new and old lists, it will appear upper in the new list. So if + // we simply insert the new snapshots in order, if an overwritten item + // is still valid in the new list is either written to the same place in + // the array or it is written in a higher palce before it gets + // overwritten by another item. This guarantess a reader that reads the + // list bottom-up will eventaully see a snapshot that repeats in the + // update, either before it gets overwritten by the writer or + // afterwards. + size_t i = 0; + auto it = snapshots.begin(); + for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) { + snapshot_cache_[i].store(*it, std::memory_order_release); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); + } +#ifndef NDEBUG + // Release the remaining sync points since they are useless given that the + // reader would also use lock to access snapshots + for (++sync_i; sync_i <= 10; ++sync_i) { + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); + } +#endif + snapshots_.clear(); + for (; it != snapshots.end(); ++it) { + // Insert them to a vector that is less efficient to access + // concurrently + snapshots_.push_back(*it); + } + // Update the size at the end. Otherwise a parallel reader might read + // items that are not set yet. + snapshots_total_.store(snapshots.size(), std::memory_order_release); + + // Note: this must be done after the snapshots data structures are updated + // with the new list of snapshots. + CleanupReleasedSnapshots(snapshots, snapshots_all_); + snapshots_all_ = snapshots; + + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end"); +} + +void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) { + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:start"); +#ifndef NDEBUG + size_t sync_i = 0; +#endif + // First check the snapshot cache that is efficient for concurrent access + auto cnt = snapshots_total_.load(std::memory_order_acquire); + // The list might get updated concurrently as we are reading from it. The + // reader should be able to read all the snapshots that are still valid + // after the update. Since the survived snapshots are written in a higher + // place before gets overwritten the reader that reads bottom-up will + // eventully see it. + const bool next_is_larger = true; + // We will set to true if the border line snapshot suggests that. + bool search_larger_list = false; + size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE); + for (; 0 < ip1; ip1--) { + SequenceNumber snapshot_seq = + snapshot_cache_[ip1 - 1].load(std::memory_order_acquire); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", + ++sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + if (ip1 == SNAPSHOT_CACHE_SIZE) { // border line snapshot + // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq + // then later also continue the search to larger snapshots + search_larger_list = snapshot_seq < evicted.commit_seq; + } + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq, !next_is_larger)) { + break; + } + } +#ifndef NDEBUG + // Release the remaining sync points before accquiring the lock + for (++sync_i; sync_i <= 10; ++sync_i) { + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + } +#endif + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end"); + if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) { + // Then access the less efficient list of snapshots_ + WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, + "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64 + "> with %" ROCKSDB_PRIszt " snapshots", + evicted.prep_seq, evicted.commit_seq, cnt); + ReadLock rl(&snapshots_mutex_); + // Items could have moved from the snapshots_ to snapshot_cache_ before + // accquiring the lock. To make sure that we do not miss a valid snapshot, + // read snapshot_cache_ again while holding the lock. + for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) { + SequenceNumber snapshot_seq = + snapshot_cache_[i].load(std::memory_order_acquire); + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq, next_is_larger)) { + break; + } + } + for (auto snapshot_seq_2 : snapshots_) { + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq_2, next_is_larger)) { + break; + } + } + } +} + +bool WritePreparedTxnDB::MaybeUpdateOldCommitMap( + const uint64_t& prep_seq, const uint64_t& commit_seq, + const uint64_t& snapshot_seq, const bool next_is_larger = true) { + // If we do not store an entry in old_commit_map_ we assume it is committed in + // all snapshots. If commit_seq <= snapshot_seq, it is considered already in + // the snapshot so we need not to keep the entry around for this snapshot. + if (commit_seq <= snapshot_seq) { + // continue the search if the next snapshot could be smaller than commit_seq + return !next_is_larger; + } + // then snapshot_seq < commit_seq + if (prep_seq <= snapshot_seq) { // overlapping range + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, + "old_commit_map_mutex_ overhead for %" PRIu64 + " commit entry: <%" PRIu64 ",%" PRIu64 ">", + snapshot_seq, prep_seq, commit_seq); + WriteLock wl(&old_commit_map_mutex_); + old_commit_map_empty_.store(false, std::memory_order_release); + auto& vec = old_commit_map_[snapshot_seq]; + vec.insert(std::upper_bound(vec.begin(), vec.end(), prep_seq), prep_seq); + // We need to store it once for each overlapping snapshot. Returning true to + // continue the search if there is more overlapping snapshot. + return true; + } + // continue the search if the next snapshot could be larger than prep_seq + return next_is_larger; +} + +WritePreparedTxnDB::~WritePreparedTxnDB() { + // At this point there could be running compaction/flush holding a + // SnapshotChecker, which holds a pointer back to WritePreparedTxnDB. + // Make sure those jobs finished before destructing WritePreparedTxnDB. + if (!db_impl_->shutting_down_) { + db_impl_->CancelAllBackgroundWork(true /*wait*/); + } +} + +void SubBatchCounter::InitWithComp(const uint32_t cf) { + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); +} + +void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) { + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + batches_++; + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.h new file mode 100644 index 0000000000..f5b6411605 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_prepared_txn_db.h @@ -0,0 +1,1123 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_iter.h" +#include "db/pre_release_callback.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "util/set_comparator.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/write_prepared_txn.h" + +namespace ROCKSDB_NAMESPACE { +enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot }; + +// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC. +// In this way some data in the DB might not be committed. The DB provides +// mechanisms to tell such data apart from committed data. +class WritePreparedTxnDB : public PessimisticTransactionDB { + public: + explicit WritePreparedTxnDB(DB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options), + SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits), + SNAPSHOT_CACHE_SIZE(static_cast(1ull << SNAPSHOT_CACHE_BITS)), + COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits), + COMMIT_CACHE_SIZE(static_cast(1ull << COMMIT_CACHE_BITS)), + FORMAT(COMMIT_CACHE_BITS) { + Init(txn_db_options); + } + + explicit WritePreparedTxnDB(StackableDB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options), + SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits), + SNAPSHOT_CACHE_SIZE(static_cast(1ull << SNAPSHOT_CACHE_BITS)), + COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits), + COMMIT_CACHE_SIZE(static_cast(1ull << COMMIT_CACHE_BITS)), + FORMAT(COMMIT_CACHE_BITS) { + Init(txn_db_options); + } + + virtual ~WritePreparedTxnDB(); + + virtual Status Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) override; + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + using TransactionDB::Write; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + // Optimized version of ::Write that receives more optimization request such + // as skip_concurrency_control. + using PessimisticTransactionDB::Write; + Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations&, + WriteBatch* updates) override; + + // Write the batch to the underlying DB and mark it as committed. Could be + // used by both directly from TxnDB or through a transaction. + Status WriteInternal(const WriteOptions& write_options, WriteBatch* batch, + size_t batch_cnt, WritePreparedTxn* txn); + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + using DB::NewIterators; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + // Check whether the transaction that wrote the value with sequence number seq + // is visible to the snapshot with sequence number snapshot_seq. + // Returns true if commit_seq <= snapshot_seq + // If the snapshot_seq is already released and snapshot_seq <= max, sets + // *snap_released to true and returns true as well. + inline bool IsInSnapshot(uint64_t prep_seq, uint64_t snapshot_seq, + uint64_t min_uncommitted = kMinUnCommittedSeq, + bool* snap_released = nullptr) const { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " min_uncommitted %" PRIu64, + prep_seq, snapshot_seq, min_uncommitted); + assert(min_uncommitted >= kMinUnCommittedSeq); + // Caller is responsible to initialize snap_released. + assert(snap_released == nullptr || *snap_released == false); + // Here we try to infer the return value without looking into prepare list. + // This would help avoiding synchronization over a shared map. + // TODO(myabandeh): optimize this. This sequence of checks must be correct + // but not necessary efficient + if (prep_seq == 0) { + // Compaction will output keys to bottom-level with sequence number 0 if + // it is visible to the earliest snapshot. + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + if (snapshot_seq < prep_seq) { + // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + if (prep_seq < min_uncommitted) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 + " because of min_uncommitted %" PRIu64, + prep_seq, snapshot_seq, 1, min_uncommitted); + return true; + } + // Commit of delayed prepared has two non-atomic steps: add to commit cache, + // remove from delayed prepared. Our reads from these two is also + // non-atomic. By looking into commit cache first thus we might not find the + // prep_seq neither in commit cache not in delayed_prepared_. To fix that i) + // we check if there was any delayed prepared BEFORE looking into commit + // cache, ii) if there was, we complete the search steps to be these: i) + // commit cache, ii) delayed prepared, commit cache again. In this way if + // the first query to commit cache missed the commit, the 2nd will catch it. + bool was_empty; + SequenceNumber max_evicted_seq_lb, max_evicted_seq_ub; + CommitEntry64b dont_care; + auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE; + size_t repeats = 0; + do { + repeats++; + assert(repeats < 100); + if (UNLIKELY(repeats >= 100)) { + throw std::runtime_error( + "The read was intrupted 100 times by update to max_evicted_seq_. " + "This is unexpected in all setups"); + } + max_evicted_seq_lb = max_evicted_seq_.load(std::memory_order_acquire); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"); + was_empty = delayed_prepared_empty_.load(std::memory_order_acquire); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"); + CommitEntry cached; + bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached); + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause"); + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"); + if (exist && prep_seq == cached.prep_seq) { + // It is committed and also not evicted from commit cache + ROCKS_LOG_DETAILS( + info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq); + return cached.commit_seq <= snapshot_seq; + } + // else it could be committed but not inserted in the map which could + // happen after recovery, or it could be committed and evicted by another + // commit, or never committed. + + // At this point we don't know if it was committed or it is still prepared + max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire); + if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) { + continue; + } + // Note: max_evicted_seq_ when we did GetCommitEntry <= max_evicted_seq_ub + if (max_evicted_seq_ub < prep_seq) { + // Not evicted from cache and also not present, so must be still + // prepared + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"); + if (!was_empty) { + // We should not normally reach here + WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD); + ReadLock rl(&prepared_mutex_); + ROCKS_LOG_WARN( + info_log_, "prepared_mutex_ overhead %" PRIu64 " for %" PRIu64, + static_cast(delayed_prepared_.size()), prep_seq); + if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) { + // This is the order: 1) delayed_prepared_commits_ update, 2) publish + // 3) delayed_prepared_ clean up. So check if it is the case of a late + // clenaup. + auto it = delayed_prepared_commits_.find(prep_seq); + if (it == delayed_prepared_commits_.end()) { + // Then it is not committed yet + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } else { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " commit: %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, it->second, + snapshot_seq <= it->second); + return it->second <= snapshot_seq; + } + } else { + // 2nd query to commit cache. Refer to was_empty comment above. + exist = GetCommitEntry(indexed_seq, &dont_care, &cached); + if (exist && prep_seq == cached.prep_seq) { + ROCKS_LOG_DETAILS( + info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq); + return cached.commit_seq <= snapshot_seq; + } + max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire); + } + } + } while (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)); + // When advancing max_evicted_seq_, we move older entires from prepared to + // delayed_prepared_. Also we move evicted entries from commit cache to + // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <= + // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in + // old_commit_map_, iii) committed with no conflict with any snapshot. Case + // (i) delayed_prepared_ is checked above + if (max_evicted_seq_ub < snapshot_seq) { // then (ii) cannot be the case + // only (iii) is the case: committed + // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq < + // snapshot_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + // else (ii) might be the case: check the commit data saved for this + // snapshot. If there was no overlapping commit entry, then it is committed + // with a commit_seq lower than any live snapshot, including snapshot_seq. + if (old_commit_map_empty_.load(std::memory_order_acquire)) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 " released=1", + prep_seq, snapshot_seq, 0); + assert(snap_released); + // This snapshot is not valid anymore. We cannot tell if prep_seq is + // committed before or after the snapshot. Return true but also set + // snap_released to true. + *snap_released = true; + return true; + } + { + // We should not normally reach here unless sapshot_seq is old. This is a + // rare case and it is ok to pay the cost of mutex ReadLock for such old, + // reading transactions. + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ReadLock rl(&old_commit_map_mutex_); + auto prep_set_entry = old_commit_map_.find(snapshot_seq); + bool found = prep_set_entry != old_commit_map_.end(); + if (found) { + auto& vec = prep_set_entry->second; + found = std::binary_search(vec.begin(), vec.end(), prep_seq); + } else { + // coming from compaction + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 " released=1", + prep_seq, snapshot_seq, 0); + // This snapshot is not valid anymore. We cannot tell if prep_seq is + // committed before or after the snapshot. Return true but also set + // snap_released to true. + assert(snap_released); + *snap_released = true; + return true; + } + + if (!found) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + } + // (ii) it the case: it is committed but after the snapshot_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + + // Add the transaction with prepare sequence seq to the prepared list. + // Note: must be called serially with increasing seq on each call. + // locked is true if prepared_mutex_ is already locked. + void AddPrepared(uint64_t seq, bool locked = false); + // Check if any of the prepared txns are less than new max_evicted_seq_. Must + // be called with prepared_mutex_ write locked. + void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked); + // Remove the transaction with prepare sequence seq from the prepared list + void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1); + // Add the transaction with prepare sequence prepare_seq and commit sequence + // commit_seq to the commit map. loop_cnt is to detect infinite loops. + // Note: must be called serially. + void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, + uint8_t loop_cnt = 0); + + struct CommitEntry { + uint64_t prep_seq; + uint64_t commit_seq; + CommitEntry() : prep_seq(0), commit_seq(0) {} + CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {} + bool operator==(const CommitEntry& rhs) const { + return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq; + } + }; + + struct CommitEntry64bFormat { + explicit CommitEntry64bFormat(size_t index_bits) + : INDEX_BITS(index_bits), + PREP_BITS(static_cast(64 - PAD_BITS - INDEX_BITS)), + COMMIT_BITS(static_cast(64 - PREP_BITS)), + COMMIT_FILTER(static_cast((1ull << COMMIT_BITS) - 1)), + DELTA_UPPERBOUND(static_cast((1ull << COMMIT_BITS))) {} + // Number of higher bits of a sequence number that is not used. They are + // used to encode the value type, ... + const size_t PAD_BITS = static_cast(8); + // Number of lower bits from prepare seq that can be skipped as they are + // implied by the index of the entry in the array + const size_t INDEX_BITS; + // Number of bits we use to encode the prepare seq + const size_t PREP_BITS; + // Number of bits we use to encode the commit seq. + const size_t COMMIT_BITS; + // Filter to encode/decode commit seq + const uint64_t COMMIT_FILTER; + // The value of commit_seq - prepare_seq + 1 must be less than this bound + const uint64_t DELTA_UPPERBOUND; + }; + + // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ... + // INDEX Delta Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ... + // DELTA DELTA Encoded Value = PREP PREP .... PREP PREP DELTA DELTA + // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and + // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the + // bits that do not have to be encoded (will be provided externally) DELTA: + // prep seq - commit seq + 1 Number of DELTA bits should be equal to number of + // index bits + PADs + struct CommitEntry64b { + constexpr CommitEntry64b() noexcept : rep_(0) {} + + CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format) + : CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {} + + CommitEntry64b(const uint64_t ps, const uint64_t cs, + const CommitEntry64bFormat& format) { + assert(ps < static_cast( + (1ull << (format.PREP_BITS + format.INDEX_BITS)))); + assert(ps <= cs); + uint64_t delta = cs - ps + 1; // make initialized delta always >= 1 + // zero is reserved for uninitialized entries + assert(0 < delta); + assert(delta < format.DELTA_UPPERBOUND); + if (delta >= format.DELTA_UPPERBOUND) { + throw std::runtime_error( + "commit_seq >> prepare_seq. The allowed distance is " + + std::to_string(format.DELTA_UPPERBOUND) + " commit_seq is " + + std::to_string(cs) + " prepare_seq is " + std::to_string(ps)); + } + rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER; + rep_ = rep_ | delta; + } + + // Return false if the entry is empty + bool Parse(const uint64_t indexed_seq, CommitEntry* entry, + const CommitEntry64bFormat& format) { + uint64_t delta = rep_ & format.COMMIT_FILTER; + // zero is reserved for uninitialized entries + assert(delta < static_cast((1ull << format.COMMIT_BITS))); + if (delta == 0) { + return false; // initialized entry would have non-zero delta + } + + assert(indexed_seq < static_cast((1ull << format.INDEX_BITS))); + uint64_t prep_up = rep_ & ~format.COMMIT_FILTER; + prep_up >>= format.PAD_BITS; + const uint64_t& prep_low = indexed_seq; + entry->prep_seq = prep_up | prep_low; + + entry->commit_seq = entry->prep_seq + delta - 1; + return true; + } + + private: + uint64_t rep_; + }; + + // Struct to hold ownership of snapshot and read callback for cleanup. + struct IteratorState; + + std::shared_ptr> GetCFComparatorMap() { + return cf_map_; + } + std::shared_ptr> GetCFHandleMap() { + return handle_map_; + } + void UpdateCFComparatorMap( + const std::vector& handles) override; + void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override; + + virtual const Snapshot* GetSnapshot() override; + SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check); + + protected: + virtual Status VerifyCFOptions( + const ColumnFamilyOptions& cf_options) override; + // Assign the min and max sequence numbers for reading from the db. A seq > + // max is not valid, and a seq < min is valid, and a min <= seq < max requires + // further checking. Normally max is defined by the snapshot and min is by + // minimum uncommitted seq. + inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max); + // Validate is a snapshot sequence number is still valid based on the latest + // db status. backed_by_snapshot specifies if the number is baked by an actual + // snapshot object. order specified the memory order with which we load the + // atomic variables: relax is enough for the default since we care about last + // value seen by same thread. + inline bool ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order = std::memory_order_relaxed); + // Get a dummy snapshot that refers to kMaxSequenceNumber + Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; } + + bool ShouldRollbackWithSingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + return rollback_deletion_type_callback_ + ? rollback_deletion_type_callback_(this, column_family, key) + : false; + } + + std::function + rollback_deletion_type_callback_; + + private: + friend class AddPreparedCallback; + friend class PreparedHeap_BasicsTest_Test; + friend class PreparedHeap_Concurrent_Test; + friend class PreparedHeap_EmptyAtTheEnd_Test; + friend class SnapshotConcurrentAccessTest_SnapshotConcurrentAccess_Test; + friend class WritePreparedCommitEntryPreReleaseCallback; + friend class WritePreparedTransactionTestBase; + friend class WritePreparedTxn; + friend class WritePreparedTxnDBMock; + friend class WritePreparedTransactionTest_AddPreparedBeforeMax_Test; + friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasic_Test; + friend class + WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicates_Test; + friend class WritePreparedTransactionTest_AdvanceSeqByOne_Test; + friend class WritePreparedTransactionTest_BasicRecovery_Test; + friend class WritePreparedTransactionTest_CheckAgainstSnapshots_Test; + friend class WritePreparedTransactionTest_CleanupSnapshotEqualToMax_Test; + friend class WritePreparedTransactionTest_ConflictDetectionAfterRecovery_Test; + friend class WritePreparedTransactionTest_CommitMap_Test; + friend class WritePreparedTransactionTest_DoubleSnapshot_Test; + friend class WritePreparedTransactionTest_IsInSnapshotEmptyMap_Test; + friend class WritePreparedTransactionTest_IsInSnapshotReleased_Test; + friend class WritePreparedTransactionTest_IsInSnapshot_Test; + friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test; + friend class + WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test; + friend class + WritePreparedTransactionTest_NonAtomicUpdateOfDelayedPrepared_Test; + friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test; + friend class WritePreparedTransactionTest_OldCommitMapGC_Test; + friend class WritePreparedTransactionTest_Rollback_Test; + friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test; + friend class WriteUnpreparedTxn; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class MultiOpsTxnsStressTest; + + void Init(const TransactionDBOptions& txn_db_opts); + + void WPRecordTick(uint32_t ticker_type) const { + RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type); + } + + // A heap with the amortized O(1) complexity for erase. It uses one extra heap + // to keep track of erased entries that are not yet on top of the main heap. + class PreparedHeap { + // The mutex is required for push and pop from PreparedHeap. ::erase will + // use external synchronization via prepared_mutex_. + port::Mutex push_pop_mutex_; + std::deque heap_; + std::priority_queue, std::greater> + erased_heap_; + std::atomic heap_top_ = {kMaxSequenceNumber}; + // True when testing crash recovery + bool TEST_CRASH_ = false; + friend class WritePreparedTxnDB; + + public: + ~PreparedHeap() { + if (!TEST_CRASH_) { + assert(heap_.empty()); + assert(erased_heap_.empty()); + } + } + port::Mutex* push_pop_mutex() { return &push_pop_mutex_; } + + inline bool empty() { return top() == kMaxSequenceNumber; } + // Returns kMaxSequenceNumber if empty() and the smallest otherwise. + inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); } + inline void push(uint64_t v) { + push_pop_mutex_.AssertHeld(); + if (heap_.empty()) { + heap_top_.store(v, std::memory_order_release); + } else { + assert(heap_top_.load() < v); + } + heap_.push_back(v); + } + void pop(bool locked = false) { + if (!locked) { + push_pop_mutex()->Lock(); + } + push_pop_mutex_.AssertHeld(); + heap_.pop_front(); + while (!heap_.empty() && !erased_heap_.empty() && + // heap_.top() > erased_heap_.top() could happen if we have erased + // a non-existent entry. Ideally the user should not do that but we + // should be resilient against it. + heap_.front() >= erased_heap_.top()) { + if (heap_.front() == erased_heap_.top()) { + heap_.pop_front(); + } + uint64_t erased __attribute__((__unused__)); + erased = erased_heap_.top(); + erased_heap_.pop(); + // No duplicate prepare sequence numbers + assert(erased_heap_.empty() || erased_heap_.top() != erased); + } + while (heap_.empty() && !erased_heap_.empty()) { + erased_heap_.pop(); + } + heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber, + std::memory_order_release); + if (!locked) { + push_pop_mutex()->Unlock(); + } + } + // Concurrrent calls needs external synchronization. It is safe to be called + // concurrent to push and pop though. + void erase(uint64_t seq) { + if (!empty()) { + auto top_seq = top(); + if (seq < top_seq) { + // Already popped, ignore it. + } else if (top_seq == seq) { + pop(); +#ifndef NDEBUG + MutexLock ml(push_pop_mutex()); + assert(heap_.empty() || heap_.front() != seq); +#endif + } else { // top() > seq + // Down the heap, remember to pop it later + erased_heap_.push(seq); + } + } + } + }; + + void TEST_Crash() override { prepared_txns_.TEST_CRASH_ = true; } + + // Get the commit entry with index indexed_seq from the commit table. It + // returns true if such entry exists. + bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b, + CommitEntry* entry) const; + + // Rewrite the entry with the index indexed_seq in the commit table with the + // commit entry . If the rewrite results into eviction, + // sets the evicted_entry and returns true. + bool AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry, + CommitEntry* evicted_entry); + + // Rewrite the entry with the index indexed_seq in the commit table with the + // commit entry new_entry only if the existing entry matches the + // expected_entry. Returns false otherwise. + bool ExchangeCommitEntry(const uint64_t indexed_seq, + CommitEntry64b& expected_entry, + const CommitEntry& new_entry); + + // Increase max_evicted_seq_ from the previous value prev_max to the new + // value. This also involves taking care of prepared txns that are not + // committed before new_max, as well as updating the list of live snapshots at + // the time of updating the max. Thread-safety: this function can be called + // concurrently. The concurrent invocations of this function is equivalent to + // a serial invocation in which the last invocation is the one with the + // largest new_max value. + void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, + const SequenceNumber& new_max); + + inline SequenceNumber SmallestUnCommittedSeq() { + // Note: We have two lists to look into, but for performance reasons they + // are not read atomically. Since CheckPreparedAgainstMax copies the entry + // to delayed_prepared_ before removing it from prepared_txns_, to ensure + // that a prepared entry will not go unmissed, we look into them in opposite + // order: first read prepared_txns_ and then delayed_prepared_. + + // This must be called before calling ::top. This is because the concurrent + // thread would call ::RemovePrepared before updating + // GetLatestSequenceNumber(). Reading then in opposite order here guarantees + // that the ::top that we read would be lower the ::top if we had otherwise + // update/read them atomically. + auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1; + auto min_prepare = prepared_txns_.top(); + // Since we update the prepare_heap always from the main write queue via + // PreReleaseCallback, the prepared_txns_.top() indicates the smallest + // prepared data in 2pc transactions. For non-2pc transactions that are + // written in two steps, we also update prepared_txns_ at the first step + // (via the same mechanism) so that their uncommitted data is reflected in + // SmallestUnCommittedSeq. + if (!delayed_prepared_empty_.load()) { + ReadLock rl(&prepared_mutex_); + if (!delayed_prepared_.empty()) { + return *delayed_prepared_.begin(); + } + } + bool empty = min_prepare == kMaxSequenceNumber; + if (empty) { + // Since GetLatestSequenceNumber is updated + // after prepared_txns_ are, the value of GetLatestSequenceNumber would + // reflect any uncommitted data that is not added to prepared_txns_ yet. + // Otherwise, if there is no concurrent txn, this value simply reflects + // that latest value in the memtable. + return next_prepare; + } else { + return std::min(min_prepare, next_prepare); + } + } + + // Enhance the snapshot object by recording in it the smallest uncommitted seq + inline void EnhanceSnapshot(SnapshotImpl* snapshot, + SequenceNumber min_uncommitted) { + assert(snapshot); + assert(min_uncommitted <= snapshot->number_ + 1); + snapshot->min_uncommitted_ = min_uncommitted; + } + + virtual const std::vector GetSnapshotListFromDB( + SequenceNumber max); + + // Will be called by the public ReleaseSnapshot method. Does the maintenance + // internal to WritePreparedTxnDB + void ReleaseSnapshotInternal(const SequenceNumber snap_seq); + + // Update the list of snapshots corresponding to the soon-to-be-updated + // max_evicted_seq_. Thread-safety: this function can be called concurrently. + // The concurrent invocations of this function is equivalent to a serial + // invocation in which the last invocation is the one with the largest + // version value. + void UpdateSnapshots(const std::vector& snapshots, + const SequenceNumber& version); + // Check the new list of new snapshots against the old one to see if any of + // the snapshots are released and to do the cleanup for the released snapshot. + void CleanupReleasedSnapshots( + const std::vector& new_snapshots, + const std::vector& old_snapshots); + + // Check an evicted entry against live snapshots to see if it should be kept + // around or it can be safely discarded (and hence assume committed for all + // snapshots). Thread-safety: this function can be called concurrently. If it + // is called concurrently with multiple UpdateSnapshots, the result is the + // same as checking the intersection of the snapshot list before updates with + // the snapshot list of all the concurrent updates. + void CheckAgainstSnapshots(const CommitEntry& evicted); + + // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq < + // commit_seq. Return false if checking the next snapshot(s) is not needed. + // This is the case if none of the next snapshots could satisfy the condition. + // next_is_larger: the next snapshot will be a larger value + bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq, + const uint64_t& commit_seq, + const uint64_t& snapshot_seq, + const bool next_is_larger); + + // A trick to increase the last visible sequence number by one and also wait + // for the in-flight commits to be visible. + void AdvanceSeqByOne(); + + // The list of live snapshots at the last time that max_evicted_seq_ advanced. + // The list stored into two data structures: in snapshot_cache_ that is + // efficient for concurrent reads, and in snapshots_ if the data does not fit + // into snapshot_cache_. The total number of snapshots in the two lists + std::atomic snapshots_total_ = {}; + // The list sorted in ascending order. Thread-safety for writes is provided + // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for + // each entry. In x86_64 architecture such reads are compiled to simple read + // instructions. + const size_t SNAPSHOT_CACHE_BITS; + const size_t SNAPSHOT_CACHE_SIZE; + std::unique_ptr[]> snapshot_cache_; + // 2nd list for storing snapshots. The list sorted in ascending order. + // Thread-safety is provided with snapshots_mutex_. + std::vector snapshots_; + // The list of all snapshots: snapshots_ + snapshot_cache_. This list although + // redundant but simplifies CleanupOldSnapshots implementation. + // Thread-safety is provided with snapshots_mutex_. + std::vector snapshots_all_; + // The version of the latest list of snapshots. This can be used to avoid + // rewriting a list that is concurrently updated with a more recent version. + SequenceNumber snapshots_version_ = 0; + + // A heap of prepared transactions. Thread-safety is provided with + // prepared_mutex_. + PreparedHeap prepared_txns_; + const size_t COMMIT_CACHE_BITS; + const size_t COMMIT_CACHE_SIZE; + const CommitEntry64bFormat FORMAT; + // commit_cache_ must be initialized to zero to tell apart an empty index from + // a filled one. Thread-safety is provided with commit_cache_mutex_. + std::unique_ptr[]> commit_cache_; + // The largest evicted *commit* sequence number from the commit_cache_. If a + // seq is smaller than max_evicted_seq_ is might or might not be present in + // commit_cache_. So commit_cache_ must first be checked before consulting + // with max_evicted_seq_. + std::atomic max_evicted_seq_ = {}; + // Order: 1) update future_max_evicted_seq_ = new_max, 2) + // GetSnapshotListFromDB(new_max), max_evicted_seq_ = new_max. Since + // GetSnapshotInternal guarantess that the snapshot seq is larger than + // future_max_evicted_seq_, this guarantes that if a snapshot is not larger + // than max has already being looked at via a GetSnapshotListFromDB(new_max). + std::atomic future_max_evicted_seq_ = {}; + // Advance max_evicted_seq_ by this value each time it needs an update. The + // larger the value, the less frequent advances we would have. We do not want + // it to be too large either as it would cause stalls by doing too much + // maintenance work under the lock. + size_t INC_STEP_FOR_MAX_EVICTED = 1; + // A map from old snapshots (expected to be used by a few read-only txns) to + // prepared sequence number of the evicted entries from commit_cache_ that + // overlaps with such snapshot. These are the prepared sequence numbers that + // the snapshot, to which they are mapped, cannot assume to be committed just + // because it is no longer in the commit_cache_. The vector must be sorted + // after each update. + // Thread-safety is provided with old_commit_map_mutex_. + std::map> old_commit_map_; + // A set of long-running prepared transactions that are not finished by the + // time max_evicted_seq_ advances their sequence number. This is expected to + // be empty normally. Thread-safety is provided with prepared_mutex_. + std::set delayed_prepared_; + // Commit of a delayed prepared: 1) update commit cache, 2) update + // delayed_prepared_commits_, 3) publish seq, 3) clean up delayed_prepared_. + // delayed_prepared_commits_ will help us tell apart the unprepared txns from + // the ones that are committed but not cleaned up yet. + std::unordered_map delayed_prepared_commits_; + // Update when delayed_prepared_.empty() changes. Expected to be true + // normally. + std::atomic delayed_prepared_empty_ = {true}; + // Update when old_commit_map_.empty() changes. Expected to be true normally. + std::atomic old_commit_map_empty_ = {true}; + mutable port::RWMutex prepared_mutex_; + mutable port::RWMutex old_commit_map_mutex_; + mutable port::RWMutex commit_cache_mutex_; + mutable port::RWMutex snapshots_mutex_; + // A cache of the cf comparators + // Thread safety: since it is a const it is safe to read it concurrently + std::shared_ptr> cf_map_; + // A cache of the cf handles + // Thread safety: since the handle is read-only object it is a const it is + // safe to read it concurrently + std::shared_ptr> handle_map_; + // A dummy snapshot object that refers to kMaxSequenceNumber + SnapshotImpl dummy_max_snapshot_; +}; + +class WritePreparedTxnReadCallback : public ReadCallback { + public: + WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot) + : ReadCallback(snapshot), + db_(db), + backed_by_snapshot_(kBackedByDBSnapshot) {} + WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + SnapshotBackup backed_by_snapshot) + : ReadCallback(snapshot, min_uncommitted), + db_(db), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WritePreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } + + // Will be called to see if the seq number visible; if not it moves on to + // the next seq number. + inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override { + auto snapshot = max_visible_seq_; + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; + } + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + + // TODO(myabandeh): override Refresh when Iterator::Refresh is supported + private: + WritePreparedTxnDB* db_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; +}; + +class AddPreparedCallback : public PreReleaseCallback { + public: + AddPreparedCallback(WritePreparedTxnDB* db, DBImpl* db_impl, + size_t sub_batch_cnt, bool two_write_queues, + bool first_prepare_batch) + : db_(db), + db_impl_(db_impl), + sub_batch_cnt_(sub_batch_cnt), + two_write_queues_(two_write_queues), + first_prepare_batch_(first_prepare_batch) { + (void)two_write_queues_; // to silence unused private field warning + } + virtual Status Callback(SequenceNumber prepare_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t log_number, size_t index, + size_t total) override { + assert(index < total); + // To reduce the cost of lock acquisition competing with the concurrent + // prepare requests, lock on the first callback and unlock on the last. + const bool do_lock = !two_write_queues_ || index == 0; + const bool do_unlock = !two_write_queues_ || index + 1 == total; + // Always Prepare from the main queue + assert(!two_write_queues_ || !is_mem_disabled); // implies the 1st queue + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume"); + if (do_lock) { + db_->prepared_txns_.push_pop_mutex()->Lock(); + } + const bool kLocked = true; + for (size_t i = 0; i < sub_batch_cnt_; i++) { + db_->AddPrepared(prepare_seq + i, kLocked); + } + if (do_unlock) { + db_->prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end"); + if (first_prepare_batch_) { + assert(log_number != 0); + db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection( + log_number); + } + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + size_t sub_batch_cnt_; + bool two_write_queues_; + // It is 2PC and this is the first prepare batch. Always the case in 2PC + // unless it is WriteUnPrepared. + bool first_prepare_batch_; +}; + +class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { + public: + // includes_data indicates that the commit also writes non-empty + // CommitTimeWriteBatch to memtable, which needs to be committed separately. + WritePreparedCommitEntryPreReleaseCallback( + WritePreparedTxnDB* db, DBImpl* db_impl, SequenceNumber prep_seq, + size_t prep_batch_cnt, size_t data_batch_cnt = 0, + SequenceNumber aux_seq = kMaxSequenceNumber, size_t aux_batch_cnt = 0) + : db_(db), + db_impl_(db_impl), + prep_seq_(prep_seq), + prep_batch_cnt_(prep_batch_cnt), + data_batch_cnt_(data_batch_cnt), + includes_data_(data_batch_cnt_ > 0), + aux_seq_(aux_seq), + aux_batch_cnt_(aux_batch_cnt), + includes_aux_batch_(aux_batch_cnt > 0) { + assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber)); // xor + assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0); + assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber)); // xor + } + + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t, size_t /*index*/, + size_t /*total*/) override { + // Always commit from the 2nd queue + assert(!db_impl_->immutable_db_options().two_write_queues || + is_mem_disabled); + assert(includes_data_ || prep_seq_ != kMaxSequenceNumber); + // Data batch is what accompanied with the commit marker and affects the + // last seq in the commit batch. + const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) + ? commit_seq + : commit_seq + data_batch_cnt_ - 1; + if (prep_seq_ != kMaxSequenceNumber) { + for (size_t i = 0; i < prep_batch_cnt_; i++) { + db_->AddCommitted(prep_seq_ + i, last_commit_seq); + } + } // else there was no prepare phase + if (includes_aux_batch_) { + for (size_t i = 0; i < aux_batch_cnt_; i++) { + db_->AddCommitted(aux_seq_ + i, last_commit_seq); + } + } + if (includes_data_) { + assert(data_batch_cnt_); + // Commit the data that is accompanied with the commit request + for (size_t i = 0; i < data_batch_cnt_; i++) { + // For commit seq of each batch use the commit seq of the last batch. + // This would make debugging easier by having all the batches having + // the same sequence number. + db_->AddCommitted(commit_seq + i, last_commit_seq); + } + } + if (db_impl_->immutable_db_options().two_write_queues) { + assert(is_mem_disabled); // implies the 2nd queue + // Publish the sequence number. We can do that here assuming the callback + // is invoked only from one write queue, which would guarantee that the + // publish sequence numbers will be in order, i.e., once a seq is + // published all the seq prior to that are also publishable. + db_impl_->SetLastPublishedSequence(last_commit_seq); + // Note RemovePrepared should be called after publishing the seq. + // Otherwise SmallestUnCommittedSeq optimization breaks. + if (prep_seq_ != kMaxSequenceNumber) { + db_->RemovePrepared(prep_seq_, prep_batch_cnt_); + } // else there was no prepare phase + if (includes_aux_batch_) { + db_->RemovePrepared(aux_seq_, aux_batch_cnt_); + } + } + // else SequenceNumber that is updated as part of the write already does the + // publishing + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + // kMaxSequenceNumber if there was no prepare phase + SequenceNumber prep_seq_; + size_t prep_batch_cnt_; + size_t data_batch_cnt_; + // Data here is the batch that is written with the commit marker, either + // because it is commit without prepare or commit has a CommitTimeWriteBatch. + bool includes_data_; + // Auxiliary batch (if there is any) is a batch that is written before, but + // gets the same commit seq as prepare batch or data batch. This is used in + // two write queues where the CommitTimeWriteBatch becomes the aux batch and + // we do a separate write to actually commit everything. + SequenceNumber aux_seq_; + size_t aux_batch_cnt_; + bool includes_aux_batch_; +}; + +// For two_write_queues commit both the aborted batch and the cleanup batch and +// then published the seq +class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback { + public: + WritePreparedRollbackPreReleaseCallback(WritePreparedTxnDB* db, + DBImpl* db_impl, + SequenceNumber prep_seq, + SequenceNumber rollback_seq, + size_t prep_batch_cnt) + : db_(db), + db_impl_(db_impl), + prep_seq_(prep_seq), + rollback_seq_(rollback_seq), + prep_batch_cnt_(prep_batch_cnt) { + assert(prep_seq != kMaxSequenceNumber); + assert(rollback_seq != kMaxSequenceNumber); + assert(prep_batch_cnt_ > 0); + } + + Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t, + size_t /*index*/, size_t /*total*/) override { + // Always commit from the 2nd queue + assert(is_mem_disabled); // implies the 2nd queue + assert(db_impl_->immutable_db_options().two_write_queues); +#ifdef NDEBUG + (void)is_mem_disabled; +#endif + const uint64_t last_commit_seq = commit_seq; + db_->AddCommitted(rollback_seq_, last_commit_seq); + for (size_t i = 0; i < prep_batch_cnt_; i++) { + db_->AddCommitted(prep_seq_ + i, last_commit_seq); + } + db_impl_->SetLastPublishedSequence(last_commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + SequenceNumber prep_seq_; + SequenceNumber rollback_seq_; + size_t prep_batch_cnt_; +}; + +// Count the number of sub-batches inside a batch. A sub-batch does not have +// duplicate keys. +struct SubBatchCounter : public WriteBatch::Handler { + explicit SubBatchCounter(std::map& comparators) + : comparators_(comparators), batches_(1) {} + std::map& comparators_; + using CFKeys = std::set; + std::map keys_; + size_t batches_; + size_t BatchCount() { return batches_; } + void AddKey(const uint32_t cf, const Slice& key); + void InitWithComp(const uint32_t cf); + Status MarkNoop(bool) override { return Status::OK(); } + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + Status MarkCommit(const Slice&) override { return Status::OK(); } + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + AddKey(cf, key); + return Status::OK(); + } + Status DeleteCF(uint32_t cf, const Slice& key) override { + AddKey(cf, key); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + AddKey(cf, key); + return Status::OK(); + } + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + AddKey(cf, key); + return Status::OK(); + } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { return Status::OK(); } + Handler::OptionState WriteAfterCommit() const override { + return Handler::OptionState::kDisabled; + } +}; + +SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max) { + if (snapshot != nullptr) { + *min = + static_cast_with_check(snapshot)->min_uncommitted_; + *max = static_cast_with_check(snapshot)->number_; + // A duplicate of the check in EnhanceSnapshot(). + assert(*min <= *max + 1); + return kBackedByDBSnapshot; + } else { + *min = SmallestUnCommittedSeq(); + *max = 0; // to be assigned later after sv is referenced. + return kUnbackedByDBSnapshot; + } +} + +bool WritePreparedTxnDB::ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order) { + if (backed_by_snapshot == kBackedByDBSnapshot) { + return true; + } else { + SequenceNumber max = max_evicted_seq_.load(order); + // Validate that max has not advanced the snapshot seq that is not backed + // by a real snapshot. This is a very rare case that should not happen in + // real workloads. + if (UNLIKELY(snap_seq <= max && snap_seq != 0)) { + return false; + } + } + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.cc new file mode 100644 index 0000000000..845b117cfd --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.cc @@ -0,0 +1,1056 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/write_unprepared_txn.h" + +#include "db/db_impl/db_impl.h" +#include "util/cast_util.h" +#include "utilities/transactions/write_unprepared_txn_db.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { + // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is + // in unprep_seqs, we have to check if seq is equal to prep_seq or any of + // the prepare_batch_cnt seq nums after it. + // + // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is + // large. + for (const auto& it : unprep_seqs_) { + if (it.first <= seq && seq < it.first + it.second) { + return true; + } + } + + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; +} + +WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : WritePreparedTxn(txn_db, write_options, txn_options), + wupt_db_(txn_db), + last_log_number_(0), + recovered_txn_(false), + largest_validated_seq_(0) { + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } +} + +WriteUnpreparedTxn::~WriteUnpreparedTxn() { + if (!unprep_seqs_.empty()) { + assert(log_number_ > 0); + assert(GetId() > 0); + assert(!name_.empty()); + + // We should rollback regardless of GetState, but some unit tests that + // test crash recovery run the destructor assuming that rollback does not + // happen, so that rollback during recovery can be exercised. + if (GetState() == STARTED || GetState() == LOCKS_STOLEN) { + auto s = RollbackInternal(); + assert(s.ok()); + if (!s.ok()) { + ROCKS_LOG_FATAL( + wupt_db_->info_log_, + "Rollback of WriteUnprepared transaction failed in destructor: %s", + s.ToString().c_str()); + } + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + } + + // Clear the tracked locks so that ~PessimisticTransaction does not + // try to unlock keys for recovered transactions. + if (recovered_txn_) { + tracked_locks_->Clear(); + } +} + +void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { + PessimisticTransaction::Initialize(txn_options); + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } + + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + recovered_txn_ = false; + largest_validated_seq_ = 0; + assert(active_iterators_.empty()); + active_iterators_.clear(); + untracked_keys_.clear(); +} + +Status WriteUnpreparedTxn::HandleWrite(std::function do_write) { + Status s; + if (active_iterators_.empty()) { + s = MaybeFlushWriteBatchToDB(); + if (!s.ok()) { + return s; + } + } + s = do_write(); + if (s.ok()) { + if (snapshot_) { + largest_validated_seq_ = + std::max(largest_validated_seq_, snapshot_->GetSequenceNumber()); + } else { + // TODO(lth): We should use the same number as tracked_at_seq in TryLock, + // because what is actually being tracked is the sequence number at which + // this key was locked at. + largest_validated_seq_ = db_impl_->GetLastPublishedSequence(); + } + } + return s; +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Merge(column_family, key, value, + assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, + const Slice& key, const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); +} + +Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); +} + +// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For +// WriteUnprepared, the write batches have already been written into the +// database during WAL replay, so all we have to do is just to "retrack" the key +// so that rollbacks are possible. +// +// Calling TryLock instead of TrackKey is also possible, but as an optimization, +// recovered transactions do not hold locks on their keys. This follows the +// implementation in PessimisticTransactionDB::Initialize where we set +// skip_concurrency_control to true. +Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) { + struct TrackKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + } + return Status::OK(); + } + + // Recovered batches do not contain 2PC markers. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + TrackKeyHandler handler(this, + wupt_db_->txn_db_options_.rollback_merge_operands); + return wb->Iterate(&handler); +} + +Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { + const bool kPrepared = true; + Status s; + if (write_batch_flush_threshold_ > 0 && + write_batch_.GetWriteBatch()->Count() > 0 && + write_batch_.GetDataSize() > + static_cast(write_batch_flush_threshold_)) { + assert(GetState() != PREPARED); + s = FlushWriteBatchToDB(!kPrepared); + } + return s; +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { + // If the current write batch contains savepoints, then some special handling + // is required so that RollbackToSavepoint can work. + // + // RollbackToSavepoint is not supported after Prepare() is called, so only do + // this for unprepared batches. + if (!prepared && unflushed_save_points_ != nullptr && + !unflushed_save_points_->empty()) { + return FlushWriteBatchWithSavePointToDB(); + } + + return FlushWriteBatchToDBInternal(prepared); +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) { + if (name_.empty()) { + assert(!prepared); +#ifndef NDEBUG + static std::atomic_ullong autogen_id{0}; + // To avoid changing all tests to call SetName, just autogenerate one. + if (wupt_db_->txn_db_options_.autogenerate_name) { + auto s = SetName(std::string("autoxid") + + std::to_string(autogen_id.fetch_add(1))); + assert(s.ok()); + } else +#endif + { + return Status::InvalidArgument("Cannot write to DB without SetName."); + } + } + + struct UntrackedKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + UntrackedKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status AddUntrackedKey(uint32_t cf, const Slice& key) { + auto str = key.ToString(); + PointLockStatus lock_status = + txn_->tracked_locks_->GetPointLockStatus(cf, str); + if (!lock_status.locked) { + txn_->untracked_keys_[cf].push_back(str); + } + return Status::OK(); + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + return AddUntrackedKey(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return AddUntrackedKey(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return AddUntrackedKey(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + return AddUntrackedKey(cf, key); + } + return Status::OK(); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + UntrackedKeyHandler handler( + this, wupt_db_->txn_db_options_.rollback_merge_operands); + auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&handler); + assert(s.ok()); + + // TODO(lth): Reduce duplicate code with WritePrepared prepare logic. + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + const bool WRITE_AFTER_COMMIT = true; + const bool first_prepare_batch = log_number_ == 0; + // MarkEndPrepare will change Noop marker to the appropriate marker. + s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_, !WRITE_AFTER_COMMIT, !prepared); + assert(s.ok()); + // For each duplicate key we account for a new sub-batch + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + // AddPrepared better to be called in the pre-release callback otherwise there + // is a non-zero chance of max advancing prepare_seq and readers assume the + // data as committed. + // Also having it in the PreReleaseCallback allows in-order addition of + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, prepare_batch_cnt_, + db_impl_->immutable_db_options().two_write_queues, first_prepare_batch); + const bool DISABLE_MEMTABLE = true; + uint64_t seq_used = kMaxSequenceNumber; + // log_number_ should refer to the oldest log containing uncommitted data + // from the current transaction. This means that if log_number_ is set, + // WriteImpl should not overwrite that value, so set log_used to nullptr if + // log_number_ is already set. + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, &last_log_number_, + /*log ref*/ 0, !DISABLE_MEMTABLE, &seq_used, + prepare_batch_cnt_, &add_prepared_callback); + if (log_number_ == 0) { + log_number_ = last_log_number_; + } + assert(!s.ok() || seq_used != kMaxSequenceNumber); + auto prepare_seq = seq_used; + + // Only call SetId if it hasn't been set yet. + if (GetId() == 0) { + SetId(prepare_seq); + } + // unprep_seqs_ will also contain prepared seqnos since they are treated in + // the same way in the prepare/commit callbacks. See the comment on the + // definition of unprep_seqs_. + unprep_seqs_[prepare_seq] = prepare_batch_cnt_; + + // Reset transaction state. + if (!prepared) { + prepare_batch_cnt_ = 0; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + return s; +} + +Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() { + assert(unflushed_save_points_ != nullptr && + unflushed_save_points_->size() > 0); + assert(save_points_ != nullptr && save_points_->size() > 0); + assert(save_points_->size() >= unflushed_save_points_->size()); + + // Handler class for creating an unprepared batch from a savepoint. + struct SavePointBatchHandler : public WriteBatch::Handler { + WriteBatchWithIndex* wb_; + const std::map& handles_; + + SavePointBatchHandler( + WriteBatchWithIndex* wb, + const std::map& handles) + : wb_(wb), handles_(handles) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Put(handles_.at(cf), key, value); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return wb_->Delete(handles_.at(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return wb_->SingleDelete(handles_.at(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Merge(handles_.at(cf), key, value); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + // The comparator of the default cf is passed in, similar to the + // initialization of TransactionBaseImpl::write_batch_. This comparator is + // only used if the write batch encounters an invalid cf id, and falls back to + // this comparator. + WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0, + true, 0, write_options_.protection_bytes_per_key); + // Swap with write_batch_ so that wb contains the complete write batch. The + // actual write batch that will be flushed to DB will be built in + // write_batch_, and will be read by FlushWriteBatchToDBInternal. + std::swap(wb, write_batch_); + TransactionBaseImpl::InitWriteBatch(); + + size_t prev_boundary = WriteBatchInternal::kHeader; + const bool kPrepared = true; + for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) { + bool trailing_batch = i == unflushed_save_points_->size(); + SavePointBatchHandler sp_handler(&write_batch_, + *wupt_db_->GetCFHandleMap().get()); + size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize() + : (*unflushed_save_points_)[i]; + + // Construct the partial write batch up to the savepoint. + // + // Theoretically, a memcpy between the write batches should be sufficient + // since the rewriting into the batch should produce the exact same byte + // representation. Rebuilding the WriteBatchWithIndex index is still + // necessary though, and would imply doing two passes over the batch though. + Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler, + prev_boundary, curr_boundary); + if (!s.ok()) { + return s; + } + + if (write_batch_.GetWriteBatch()->Count() > 0) { + // Flush the write batch. + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + } + + if (!trailing_batch) { + if (flushed_save_points_ == nullptr) { + flushed_save_points_.reset( + new autovector()); + } + flushed_save_points_->emplace_back( + unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot())); + } + + prev_boundary = curr_boundary; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + unflushed_save_points_->clear(); + return Status::OK(); +} + +Status WriteUnpreparedTxn::PrepareInternal() { + const bool kPrepared = true; + return FlushWriteBatchToDB(kPrepared); +} + +Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() { + if (unprep_seqs_.empty()) { + assert(log_number_ == 0); + assert(GetId() == 0); + return WritePreparedTxn::CommitWithoutPrepareInternal(); + } + + // TODO(lth): We should optimize commit without prepare to not perform + // a prepare under the hood. + auto s = PrepareInternal(); + if (!s.ok()) { + return s; + } + return CommitInternal(); +} + +Status WriteUnpreparedTxn::CommitInternal() { + // TODO(lth): Reduce duplicate code with WritePrepared commit logic. + + // We take the commit-time batch and append the Commit marker. The Memtable + // will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + const bool empty = working_batch->Count() == 0; + auto s = WriteBatchInternal::MarkCommit(working_batch, name_); + assert(s.ok()); + + const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_; + if (!empty) { + // When not writing to memtable, we can still cache the latest write batch. + // The cached batch will be written to memtable in WriteRecoverableState + // during FlushMemTable + if (for_recovery) { + WriteBatchInternal::SetAsLatestPersistentState(working_batch); + } else { + return Status::InvalidArgument( + "Commit-time-batch can only be used if " + "use_only_the_last_commit_time_batch_for_recovery is true"); + } + } + + const bool includes_data = !empty && !for_recovery; + size_t commit_batch_cnt = 0; + if (UNLIKELY(includes_data)) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Duplicate key overhead"); + SubBatchCounter counter(*wpt_db_->GetCFComparatorMap()); + s = working_batch->Iterate(&counter); + assert(s.ok()); + commit_batch_cnt = counter.BatchCount(); + } + const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt); + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + uint64_t seq_used = kMaxSequenceNumber; + // Since the prepared batch is directly written to memtable, there is + // already a connection between the memtable and its WAL, so there is no + // need to redundantly reference the log that contains the prepared data. + const uint64_t zero_log_number = 0ull; + size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; + s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, + zero_log_number, disable_memtable, &seq_used, + batch_cnt, pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; + if (LIKELY(do_one_write || !s.ok())) { + if (LIKELY(s.ok())) { + // Note RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + } + if (UNLIKELY(!do_one_write)) { + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; + } // else do the 2nd write to publish seq + + // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the + // commit write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[commit_batch_seq] = commit_batch_cnt; + WriteUnpreparedCommitEntryPreReleaseCallback + update_commit_map_with_commit_batch(wpt_db_, db_impl_, unprep_seqs_, 0); + + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + + // Update commit map only from the 2nd queue + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_commit_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Note RemovePrepared should be called after WriteImpl that publishsed the + // seq. Otherwise SmallestUnCommittedSeq optimization breaks. + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; +} + +Status WriteUnpreparedTxn::WriteRollbackKeys( + const LockTracker& lock_tracker, WriteBatchWithIndex* rollback_batch, + ReadCallback* callback, const ReadOptions& roptions) { + // This assertion can be removed when range lock is supported. + assert(lock_tracker.IsPointLockSupported()); + const auto& cf_map = *wupt_db_->GetCFHandleMap(); + auto WriteRollbackKey = [&](const std::string& key, uint32_t cfid) { + const auto& cf_handle = cf_map.at(cfid); + PinnableSlice pinnable_val; + bool not_used; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = callback; + auto s = db_impl_->GetImpl(roptions, key, get_impl_options); + + if (s.ok()) { + s = rollback_batch->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + if (wupt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) { + s = rollback_batch->SingleDelete(cf_handle, key); + } else { + s = rollback_batch->Delete(cf_handle, key); + } + assert(s.ok()); + } else { + return s; + } + + return Status::OK(); + }; + + std::unique_ptr cf_it( + lock_tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::unique_ptr key_it( + lock_tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + auto s = WriteRollbackKey(key, cf); + if (!s.ok()) { + return s; + } + } + } + + for (const auto& cfkey : untracked_keys_) { + const auto cfid = cfkey.first; + const auto& keys = cfkey.second; + for (const auto& key : keys) { + auto s = WriteRollbackKey(key, cfid); + if (!s.ok()) { + return s; + } + } + } + + return Status::OK(); +} + +Status WriteUnpreparedTxn::RollbackInternal() { + // TODO(lth): Reduce duplicate code with WritePrepared rollback logic. + WriteBatchWithIndex rollback_batch( + wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0, + write_options_.protection_bytes_per_key); + assert(GetId() != kMaxSequenceNumber); + assert(GetId() > 0); + Status s; + auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); + // Note that we do not use WriteUnpreparedTxnReadCallback because we do not + // need to read our own writes when reading prior versions of the key for + // rollback. + WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq); + // TODO(lth): We write rollback batch all in a single batch here, but this + // should be subdivded into multiple batches as well. In phase 2, when key + // sets are read from WAL, this will happen naturally. + s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions); + if (!s.ok()) { + return s; + } + + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_); + assert(s.ok()); + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + const bool DISABLE_MEMTABLE = true; + const uint64_t NO_REF_LOG = 0; + uint64_t seq_used = kMaxSequenceNumber; + // Rollback batch may contain duplicate keys, because tracked_keys_ is not + // comparator aware. + auto rollback_batch_cnt = rollback_batch.SubBatchCnt(); + // We commit the rolled back prepared batches. Although this is + // counter-intuitive, i) it is safe to do so, since the prepared batches are + // already canceled out by the rollback batch, ii) adding the commit entry to + // CommitCache will allow us to benefit from the existing mechanism in + // CommitCache that keeps an entry evicted due to max advance and yet overlaps + // with a live snapshot around so that the live snapshot properly skips the + // entry even if its prepare seq is lower than max_evicted_seq_. + // + // TODO(lth): RollbackInternal is conceptually very similar to + // CommitInternal, with the rollback batch simply taking on the role of + // CommitTimeWriteBatch. We should be able to merge the two code paths. + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, unprep_seqs_, rollback_batch_cnt); + // Note: the rollback batch does not need AddPrepared since it is written to + // DB in one shot. min_uncommitted still works since it requires capturing + // data that is written to DB but not yet committed, while the rollback + // batch commits with PreReleaseCallback. + s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(), + nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE, + &seq_used, rollback_batch_cnt, + do_one_write ? &update_commit_map : nullptr); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (!s.ok()) { + return s; + } + if (do_one_write) { + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; + } // else do the 2nd write for commit + + uint64_t& prepare_seq = seq_used; + // Populate unprep_seqs_ with rollback_batch_cnt, since we treat data in the + // rollback write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[prepare_seq] = rollback_batch_cnt; + WriteUnpreparedCommitEntryPreReleaseCallback + update_commit_map_with_rollback_batch(wpt_db_, db_impl_, unprep_seqs_, 0); + + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal 2nd write prepare_seq: %" PRIu64, + prepare_seq); + WriteBatch empty_batch; + const size_t ONE_BATCH = 1; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_rollback_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Mark the txn as rolled back + if (s.ok()) { + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + } + + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; +} + +void WriteUnpreparedTxn::Clear() { + if (!recovered_txn_) { + txn_db_impl_->UnLock(this, *tracked_locks_); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + recovered_txn_ = false; + largest_validated_seq_ = 0; + for (auto& it : active_iterators_) { + auto bdit = static_cast(it); + bdit->Invalidate(Status::InvalidArgument( + "Cannot use iterator after transaction has finished")); + } + active_iterators_.clear(); + untracked_keys_.clear(); + TransactionBaseImpl::Clear(); +} + +void WriteUnpreparedTxn::SetSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + PessimisticTransaction::SetSavePoint(); + if (unflushed_save_points_ == nullptr) { + unflushed_save_points_.reset(new autovector()); + } + unflushed_save_points_->push_back(write_batch_.GetDataSize()); +} + +Status WriteUnpreparedTxn::RollbackToSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::RollbackToSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + return RollbackToSavePointInternal(); + } + + return Status::NotFound(); +} + +Status WriteUnpreparedTxn::RollbackToSavePointInternal() { + Status s; + + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + + assert(flushed_save_points_->size() > 0); + WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back(); + + assert(save_points_ != nullptr && save_points_->size() > 0); + const LockTracker& tracked_keys = *save_points_->top().new_locks_; + + ReadOptions roptions; + roptions.snapshot = top.snapshot_->snapshot(); + SequenceNumber min_uncommitted = + static_cast_with_check(roptions.snapshot) + ->min_uncommitted_; + SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber(); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + top.unprep_seqs_, + kBackedByDBSnapshot); + s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions); + if (!s.ok()) { + return s; + } + + const bool kPrepared = true; + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + + // PessimisticTransaction::RollbackToSavePoint will call also call + // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has + // no savepoints because this savepoint has already been flushed. Work around + // this by setting a fake savepoint. + write_batch_.SetSavePoint(); + s = PessimisticTransaction::RollbackToSavePoint(); + assert(s.ok()); + if (!s.ok()) { + return s; + } + + flushed_save_points_->pop_back(); + return s; +} + +Status WriteUnpreparedTxn::PopSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on + // write_batch_. However, write_batch_ is empty and has no savepoints + // because this savepoint has already been flushed. Work around this by + // setting a fake savepoint. + write_batch_.SetSavePoint(); + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + flushed_save_points_->pop_back(); + return s; + } + + return Status::NotFound(); +} + +void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + assert(options.io_activity == Env::IOActivity::kUnknown); + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_, backed_by_snapshot); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!callback.valid() || + !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + +Status WriteUnpreparedTxn::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_, backed_by_snapshot); + auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, + value, &callback); + if (LIKELY(callback.valid() && + wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + return res; + } else { + res.PermitUncheckedError(); + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + return Status::TryAgain(); + } +} + +namespace { +static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) { + auto txn = reinterpret_cast(arg1); + auto iter = reinterpret_cast(arg2); + txn->RemoveActiveIterator(iter); +} +} // anonymous namespace + +Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) { + return GetIterator(options, wupt_db_->DefaultColumnFamily()); +} + +Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + // Make sure to get iterator from WriteUnprepareTxnDB, not the root db. + Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this); + assert(db_iter); + + auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter); + active_iterators_.push_back(iter); + iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter); + return iter; +} + +Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic. + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check(snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WriteUnpreparedTxnReadCallback snap_checker( + wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot); + // TODO(yanqin): Support user-defined timestamp. + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr, + false /* cache_only */, &snap_checker, min_uncommitted); +} + +const std::map& +WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { + return unprep_seqs_; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.h new file mode 100644 index 0000000000..63c65f00ae --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn.h @@ -0,0 +1,339 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + + +#include + +#include "utilities/transactions/write_prepared_txn.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTxnDB; +class WriteUnpreparedTxn; + +// WriteUnprepared transactions needs to be able to read their own uncommitted +// writes, and supporting this requires some careful consideration. Because +// writes in the current transaction may be flushed to DB already, we cannot +// rely on the contents of WriteBatchWithIndex to determine whether a key should +// be visible or not, so we have to remember to check the DB for any uncommitted +// keys that should be visible to us. First, we will need to change the seek to +// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq). +// Any key greater than max_visible_seq should not be visible because they +// cannot be unprepared by the current transaction and they are not in its +// snapshot. +// +// When we seek to max_visible_seq, one of these cases will happen: +// 1. We hit a unprepared key from the current transaction. +// 2. We hit a unprepared key from the another transaction. +// 3. We hit a committed key with snap_seq < seq < max_unprep_seq. +// 4. We hit a committed key with seq <= snap_seq. +// +// IsVisibleFullCheck handles all cases correctly. +// +// Other notes: +// Note that max_visible_seq is only calculated once at iterator construction +// time, meaning if the same transaction is adding more unprep seqs through +// writes during iteration, these newer writes may not be visible. This is not a +// problem for MySQL though because it avoids modifying the index as it is +// scanning through it to avoid the Halloween Problem. Instead, it scans the +// index once up front, and modifies based on a temporary copy. +// +// In DBIter, there is a "reseek" optimization if the iterator skips over too +// many keys. However, this assumes that the reseek seeks exactly to the +// required key. In write unprepared, even after seeking directly to +// max_visible_seq, some iteration may be required before hitting a visible key, +// and special precautions must be taken to avoid performing another reseek, +// leading to an infinite loop. +// +class WriteUnpreparedTxnReadCallback : public ReadCallback { + public: + WriteUnpreparedTxnReadCallback( + WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + const std::map& unprep_seqs, + SnapshotBackup backed_by_snapshot) + // Pass our last uncommitted seq as the snapshot to the parent class to + // ensure that the parent will not prematurely filter out own writes. We + // will do the exact comparison against snapshots in IsVisibleFullCheck + // override. + : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted), + db_(db), + unprep_seqs_(unprep_seqs), + wup_snapshot_(snapshot), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WriteUnpreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } + + virtual bool IsVisibleFullCheck(SequenceNumber seq) override; + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + + void Refresh(SequenceNumber seq) override { + max_visible_seq_ = std::max(max_visible_seq_, seq); + wup_snapshot_ = seq; + } + + static SequenceNumber CalcMaxVisibleSeq( + const std::map& unprep_seqs, + SequenceNumber snapshot_seq) { + SequenceNumber max_unprepared = 0; + if (unprep_seqs.size()) { + max_unprepared = + unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1; + } + return std::max(max_unprepared, snapshot_seq); + } + + private: + WritePreparedTxnDB* db_; + const std::map& unprep_seqs_; + SequenceNumber wup_snapshot_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; +}; + +class WriteUnpreparedTxn : public WritePreparedTxn { + public: + WriteUnpreparedTxn(WriteUnpreparedTxnDB* db, + const WriteOptions& write_options, + const TransactionOptions& txn_options); + + virtual ~WriteUnpreparedTxn(); + + using TransactionBaseImpl::Put; + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) override; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::Merge; + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::Delete; + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::SingleDelete; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked = false) override; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) override; + + // In WriteUnprepared, untracked writes will break snapshot validation logic. + // Snapshot validation will only check the largest sequence number of a key to + // see if it was committed or not. However, an untracked unprepared write will + // hide smaller committed sequence numbers. + // + // TODO(lth): Investigate whether it is worth having snapshot validation + // validate all values larger than snap_seq. Otherwise, we should return + // Status::NotSupported for untracked writes. + + virtual Status RebuildFromWriteBatch(WriteBatch*) override; + + virtual uint64_t GetLastLogNumber() const override { + return last_log_number_; + } + + void RemoveActiveIterator(Iterator* iter) { + active_iterators_.erase( + std::remove(active_iterators_.begin(), active_iterators_.end(), iter), + active_iterators_.end()); + } + + protected: + void Initialize(const TransactionOptions& txn_options) override; + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + Status CommitInternal() override; + + Status RollbackInternal() override; + + void Clear() override; + + void SetSavePoint() override; + Status RollbackToSavePoint() override; + Status PopSavePoint() override; + + // Get and GetIterator needs to be overridden so that a ReadCallback to + // handle read-your-own-write is used. + using Transaction::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + using Transaction::GetIterator; + virtual Iterator* GetIterator(const ReadOptions& options) override; + virtual Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + + private: + friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test; + friend class WriteUnpreparedTxnDB; + + const std::map& GetUnpreparedSequenceNumbers(); + Status WriteRollbackKeys(const LockTracker& tracked_keys, + WriteBatchWithIndex* rollback_batch, + ReadCallback* callback, const ReadOptions& roptions); + + Status MaybeFlushWriteBatchToDB(); + Status FlushWriteBatchToDB(bool prepared); + Status FlushWriteBatchToDBInternal(bool prepared); + Status FlushWriteBatchWithSavePointToDB(); + Status RollbackToSavePointInternal(); + Status HandleWrite(std::function do_write); + + // For write unprepared, we check on every writebatch append to see if + // write_batch_flush_threshold_ has been exceeded, and then call + // FlushWriteBatchToDB if so. This logic is encapsulated in + // MaybeFlushWriteBatchToDB. + int64_t write_batch_flush_threshold_; + WriteUnpreparedTxnDB* wupt_db_; + + // Ordered list of unprep_seq sequence numbers that we have already written + // to DB. + // + // This maps unprep_seq => prepare_batch_cnt for each unprepared batch + // written by this transaction. + // + // Note that this contains both prepared and unprepared batches, since they + // are treated similarily in prepare heap/commit map, so it simplifies the + // commit callbacks. + std::map unprep_seqs_; + + uint64_t last_log_number_; + + // Recovered transactions have tracked_keys_ populated, but are not actually + // locked for efficiency reasons. For recovered transactions, skip unlocking + // keys when transaction ends. + bool recovered_txn_; + + // Track the largest sequence number at which we performed snapshot + // validation. If snapshot validation was skipped because no snapshot was set, + // then this is set to GetLastPublishedSequence. This value is useful because + // it means that for keys that have unprepared seqnos, we can guarantee that + // no committed keys by other transactions can exist between + // largest_validated_seq_ and max_unprep_seq. See + // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is + // necessary for iterator Prev(). + // + // Currently this value only increases during the lifetime of a transaction, + // but in some cases, we should be able to restore the previously largest + // value when calling RollbackToSavepoint. + SequenceNumber largest_validated_seq_; + + struct SavePoint { + // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is + // used during RollbackToSavepoint to determine visibility when restoring + // old values. + // + // TODO(lth): Since all unprep_seqs_ sets further down the stack must be + // subsets, this can potentially be deduplicated by just storing set + // difference. Investigate if this is worth it. + std::map unprep_seqs_; + + // This snapshot will be used to read keys at this savepoint if we call + // RollbackToSavePoint. + std::unique_ptr snapshot_; + + SavePoint(const std::map& seqs, + ManagedSnapshot* snapshot) + : unprep_seqs_(seqs), snapshot_(snapshot){}; + }; + + // We have 3 data structures holding savepoint information: + // 1. TransactionBaseImpl::save_points_ + // 2. WriteUnpreparedTxn::flushed_save_points_ + // 3. WriteUnpreparecTxn::unflushed_save_points_ + // + // TransactionBaseImpl::save_points_ holds information about all write + // batches, including the current in-memory write_batch_, or unprepared + // batches that have been written out. Its responsibility is just to track + // which keys have been modified in every savepoint. + // + // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints + // set on unprepared batches that have already flushed. It holds the snapshot + // and unprep_seqs at that savepoint, so that the rollback process can + // determine which keys were visible at that point in time. + // + // WriteUnpreparecTxn::unflushed_save_points_ holds information about + // savepoints on the current in-memory write_batch_. It simply records the + // size of the write batch at every savepoint. + // + // TODO(lth): Remove the redundancy between save_point_boundaries_ and + // write_batch_.save_points_. + // + // Based on this information, here are some invariants: + // size(unflushed_save_points_) = size(write_batch_.save_points_) + // size(flushed_save_points_) + size(unflushed_save_points_) + // = size(save_points_) + // + std::unique_ptr> + flushed_save_points_; + std::unique_ptr> unflushed_save_points_; + + // It is currently unsafe to flush a write batch if there are active iterators + // created from this transaction. This is because we use WriteBatchWithIndex + // to do merging reads from the DB and the write batch. If we flush the write + // batch, it is possible that the delta iterator on the iterator will point to + // invalid memory. + std::vector active_iterators_; + + // Untracked keys that we have to rollback. + // + // TODO(lth): Currently we we do not record untracked keys per-savepoint. + // This means that when rolling back to savepoints, we have to check all + // keys in the current transaction for rollback. Note that this is only + // inefficient, but still correct because we take a snapshot at every + // savepoint, and we will use that snapshot to construct the rollback batch. + // The rollback batch will then contain a reissue of the same marker. + // + // A more optimal solution would be to only check keys changed since the + // last savepoint. Also, it may make sense to merge this into tracked_keys_ + // and differentiate between tracked but not locked keys to avoid having two + // very similar data structures. + using KeySet = std::unordered_map>; + KeySet untracked_keys_; +}; + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.cc new file mode 100644 index 0000000000..fd0ba0aedc --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.cc @@ -0,0 +1,476 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/transactions/write_unprepared_txn_db.h" + +#include "db/arena_wrapped_db_iter.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Instead of reconstructing a Transaction object, and calling rollback on it, +// we can be more efficient with RollbackRecoveredTransaction by skipping +// unnecessary steps (eg. updating CommitMap, reconstructing keyset) +Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( + const DBImpl::RecoveredTransaction* rtxn) { + // TODO(lth): Reduce duplicate code with WritePrepared rollback logic. + assert(rtxn->unprepared_); + auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap(); + auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap(); + // In theory we could write with disableWAL = true during recovery, and + // assume that if we crash again during recovery, we can just replay from + // the very beginning. Unfortunately, the XIDs from the application may not + // necessarily be unique across restarts, potentially leading to situations + // like this: + // + // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1) + // -- crash and recover with Put(a) rolled back as it was not prepared + // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1) + // COMMIT(xid = 1) + // -- crash and recover with both a, b + // + // We could just write the rollback marker, but then we would have to extend + // MemTableInserter during recovery to actually do writes into the DB + // instead of just dropping the in-memory write batch. + // + WriteOptions w_options; + + class InvalidSnapshotReadCallback : public ReadCallback { + public: + InvalidSnapshotReadCallback(SequenceNumber snapshot) + : ReadCallback(snapshot) {} + + inline bool IsVisibleFullCheck(SequenceNumber) override { + // The seq provided as snapshot is the seq right before we have locked and + // wrote to it, so whatever is there, it is committed. + return true; + } + + // Ignore the refresh request since we are confident that our snapshot seq + // is not going to be affected by concurrent compactions (not enabled yet.) + void Refresh(SequenceNumber) override {} + }; + + // Iterate starting with largest sequence number. + for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) { + auto last_visible_txn = it->first - 1; + const auto& batch = it->second.batch_; + WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */, + w_options.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + + struct RollbackWriteBatchBuilder : public WriteBatch::Handler { + DBImpl* db_; + ReadOptions roptions; + InvalidSnapshotReadCallback callback; + WriteBatch* rollback_batch_; + std::map& comparators_; + std::map& handles_; + using CFKeys = std::set; + std::map keys_; + bool rollback_merge_operands_; + RollbackWriteBatchBuilder( + DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch, + std::map& comparators, + std::map& handles, + bool rollback_merge_operands) + : db_(db), + callback(snap_seq), + // disable min_uncommitted optimization + rollback_batch_(dst_batch), + comparators_(comparators), + handles_(handles), + rollback_merge_operands_(rollback_merge_operands) {} + + Status Rollback(uint32_t cf, const Slice& key) { + Status s; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); + } + auto res = cf_keys.insert(key); + if (res.second == + false) { // second is false if a element already existed. + return s; + } + + PinnableSlice pinnable_val; + bool not_used; + auto cf_handle = handles_[cf]; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_->GetImpl(roptions, key, get_impl_options); + assert(s.ok() || s.IsNotFound()); + if (s.ok()) { + s = rollback_batch_->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + // There has been no readable value before txn. By adding a delete we + // make sure that there will be none afterwards either. + s = rollback_batch_->Delete(cf_handle, key); + assert(s.ok()); + } else { + // Unexpected status. Return it to the user. + } + return s; + } + + Status PutCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + return Rollback(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } + } + + // Recovered batches do not contain 2PC markers. + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + Status MarkBeginPrepare(bool) override { + return Status::InvalidArgument(); + } + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + } rollback_handler(db_impl_, last_visible_txn, &rollback_batch, + *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), + txn_db_options_.rollback_merge_operands); + + auto s = batch->Iterate(&rollback_handler); + if (!s.ok()) { + return s; + } + + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_); + if (!s.ok()) { + return s; + } + + const uint64_t kNoLogRef = 0; + const bool kDisableMemtable = true; + const size_t kOneBatch = 1; + uint64_t seq_used = kMaxSequenceNumber; + s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr, + kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch); + if (!s.ok()) { + return s; + } + + // If two_write_queues, we must manually release the sequence number to + // readers. + if (db_impl_->immutable_db_options().two_write_queues) { + db_impl_->SetLastPublishedSequence(seq_used); + } + } + + return Status::OK(); +} + +Status WriteUnpreparedTxnDB::Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) { + // TODO(lth): Reduce code duplication in this function. + auto dbimpl = static_cast_with_check(GetRootDB()); + assert(dbimpl != nullptr); + + db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this)); + // A callback to commit a single sub-batch + class CommitSubBatchPreReleaseCallback : public PreReleaseCallback { + public: + explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) + : db_(db) {} + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { + assert(!is_mem_disabled); + db_->AddCommitted(commit_seq, commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + }; + db_impl_->SetRecoverableStatePreReleaseCallback( + new CommitSubBatchPreReleaseCallback(this)); + + // PessimisticTransactionDB::Initialize + for (auto cf_ptr : handles) { + AddColumnFamily(cf_ptr); + } + // Verify cf options + for (auto handle : handles) { + ColumnFamilyDescriptor cfd; + Status s = handle->GetDescriptor(&cfd); + if (!s.ok()) { + return s; + } + s = VerifyCFOptions(cfd.options); + if (!s.ok()) { + return s; + } + } + + // Re-enable compaction for the column families that initially had + // compaction enabled. + std::vector compaction_enabled_cf_handles; + compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); + for (auto index : compaction_enabled_cf_indices) { + compaction_enabled_cf_handles.push_back(handles[index]); + } + + // create 'real' transactions from recovered shell transactions + auto rtxns = dbimpl->recovered_transactions(); + std::map ordered_seq_cnt; + for (auto rtxn : rtxns) { + auto recovered_trx = rtxn.second; + assert(recovered_trx); + assert(recovered_trx->batches_.size() >= 1); + assert(recovered_trx->name_.length()); + + // We can only rollback transactions after AdvanceMaxEvictedSeq is called, + // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why + // two iterations is required. + if (recovered_trx->unprepared_) { + continue; + } + + WriteOptions w_options; + w_options.sync = true; + TransactionOptions t_options; + + auto first_log_number = recovered_trx->batches_.begin()->second.log_number_; + auto first_seq = recovered_trx->batches_.begin()->first; + auto last_prepare_batch_cnt = + recovered_trx->batches_.begin()->second.batch_cnt_; + + Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr); + assert(real_trx); + auto wupt = static_cast_with_check(real_trx); + wupt->recovered_txn_ = true; + + real_trx->SetLogNumber(first_log_number); + real_trx->SetId(first_seq); + Status s = real_trx->SetName(recovered_trx->name_); + if (!s.ok()) { + return s; + } + wupt->prepare_batch_cnt_ = last_prepare_batch_cnt; + + for (auto batch : recovered_trx->batches_) { + const auto& seq = batch.first; + const auto& batch_info = batch.second; + auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + assert(batch_info.log_number_); + + ordered_seq_cnt[seq] = cnt; + assert(wupt->unprep_seqs_.count(seq) == 0); + wupt->unprep_seqs_[seq] = cnt; + + s = wupt->RebuildFromWriteBatch(batch_info.batch_); + assert(s.ok()); + if (!s.ok()) { + return s; + } + } + + const bool kClear = true; + wupt->InitWriteBatch(kClear); + + real_trx->SetState(Transaction::PREPARED); + if (!s.ok()) { + return s; + } + } + // AddPrepared must be called in order + for (auto seq_cnt : ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } + + SequenceNumber prev_max = max_evicted_seq_; + SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); + AdvanceMaxEvictedSeq(prev_max, last_seq); + // Create a gap between max and the next snapshot. This simplifies the logic + // in IsInSnapshot by not having to consider the special case of max == + // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest. + if (last_seq) { + db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1); + db_impl_->versions_->SetLastSequence(last_seq + 1); + db_impl_->versions_->SetLastPublishedSequence(last_seq + 1); + } + + Status s; + // Rollback unprepared transactions. + for (auto rtxn : rtxns) { + auto recovered_trx = rtxn.second; + if (recovered_trx->unprepared_) { + s = RollbackRecoveredTransaction(recovered_trx); + if (!s.ok()) { + return s; + } + continue; + } + } + + if (s.ok()) { + dbimpl->DeleteAllRecoveredTransactions(); + + // Compaction should start only after max_evicted_seq_ is set AND recovered + // transactions are either added to PrepareHeap or rolled back. + s = EnableAutoCompaction(compaction_enabled_cf_handles); + } + + return s; +} + +Transaction* WriteUnpreparedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WriteUnpreparedTxn(this, write_options, txn_options); + } +} + +// Struct to hold ownership of snapshot and read callback for iterator cleanup. +struct WriteUnpreparedTxnDB::IteratorState { + IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, + std::shared_ptr s, + SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn) + : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_, + kBackedByDBSnapshot), + snapshot(s) {} + SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); } + + WriteUnpreparedTxnReadCallback callback; + std::shared_ptr snapshot; +}; + +namespace { +static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { + delete reinterpret_cast(arg1); +} +} // anonymous namespace + +Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family, + WriteUnpreparedTxn* txn) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + // TODO(lth): Refactor so that this logic is shared with WritePrepared. + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + + // Currently, the Prev() iterator logic does not work well without snapshot + // validation. The logic simply iterates through values of a key in + // ascending seqno order, stopping at the first non-visible value and + // returning the last visible value. + // + // For example, if snapshot sequence is 3, and we have the following keys: + // foo: v1 1 + // foo: v2 2 + // foo: v3 3 + // foo: v4 4 + // foo: v5 5 + // + // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3, + // which is the last visible value. + // + // For unprepared transactions, if we have snap_seq = 3, but the current + // transaction has unprep_seq 5, then returning the first non-visible value + // would be incorrect, as we should return v5, and not v3. The problem is that + // there are committed values at snapshot_seq < commit_seq < unprep_seq. + // + // Snapshot validation can prevent this problem by ensuring that no committed + // values exist at snapshot_seq < commit_seq, and thus any value with a + // sequence number greater than snapshot_seq must be unprepared values. For + // example, if the transaction had a snapshot at 3, then snapshot validation + // would be performed during the Put(v5) call. It would find v4, and the Put + // would fail with snapshot validation failure. + // + // TODO(lth): Improve Prev() logic to continue iterating until + // max_visible_seq, and then return the last visible value, so that this + // restriction can be lifted. + const Snapshot* snapshot = nullptr; + if (options.snapshot == nullptr) { + snapshot = GetSnapshot(); + own_snapshot = std::make_shared(db_impl_, snapshot); + } else { + snapshot = options.snapshot; + } + + snapshot_seq = snapshot->GetSequenceNumber(); + assert(snapshot_seq != kMaxSequenceNumber); + // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are + // guaranteed that for keys that were modified by this transaction (and thus + // might have unprepared values), no committed values exist at + // largest_validated_seq < commit_seq (or the contrapositive: any committed + // value must exist at commit_seq <= largest_validated_seq). This implies + // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <= + // snapshot_seq. As explained above, the problem with Prev() only happens when + // snapshot_seq < commit_seq. + // + // For keys that were not modified by this transaction, largest_validated_seq_ + // is meaningless, and Prev() should just work with the existing visibility + // logic. + if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() && + !txn->unprep_seqs_.empty()) { + ROCKS_LOG_ERROR(info_log_, + "WriteUnprepared iterator creation failed since the " + "transaction has performed unvalidated writes"); + return nullptr; + } + min_uncommitted = + static_cast_with_check(snapshot)->min_uncommitted_; + + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn); + auto* db_iter = db_impl_->NewIteratorImpl( + options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index, + allow_refresh); + db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr); + return db_iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.h new file mode 100644 index 0000000000..a7b10f1533 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/transactions/write_unprepared_txn_db.h @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "utilities/transactions/write_prepared_txn_db.h" +#include "utilities/transactions/write_unprepared_txn.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTxn; + +class WriteUnpreparedTxnDB : public WritePreparedTxnDB { + public: + using WritePreparedTxnDB::WritePreparedTxnDB; + + Status Initialize(const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) override; + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + // Struct to hold ownership of snapshot and read callback for cleanup. + struct IteratorState; + + using WritePreparedTxnDB::NewIterator; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family, + WriteUnpreparedTxn* txn); + + private: + Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn); +}; + +class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { + // TODO(lth): Reduce code duplication with + // WritePreparedCommitEntryPreReleaseCallback + public: + // includes_data indicates that the commit also writes non-empty + // CommitTimeWriteBatch to memtable, which needs to be committed separately. + WriteUnpreparedCommitEntryPreReleaseCallback( + WritePreparedTxnDB* db, DBImpl* db_impl, + const std::map& unprep_seqs, + size_t data_batch_cnt = 0, bool publish_seq = true) + : db_(db), + db_impl_(db_impl), + unprep_seqs_(unprep_seqs), + data_batch_cnt_(data_batch_cnt), + includes_data_(data_batch_cnt_ > 0), + publish_seq_(publish_seq) { + assert(unprep_seqs.size() > 0); + } + + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t, size_t /*index*/, + size_t /*total*/) override { + const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) + ? commit_seq + : commit_seq + data_batch_cnt_ - 1; + // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt. + for (const auto& s : unprep_seqs_) { + for (size_t i = 0; i < s.second; i++) { + db_->AddCommitted(s.first + i, last_commit_seq); + } + } + + if (includes_data_) { + assert(data_batch_cnt_); + // Commit the data that is accompanied with the commit request + for (size_t i = 0; i < data_batch_cnt_; i++) { + // For commit seq of each batch use the commit seq of the last batch. + // This would make debugging easier by having all the batches having + // the same sequence number. + db_->AddCommitted(commit_seq + i, last_commit_seq); + } + } + if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) { + assert(is_mem_disabled); // implies the 2nd queue + // Publish the sequence number. We can do that here assuming the callback + // is invoked only from one write queue, which would guarantee that the + // publish sequence numbers will be in order, i.e., once a seq is + // published all the seq prior to that are also publishable. + db_impl_->SetLastPublishedSequence(last_commit_seq); + } + // else SequenceNumber that is updated as part of the write already does the + // publishing + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + const std::map& unprep_seqs_; + size_t data_batch_cnt_; + // Either because it is commit without prepare or it has a + // CommitTimeWriteBatch + bool includes_data_; + // Should the callback also publishes the commit seq number + bool publish_seq_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.cc new file mode 100644 index 0000000000..2b261ec6f2 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.cc @@ -0,0 +1,617 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/ttl/db_ttl_impl.h" + +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map ttl_merge_op_type_info = + {{"user_operator", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}}; + +TtlMergeOperator::TtlMergeOperator( + const std::shared_ptr& merge_op, SystemClock* clock) + : user_merge_op_(merge_op), clock_(clock) { + RegisterOptions("TtlMergeOptions", &user_merge_op_, &ttl_merge_op_type_info); +} + +bool TtlMergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + const uint32_t ts_len = DBWithTTLImpl::kTSLength; + if (merge_in.existing_value && merge_in.existing_value->size() < ts_len) { + ROCKS_LOG_ERROR(merge_in.logger, + "Error: Could not remove timestamp from existing value."); + return false; + } + + // Extract time-stamp from each operand to be passed to user_merge_op_ + std::vector operands_without_ts; + for (const auto& operand : merge_in.operand_list) { + if (operand.size() < ts_len) { + ROCKS_LOG_ERROR(merge_in.logger, + "Error: Could not remove timestamp from operand value."); + return false; + } + operands_without_ts.push_back(operand); + operands_without_ts.back().remove_suffix(ts_len); + } + + // Apply the user merge operator (store result in *new_value) + bool good = true; + MergeOperationOutput user_merge_out(merge_out->new_value, + merge_out->existing_operand); + if (merge_in.existing_value) { + Slice existing_value_without_ts(merge_in.existing_value->data(), + merge_in.existing_value->size() - ts_len); + good = user_merge_op_->FullMergeV2( + MergeOperationInput(merge_in.key, &existing_value_without_ts, + operands_without_ts, merge_in.logger), + &user_merge_out); + } else { + good = user_merge_op_->FullMergeV2( + MergeOperationInput(merge_in.key, nullptr, operands_without_ts, + merge_in.logger), + &user_merge_out); + } + merge_out->op_failure_scope = user_merge_out.op_failure_scope; + + // Return false if the user merge operator returned false + if (!good) { + return false; + } + + if (merge_out->existing_operand.data()) { + merge_out->new_value.assign(merge_out->existing_operand.data(), + merge_out->existing_operand.size()); + merge_out->existing_operand = Slice(nullptr, 0); + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!clock_->GetCurrentTime(&curtime).ok()) { + ROCKS_LOG_ERROR( + merge_in.logger, + "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + merge_out->new_value.append(ts_string, ts_len); + return true; + } +} + +bool TtlMergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + const uint32_t ts_len = DBWithTTLImpl::kTSLength; + std::deque operands_without_ts; + + for (const auto& operand : operand_list) { + if (operand.size() < ts_len) { + ROCKS_LOG_ERROR(logger, "Error: Could not remove timestamp from value."); + return false; + } + + operands_without_ts.push_back( + Slice(operand.data(), operand.size() - ts_len)); + } + + // Apply the user partial-merge operator (store result in *new_value) + assert(new_value); + if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value, + logger)) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!clock_->GetCurrentTime(&curtime).ok()) { + ROCKS_LOG_ERROR( + logger, + "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } +} + +Status TtlMergeOperator::PrepareOptions(const ConfigOptions& config_options) { + if (clock_ == nullptr) { + clock_ = config_options.env->GetSystemClock().get(); + } + return MergeOperator::PrepareOptions(config_options); +} + +Status TtlMergeOperator::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (user_merge_op_ == nullptr) { + return Status::InvalidArgument( + "UserMergeOperator required by TtlMergeOperator"); + } else if (clock_ == nullptr) { + return Status::InvalidArgument("SystemClock required by TtlMergeOperator"); + } else { + return MergeOperator::ValidateOptions(db_opts, cf_opts); + } +} + +void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options, + SystemClock* clock) { + if (options->compaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, clock, options->compaction_filter); + } else { + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, clock, options->compaction_filter_factory)); + } + + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator, clock)); + } +} + +static std::unordered_map ttl_type_info = { + {"ttl", {0, OptionType::kInt32T}}, +}; + +static std::unordered_map ttl_cff_type_info = { + {"user_filter_factory", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kNone)}}; +static std::unordered_map user_cf_type_info = { + {"user_filter", + OptionTypeInfo::AsCustomRawPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}}; + +TtlCompactionFilter::TtlCompactionFilter( + int32_t ttl, SystemClock* clock, const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory) + : LayeredCompactionFilterBase(_user_comp_filter, + std::move(_user_comp_filter_from_factory)), + ttl_(ttl), + clock_(clock) { + RegisterOptions("TTL", &ttl_, &ttl_type_info); + RegisterOptions("UserFilter", &user_comp_filter_, &user_cf_type_info); +} + +bool TtlCompactionFilter::Filter(int level, const Slice& key, + const Slice& old_val, std::string* new_val, + bool* value_changed) const { + if (DBWithTTLImpl::IsStale(old_val, ttl_, clock_)) { + return true; + } + if (user_comp_filter() == nullptr) { + return false; + } + assert(old_val.size() >= DBWithTTLImpl::kTSLength); + Slice old_val_without_ts(old_val.data(), + old_val.size() - DBWithTTLImpl::kTSLength); + if (user_comp_filter()->Filter(level, key, old_val_without_ts, new_val, + value_changed)) { + return true; + } + if (*value_changed) { + new_val->append(old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength, + DBWithTTLImpl::kTSLength); + } + return false; +} + +Status TtlCompactionFilter::PrepareOptions( + const ConfigOptions& config_options) { + if (clock_ == nullptr) { + clock_ = config_options.env->GetSystemClock().get(); + } + return LayeredCompactionFilterBase::PrepareOptions(config_options); +} + +Status TtlCompactionFilter::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (clock_ == nullptr) { + return Status::InvalidArgument( + "SystemClock required by TtlCompactionFilter"); + } else { + return LayeredCompactionFilterBase::ValidateOptions(db_opts, cf_opts); + } +} + +TtlCompactionFilterFactory::TtlCompactionFilterFactory( + int32_t ttl, SystemClock* clock, + std::shared_ptr comp_filter_factory) + : ttl_(ttl), clock_(clock), user_comp_filter_factory_(comp_filter_factory) { + RegisterOptions("UserOptions", &user_comp_filter_factory_, + &ttl_cff_type_info); + RegisterOptions("TTL", &ttl_, &ttl_type_info); +} + +std::unique_ptr +TtlCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context& context) { + std::unique_ptr user_comp_filter_from_factory = + nullptr; + if (user_comp_filter_factory_) { + user_comp_filter_from_factory = + user_comp_filter_factory_->CreateCompactionFilter(context); + } + + return std::unique_ptr(new TtlCompactionFilter( + ttl_, clock_, nullptr, std::move(user_comp_filter_from_factory))); +} + +Status TtlCompactionFilterFactory::PrepareOptions( + const ConfigOptions& config_options) { + if (clock_ == nullptr) { + clock_ = config_options.env->GetSystemClock().get(); + } + return CompactionFilterFactory::PrepareOptions(config_options); +} + +Status TtlCompactionFilterFactory::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (clock_ == nullptr) { + return Status::InvalidArgument( + "SystemClock required by TtlCompactionFilterFactory"); + } else { + return CompactionFilterFactory::ValidateOptions(db_opts, cf_opts); + } +} + +int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) { + library.AddFactory( + TtlMergeOperator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TtlMergeOperator(nullptr, nullptr)); + return guard->get(); + }); + library.AddFactory( + TtlCompactionFilterFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TtlCompactionFilterFactory(0, nullptr, nullptr)); + return guard->get(); + }); + library.AddFactory( + TtlCompactionFilter::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { + return new TtlCompactionFilter(0, nullptr, nullptr); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +// Open the db inside DBWithTTLImpl because options needs pointer to its ttl +DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {} + +DBWithTTLImpl::~DBWithTTLImpl() { + if (!closed_) { + Close().PermitUncheckedError(); + } +} + +Status DBWithTTLImpl::Close() { + Status ret = Status::OK(); + if (!closed_) { + Options default_options = GetOptions(); + // Need to stop background compaction before getting rid of the filter + CancelAllBackgroundWork(db_, /* wait = */ true); + ret = db_->Close(); + delete default_options.compaction_filter; + closed_ = true; + } + return ret; +} + +void DBWithTTLImpl::RegisterTtlClasses() { + static std::once_flag once; + std::call_once(once, [&]() { + ObjectRegistry::Default()->AddLibrary("TTL", RegisterTtlObjects, ""); + }); +} + +Status DBWithTTL::Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl, bool read_only) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles, + dbptr, {ttl}, read_only); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status DBWithTTL::Open( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DBWithTTL** dbptr, + const std::vector& ttls, bool read_only) { + DBWithTTLImpl::RegisterTtlClasses(); + if (ttls.size() != column_families.size()) { + return Status::InvalidArgument( + "ttls size has to be the same as number of column families"); + } + + SystemClock* clock = (db_options.env == nullptr) + ? SystemClock::Default().get() + : db_options.env->GetSystemClock().get(); + + std::vector column_families_sanitized = + column_families; + for (size_t i = 0; i < column_families_sanitized.size(); ++i) { + DBWithTTLImpl::SanitizeOptions( + ttls[i], &column_families_sanitized[i].options, clock); + } + DB* db; + + Status st; + if (read_only) { + st = DB::OpenForReadOnly(db_options, dbname, column_families_sanitized, + handles, &db); + } else { + st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTLImpl(db); + } else { + *dbptr = nullptr; + } + return st; +} + +Status DBWithTTLImpl::CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) { + RegisterTtlClasses(); + ColumnFamilyOptions sanitized_options = options; + DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options, + GetEnv()->GetSystemClock().get()); + + return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name, + handle); +} + +Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + return CreateColumnFamilyWithTtl(options, column_family_name, handle, 0); +} + +// Appends the current timestamp to the string. +// Returns false if could not get the current_time, true if append succeeds +Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts, + SystemClock* clock) { + val_with_ts->reserve(kTSLength + val.size()); + char ts_string[kTSLength]; + int64_t curtime; + Status st = clock->GetCurrentTime(&curtime); + if (!st.ok()) { + return st; + } + EncodeFixed32(ts_string, (int32_t)curtime); + val_with_ts->append(val.data(), val.size()); + val_with_ts->append(ts_string, kTSLength); + return st; +} + +// Returns corruption if the length of the string is lesser than timestamp, or +// timestamp refers to a time lesser than ttl-feature release time +Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) { + if (str.size() < kTSLength) { + return Status::Corruption("Error: value's length less than timestamp's\n"); + } + // Checks that TS is not lesser than kMinTimestamp + // Gaurds against corruption & normal database opened incorrectly in ttl mode + int32_t timestamp_value = DecodeFixed32(str.data() + str.size() - kTSLength); + if (timestamp_value < kMinTimestamp) { + return Status::Corruption("Error: Timestamp < ttl feature release time!\n"); + } + return Status::OK(); +} + +// Checks if the string is stale or not according to TTl provided +bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, + SystemClock* clock) { + if (ttl <= 0) { // Data is fresh if TTL is non-positive + return false; + } + int64_t curtime; + if (!clock->GetCurrentTime(&curtime).ok()) { + return false; // Treat the data as fresh if could not get current time + } + /* int32_t may overflow when timestamp_value + ttl + * for example ttl = 86400 * 365 * 15 + * convert timestamp_value to int64_t + */ + int64_t timestamp_value = + DecodeFixed32(value.data() + value.size() - kTSLength); + return (timestamp_value + ttl) < curtime; +} + +// Strips the TS from the end of the slice +Status DBWithTTLImpl::StripTS(PinnableSlice* pinnable_val) { + if (pinnable_val->size() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + pinnable_val->remove_suffix(kTSLength); + return Status::OK(); +} + +// Strips the TS from the end of the string +Status DBWithTTLImpl::StripTS(std::string* str) { + if (str->length() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + str->erase(str->length() - kTSLength, kTSLength); + return Status::OK(); +} + +Status DBWithTTLImpl::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) { + WriteBatch batch; + Status st = batch.Put(column_family, key, val); + if (st.ok()) { + st = Write(options, &batch); + } + return st; +} + +Status DBWithTTLImpl::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + Status st = db_->Get(options, column_family, key, value); + if (!st.ok()) { + return st; + } + st = SanityCheckTimestamp(*value); + if (!st.ok()) { + return st; + } + return StripTS(value); +} + +std::vector DBWithTTLImpl::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + auto statuses = db_->MultiGet(options, column_family, keys, values); + for (size_t i = 0; i < keys.size(); ++i) { + if (!statuses[i].ok()) { + continue; + } + statuses[i] = SanityCheckTimestamp((*values)[i]); + if (!statuses[i].ok()) { + continue; + } + statuses[i] = StripTS(&(*values)[i]); + } + return statuses; +} + +bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool* value_found) { + bool ret = db_->KeyMayExist(options, column_family, key, value, value_found); + if (ret && value != nullptr && value_found != nullptr && *value_found) { + if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { + return false; + } + } + return ret; +} + +Status DBWithTTLImpl::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatch batch; + Status st = batch.Merge(column_family, key, value); + if (st.ok()) { + st = Write(options, &batch); + } + return st; +} + +Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { + class Handler : public WriteBatch::Handler { + public: + explicit Handler(SystemClock* clock) : clock_(clock) {} + WriteBatch updates_ttl; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + std::string value_with_ts; + Status st = AppendTS(value, &value_with_ts, clock_); + if (!st.ok()) { + return st; + } + return WriteBatchInternal::Put(&updates_ttl, column_family_id, key, + value_with_ts); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + std::string value_with_ts; + Status st = AppendTS(value, &value_with_ts, clock_); + if (!st.ok()) { + return st; + } + return WriteBatchInternal::Merge(&updates_ttl, column_family_id, key, + value_with_ts); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + return WriteBatchInternal::Delete(&updates_ttl, column_family_id, key); + } + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + return WriteBatchInternal::DeleteRange(&updates_ttl, column_family_id, + begin_key, end_key); + } + void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); } + + private: + SystemClock* clock_; + }; + Handler handler(GetEnv()->GetSystemClock().get()); + Status st = updates->Iterate(&handler); + if (!st.ok()) { + return st; + } else { + return db_->Write(opts, &(handler.updates_ttl)); + } +} + +Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) { + if (opts.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } + return new TtlIterator(db_->NewIterator(opts, column_family)); +} + +void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) { + std::shared_ptr filter; + Options opts; + opts = GetOptions(h); + filter = std::static_pointer_cast( + opts.compaction_filter_factory); + if (!filter) return; + filter->SetTtl(ttl); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.h new file mode 100644 index 0000000000..6ac662467f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/ttl/db_ttl_impl.h @@ -0,0 +1,243 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/db_ttl.h" +#include "utilities/compaction_filters/layered_compaction_filter_base.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef GetCurrentTime +#endif + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; +class ObjectLibrary; +class ObjectRegistry; +class DBWithTTLImpl : public DBWithTTL { + public: + static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options, + SystemClock* clock); + + static void RegisterTtlClasses(); + explicit DBWithTTLImpl(DB* db); + + virtual ~DBWithTTLImpl(); + + virtual Status Close() override; + + Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle, + int ttl) override; + + Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + using StackableDB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + using StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override; + + virtual DB* GetBaseDB() override { return db_; } + + static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock); + + static Status AppendTS(const Slice& val, std::string* val_with_ts, + SystemClock* clock); + + static Status SanityCheckTimestamp(const Slice& str); + + static Status StripTS(std::string* str); + + static Status StripTS(PinnableSlice* str); + + static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp + + static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 + + static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 + + void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); } + + void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override; + + private: + // remember whether the Close completes or not + bool closed_; +}; + +class TtlIterator : public Iterator { + public: + explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); } + + ~TtlIterator() { delete iter_; } + + bool Valid() const override { return iter_->Valid(); } + + void SeekToFirst() override { iter_->SeekToFirst(); } + + void SeekToLast() override { iter_->SeekToLast(); } + + void Seek(const Slice& target) override { iter_->Seek(target); } + + void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); } + + void Next() override { iter_->Next(); } + + void Prev() override { iter_->Prev(); } + + Slice key() const override { return iter_->key(); } + + int32_t ttl_timestamp() const { + return DecodeFixed32(iter_->value().data() + iter_->value().size() - + DBWithTTLImpl::kTSLength); + } + + Slice value() const override { + // TODO: handle timestamp corruption like in general iterator semantics + assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok()); + Slice trimmed_value = iter_->value(); + trimmed_value.size_ -= DBWithTTLImpl::kTSLength; + return trimmed_value; + } + + Status status() const override { return iter_->status(); } + + private: + Iterator* iter_; +}; + +class TtlCompactionFilter : public LayeredCompactionFilterBase { + public: + TtlCompactionFilter(int32_t ttl, SystemClock* clock, + const CompactionFilter* _user_comp_filter, + std::unique_ptr + _user_comp_filter_from_factory = nullptr); + + virtual bool Filter(int level, const Slice& key, const Slice& old_val, + std::string* new_val, bool* value_changed) const override; + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "TtlCompactionFilter"; } + bool IsInstanceOf(const std::string& name) const override { + if (name == "Delete By TTL") { + return true; + } else { + return LayeredCompactionFilterBase::IsInstanceOf(name); + } + } + + Status PrepareOptions(const ConfigOptions& config_options) override; + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + + private: + int32_t ttl_; + SystemClock* clock_; +}; + +class TtlCompactionFilterFactory : public CompactionFilterFactory { + public: + TtlCompactionFilterFactory( + int32_t ttl, SystemClock* clock, + std::shared_ptr comp_filter_factory); + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; + void SetTtl(int32_t ttl) { ttl_ = ttl; } + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "TtlCompactionFilterFactory"; } + Status PrepareOptions(const ConfigOptions& config_options) override; + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + const Customizable* Inner() const override { + return user_comp_filter_factory_.get(); + } + + private: + int32_t ttl_; + SystemClock* clock_; + std::shared_ptr user_comp_filter_factory_; +}; + +class TtlMergeOperator : public MergeOperator { + public: + explicit TtlMergeOperator(const std::shared_ptr& merge_op, + SystemClock* clock); + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + static const char* kClassName() { return "TtlMergeOperator"; } + + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override { + if (name == "Merge By TTL") { + return true; + } else { + return MergeOperator::IsInstanceOf(name); + } + } + + Status PrepareOptions(const ConfigOptions& config_options) override; + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + const Customizable* Inner() const override { return user_merge_op_.get(); } + + private: + std::shared_ptr user_merge_op_; + SystemClock* clock_; +}; +extern "C" { +int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/); +} // extern "C" + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/wal_filter.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/wal_filter.cc new file mode 100644 index 0000000000..9fa36bf272 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/wal_filter.cc @@ -0,0 +1,22 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/wal_filter.h" + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/customizable_util.h" + +namespace ROCKSDB_NAMESPACE { +Status WalFilter::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + WalFilter** filter) { + Status s = LoadStaticObject(config_options, value, filter); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index.cc new file mode 100644 index 0000000000..f20b7e5e36 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index.cc @@ -0,0 +1,694 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "rocksdb/utilities/write_batch_with_index.h" + +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "options/db_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { +struct WriteBatchWithIndex::Rep { + explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0, + size_t max_bytes = 0, bool _overwrite_key = false, + size_t protection_bytes_per_key = 0) + : write_batch(reserved_bytes, max_bytes, protection_bytes_per_key, + index_comparator ? index_comparator->timestamp_size() : 0), + comparator(index_comparator, &write_batch), + skip_list(comparator, &arena), + overwrite_key(_overwrite_key), + last_entry_offset(0), + last_sub_batch_offset(0), + sub_batch_cnt(1) {} + ReadableWriteBatch write_batch; + WriteBatchEntryComparator comparator; + Arena arena; + WriteBatchEntrySkipList skip_list; + bool overwrite_key; + size_t last_entry_offset; + // The starting offset of the last sub-batch. A sub-batch starts right before + // inserting a key that is a duplicate of a key in the last sub-batch. Zero, + // the default, means that no duplicate key is detected so far. + size_t last_sub_batch_offset; + // Total number of sub-batches in the write batch. Default is 1. + size_t sub_batch_cnt; + + // Remember current offset of internal write batch, which is used as + // the starting offset of the next record. + void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); } + + // In overwrite mode, find the existing entry for the same key and update it + // to point to the current entry. + // Return true if the key is found and updated. + bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key, + WriteType type); + bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key, + WriteType type); + + // Add the recent entry to the update. + // In overwrite mode, if key already exists in the index, update it. + void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key, + WriteType type); + void AddOrUpdateIndex(const Slice& key, WriteType type); + + // Allocate an index entry pointing to the last entry in the write batch and + // put it to skip list. + void AddNewEntry(uint32_t column_family_id); + + // Clear all updates buffered in this batch. + void Clear(); + void ClearIndex(); + + // Rebuild index by reading all records from the batch. + // Returns non-ok status on corruption. + Status ReBuildIndex(); +}; + +bool WriteBatchWithIndex::Rep::UpdateExistingEntry( + ColumnFamilyHandle* column_family, const Slice& key, WriteType type) { + uint32_t cf_id = GetColumnFamilyID(column_family); + return UpdateExistingEntryWithCfId(cf_id, key, type); +} + +bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId( + uint32_t column_family_id, const Slice& key, WriteType type) { + if (!overwrite_key) { + return false; + } + + WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch, + &comparator); + iter.Seek(key); + if (!iter.Valid()) { + return false; + } else if (!iter.MatchesKey(column_family_id, key)) { + return false; + } else { + // Move to the end of this key (NextKey-Prev) + iter.NextKey(); // Move to the next key + if (iter.Valid()) { + iter.Prev(); // Move back one entry + } else { + iter.SeekToLast(); + } + } + WriteBatchIndexEntry* non_const_entry = + const_cast(iter.GetRawEntry()); + if (LIKELY(last_sub_batch_offset <= non_const_entry->offset)) { + last_sub_batch_offset = last_entry_offset; + sub_batch_cnt++; + } + if (type == kMergeRecord) { + return false; + } else { + non_const_entry->offset = last_entry_offset; + return true; + } +} + +void WriteBatchWithIndex::Rep::AddOrUpdateIndex( + ColumnFamilyHandle* column_family, const Slice& key, WriteType type) { + if (!UpdateExistingEntry(column_family, key, type)) { + uint32_t cf_id = GetColumnFamilyID(column_family); + const auto* cf_cmp = GetColumnFamilyUserComparator(column_family); + if (cf_cmp != nullptr) { + comparator.SetComparatorForCF(cf_id, cf_cmp); + } + AddNewEntry(cf_id); + } +} + +void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key, + WriteType type) { + if (!UpdateExistingEntryWithCfId(0, key, type)) { + AddNewEntry(0); + } +} + +void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) { + const std::string& wb_data = write_batch.Data(); + Slice entry_ptr = Slice(wb_data.data() + last_entry_offset, + wb_data.size() - last_entry_offset); + // Extract key + Slice key; + bool success = + ReadKeyFromWriteBatchEntry(&entry_ptr, &key, column_family_id != 0); +#ifdef NDEBUG + (void)success; +#endif + assert(success); + + const Comparator* const ucmp = comparator.GetComparator(column_family_id); + size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; + + if (ts_sz > 0) { + key.remove_suffix(ts_sz); + } + + auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); + auto* index_entry = + new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id, + key.data() - wb_data.data(), key.size()); + skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Rep::Clear() { + write_batch.Clear(); + ClearIndex(); +} + +void WriteBatchWithIndex::Rep::ClearIndex() { + skip_list.~WriteBatchEntrySkipList(); + arena.~Arena(); + new (&arena) Arena(); + new (&skip_list) WriteBatchEntrySkipList(comparator, &arena); + last_entry_offset = 0; + last_sub_batch_offset = 0; + sub_batch_cnt = 1; +} + +Status WriteBatchWithIndex::Rep::ReBuildIndex() { + Status s; + + ClearIndex(); + + if (write_batch.Count() == 0) { + // Nothing to re-index + return s; + } + + size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch); + + Slice input(write_batch.Data()); + input.remove_prefix(offset); + + // Loop through all entries in Rep and add each one to the index + uint32_t found = 0; + while (s.ok() && !input.empty()) { + Slice key, value, blob, xid; + uint32_t column_family_id = 0; // default + char tag = 0; + + // set offset of current entry for call to AddNewEntry() + last_entry_offset = input.data() - write_batch.Data().data(); + + s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, &value, + &blob, &xid); + if (!s.ok()) { + break; + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, kPutRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, + kDeleteRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, + kSingleDeleteRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, kMergeRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeLogData: + case kTypeBeginPrepareXID: + case kTypeBeginPersistedPrepareXID: + case kTypeBeginUnprepareXID: + case kTypeEndPrepareXID: + case kTypeCommitXID: + case kTypeCommitXIDAndTimestamp: + case kTypeRollbackXID: + case kTypeNoop: + break; + default: + return Status::Corruption( + "unknown WriteBatch tag in ReBuildIndex", + std::to_string(static_cast(tag))); + } + } + + if (s.ok() && found != write_batch.Count()) { + s = Status::Corruption("WriteBatch has wrong count"); + } + + return s; +} + +WriteBatchWithIndex::WriteBatchWithIndex( + const Comparator* default_index_comparator, size_t reserved_bytes, + bool overwrite_key, size_t max_bytes, size_t protection_bytes_per_key) + : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes, + overwrite_key, protection_bytes_per_key)) {} + +WriteBatchWithIndex::~WriteBatchWithIndex() {} + +WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default; + +WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) = + default; + +WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; } + +size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; } + +WBWIIterator* WriteBatchWithIndex::NewIterator() { + return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch, + &(rep->comparator)); +} + +WBWIIterator* WriteBatchWithIndex::NewIterator( + ColumnFamilyHandle* column_family) { + return new WBWIIteratorImpl(GetColumnFamilyID(column_family), + &(rep->skip_list), &rep->write_batch, + &(rep->comparator)); +} + +Iterator* WriteBatchWithIndex::NewIteratorWithBase( + ColumnFamilyHandle* column_family, Iterator* base_iterator, + const ReadOptions* read_options) { + auto wbwiii = + new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list), + &rep->write_batch, &rep->comparator); + return new BaseDeltaIterator(column_family, base_iterator, wbwiii, + GetColumnFamilyUserComparator(column_family), + read_options); +} + +Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) { + // default column family's comparator + auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch, + &rep->comparator); + return new BaseDeltaIterator(nullptr, base_iterator, wbwiii, + rep->comparator.default_comparator()); +} + +Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Put(column_family, key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kPutRecord); + } + return s; +} + +Status WriteBatchWithIndex::Put(const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Put(key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kPutRecord); + } + return s; +} + +Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, + const Slice& /*key*/, const Slice& /*ts*/, + const Slice& /*value*/) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be nullptr"); + } + // TODO: support WBWI::Put() with timestamp. + return Status::NotSupported(); +} + +Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Delete(column_family, key); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::Delete(const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Delete(key); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, + const Slice& /*key*/, const Slice& /*ts*/) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be nullptr"); + } + // TODO: support WBWI::Delete() with timestamp. + return Status::NotSupported(); +} + +Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.SingleDelete(column_family, key); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kSingleDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::SingleDelete(const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.SingleDelete(key); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kSingleDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& /*key*/, + const Slice& /*ts*/) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be nullptr"); + } + // TODO: support WBWI::SingleDelete() with timestamp. + return Status::NotSupported(); +} + +Status WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Merge(column_family, key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kMergeRecord); + } + return s; +} + +Status WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Merge(key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kMergeRecord); + } + return s; +} + +Status WriteBatchWithIndex::PutLogData(const Slice& blob) { + return rep->write_batch.PutLogData(blob); +} + +void WriteBatchWithIndex::Clear() { rep->Clear(); } + +Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, + const DBOptions& options, + const Slice& key, std::string* value) { + Status s; + WriteBatchWithIndexInternal wbwii(&options, column_family); + auto result = wbwii.GetFromBatch(this, key, value, &s); + + switch (result) { + case WBWIIteratorImpl::kFound: + case WBWIIteratorImpl::kError: + // use returned status + break; + case WBWIIteratorImpl::kDeleted: + case WBWIIteratorImpl::kNotFound: + s = Status::NotFound(); + break; + case WBWIIteratorImpl::kMergeInProgress: + s = Status::MergeInProgress(); + break; + default: + assert(false); + } + + return s; +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key, + &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + const Slice& key, + PinnableSlice* pinnable_val) { + return GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key, + pinnable_val); +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = + GetFromBatchAndDB(db, read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val) { + return GetFromBatchAndDB(db, read_options, column_family, key, pinnable_val, + nullptr); +} + +Status WriteBatchWithIndex::GetFromBatchAndDB( + DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) { + const Comparator* const ucmp = rep->comparator.GetComparator(column_family); + size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; + if (ts_sz > 0 && !read_options.timestamp) { + return Status::InvalidArgument("Must specify timestamp"); + } + + Status s; + WriteBatchWithIndexInternal wbwii(db, column_family); + + // Since the lifetime of the WriteBatch is the same as that of the transaction + // we cannot pin it as otherwise the returned value will not be available + // after the transaction finishes. + std::string& batch_value = *pinnable_val->GetSelf(); + auto result = wbwii.GetFromBatch(this, key, &batch_value, &s); + + if (result == WBWIIteratorImpl::kFound) { + pinnable_val->PinSelf(); + return s; + } else if (!s.ok() || result == WBWIIteratorImpl::kError) { + return s; + } else if (result == WBWIIteratorImpl::kDeleted) { + return Status::NotFound(); + } + assert(result == WBWIIteratorImpl::kMergeInProgress || + result == WBWIIteratorImpl::kNotFound); + + // Did not find key in batch OR could not resolve Merges. Try DB. + if (!callback) { + s = db->Get(read_options, column_family, key, pinnable_val); + } else { + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = pinnable_val; + get_impl_options.callback = callback; + s = static_cast_with_check(db->GetRootDB()) + ->GetImpl(read_options, key, get_impl_options); + } + + if (s.ok() || s.IsNotFound()) { // DB Get Succeeded + if (result == WBWIIteratorImpl::kMergeInProgress) { + // Merge result from DB with merges in Batch + std::string merge_result; + if (s.ok()) { + s = wbwii.MergeKey(key, pinnable_val, &merge_result); + } else { // Key not present in db (s.IsNotFound()) + s = wbwii.MergeKey(key, nullptr, &merge_result); + } + if (s.ok()) { + pinnable_val->Reset(); + *pinnable_val->GetSelf() = std::move(merge_result); + pinnable_val->PinSelf(); + } + } + } + + return s; +} + +void WriteBatchWithIndex::MultiGetFromBatchAndDB( + DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, bool sorted_input) { + MultiGetFromBatchAndDB(db, read_options, column_family, num_keys, keys, + values, statuses, sorted_input, nullptr); +} + +void WriteBatchWithIndex::MultiGetFromBatchAndDB( + DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, bool sorted_input, ReadCallback* callback) { + const Comparator* const ucmp = rep->comparator.GetComparator(column_family); + size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; + if (ts_sz > 0 && !read_options.timestamp) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::InvalidArgument("Must specify timestamp"); + } + return; + } + + WriteBatchWithIndexInternal wbwii(db, column_family); + + autovector key_context; + autovector sorted_keys; + // To hold merges from the write batch + autovector, + MultiGetContext::MAX_BATCH_SIZE> + merges; + // Since the lifetime of the WriteBatch is the same as that of the transaction + // we cannot pin it as otherwise the returned value will not be available + // after the transaction finishes. + for (size_t i = 0; i < num_keys; ++i) { + MergeContext merge_context; + std::string batch_value; + Status* s = &statuses[i]; + PinnableSlice* pinnable_val = &values[i]; + pinnable_val->Reset(); + auto result = + wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s); + + if (result == WBWIIteratorImpl::kFound) { + *pinnable_val->GetSelf() = std::move(batch_value); + pinnable_val->PinSelf(); + continue; + } + if (result == WBWIIteratorImpl::kDeleted) { + *s = Status::NotFound(); + continue; + } + if (result == WBWIIteratorImpl::kError) { + continue; + } + assert(result == WBWIIteratorImpl::kMergeInProgress || + result == WBWIIteratorImpl::kNotFound); + key_context.emplace_back(column_family, keys[i], &values[i], + /* columns */ nullptr, /* timestamp */ nullptr, + &statuses[i]); + merges.emplace_back(result, std::move(merge_context)); + } + + for (KeyContext& key : key_context) { + sorted_keys.emplace_back(&key); + } + + // Did not find key in batch OR could not resolve Merges. Try DB. + static_cast_with_check(db->GetRootDB()) + ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys); + static_cast_with_check(db->GetRootDB()) + ->MultiGetWithCallback(read_options, column_family, callback, + &sorted_keys); + + for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) { + KeyContext& key = *iter; + if (key.s->ok() || key.s->IsNotFound()) { // DB Get Succeeded + size_t index = iter - key_context.begin(); + std::pair& merge_result = + merges[index]; + if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) { + std::string merged_value; + // Merge result from DB with merges in Batch + if (key.s->ok()) { + *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second, + &merged_value); + } else { // Key not present in db (s.IsNotFound()) + *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second, + &merged_value); + } + if (key.s->ok()) { + key.value->Reset(); + *key.value->GetSelf() = std::move(merged_value); + key.value->PinSelf(); + } + } + } + } +} + +void WriteBatchWithIndex::SetSavePoint() { rep->write_batch.SetSavePoint(); } + +Status WriteBatchWithIndex::RollbackToSavePoint() { + Status s = rep->write_batch.RollbackToSavePoint(); + + if (s.ok()) { + rep->sub_batch_cnt = 1; + rep->last_sub_batch_offset = 0; + s = rep->ReBuildIndex(); + } + + return s; +} + +Status WriteBatchWithIndex::PopSavePoint() { + return rep->write_batch.PopSavePoint(); +} + +void WriteBatchWithIndex::SetMaxBytes(size_t max_bytes) { + rep->write_batch.SetMaxBytes(max_bytes); +} + +size_t WriteBatchWithIndex::GetDataSize() const { + return rep->write_batch.GetDataSize(); +} + +const Comparator* WriteBatchWithIndexInternal::GetUserComparator( + const WriteBatchWithIndex& wbwi, uint32_t cf_id) { + const WriteBatchEntryComparator& ucmps = wbwi.rep->comparator; + return ucmps.GetComparator(cf_id); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc new file mode 100644 index 0000000000..ee4754f8d1 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -0,0 +1,742 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family, + Iterator* base_iterator, + WBWIIteratorImpl* delta_iterator, + const Comparator* comparator, + const ReadOptions* read_options) + : forward_(true), + current_at_base_(true), + equal_keys_(false), + status_(Status::OK()), + base_iterator_(base_iterator), + delta_iterator_(delta_iterator), + comparator_(comparator), + iterate_upper_bound_(read_options ? read_options->iterate_upper_bound + : nullptr) { + assert(comparator_); + wbwii_.reset(new WriteBatchWithIndexInternal(column_family)); +} + +bool BaseDeltaIterator::Valid() const { + return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid()) : false; +} + +void BaseDeltaIterator::SeekToFirst() { + forward_ = true; + base_iterator_->SeekToFirst(); + delta_iterator_->SeekToFirst(); + UpdateCurrent(); +} + +void BaseDeltaIterator::SeekToLast() { + forward_ = false; + base_iterator_->SeekToLast(); + delta_iterator_->SeekToLast(); + UpdateCurrent(); +} + +void BaseDeltaIterator::Seek(const Slice& k) { + forward_ = true; + base_iterator_->Seek(k); + delta_iterator_->Seek(k); + UpdateCurrent(); +} + +void BaseDeltaIterator::SeekForPrev(const Slice& k) { + forward_ = false; + base_iterator_->SeekForPrev(k); + delta_iterator_->SeekForPrev(k); + UpdateCurrent(); +} + +void BaseDeltaIterator::Next() { + if (!Valid()) { + status_ = Status::NotSupported("Next() on invalid iterator"); + return; + } + + if (!forward_) { + // Need to change direction + // if our direction was backward and we're not equal, we have two states: + // * both iterators are valid: we're already in a good state (current + // shows to smaller) + // * only one iterator is valid: we need to advance that iterator + forward_ = true; + equal_keys_ = false; + if (!BaseValid()) { + assert(DeltaValid()); + base_iterator_->SeekToFirst(); + } else if (!DeltaValid()) { + delta_iterator_->SeekToFirst(); + } else if (current_at_base_) { + // Change delta from larger than base to smaller + AdvanceDelta(); + } else { + // Change base from larger than delta to smaller + AdvanceBase(); + } + if (DeltaValid() && BaseValid()) { + if (0 == comparator_->CompareWithoutTimestamp( + delta_iterator_->Entry().key, /*a_has_ts=*/false, + base_iterator_->key(), /*b_has_ts=*/false)) { + equal_keys_ = true; + } + } + } + Advance(); +} + +void BaseDeltaIterator::Prev() { + if (!Valid()) { + status_ = Status::NotSupported("Prev() on invalid iterator"); + return; + } + + if (forward_) { + // Need to change direction + // if our direction was backward and we're not equal, we have two states: + // * both iterators are valid: we're already in a good state (current + // shows to smaller) + // * only one iterator is valid: we need to advance that iterator + forward_ = false; + equal_keys_ = false; + if (!BaseValid()) { + assert(DeltaValid()); + base_iterator_->SeekToLast(); + } else if (!DeltaValid()) { + delta_iterator_->SeekToLast(); + } else if (current_at_base_) { + // Change delta from less advanced than base to more advanced + AdvanceDelta(); + } else { + // Change base from less advanced than delta to more advanced + AdvanceBase(); + } + if (DeltaValid() && BaseValid()) { + if (0 == comparator_->CompareWithoutTimestamp( + delta_iterator_->Entry().key, /*a_has_ts=*/false, + base_iterator_->key(), /*b_has_ts=*/false)) { + equal_keys_ = true; + } + } + } + + Advance(); +} + +Slice BaseDeltaIterator::key() const { + return current_at_base_ ? base_iterator_->key() + : delta_iterator_->Entry().key; +} + +Slice BaseDeltaIterator::value() const { + if (current_at_base_) { + return base_iterator_->value(); + } else { + WriteEntry delta_entry = delta_iterator_->Entry(); + if (wbwii_->GetNumOperands() == 0) { + return delta_entry.value; + } else if (delta_entry.type == kDeleteRecord || + delta_entry.type == kSingleDeleteRecord) { + status_ = + wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf()); + } else if (delta_entry.type == kPutRecord) { + status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value, + merge_result_.GetSelf()); + } else if (delta_entry.type == kMergeRecord) { + if (equal_keys_) { + Slice base_value = base_iterator_->value(); + status_ = wbwii_->MergeKey(delta_entry.key, &base_value, + merge_result_.GetSelf()); + } else { + status_ = + wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf()); + } + } + merge_result_.PinSelf(); + return merge_result_; + } +} + +Status BaseDeltaIterator::status() const { + if (!status_.ok()) { + return status_; + } + if (!base_iterator_->status().ok()) { + return base_iterator_->status(); + } + return delta_iterator_->status(); +} + +void BaseDeltaIterator::Invalidate(Status s) { status_ = s; } + +void BaseDeltaIterator::AssertInvariants() { +#ifndef NDEBUG + bool not_ok = false; + if (!base_iterator_->status().ok()) { + assert(!base_iterator_->Valid()); + not_ok = true; + } + if (!delta_iterator_->status().ok()) { + assert(!delta_iterator_->Valid()); + not_ok = true; + } + if (not_ok) { + assert(!Valid()); + assert(!status().ok()); + return; + } + + if (!Valid()) { + return; + } + if (!BaseValid()) { + assert(!current_at_base_ && delta_iterator_->Valid()); + return; + } + if (!DeltaValid()) { + assert(current_at_base_ && base_iterator_->Valid()); + return; + } + // we don't support those yet + assert(delta_iterator_->Entry().type != kMergeRecord && + delta_iterator_->Entry().type != kLogDataRecord); + int compare = comparator_->CompareWithoutTimestamp( + delta_iterator_->Entry().key, /*a_has_ts=*/false, base_iterator_->key(), + /*b_has_ts=*/false); + if (forward_) { + // current_at_base -> compare < 0 + assert(!current_at_base_ || compare < 0); + // !current_at_base -> compare <= 0 + assert(current_at_base_ && compare >= 0); + } else { + // current_at_base -> compare > 0 + assert(!current_at_base_ || compare > 0); + // !current_at_base -> compare <= 0 + assert(current_at_base_ && compare <= 0); + } + // equal_keys_ <=> compare == 0 + assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0)); +#endif +} + +void BaseDeltaIterator::Advance() { + if (equal_keys_) { + assert(BaseValid() && DeltaValid()); + AdvanceBase(); + AdvanceDelta(); + } else { + if (current_at_base_) { + assert(BaseValid()); + AdvanceBase(); + } else { + assert(DeltaValid()); + AdvanceDelta(); + } + } + UpdateCurrent(); +} + +void BaseDeltaIterator::AdvanceDelta() { + if (forward_) { + delta_iterator_->NextKey(); + } else { + delta_iterator_->PrevKey(); + } +} +void BaseDeltaIterator::AdvanceBase() { + if (forward_) { + base_iterator_->Next(); + } else { + base_iterator_->Prev(); + } +} + +bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); } +bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); } +void BaseDeltaIterator::UpdateCurrent() { +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ + status_ = Status::OK(); + while (true) { + auto delta_result = WBWIIteratorImpl::kNotFound; + WriteEntry delta_entry; + if (DeltaValid()) { + assert(delta_iterator_->status().ok()); + delta_result = + delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext()); + delta_entry = delta_iterator_->Entry(); + } else if (!delta_iterator_->status().ok()) { + // Expose the error status and stop. + current_at_base_ = false; + return; + } + equal_keys_ = false; + if (!BaseValid()) { + if (!base_iterator_->status().ok()) { + // Expose the error status and stop. + current_at_base_ = true; + return; + } + + // Base has finished. + if (!DeltaValid()) { + // Finished + return; + } + if (iterate_upper_bound_) { + if (comparator_->CompareWithoutTimestamp( + delta_entry.key, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { + // out of upper bound -> finished. + return; + } + } + if (delta_result == WBWIIteratorImpl::kDeleted && + wbwii_->GetNumOperands() == 0) { + AdvanceDelta(); + } else { + current_at_base_ = false; + return; + } + } else if (!DeltaValid()) { + // Delta has finished. + current_at_base_ = true; + return; + } else { + int compare = + (forward_ ? 1 : -1) * comparator_->CompareWithoutTimestamp( + delta_entry.key, /*a_has_ts=*/false, + base_iterator_->key(), /*b_has_ts=*/false); + if (compare <= 0) { // delta bigger or equal + if (compare == 0) { + equal_keys_ = true; + } + if (delta_result != WBWIIteratorImpl::kDeleted || + wbwii_->GetNumOperands() > 0) { + current_at_base_ = false; + return; + } + // Delta is less advanced and is delete. + AdvanceDelta(); + if (equal_keys_) { + AdvanceBase(); + } + } else { + current_at_base_ = true; + return; + } + } + } + + AssertInvariants(); +#endif // __clang_analyzer__ +} + +void WBWIIteratorImpl::AdvanceKey(bool forward) { + if (Valid()) { + Slice key = Entry().key; + do { + if (forward) { + Next(); + } else { + Prev(); + } + } while (MatchesKey(column_family_id_, key)); + } +} + +void WBWIIteratorImpl::NextKey() { AdvanceKey(true); } + +void WBWIIteratorImpl::PrevKey() { + AdvanceKey(false); // Move to the tail of the previous key + if (Valid()) { + AdvanceKey(false); // Move back another key. Now we are at the start of + // the previous key + if (Valid()) { // Still a valid + Next(); // Move forward one onto this key + } else { + SeekToFirst(); // Not valid, move to the start + } + } +} + +WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate( + MergeContext* merge_context) { + if (Valid()) { + Slice key = Entry().key; + return FindLatestUpdate(key, merge_context); + } else { + merge_context->Clear(); // Clear any entries in the MergeContext + return WBWIIteratorImpl::kNotFound; + } +} + +WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate( + const Slice& key, MergeContext* merge_context) { + Result result = WBWIIteratorImpl::kNotFound; + merge_context->Clear(); // Clear any entries in the MergeContext + // TODO(agiardullo): consider adding support for reverse iteration + if (!Valid()) { + return result; + } else if (comparator_->CompareKey(column_family_id_, Entry().key, key) != + 0) { + return result; + } else { + // We want to iterate in the reverse order that the writes were added to the + // batch. Since we don't have a reverse iterator, we must seek past the + // end. We do this by seeking to the next key, and then back one step + NextKey(); + if (Valid()) { + Prev(); + } else { + SeekToLast(); + } + + // We are at the end of the iterator for this key. Search backwards for the + // last Put or Delete, accumulating merges along the way. + while (Valid()) { + const WriteEntry entry = Entry(); + if (comparator_->CompareKey(column_family_id_, entry.key, key) != 0) { + break; // Unexpected error or we've reached a different next key + } + + switch (entry.type) { + case kPutRecord: + return WBWIIteratorImpl::kFound; + case kDeleteRecord: + return WBWIIteratorImpl::kDeleted; + case kSingleDeleteRecord: + return WBWIIteratorImpl::kDeleted; + case kMergeRecord: + result = WBWIIteratorImpl::kMergeInProgress; + merge_context->PushOperand(entry.value); + break; + case kLogDataRecord: + break; // ignore + case kXIDRecord: + break; // ignore + default: + return WBWIIteratorImpl::kError; + } // end switch statement + Prev(); + } // End while Valid() + // At this point, we have been through the whole list and found no Puts or + // Deletes. The iterator points to the previous key. Move the iterator back + // onto this one. + if (Valid()) { + Next(); + } else { + SeekToFirst(); + } + } + return result; +} + +Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, + WriteType* type, Slice* Key, + Slice* value, Slice* blob, + Slice* xid) const { + if (type == nullptr || Key == nullptr || value == nullptr || + blob == nullptr || xid == nullptr) { + return Status::InvalidArgument("Output parameters cannot be null"); + } + + if (data_offset == GetDataSize()) { + // reached end of batch. + return Status::NotFound(); + } + + if (data_offset > GetDataSize()) { + return Status::InvalidArgument("data offset exceed write batch size"); + } + Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); + char tag; + uint32_t column_family; + Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, + blob, xid); + if (!s.ok()) { + return s; + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + *type = kPutRecord; + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + *type = kDeleteRecord; + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + *type = kSingleDeleteRecord; + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + *type = kDeleteRangeRecord; + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + *type = kMergeRecord; + break; + case kTypeLogData: + *type = kLogDataRecord; + break; + case kTypeNoop: + case kTypeBeginPrepareXID: + case kTypeBeginPersistedPrepareXID: + case kTypeBeginUnprepareXID: + case kTypeEndPrepareXID: + case kTypeCommitXID: + case kTypeRollbackXID: + *type = kXIDRecord; + break; + default: + return Status::Corruption("unknown WriteBatch tag ", + std::to_string(static_cast(tag))); + } + return Status::OK(); +} + +// If both of `entry1` and `entry2` point to real entry in write batch, we +// compare the entries as following: +// 1. first compare the column family, the one with larger CF will be larger; +// 2. Inside the same CF, we first decode the entry to find the key of the entry +// and the entry with larger key will be larger; +// 3. If two entries are of the same CF and key, the one with larger offset +// will be larger. +// Some times either `entry1` or `entry2` is dummy entry, which is actually +// a search key. In this case, in step 2, we don't go ahead and decode the +// entry but use the value in WriteBatchIndexEntry::search_key. +// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf. +// This indicate that we are going to seek to the first of the column family. +// Once we see this, this entry will be smaller than all the real entries of +// the column family. +int WriteBatchEntryComparator::operator()( + const WriteBatchIndexEntry* entry1, + const WriteBatchIndexEntry* entry2) const { + if (entry1->column_family > entry2->column_family) { + return 1; + } else if (entry1->column_family < entry2->column_family) { + return -1; + } + + // Deal with special case of seeking to the beginning of a column family + if (entry1->is_min_in_cf()) { + return -1; + } else if (entry2->is_min_in_cf()) { + return 1; + } + + Slice key1, key2; + if (entry1->search_key == nullptr) { + key1 = Slice(write_batch_->Data().data() + entry1->key_offset, + entry1->key_size); + } else { + key1 = *(entry1->search_key); + } + if (entry2->search_key == nullptr) { + key2 = Slice(write_batch_->Data().data() + entry2->key_offset, + entry2->key_size); + } else { + key2 = *(entry2->search_key); + } + + int cmp = CompareKey(entry1->column_family, key1, key2); + if (cmp != 0) { + return cmp; + } else if (entry1->offset > entry2->offset) { + return 1; + } else if (entry1->offset < entry2->offset) { + return -1; + } + return 0; +} + +int WriteBatchEntryComparator::CompareKey(uint32_t column_family, + const Slice& key1, + const Slice& key2) const { + if (column_family < cf_comparators_.size() && + cf_comparators_[column_family] != nullptr) { + return cf_comparators_[column_family]->CompareWithoutTimestamp( + key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false); + } else { + return default_comparator_->CompareWithoutTimestamp( + key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false); + } +} + +const Comparator* WriteBatchEntryComparator::GetComparator( + const ColumnFamilyHandle* column_family) const { + return column_family ? column_family->GetComparator() : default_comparator_; +} + +const Comparator* WriteBatchEntryComparator::GetComparator( + uint32_t column_family) const { + if (column_family < cf_comparators_.size() && + cf_comparators_[column_family]) { + return cf_comparators_[column_family]; + } + return default_comparator_; +} + +WriteEntry WBWIIteratorImpl::Entry() const { + WriteEntry ret; + Slice blob, xid; + const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); + // this is guaranteed with Valid() + assert(iter_entry != nullptr && + iter_entry->column_family == column_family_id_); + auto s = write_batch_->GetEntryFromDataOffset( + iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid); + assert(s.ok()); + assert(ret.type == kPutRecord || ret.type == kDeleteRecord || + ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord || + ret.type == kMergeRecord); + // Make sure entry.key does not include user-defined timestamp. + const Comparator* const ucmp = comparator_->GetComparator(column_family_id_); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz > 0) { + ret.key = StripTimestampFromUserKey(ret.key, ts_sz); + } + return ret; +} + +bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) { + if (Valid()) { + return comparator_->CompareKey(cf_id, key, Entry().key) == 0; + } else { + return false; + } +} + +WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( + ColumnFamilyHandle* column_family) + : db_(nullptr), db_options_(nullptr), column_family_(column_family) {} + +WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( + DB* db, ColumnFamilyHandle* column_family) + : db_(db), db_options_(nullptr), column_family_(column_family) { + if (db_ != nullptr && column_family_ == nullptr) { + column_family_ = db_->DefaultColumnFamily(); + } +} + +WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( + const DBOptions* db_options, ColumnFamilyHandle* column_family) + : db_(nullptr), db_options_(db_options), column_family_(column_family) {} + +Status WriteBatchWithIndexInternal::MergeKey(const Slice& key, + const Slice* value, + const MergeContext& context, + std::string* result) const { + if (column_family_ != nullptr) { + auto cfh = static_cast_with_check(column_family_); + const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get(); + if (merge_operator == nullptr) { + return Status::InvalidArgument( + "Merge_operator must be set for column_family"); + } else if (db_ != nullptr) { + const ImmutableDBOptions& immutable_db_options = + static_cast_with_check(db_->GetRootDB()) + ->immutable_db_options(); + Statistics* statistics = immutable_db_options.statistics.get(); + Logger* logger = immutable_db_options.info_log.get(); + SystemClock* clock = immutable_db_options.clock; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, logger, + statistics, clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); + } else if (db_options_ != nullptr) { + Statistics* statistics = db_options_->statistics.get(); + Env* env = db_options_->env; + Logger* logger = db_options_->info_log.get(); + SystemClock* clock = env->GetSystemClock().get(); + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, logger, + statistics, clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); + } else { + const auto cf_opts = cfh->cfd()->ioptions(); + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, + cf_opts->logger, cf_opts->stats, cf_opts->clock, + /* result_operand */ nullptr, /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); + } + } else { + return Status::InvalidArgument("Must provide a column_family"); + } +} + +WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( + WriteBatchWithIndex* batch, const Slice& key, MergeContext* context, + std::string* value, Status* s) { + *s = Status::OK(); + + std::unique_ptr iter( + static_cast_with_check( + batch->NewIterator(column_family_))); + + // Search the iterator for this key, and updates/merges to it. + iter->Seek(key); + auto result = iter->FindLatestUpdate(key, context); + if (result == WBWIIteratorImpl::kError) { + (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:", + std::to_string(iter->Entry().type)); + return result; + } else if (result == WBWIIteratorImpl::kNotFound) { + return result; + } else if (result == WBWIIteratorImpl::Result::kFound) { // PUT + Slice entry_value = iter->Entry().value; + if (context->GetNumOperands() > 0) { + *s = MergeKey(key, &entry_value, *context, value); + if (!s->ok()) { + result = WBWIIteratorImpl::Result::kError; + } + } else { + value->assign(entry_value.data(), entry_value.size()); + } + } else if (result == WBWIIteratorImpl::kDeleted) { + if (context->GetNumOperands() > 0) { + *s = MergeKey(key, nullptr, *context, value); + if (s->ok()) { + result = WBWIIteratorImpl::Result::kFound; + } else { + result = WBWIIteratorImpl::Result::kError; + } + } + } + return result; +} + +} // namespace ROCKSDB_NAMESPACE + diff --git a/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.h b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.h new file mode 100644 index 0000000000..031d72889e --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/utilities/write_batch_with_index/write_batch_with_index_internal.h @@ -0,0 +1,342 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + + +#include +#include +#include + +#include "db/merge_context.h" +#include "memtable/skiplist.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" + +namespace ROCKSDB_NAMESPACE { + +class MergeContext; +class WBWIIteratorImpl; +class WriteBatchWithIndexInternal; +struct Options; + +// when direction == forward +// * current_at_base_ <=> base_iterator > delta_iterator +// when direction == backwards +// * current_at_base_ <=> base_iterator < delta_iterator +// always: +// * equal_keys_ <=> base_iterator == delta_iterator +class BaseDeltaIterator : public Iterator { + public: + BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator, + WBWIIteratorImpl* delta_iterator, + const Comparator* comparator, + const ReadOptions* read_options = nullptr); + + ~BaseDeltaIterator() override {} + + bool Valid() const override; + void SeekToFirst() override; + void SeekToLast() override; + void Seek(const Slice& k) override; + void SeekForPrev(const Slice& k) override; + void Next() override; + void Prev() override; + Slice key() const override; + Slice value() const override; + Status status() const override; + void Invalidate(Status s); + + private: + void AssertInvariants(); + void Advance(); + void AdvanceDelta(); + void AdvanceBase(); + bool BaseValid() const; + bool DeltaValid() const; + void UpdateCurrent(); + + std::unique_ptr wbwii_; + bool forward_; + bool current_at_base_; + bool equal_keys_; + mutable Status status_; + std::unique_ptr base_iterator_; + std::unique_ptr delta_iterator_; + const Comparator* comparator_; // not owned + const Slice* iterate_upper_bound_; + mutable PinnableSlice merge_result_; +}; + +// Key used by skip list, as the binary searchable index of WriteBatchWithIndex. +struct WriteBatchIndexEntry { + WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz) + : offset(o), + column_family(c), + key_offset(ko), + key_size(ksz), + search_key(nullptr) {} + // Create a dummy entry as the search key. This index entry won't be backed + // by an entry from the write batch, but a pointer to the search key. Or a + // special flag of offset can indicate we are seek to first. + // @_search_key: the search key + // @_column_family: column family + // @is_forward_direction: true for Seek(). False for SeekForPrev() + // @is_seek_to_first: true if we seek to the beginning of the column family + // _search_key should be null in this case. + WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family, + bool is_forward_direction, bool is_seek_to_first) + // For SeekForPrev(), we need to make the dummy entry larger than any + // entry who has the same search key. Otherwise, we'll miss those entries. + : offset(is_forward_direction ? 0 : std::numeric_limits::max()), + column_family(_column_family), + key_offset(0), + key_size(is_seek_to_first ? kFlagMinInCf : 0), + search_key(_search_key) { + assert(_search_key != nullptr || is_seek_to_first); + } + + // If this flag appears in the key_size, it indicates a + // key that is smaller than any other entry for the same column family. + static const size_t kFlagMinInCf = std::numeric_limits::max(); + + bool is_min_in_cf() const { + assert(key_size != kFlagMinInCf || + (key_offset == 0 && search_key == nullptr)); + return key_size == kFlagMinInCf; + } + + // offset of an entry in write batch's string buffer. If this is a dummy + // lookup key, in which case search_key != nullptr, offset is set to either + // 0 or max, only for comparison purpose. Because when entries have the same + // key, the entry with larger offset is larger, offset = 0 will make a seek + // key small or equal than all the entries with the seek key, so that Seek() + // will find all the entries of the same key. Similarly, offset = MAX will + // make the entry just larger than all entries with the search key so + // SeekForPrev() will see all the keys with the same key. + size_t offset; + uint32_t column_family; // column family of the entry. + size_t key_offset; // offset of the key in write batch's string buffer. + size_t key_size; // size of the key. kFlagMinInCf indicates + // that this is a dummy look up entry for + // SeekToFirst() to the beginning of the column + // family. We use the flag here to save a boolean + // in the struct. + + const Slice* search_key; // if not null, instead of reading keys from + // write batch, use it to compare. This is used + // for lookup key. +}; + +class ReadableWriteBatch : public WriteBatch { + public: + explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0, + size_t protection_bytes_per_key = 0, + size_t default_cf_ts_sz = 0) + : WriteBatch(reserved_bytes, max_bytes, protection_bytes_per_key, + default_cf_ts_sz) {} + // Retrieve some information from a write entry in the write batch, given + // the start offset of the write entry. + Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, + Slice* value, Slice* blob, Slice* xid) const; +}; + +class WriteBatchEntryComparator { + public: + WriteBatchEntryComparator(const Comparator* _default_comparator, + const ReadableWriteBatch* write_batch) + : default_comparator_(_default_comparator), write_batch_(write_batch) {} + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + int operator()(const WriteBatchIndexEntry* entry1, + const WriteBatchIndexEntry* entry2) const; + + int CompareKey(uint32_t column_family, const Slice& key1, + const Slice& key2) const; + + void SetComparatorForCF(uint32_t column_family_id, + const Comparator* comparator) { + if (column_family_id >= cf_comparators_.size()) { + cf_comparators_.resize(column_family_id + 1, nullptr); + } + cf_comparators_[column_family_id] = comparator; + } + + const Comparator* default_comparator() { return default_comparator_; } + + const Comparator* GetComparator( + const ColumnFamilyHandle* column_family) const; + + const Comparator* GetComparator(uint32_t column_family) const; + + private: + const Comparator* const default_comparator_; + std::vector cf_comparators_; + const ReadableWriteBatch* const write_batch_; +}; + +using WriteBatchEntrySkipList = + SkipList; + +class WBWIIteratorImpl : public WBWIIterator { + public: + enum Result : uint8_t { + kFound, + kDeleted, + kNotFound, + kMergeInProgress, + kError + }; + WBWIIteratorImpl(uint32_t column_family_id, + WriteBatchEntrySkipList* skip_list, + const ReadableWriteBatch* write_batch, + WriteBatchEntryComparator* comparator) + : column_family_id_(column_family_id), + skip_list_iter_(skip_list), + write_batch_(write_batch), + comparator_(comparator) {} + + ~WBWIIteratorImpl() override {} + + bool Valid() const override { + if (!skip_list_iter_.Valid()) { + return false; + } + const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); + return (iter_entry != nullptr && + iter_entry->column_family == column_family_id_); + } + + void SeekToFirst() override { + WriteBatchIndexEntry search_entry( + nullptr /* search_key */, column_family_id_, + true /* is_forward_direction */, true /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + } + + void SeekToLast() override { + WriteBatchIndexEntry search_entry( + nullptr /* search_key */, column_family_id_ + 1, + true /* is_forward_direction */, true /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + if (!skip_list_iter_.Valid()) { + skip_list_iter_.SeekToLast(); + } else { + skip_list_iter_.Prev(); + } + } + + void Seek(const Slice& key) override { + WriteBatchIndexEntry search_entry(&key, column_family_id_, + true /* is_forward_direction */, + false /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + } + + void SeekForPrev(const Slice& key) override { + WriteBatchIndexEntry search_entry(&key, column_family_id_, + false /* is_forward_direction */, + false /* is_seek_to_first */); + skip_list_iter_.SeekForPrev(&search_entry); + } + + void Next() override { skip_list_iter_.Next(); } + + void Prev() override { skip_list_iter_.Prev(); } + + WriteEntry Entry() const override; + + Status status() const override { + // this is in-memory data structure, so the only way status can be non-ok is + // through memory corruption + return Status::OK(); + } + + const WriteBatchIndexEntry* GetRawEntry() const { + return skip_list_iter_.key(); + } + + bool MatchesKey(uint32_t cf_id, const Slice& key); + + // Moves the iterator to first entry of the previous key. + void PrevKey(); + // Moves the iterator to first entry of the next key. + void NextKey(); + + // Moves the iterator to the Update (Put or Delete) for the current key + // If there are no Put/Delete, the Iterator will point to the first entry for + // this key + // @return kFound if a Put was found for the key + // @return kDeleted if a delete was found for the key + // @return kMergeInProgress if only merges were fouund for the key + // @return kError if an unsupported operation was found for the key + // @return kNotFound if no operations were found for this key + // + Result FindLatestUpdate(const Slice& key, MergeContext* merge_context); + Result FindLatestUpdate(MergeContext* merge_context); + + protected: + void AdvanceKey(bool forward); + + private: + uint32_t column_family_id_; + WriteBatchEntrySkipList::Iterator skip_list_iter_; + const ReadableWriteBatch* write_batch_; + WriteBatchEntryComparator* comparator_; +}; + +class WriteBatchWithIndexInternal { + public: + static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi, + uint32_t cf_id); + + // For GetFromBatchAndDB or similar + explicit WriteBatchWithIndexInternal(DB* db, + ColumnFamilyHandle* column_family); + // For GetFromBatchAndDB or similar + explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family); + // For GetFromBatch or similar + explicit WriteBatchWithIndexInternal(const DBOptions* db_options, + ColumnFamilyHandle* column_family); + + // If batch contains a value for key, store it in *value and return kFound. + // If batch contains a deletion for key, return Deleted. + // If batch contains Merge operations as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operands to *operands, + // and return kMergeInProgress + // If batch does not contain this key, return kNotFound + // Else, return kError on error with error Status stored in *s. + WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, + const Slice& key, std::string* value, + Status* s) { + return GetFromBatch(batch, key, &merge_context_, value, s); + } + WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, + const Slice& key, + MergeContext* merge_context, + std::string* value, Status* s); + Status MergeKey(const Slice& key, const Slice* value, + std::string* result) const { + return MergeKey(key, value, merge_context_, result); + } + Status MergeKey(const Slice& key, const Slice* value, + const MergeContext& context, std::string* result) const; + size_t GetNumOperands() const { return merge_context_.GetNumOperands(); } + MergeContext* GetMergeContext() { return &merge_context_; } + Slice GetOperand(int index) const { return merge_context_.GetOperand(index); } + + private: + DB* db_; + const DBOptions* db_options_; + ColumnFamilyHandle* column_family_; + MergeContext merge_context_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB.hs b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB.hs new file mode 100644 index 0000000000..730a122d65 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB.hs @@ -0,0 +1,17 @@ +-- | +-- Module : Database.RocksDB +-- Copyright : (c) 2012-2013 The leveldb-haskell Authors +-- (c) 2014 The rocksdb-haskell Authors +-- License : BSD3 +-- Maintainer : mail@agrafix.net +-- Stability : experimental +-- Portability : non-portable +-- +-- RocksDB Haskell binding. +-- +-- The API closely follows the C-API of RocksDB. +-- For more information, see: + +module Database.RocksDB (module Base) where + +import Database.RocksDB.Base as Base diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Base.hs b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Base.hs new file mode 100644 index 0000000000..4aff66a7cf --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Base.hs @@ -0,0 +1,336 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE TupleSections #-} + +-- | +-- Module : Database.RocksDB.Base +-- Copyright : (c) 2012-2013 The leveldb-haskell Authors +-- (c) 2014 The rocksdb-haskell Authors +-- License : BSD3 +-- Maintainer : mail@agrafix.net +-- Stability : experimental +-- Portability : non-portable +-- +-- RocksDB Haskell binding. +-- +-- The API closely follows the C-API of RocksDB. +-- For more information, see: + +module Database.RocksDB.Base + ( -- * Exported Types + DB + , BatchOp (..) + , Comparator (..) + , Compression (..) + , Options (..) + , Snapshot + , WriteBatch + , WriteOptions (..) + , Range + + -- * Defaults + , defaultOptions + , defaultWriteOptions + + -- * Basic Database Manipulations + , open + , openReadOnly + -- , openBracket + , close + , put + , putBinaryVal + , putBinary + , delete + , write + , get + , getBinary + , getBinaryVal + , withSnapshot + -- , withSnapshotBracket + , createSnapshot + , releaseSnapshot + + -- * Administrative Functions + , Property (..), getProperty + , destroy + , repair + , approximateSize + + -- * Utility functions to help perform mass writes + , binaryToBS + , bsToBinary + + -- * Iteration + , module Database.RocksDB.Iterator + ) where + +import Control.Exception (bracket, bracketOnError) +import Control.Monad (liftM, when) + +import Control.Monad.IO.Class (MonadIO (liftIO)) +import Data.Binary (Binary) +import qualified Data.Binary as Binary +import Data.ByteString (ByteString) +import Data.ByteString.Internal (ByteString (..)) +import qualified Data.ByteString.Lazy as BSL +import Foreign +import Foreign.C.String (CString, withCString) +import System.Directory (createDirectoryIfMissing) + +import Database.RocksDB.C +import Database.RocksDB.Internal +import Database.RocksDB.Iterator +import Database.RocksDB.ReadOptions +import Database.RocksDB.Types + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Unsafe as BU + +import qualified GHC.Foreign as GHC +import qualified GHC.IO.Encoding as GHC + +-- -- | Open a database +-- -- +-- -- The returned handle will automatically be released when the enclosing +-- -- 'runResourceT' terminates. +-- openBracket :: MonadResource m => FilePath -> Options -> m (ReleaseKey, DB) +-- openBracket path opts = allocate (open path opts) close +-- {-# INLINE openBracket #-} + +-- -- | Run an action with a snapshot of the database. +-- -- +-- -- The snapshot will be released when the action terminates or throws an +-- -- exception. Note that this function is provided for convenience and does not +-- -- prevent the 'Snapshot' handle to escape. It will, however, be invalid after +-- -- this function returns and should not be used anymore. +-- withSnapshotBracket :: MonadResource m => DB -> (Snapshot -> m a) -> m a +-- withSnapshotBracket db f = do +-- (rk, snap) <- createSnapshotBracket db +-- res <- f snap +-- release rk +-- return res + +-- -- | Create a snapshot of the database. +-- -- +-- -- The returned 'Snapshot' will be released automatically when the enclosing +-- -- 'runResourceT' terminates. It is recommended to use 'createSnapshot'' instead +-- -- and release the resource manually as soon as possible. +-- -- Can be released early. +-- createSnapshotBracket :: MonadResource m => DB -> m (ReleaseKey, Snapshot) +-- createSnapshotBracket db = allocate (createSnapshot db) (releaseSnapshot db) + +openWith :: MonadIO m => (OptionsPtr -> CString -> ErrPtr -> IO RocksDBPtr) -> [Char] -> Options -> m DB +openWith opener path opts = liftIO $ bracketOnError initialize finalize mkDB + where +# ifdef mingw32_HOST_OS + initialize = + (, ()) <$> mkOpts opts + finalize (opts', ()) = + freeOpts opts' +# else + initialize = do + opts' <- mkOpts opts + -- With LC_ALL=C, two things happen: + -- * rocksdb can't open a database with unicode in path; + -- * rocksdb can't create a folder properly. + -- So, we create the folder by ourselves, and for thart we + -- need to set the encoding we're going to use. On Linux + -- it's almost always UTC-8. + oldenc <- GHC.getFileSystemEncoding + when (createIfMissing opts) $ + GHC.setFileSystemEncoding GHC.utf8 + pure (opts', oldenc) + finalize (opts', oldenc) = do + freeOpts opts' + GHC.setFileSystemEncoding oldenc +# endif + mkDB (Options' opts_ptr _ _, _) = do + when (createIfMissing opts) $ + createDirectoryIfMissing True path + withFilePath path $ \path_ptr -> + liftM DB + $ throwIfErr "open" + $ opener opts_ptr path_ptr + +-- | Open a database. +-- +-- The returned handle should be released with 'close'. +open :: MonadIO m => FilePath -> Options -> m DB +open = openWith c_rocksdb_open + +-- | Open a database in read-only mode. Any changes made to the database after +-- the database is opened read-only will not be visible until re-opened. +-- +-- The returned handle should be released with 'close'. +openReadOnly :: MonadIO m => FilePath -> Options -> m DB +openReadOnly = openWith (\o p -> c_rocksdb_open_for_read_only o p 0) + +-- | Close a database. +-- +-- The handle will be invalid after calling this action and should no +-- longer be used. +close :: MonadIO m => DB -> m () +close (DB db_ptr) = liftIO $ + c_rocksdb_close db_ptr + +-- | Run an action with a 'Snapshot' of the database. +withSnapshot :: MonadIO m => DB -> (Snapshot -> IO a) -> m a +withSnapshot db act = liftIO $ + bracket (createSnapshot db) (releaseSnapshot db) act + +-- | Create a snapshot of the database. +-- +-- The returned 'Snapshot' should be released with 'releaseSnapshot'. +createSnapshot :: MonadIO m => DB -> m Snapshot +createSnapshot (DB db_ptr) = liftIO $ + Snapshot <$> c_rocksdb_create_snapshot db_ptr + +-- | Release a snapshot. +-- +-- The handle will be invalid after calling this action and should no +-- longer be used. +releaseSnapshot :: MonadIO m => DB -> Snapshot -> m () +releaseSnapshot (DB db_ptr) (Snapshot snap) = liftIO $ + c_rocksdb_release_snapshot db_ptr snap + +-- | Get a DB property. +getProperty :: MonadIO m => DB -> Property -> m (Maybe ByteString) +getProperty (DB db_ptr) p = liftIO $ + withCString (prop p) $ \prop_ptr -> do + val_ptr <- c_rocksdb_property_value db_ptr prop_ptr + if val_ptr == nullPtr + then return Nothing + else do res <- Just <$> BS.packCString val_ptr + freeCString val_ptr + return res + where + prop (NumFilesAtLevel i) = "rocksdb.num-files-at-level" ++ show i + prop Stats = "rocksdb.stats" + prop SSTables = "rocksdb.sstables" + +-- | Destroy the given RocksDB database. +destroy :: MonadIO m => FilePath -> Options -> m () +destroy path opts = liftIO $ bracket (mkOpts opts) freeOpts destroy' + where + destroy' (Options' opts_ptr _ _) = + withFilePath path $ \path_ptr -> + throwIfErr "destroy" $ c_rocksdb_destroy_db opts_ptr path_ptr + +-- | Repair the given RocksDB database. +repair :: MonadIO m => FilePath -> Options -> m () +repair path opts = liftIO $ bracket (mkOpts opts) freeOpts repair' + where + repair' (Options' opts_ptr _ _) = + withFilePath path $ \path_ptr -> + throwIfErr "repair" $ c_rocksdb_repair_db opts_ptr path_ptr + +-- TODO: support [Range], like C API does +type Range = (ByteString, ByteString) + +-- | Inspect the approximate sizes of the different levels. +approximateSize :: MonadIO m => DB -> Range -> m Int64 +approximateSize (DB db_ptr) (from, to) = liftIO $ + BU.unsafeUseAsCStringLen from $ \(from_ptr, flen) -> + BU.unsafeUseAsCStringLen to $ \(to_ptr, tlen) -> + withArray [from_ptr] $ \from_ptrs -> + withArray [intToCSize flen] $ \flen_ptrs -> + withArray [to_ptr] $ \to_ptrs -> + withArray [intToCSize tlen] $ \tlen_ptrs -> + allocaArray 1 $ \size_ptrs -> do + c_rocksdb_approximate_sizes db_ptr 1 + from_ptrs flen_ptrs + to_ptrs tlen_ptrs + size_ptrs + liftM head $ peekArray 1 size_ptrs >>= mapM toInt64 + + where + toInt64 = return . fromIntegral + +putBinaryVal :: (MonadIO m, Binary v) => DB -> WriteOptions -> ByteString -> v -> m () +putBinaryVal db wopts key val = put db wopts key (binaryToBS val) + +putBinary :: (MonadIO m, Binary k, Binary v) => DB -> WriteOptions -> k -> v -> m () +putBinary db wopts key val = put db wopts (binaryToBS key) (binaryToBS val) + +-- | Write a key/value pair. +put :: MonadIO m => DB -> WriteOptions -> ByteString -> ByteString -> m () +put (DB db_ptr) opts key value = liftIO $ withCWriteOpts opts $ \opts_ptr -> + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + BU.unsafeUseAsCStringLen value $ \(val_ptr, vlen) -> + throwIfErr "put" + $ c_rocksdb_put db_ptr opts_ptr + key_ptr (intToCSize klen) + val_ptr (intToCSize vlen) + +getBinaryVal :: (Binary v, MonadIO m) => DB -> ReadOptions -> ByteString -> m (Maybe v) +getBinaryVal db ropts key = fmap bsToBinary <$> get db ropts key + +getBinary :: (MonadIO m, Binary k, Binary v) => DB -> ReadOptions -> k -> m (Maybe v) +getBinary db ropts key = fmap bsToBinary <$> get db ropts (binaryToBS key) + +-- | Read a value by key. +get :: MonadIO m => DB -> ReadOptions -> ByteString -> m (Maybe ByteString) +get (DB db_ptr) opts key = liftIO $ withReadOptions opts $ \opts_ptr -> + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + alloca $ \vlen_ptr -> do + val_ptr <- throwIfErr "get" $ + c_rocksdb_get db_ptr opts_ptr key_ptr (intToCSize klen) vlen_ptr + vlen <- peek vlen_ptr + if val_ptr == nullPtr + then return Nothing + else do + res' <- Just <$> BS.packCStringLen (val_ptr, cSizeToInt vlen) + freeCString val_ptr + return res' + +-- | Delete a key/value pair. +delete :: MonadIO m => DB -> WriteOptions -> ByteString -> m () +delete (DB db_ptr) opts key = liftIO $ withCWriteOpts opts $ \opts_ptr -> + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + throwIfErr "delete" + $ c_rocksdb_delete db_ptr opts_ptr key_ptr (intToCSize klen) + +-- | Perform a batch mutation. +write :: MonadIO m => DB -> WriteOptions -> WriteBatch -> m () +write (DB db_ptr) opts batch = liftIO $ withCWriteOpts opts $ \opts_ptr -> + bracket c_rocksdb_writebatch_create c_rocksdb_writebatch_destroy $ \batch_ptr -> do + + mapM_ (batchAdd batch_ptr) batch + + throwIfErr "write" $ c_rocksdb_write db_ptr opts_ptr batch_ptr + + -- ensure @ByteString@s (and respective shared @CStringLen@s) aren't GC'ed + -- until here + mapM_ (liftIO . touch) batch + + where + batchAdd batch_ptr (Put key val) = + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + BU.unsafeUseAsCStringLen val $ \(val_ptr, vlen) -> + c_rocksdb_writebatch_put batch_ptr + key_ptr (intToCSize klen) + val_ptr (intToCSize vlen) + + batchAdd batch_ptr (Del key) = + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + c_rocksdb_writebatch_delete batch_ptr key_ptr (intToCSize klen) + + touch (Put (PS p _ _) (PS p' _ _)) = do + touchForeignPtr p + touchForeignPtr p' + + touch (Del (PS p _ _)) = touchForeignPtr p + +binaryToBS :: Binary v => v -> ByteString +binaryToBS x = BSL.toStrict (Binary.encode x) + +bsToBinary :: Binary v => ByteString -> v +bsToBinary x = Binary.decode (BSL.fromStrict x) + +-- | Marshal a 'FilePath' (Haskell string) into a `NUL` terminated C string using +-- temporary storage. +-- On Linux, UTF-8 is almost always the encoding used. +-- When on Windows, UTF-8 can also be used, although the default for those devices is +-- UTF-16. For a more detailed explanation, please refer to +-- https://msdn.microsoft.com/en-us/library/windows/desktop/dd374081(v=vs.85).aspx. +withFilePath :: FilePath -> (CString -> IO a) -> IO a +withFilePath = GHC.withCString GHC.utf8 diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/C.hsc b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/C.hsc new file mode 100644 index 0000000000..82090193fa --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/C.hsc @@ -0,0 +1,415 @@ +{-# LANGUAGE CPP, ForeignFunctionInterface, EmptyDataDecls #-} +-- | +-- Module : Database.RocksDB.C +-- Copyright : (c) 2012-2013 The rocksdb-haskell Authors +-- License : BSD3 +-- Maintainer : kim.altintop@gmail.com +-- Stability : experimental +-- Portability : non-portable +-- + +module Database.RocksDB.C where + +import Foreign +import Foreign.C.Types +import Foreign.C.String + +#ifdef mingw32_HOST_OS +#include +#else +#include +#endif + +data RocksDB +data LCache +data LComparator +data LIterator +data LLogger +data LOptions +data LReadOptions +data LSnapshot +data LWriteBatch +data LWriteOptions +data PrefixExtractor +data Checkpoint + +type RocksDBPtr = Ptr RocksDB +type CachePtr = Ptr LCache +type ComparatorPtr = Ptr LComparator +type IteratorPtr = Ptr LIterator +type LoggerPtr = Ptr LLogger +type OptionsPtr = Ptr LOptions +type ReadOptionsPtr = Ptr LReadOptions +type SnapshotPtr = Ptr LSnapshot +type WriteBatchPtr = Ptr LWriteBatch +type WriteOptionsPtr = Ptr LWriteOptions + +type DBName = CString +type ErrPtr = Ptr CString +type Key = CString +type Val = CString + +newtype CompressionOpt = CompressionOpt { compressionOpt :: CInt } + deriving (Eq, Show) +#{enum CompressionOpt, CompressionOpt + , noCompression = 0 + , snappyCompression = 1 + , zlibCompression = 2 + , bz2Compression = 3 + , lz4Compression = 4 + , lz4hcCompression = 5 + } + + +foreign import ccall safe "rocksdb\\c.h rocksdb_open" + c_rocksdb_open :: OptionsPtr -> DBName -> ErrPtr -> IO RocksDBPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_open_for_read_only" + c_rocksdb_open_for_read_only :: OptionsPtr -> DBName -> CChar -> ErrPtr -> IO RocksDBPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_close" + c_rocksdb_close :: RocksDBPtr -> IO () + + +foreign import ccall safe "rocksdb\\c.h rocksdb_put" + c_rocksdb_put :: RocksDBPtr + -> WriteOptionsPtr + -> Key -> CSize + -> Val -> CSize + -> ErrPtr + -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_delete" + c_rocksdb_delete :: RocksDBPtr + -> WriteOptionsPtr + -> Key -> CSize + -> ErrPtr + -> IO () + + +foreign import ccall unsafe "cpp\\chainweb-rocksdb.h rocksdb_delete_range" + rocksdb_delete_range + :: RocksDBPtr + -> WriteOptionsPtr + -> CString {- min key -} + -> CSize {- min key length -} + -> CString {- max key length -} + -> CSize {- max key length -} + -> Ptr CString {- output: errptr -} + -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_write" + c_rocksdb_write :: RocksDBPtr + -> WriteOptionsPtr + -> WriteBatchPtr + -> ErrPtr + -> IO () + +-- | Returns NULL if not found. A malloc()ed array otherwise. Stores the length +-- of the array in *vallen. +foreign import ccall safe "rocksdb\\c.h rocksdb_get" + c_rocksdb_get :: RocksDBPtr + -> ReadOptionsPtr + -> Key -> CSize + -> Ptr CSize -- ^ value length + -> ErrPtr + -> IO CString + +-- // if values_list[i] == NULL and errs[i] == NULL, +-- // then we got status.IsNotFound(), which we will not return. +-- // all errors except status status.ok() and status.IsNotFound() are returned. +-- // +-- // errs, values_list and values_list_sizes must be num_keys in length, +-- // allocated by the caller. +-- // errs is a list of strings as opposed to the conventional one error, +-- // where errs[i] is the status for retrieval of keys_list[i]. +-- // each non-NULL errs entry is a malloc()ed, null terminated string. +-- // each non-NULL values_list entry is a malloc()ed array, with +-- // the length for each stored in values_list_sizes[i]. +-- extern ROCKSDB_LIBRARY_API void rocksdb_multi_get( +-- rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, +-- const char* const* keys_list, const size_t* keys_list_sizes, +-- char** values_list, size_t* values_list_sizes, char** errs); +-- +foreign import ccall unsafe "rocksdb\\c.h rocksdb_multi_get" + rocksdb_multi_get + :: RocksDBPtr + -> ReadOptionsPtr + -> CSize + -- ^ num_key + -> Ptr (Ptr CChar) + -- ^ keys_list + -> Ptr CSize + -- ^ keys_list_sizes + -> Ptr (Ptr CChar) + -- ^ values_list + -> Ptr CSize + -- ^ values_list_sizes + -> Ptr CString + -- ^ errs + -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_create_snapshot" + c_rocksdb_create_snapshot :: RocksDBPtr -> IO SnapshotPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_release_snapshot" + c_rocksdb_release_snapshot :: RocksDBPtr -> SnapshotPtr -> IO () + +-- | Returns NULL if property name is unknown. Else returns a pointer to a +-- malloc()-ed null-terminated value. +foreign import ccall safe "rocksdb\\c.h rocksdb_property_value" + c_rocksdb_property_value :: RocksDBPtr -> CString -> IO CString + +foreign import ccall safe "rocksdb\\c.h rocksdb_approximate_sizes" + c_rocksdb_approximate_sizes :: RocksDBPtr + -> CInt -- ^ num ranges + -> Ptr CString -> Ptr CSize -- ^ range start keys (array) + -> Ptr CString -> Ptr CSize -- ^ range limit keys (array) + -> Ptr Word64 -- ^ array of approx. sizes of ranges + -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_destroy_db" + c_rocksdb_destroy_db :: OptionsPtr -> DBName -> ErrPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_repair_db" + c_rocksdb_repair_db :: OptionsPtr -> DBName -> ErrPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_compact_range" + rocksdb_compact_range + :: RocksDBPtr + -> CString {- min key -} + -> CSize {- min key length -} + -> CString {- max key -} + -> CSize {- max key length -} + -> IO () + +-- +-- Iterator +-- + +foreign import ccall safe "rocksdb\\c.h rocksdb_create_iterator" + c_rocksdb_create_iterator :: RocksDBPtr -> ReadOptionsPtr -> IO IteratorPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_destroy" + c_rocksdb_iter_destroy :: IteratorPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_valid" + c_rocksdb_iter_valid :: IteratorPtr -> IO CUChar + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_seek_to_first" + c_rocksdb_iter_seek_to_first :: IteratorPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_seek_to_last" + c_rocksdb_iter_seek_to_last :: IteratorPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_seek" + c_rocksdb_iter_seek :: IteratorPtr -> Key -> CSize -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_next" + c_rocksdb_iter_next :: IteratorPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_prev" + c_rocksdb_iter_prev :: IteratorPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_key" + c_rocksdb_iter_key :: IteratorPtr -> Ptr CSize -> IO Key + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_value" + c_rocksdb_iter_value :: IteratorPtr -> Ptr CSize -> IO Val + +foreign import ccall safe "rocksdb\\c.h rocksdb_iter_get_error" + c_rocksdb_iter_get_error :: IteratorPtr -> ErrPtr -> IO () + + +-- +-- Write batch +-- + +foreign import ccall safe "rocksdb\\c.h rocksdb_writebatch_create" + c_rocksdb_writebatch_create :: IO WriteBatchPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_writebatch_destroy" + c_rocksdb_writebatch_destroy :: WriteBatchPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_writebatch_clear" + c_rocksdb_writebatch_clear :: WriteBatchPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_writebatch_put" + c_rocksdb_writebatch_put :: WriteBatchPtr + -> Key -> CSize + -> Val -> CSize + -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_writebatch_delete" + c_rocksdb_writebatch_delete :: WriteBatchPtr -> Key -> CSize -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_writebatch_iterate" + c_rocksdb_writebatch_iterate :: WriteBatchPtr + -> Ptr () -- ^ state + -> FunPtr (Ptr () -> Key -> CSize -> Val -> CSize) -- ^ put + -> FunPtr (Ptr () -> Key -> CSize) -- ^ delete + -> IO () + + +-- +-- Options +-- + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_create" + c_rocksdb_options_create :: IO OptionsPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_destroy" + c_rocksdb_options_destroy :: OptionsPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_comparator" + c_rocksdb_options_set_comparator :: OptionsPtr -> ComparatorPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_create_if_missing" + c_rocksdb_options_set_create_if_missing :: OptionsPtr -> CUChar -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_error_if_exists" + c_rocksdb_options_set_error_if_exists :: OptionsPtr -> CUChar -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_paranoid_checks" + c_rocksdb_options_set_paranoid_checks :: OptionsPtr -> CUChar -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_info_log" + c_rocksdb_options_set_info_log :: OptionsPtr -> LoggerPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_write_buffer_size" + c_rocksdb_options_set_write_buffer_size :: OptionsPtr -> CSize -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_max_open_files" + c_rocksdb_options_set_max_open_files :: OptionsPtr -> CInt -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_options_set_compression" + c_rocksdb_options_set_compression :: OptionsPtr -> CompressionOpt -> IO () + +foreign import ccall unsafe "rocksdb\\c.h rocksdb_options_set_prefix_extractor" + rocksdb_options_set_prefix_extractor :: OptionsPtr -> Ptr PrefixExtractor -> IO () + +foreign import ccall unsafe "cpp\\chainweb-rocksdb.h rocksdb_options_table_prefix_extractor" + rocksdb_options_table_prefix_extractor :: Ptr PrefixExtractor +-- +-- Comparator +-- + +type StatePtr = Ptr () +type Destructor = StatePtr -> () +type CompareFun = StatePtr -> CString -> CSize -> CString -> CSize -> IO CInt +type NameFun = StatePtr -> CString + +-- | Make a FunPtr to a user-defined comparator function +foreign import ccall "wrapper" mkCmp :: CompareFun -> IO (FunPtr CompareFun) + +-- | Make a destructor FunPtr +foreign import ccall "wrapper" mkDest :: Destructor -> IO (FunPtr Destructor) + +-- | Make a name FunPtr +foreign import ccall "wrapper" mkName :: NameFun -> IO (FunPtr NameFun) + +foreign import ccall safe "rocksdb\\c.h rocksdb_comparator_create" + c_rocksdb_comparator_create :: StatePtr + -> FunPtr Destructor + -> FunPtr CompareFun + -> FunPtr NameFun + -> IO ComparatorPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_comparator_destroy" + c_rocksdb_comparator_destroy :: ComparatorPtr -> IO () + + +-- +-- Filter Policy +-- + +type CreateFilterFun = StatePtr + -> Ptr CString -- ^ key array + -> Ptr CSize -- ^ key length array + -> CInt -- ^ num keys + -> Ptr CSize -- ^ filter length + -> IO CString -- ^ the filter +type KeyMayMatchFun = StatePtr + -> CString -- ^ key + -> CSize -- ^ key length + -> CString -- ^ filter + -> CSize -- ^ filter length + -> IO CUChar -- ^ whether key is in filter + +-- | Make a FunPtr to a user-defined create_filter function +foreign import ccall "wrapper" mkCF :: CreateFilterFun -> IO (FunPtr CreateFilterFun) + +-- | Make a FunPtr to a user-defined key_may_match function +foreign import ccall "wrapper" mkKMM :: KeyMayMatchFun -> IO (FunPtr KeyMayMatchFun) + +-- +-- Read options +-- + +foreign import ccall safe "rocksdb\\c.h rocksdb_readoptions_create" + c_rocksdb_readoptions_create :: IO ReadOptionsPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_readoptions_destroy" + c_rocksdb_readoptions_destroy :: ReadOptionsPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_readoptions_set_verify_checksums" + c_rocksdb_readoptions_set_verify_checksums :: ReadOptionsPtr -> CUChar -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_readoptions_set_fill_cache" + c_rocksdb_readoptions_set_fill_cache :: ReadOptionsPtr -> CUChar -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_readoptions_set_snapshot" + c_rocksdb_readoptions_set_snapshot :: ReadOptionsPtr -> SnapshotPtr -> IO () + +foreign import ccall unsafe "rocksdb\\c.h rocksdb_readoptions_set_iterate_upper_bound" + rocksdb_readoptions_set_iterate_upper_bound :: ReadOptionsPtr -> CString -> CSize -> IO () + +foreign import ccall unsafe "rocksdb\\c.h rocksdb_readoptions_set_iterate_lower_bound" + rocksdb_readoptions_set_iterate_lower_bound :: ReadOptionsPtr -> CString -> CSize -> IO () + +foreign import ccall unsafe "cpp\\chainweb-rocksdb.h rocksdb_readoptions_set_auto_prefix_mode" + rocksdb_readoptions_set_auto_prefix_mode :: ReadOptionsPtr -> CBool -> IO () + +-- +-- Write options +-- + +foreign import ccall safe "rocksdb\\c.h rocksdb_writeoptions_create" + c_rocksdb_writeoptions_create :: IO WriteOptionsPtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_writeoptions_destroy" + c_rocksdb_writeoptions_destroy :: WriteOptionsPtr -> IO () + +foreign import ccall safe "rocksdb\\c.h rocksdb_writeoptions_set_sync" + c_rocksdb_writeoptions_set_sync :: WriteOptionsPtr -> CUChar -> IO () + + +-- +-- Cache +-- + +foreign import ccall safe "rocksdb\\c.h rocksdb_cache_create_lru" + c_rocksdb_cache_create_lru :: CSize -> IO CachePtr + +foreign import ccall safe "rocksdb\\c.h rocksdb_cache_destroy" + c_rocksdb_cache_destroy :: CachePtr -> IO () + +---------------------------------------------------------------------------- +-- Free +---------------------------------------------------------------------------- + +foreign import ccall safe "rocksdb\\c.h rocksdb_free" + c_rocksdb_free :: CString -> IO () + +---------------------------------------------------------------------------- +-- Checkpoints +---------------------------------------------------------------------------- + +foreign import ccall unsafe "rocksdb\\c.h rocksdb_checkpoint_object_create" + rocksdb_checkpoint_object_create :: RocksDBPtr -> Ptr CString -> IO (Ptr Checkpoint) + +foreign import ccall unsafe "rocksdb\\c.h rocksdb_checkpoint_create" + rocksdb_checkpoint_create :: Ptr Checkpoint -> CString -> CULong -> Ptr CString -> IO () + +foreign import ccall unsafe "rocksdb\\c.h rocksdb_checkpoint_object_destroy" + rocksdb_checkpoint_object_destroy :: Ptr Checkpoint -> IO () diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Internal.hs b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Internal.hs new file mode 100644 index 0000000000..4cbc118b4b --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Internal.hs @@ -0,0 +1,217 @@ +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE OverloadedStrings #-} + +-- | +-- Module : Database.RocksDB.Internal +-- Copyright : (c) 2012-2013 The leveldb-haskell Authors +-- (c) 2014 The rocksdb-haskell Authors +-- License : BSD3 +-- Maintainer : mail@agrafix.net +-- Stability : experimental +-- Portability : non-portable +-- + +module Database.RocksDB.Internal + ( -- * Types + DB (..) + , Comparator' + , Options' (..) + + -- * "Smart" constructors and deconstructors + , freeCReadOpts + , freeComparator + , freeOpts + , freeCString + , mkComparator + , mkCompareFun + , mkCreateFilterFun + , mkKeyMayMatchFun + , mkOpts + + -- * combinators + , withCWriteOpts + + -- * Utilities + , throwIfErr + , cSizeToInt + , intToCSize + , intToCInt + , cIntToInt + , boolToNum + ) +where + +import Control.Exception +import Control.Monad (when) +import Data.ByteString (ByteString) +import Foreign +import Foreign.C.String (CString, peekCString, withCString) +import Foreign.C.Types (CInt, CSize) + +import Database.RocksDB.C +import Database.RocksDB.Types + +import qualified Data.ByteString as BS + + +-- | Database handle +newtype DB = DB RocksDBPtr + deriving Eq + +-- | Internal representation of a 'Comparator' +data Comparator' = Comparator' (FunPtr CompareFun) + (FunPtr Destructor) + (FunPtr NameFun) + ComparatorPtr + +-- | Internal representation of the 'Options' +data Options' = Options' + { _optsPtr :: !OptionsPtr + , _cachePtr :: !(Maybe CachePtr) + , _comp :: !(Maybe Comparator') + } + +mkOpts :: Options -> IO Options' +mkOpts Options{..} = do + opts_ptr <- c_rocksdb_options_create + + c_rocksdb_options_set_compression opts_ptr + $ ccompression compression + c_rocksdb_options_set_create_if_missing opts_ptr + $ boolToNum createIfMissing + c_rocksdb_options_set_error_if_exists opts_ptr + $ boolToNum errorIfExists + c_rocksdb_options_set_max_open_files opts_ptr + $ intToCInt maxOpenFiles + c_rocksdb_options_set_paranoid_checks opts_ptr + $ boolToNum paranoidChecks + c_rocksdb_options_set_write_buffer_size opts_ptr + $ intToCSize writeBufferSize + + cmp <- maybeSetCmp opts_ptr comparator + + return (Options' opts_ptr Nothing cmp) + + where + ccompression NoCompression = + noCompression + ccompression SnappyCompression = + snappyCompression + ccompression ZlibCompression = + zlibCompression + + maybeSetCmp :: OptionsPtr -> Maybe Comparator -> IO (Maybe Comparator') + maybeSetCmp opts_ptr (Just mcmp) = Just <$> setcmp opts_ptr mcmp + maybeSetCmp _ Nothing = return Nothing + + setcmp :: OptionsPtr -> Comparator -> IO Comparator' + setcmp opts_ptr (Comparator cmp) = do + cmp'@(Comparator' _ _ _ cmp_ptr) <- mkComparator "user-defined" cmp + c_rocksdb_options_set_comparator opts_ptr cmp_ptr + return cmp' + +freeOpts :: Options' -> IO () +freeOpts (Options' opts_ptr mcache_ptr mcmp_ptr) = + c_rocksdb_options_destroy opts_ptr `finally` + maybe (return ()) c_rocksdb_cache_destroy mcache_ptr `finally` + maybe (return ()) freeComparator mcmp_ptr + +withCWriteOpts :: WriteOptions -> (WriteOptionsPtr -> IO a) -> IO a +withCWriteOpts WriteOptions{..} = bracket mkCWriteOpts freeCWriteOpts + where + mkCWriteOpts = do + opts_ptr <- c_rocksdb_writeoptions_create + onException + (c_rocksdb_writeoptions_set_sync opts_ptr $ boolToNum sync) + (c_rocksdb_writeoptions_destroy opts_ptr) + return opts_ptr + + freeCWriteOpts = c_rocksdb_writeoptions_destroy + +mkCompareFun :: (ByteString -> ByteString -> Ordering) -> CompareFun +mkCompareFun cmp = cmp' + where + cmp' _ a alen b blen = do + a' <- BS.packCStringLen (a, fromInteger . toInteger $ alen) + b' <- BS.packCStringLen (b, fromInteger . toInteger $ blen) + return $ case cmp a' b' of + EQ -> 0 + GT -> 1 + LT -> -1 + +mkComparator :: String -> (ByteString -> ByteString -> Ordering) -> IO Comparator' +mkComparator name f = + withCString name $ \cs -> do + ccmpfun <- mkCmp . mkCompareFun $ f + cdest <- mkDest $ const () + cname <- mkName $ const cs + ccmp <- c_rocksdb_comparator_create nullPtr cdest ccmpfun cname + return $ Comparator' ccmpfun cdest cname ccmp + + +freeComparator :: Comparator' -> IO () +freeComparator (Comparator' ccmpfun cdest cname ccmp) = do + c_rocksdb_comparator_destroy ccmp + freeHaskellFunPtr ccmpfun + freeHaskellFunPtr cdest + freeHaskellFunPtr cname + +mkCreateFilterFun :: ([ByteString] -> ByteString) -> CreateFilterFun +mkCreateFilterFun f = f' + where + f' _ ks ks_lens n_ks flen = do + let n_ks' = fromInteger . toInteger $ n_ks + ks' <- peekArray n_ks' ks + ks_lens' <- peekArray n_ks' ks_lens + keys <- mapM bstr (zip ks' ks_lens') + let res = f keys + poke flen (fromIntegral . BS.length $ res) + BS.useAsCString res $ \cstr -> return cstr + + bstr (x,len) = BS.packCStringLen (x, fromInteger . toInteger $ len) + +mkKeyMayMatchFun :: (ByteString -> ByteString -> Bool) -> KeyMayMatchFun +mkKeyMayMatchFun g = g' + where + g' _ k klen f flen = do + k' <- BS.packCStringLen (k, fromInteger . toInteger $ klen) + f' <- BS.packCStringLen (f, fromInteger . toInteger $ flen) + return . boolToNum $ g k' f' + + +freeCReadOpts :: ReadOptionsPtr -> IO () +freeCReadOpts = c_rocksdb_readoptions_destroy + +freeCString :: CString -> IO () +freeCString = c_rocksdb_free + +throwIfErr :: String -> (ErrPtr -> IO a) -> IO a +throwIfErr s f = alloca $ \err_ptr -> do + poke err_ptr nullPtr + res <- f err_ptr + erra <- peek err_ptr + when (erra /= nullPtr) $ do + err <- peekCString erra + throwIO $ userError $ s ++ ": " ++ err + return res + +cSizeToInt :: CSize -> Int +cSizeToInt = fromIntegral +{-# INLINE cSizeToInt #-} + +intToCSize :: Int -> CSize +intToCSize = fromIntegral +{-# INLINE intToCSize #-} + +intToCInt :: Int -> CInt +intToCInt = fromIntegral +{-# INLINE intToCInt #-} + +cIntToInt :: CInt -> Int +cIntToInt = fromIntegral +{-# INLINE cIntToInt #-} + +boolToNum :: Num b => Bool -> b +boolToNum True = fromIntegral (1 :: Int) +boolToNum False = fromIntegral (0 :: Int) +{-# INLINE boolToNum #-} diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Iterator.hs b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Iterator.hs new file mode 100644 index 0000000000..e5f5de024f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Iterator.hs @@ -0,0 +1,198 @@ +-- | This code is mostly from serokell/rocksdb-haskell's Iterator module. +-- The main difference is the way that options are handled. The original code +-- didn't let me alter any options that didn't have bindings. It also had the +-- ability to leak the underlying `ReadOptions` pointer. +{-# language ScopedTypeVariables #-} +{-# language RankNTypes #-} + +module Database.RocksDB.Iterator + ( Iterator + , createIter + , withIter + , iterValid + , iterSeek + , iterFirst + , iterLast + , iterNext + , iterPrev + , iterKey + , iterValue + , iterEntry + , iterGetError + , mapIter + , iterItems + , iterKeys + , iterValues + ) +where + +import Control.Exception (bracket) +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (liftIO)) +import Data.ByteString (ByteString) +import Data.Maybe (catMaybes) +import Foreign +import Foreign.C.Error (throwErrnoIfNull) +import Foreign.C.String +import Foreign.C.Types + +import Database.RocksDB.C +import Database.RocksDB.Internal + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as BC +import qualified Data.ByteString.Unsafe as BU + +newtype Iterator = Iterator IteratorPtr + +createIter :: DB -> ReadOptionsPtr -> IO Iterator +createIter (DB db_ptr) opts_ptr = fmap Iterator $ + throwErrnoIfNull "create_iterator" $ c_rocksdb_create_iterator db_ptr opts_ptr + +-- | Run an action with an 'IteratorPtr' +withIter :: DB -> ReadOptionsPtr -> (Iterator -> IO a) -> IO a +withIter db opts_ptr = + bracket (createIter db opts_ptr) (\(Iterator p) -> c_rocksdb_iter_destroy p) + +-- | An iterator is either positioned at a key/value pair, or not valid. This +-- function returns /true/ iff the iterator is valid. +iterValid :: Iterator -> IO Bool +iterValid (Iterator iter_ptr) = liftIO $ do + x <- c_rocksdb_iter_valid iter_ptr + return (x /= 0) + +-- | Position at the first key in the source that is at or past target. The +-- iterator is /valid/ after this call iff the source contains an entry that +-- comes at or past target. +iterSeek :: Iterator -> ByteString -> IO () +iterSeek (Iterator iter_ptr) key = liftIO $ + BU.unsafeUseAsCStringLen key $ \(key_ptr, klen) -> + c_rocksdb_iter_seek iter_ptr key_ptr (intToCSize klen) + +-- | Position at the first key in the source. The iterator is /valid/ after this +-- call iff the source is not empty. +iterFirst :: Iterator -> IO () +iterFirst (Iterator iter_ptr) = liftIO $ c_rocksdb_iter_seek_to_first iter_ptr + +-- | Position at the last key in the source. The iterator is /valid/ after this +-- call iff the source is not empty. +iterLast :: Iterator -> IO () +iterLast (Iterator iter_ptr) = liftIO $ c_rocksdb_iter_seek_to_last iter_ptr + +-- | Moves to the next entry in the source. After this call, 'iterValid' is +-- /true/ iff the iterator was not positioned at the last entry in the source. +-- +-- If the iterator is not valid, this function does nothing. Note that this is a +-- shortcoming of the C API: an 'iterPrev' might still be possible, but we can't +-- determine if we're at the last or first entry. +iterNext :: Iterator -> IO () +iterNext (Iterator iter_ptr) = liftIO $ do + valid <- c_rocksdb_iter_valid iter_ptr + when (valid /= 0) $ c_rocksdb_iter_next iter_ptr + +-- | Moves to the previous entry in the source. After this call, 'iterValid' is +-- /true/ iff the iterator was not positioned at the first entry in the source. +-- +-- If the iterator is not valid, this function does nothing. Note that this is a +-- shortcoming of the C API: an 'iterNext' might still be possible, but we can't +-- determine if we're at the last or first entry. +iterPrev :: Iterator -> IO () +iterPrev (Iterator iter_ptr) = liftIO $ do + valid <- c_rocksdb_iter_valid iter_ptr + when (valid /= 0) $ c_rocksdb_iter_prev iter_ptr + +-- | Return the key for the current entry if the iterator is currently +-- positioned at an entry, ie. 'iterValid'. +iterKey :: Iterator -> IO (Maybe ByteString) +iterKey (Iterator iter_ptr) = liftIO $ + iterString iter_ptr c_rocksdb_iter_key + +-- | Return the value for the current entry if the iterator is currently +-- positioned at an entry, ie. 'iterValid'. +iterValue :: Iterator -> IO (Maybe ByteString) +iterValue (Iterator iter_ptr) = liftIO $ + iterString iter_ptr c_rocksdb_iter_value + +-- | Return the current entry as a pair, if the iterator is currently positioned +-- at an entry, ie. 'iterValid'. +iterEntry :: Iterator -> IO (Maybe (ByteString, ByteString)) +iterEntry iter = liftIO $ do + mkey <- iterKey iter + mval <- iterValue iter + return $ (,) <$> mkey <*> mval + +-- | Check for errors +-- +-- Note that this captures somewhat severe errors such as a corrupted database. +iterGetError :: Iterator -> IO (Maybe ByteString) +iterGetError (Iterator iter_ptr) = liftIO $ + alloca $ \err_ptr -> do + poke err_ptr nullPtr + c_rocksdb_iter_get_error iter_ptr err_ptr + erra <- peek err_ptr + if erra == nullPtr + then return Nothing + else do + err <- peekCString erra + return . Just . BC.pack $ err + +-- | Map a function over an iterator, advancing the iterator forward and +-- returning the value. The iterator should be put in the right position prior +-- to calling the function. +-- +-- Note that this function accumulates the result strictly, ie. it reads all +-- values into memory until the iterator is exhausted. This is most likely not +-- what you want for large ranges. You may consider using conduits instead, for +-- an example see: +mapIter :: (Iterator -> IO a) -> Iterator -> IO [a] +mapIter f iter@(Iterator iter_ptr) = go [] + where + go acc = do + valid <- liftIO $ c_rocksdb_iter_valid iter_ptr + if valid == 0 + then return acc + else do + val <- f iter + () <- liftIO $ c_rocksdb_iter_next iter_ptr + go (val : acc) + +-- | Return a list of key and value tuples from an iterator. The iterator +-- should be put in the right position prior to calling this with the iterator. +-- +-- See strictness remarks on 'mapIter'. +iterItems :: Iterator -> IO [(ByteString, ByteString)] +iterItems iter = catMaybes <$> mapIter iterEntry iter + +-- | Return a list of key from an iterator. The iterator should be put +-- in the right position prior to calling this with the iterator. +-- +-- See strictness remarks on 'mapIter' +iterKeys :: Iterator -> IO [ByteString] +iterKeys iter = catMaybes <$> mapIter iterKey iter + +-- | Return a list of values from an iterator. The iterator should be put +-- in the right position prior to calling this with the iterator. +-- +-- See strictness remarks on 'mapIter' +iterValues :: Iterator -> IO [ByteString] +iterValues iter = catMaybes <$> mapIter iterValue iter + +-- +-- Internal +-- + +iterString :: IteratorPtr + -> (IteratorPtr -> Ptr CSize -> IO CString) + -> IO (Maybe ByteString) +iterString iter_ptr f = do + valid <- c_rocksdb_iter_valid iter_ptr + if valid == 0 + then return Nothing + else + alloca $ \len_ptr -> do + ptr <- f iter_ptr len_ptr + if ptr == nullPtr + then return Nothing + else do + len <- peek len_ptr + Just <$> BS.packCStringLen (ptr, cSizeToInt len) diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/ReadOptions.hs b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/ReadOptions.hs new file mode 100644 index 0000000000..f1aa12da1f --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/ReadOptions.hs @@ -0,0 +1,93 @@ +{-# language RankNTypes #-} + +module Database.RocksDB.ReadOptions + ( ReadOptions(..) + , withReadOptions + , withCReadOpts + , setVerifyChecksums + , setFillCache + , setUseSnapshot + , setUpperBound + , setLowerBound + , setAutoPrefixMode + ) + where + +import Control.Exception +import Data.ByteString(ByteString) +import qualified Data.ByteString.Unsafe as BU +import Foreign.Ptr + +import Database.RocksDB.C +import Database.RocksDB.Types + +newtype ReadOptions = ReadOptions + { runReadOptions :: forall r. ReadOptionsPtr -> IO r -> IO r + } +instance Monoid ReadOptions where + mempty = ReadOptions (\_ k -> k) +instance Semigroup ReadOptions where + ReadOptions r <> ReadOptions r' = + ReadOptions $ \ptr k -> r ptr (r' ptr k) + +withReadOptions :: ReadOptions -> (ReadOptionsPtr -> IO a) -> IO a +withReadOptions opts k = withCReadOpts $ \opts_ptr -> do + runReadOptions opts opts_ptr (k opts_ptr) + +withCReadOpts :: (ReadOptionsPtr -> IO a) -> IO a +withCReadOpts = bracket c_rocksdb_readoptions_create c_rocksdb_readoptions_destroy + +-- | If true, all data read from underlying storage will be verified +-- against corresponding checksuyms. +-- +-- Default: True +setVerifyChecksums :: Bool -> ReadOptions +setVerifyChecksums b = ReadOptions $ \opts_ptr k -> do + c_rocksdb_readoptions_set_verify_checksums opts_ptr (boolToNum b) + k + +-- | Should the data read for this iteration be cached in memory? Callers +-- may wish to set this field to false for bulk scans. +-- +-- Default: True +setFillCache :: Bool -> ReadOptions +setFillCache b = ReadOptions $ \opts_ptr k -> do + c_rocksdb_readoptions_set_fill_cache opts_ptr (boolToNum b) + k + +-- | If 'Just', read as of the supplied snapshot (which must belong to the +-- DB that is being read and which must not have been released). If +-- 'Nothing', use an implicit snapshot of the state at the beginning of +-- this read operation. +-- +-- Default: Nothing +setUseSnapshot :: Maybe Snapshot -> ReadOptions +setUseSnapshot snapshot = ReadOptions $ \opts_ptr k -> do + c_rocksdb_readoptions_set_snapshot opts_ptr snap_ptr + k + where + snap_ptr = case snapshot of + Just (Snapshot p) -> p + Nothing -> nullPtr + +setUpperBound :: ByteString -> ReadOptions +setUpperBound upper = ReadOptions $ \opts_ptr k -> + BU.unsafeUseAsCStringLen upper $ \(upperPtr, upperLen) -> do + rocksdb_readoptions_set_iterate_upper_bound opts_ptr upperPtr (fromIntegral upperLen) + k + +setLowerBound :: ByteString -> ReadOptions +setLowerBound lower = ReadOptions $ \opts_ptr k -> + BU.unsafeUseAsCStringLen lower $ \(lowerPtr, lowerLen) -> do + rocksdb_readoptions_set_iterate_lower_bound opts_ptr lowerPtr (fromIntegral lowerLen) + k + +setAutoPrefixMode :: Bool -> ReadOptions +setAutoPrefixMode m = ReadOptions $ \opts_ptr k -> do + rocksdb_readoptions_set_auto_prefix_mode opts_ptr (boolToNum m) + k + +boolToNum :: Num b => Bool -> b +boolToNum True = fromIntegral (1 :: Int) +boolToNum False = fromIntegral (0 :: Int) +{-# INLINE boolToNum #-} diff --git a/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Types.hs b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Types.hs new file mode 100644 index 0000000000..b76ed60467 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/src/Database/RocksDB/Types.hs @@ -0,0 +1,147 @@ +-- | +-- Module : Database.RocksDB.Types +-- Copyright : (c) 2012-2013 The leveldb-haskell Authors +-- (c) 2014 The rocksdb-haskell Authors +-- License : BSD3 +-- Maintainer : mail@agrafix.net +-- Stability : experimental +-- Portability : non-portable +-- + +module Database.RocksDB.Types + ( BatchOp (..) + , Comparator (..) + , Compression (..) + , Options (..) + , Property (..) + , Snapshot (..) + , WriteBatch + , WriteOptions (..) + + , defaultOptions + , defaultWriteOptions + ) +where + +import Data.ByteString (ByteString) +import Data.Default +import Foreign + +import Database.RocksDB.C + +-- | Snapshot handle +newtype Snapshot = Snapshot SnapshotPtr deriving (Eq) + +-- | Compression setting +data Compression + = NoCompression + | SnappyCompression + | ZlibCompression + deriving (Eq, Show) + +-- | User-defined comparator +newtype Comparator = Comparator (ByteString -> ByteString -> Ordering) + +-- | Options when opening a database +data Options = Options + { comparator :: !(Maybe Comparator) + -- ^ Comparator used to defined the order of keys in the table. + -- + -- If 'Nothing', the default comparator is used, which uses lexicographic + -- bytes-wise ordering. + -- + -- NOTE: the client must ensure that the comparator supplied here has the + -- same name and orders keys /exactly/ the same as the comparator provided + -- to previous open calls on the same DB. + -- + -- Default: Nothing + , compression :: !Compression + -- ^ Compress blocks using the specified compression algorithm. + -- + -- This parameter can be changed dynamically. + -- + -- Default: 'EnableCompression' + , createIfMissing :: !Bool + -- ^ If true, the database will be created if it is missing. + -- + -- Default: False + , errorIfExists :: !Bool + -- ^ It true, an error is raised if the database already exists. + -- + -- Default: False + , maxOpenFiles :: !Int + -- ^ Number of open files that can be used by the DB. + -- + -- You may need to increase this if your database has a large working set + -- (budget one open file per 2MB of working set). + -- + -- Default: 1000 + , paranoidChecks :: !Bool + -- ^ If true, the implementation will do aggressive checking of the data + -- it is processing and will stop early if it detects any errors. + -- + -- This may have unforeseen ramifications: for example, a corruption of + -- one DB entry may cause a large number of entries to become unreadable + -- or for the entire DB to become unopenable. + -- + -- Default: False + , writeBufferSize :: !Int + -- ^ Amount of data to build up in memory (backed by an unsorted log on + -- disk) before converting to a sorted on-disk file. + -- + -- Larger values increase performance, especially during bulk loads. Up to + -- to write buffers may be held in memory at the same time, so you may + -- with to adjust this parameter to control memory usage. Also, a larger + -- write buffer will result in a longer recovery time the next time the + -- database is opened. + -- + -- Default: 4MB + } + +defaultOptions :: Options +defaultOptions = Options + { comparator = Nothing + , compression = SnappyCompression + , createIfMissing = False + , errorIfExists = False + , maxOpenFiles = 1000 + , paranoidChecks = False + , writeBufferSize = 4 `shift` 20 + } + +instance Default Options where + def = defaultOptions + +-- | Options for write operations +data WriteOptions = WriteOptions + { sync :: !Bool + -- ^ If true, the write will be flushed from the operating system buffer + -- cache (by calling WritableFile::Sync()) before the write is considered + -- complete. If this flag is true, writes will be slower. + -- + -- If this flag is false, and the machine crashes, some recent writes may + -- be lost. Note that if it is just the process that crashes (i.e., the + -- machine does not reboot), no writes will be lost even if sync==false. + -- + -- In other words, a DB write with sync==false has similar crash semantics + -- as the "write()" system call. A DB write with sync==true has similar + -- crash semantics to a "write()" system call followed by "fsync()". + -- + -- Default: False + } deriving (Eq, Show) + +defaultWriteOptions :: WriteOptions +defaultWriteOptions = WriteOptions { sync = False } + +instance Default WriteOptions where + def = defaultWriteOptions + +type WriteBatch = [BatchOp] + +-- | Batch operation +data BatchOp = Put ByteString ByteString | Del ByteString + deriving (Eq, Show) + +-- | Properties exposed by RocksDB +data Property = NumFilesAtLevel Int | Stats | SSTables + deriving (Eq, Show) diff --git a/vendored/rocksdb-haskell-kadena/unpack-rocksdb.sh b/vendored/rocksdb-haskell-kadena/unpack-rocksdb.sh new file mode 100755 index 0000000000..75f6c09fd3 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/unpack-rocksdb.sh @@ -0,0 +1,354 @@ +#!/usr/bin/env bash + +set -e + +ROCKSDB_VERSION=8.3.2 + +ROCKSDB_ARCHIVE=v${ROCKSDB_VERSION}.tar.gz +ROCKSDB_BASE=rocksdb-${ROCKSDB_VERSION} + +if [[ ! -f ${ROCKSDB_ARCHIVE} ]] ; then + wget https://github.com/facebook/rocksdb/archive/refs/tags/v${ROCKSDB_VERSION}.tar.gz +fi + +# This list is based on LIB_OBJECTS from the Makefile. It must be generated when +# the rocksdb version changes. +# +tar -xzf "$ROCKSDB_ARCHIVE" \ + ${ROCKSDB_BASE}/include \ + ${ROCKSDB_BASE}/cache/cache.cc \ + ${ROCKSDB_BASE}/cache/cache_entry_roles.cc \ + ${ROCKSDB_BASE}/cache/cache_key.cc \ + ${ROCKSDB_BASE}/cache/cache_helpers.cc \ + ${ROCKSDB_BASE}/cache/cache_reservation_manager.cc \ + ${ROCKSDB_BASE}/cache/charged_cache.cc \ + ${ROCKSDB_BASE}/cache/clock_cache.cc \ + ${ROCKSDB_BASE}/cache/lru_cache.cc \ + ${ROCKSDB_BASE}/cache/compressed_secondary_cache.cc \ + ${ROCKSDB_BASE}/cache/secondary_cache.cc \ + ${ROCKSDB_BASE}/cache/secondary_cache_adapter.cc \ + ${ROCKSDB_BASE}/cache/sharded_cache.cc \ + ${ROCKSDB_BASE}/db/arena_wrapped_db_iter.cc \ + ${ROCKSDB_BASE}/db/blob/blob_contents.cc \ + ${ROCKSDB_BASE}/db/blob/blob_fetcher.cc \ + ${ROCKSDB_BASE}/db/blob/blob_file_addition.cc \ + ${ROCKSDB_BASE}/db/blob/blob_file_builder.cc \ + ${ROCKSDB_BASE}/db/blob/blob_file_cache.cc \ + ${ROCKSDB_BASE}/db/blob/blob_file_garbage.cc \ + ${ROCKSDB_BASE}/db/blob/blob_file_meta.cc \ + ${ROCKSDB_BASE}/db/blob/blob_file_reader.cc \ + ${ROCKSDB_BASE}/db/blob/blob_garbage_meter.cc \ + ${ROCKSDB_BASE}/db/blob/blob_log_format.cc \ + ${ROCKSDB_BASE}/db/blob/blob_log_sequential_reader.cc \ + ${ROCKSDB_BASE}/db/blob/blob_log_writer.cc \ + ${ROCKSDB_BASE}/db/blob/blob_source.cc \ + ${ROCKSDB_BASE}/db/blob/prefetch_buffer_collection.cc \ + ${ROCKSDB_BASE}/db/builder.cc \ + ${ROCKSDB_BASE}/db/c.cc \ + ${ROCKSDB_BASE}/db/column_family.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_iterator.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_job.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_picker.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_picker_fifo.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_picker_level.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_picker_universal.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_service_job.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_state.cc \ + ${ROCKSDB_BASE}/db/compaction/compaction_outputs.cc \ + ${ROCKSDB_BASE}/db/compaction/sst_partitioner.cc \ + ${ROCKSDB_BASE}/db/compaction/subcompaction_state.cc \ + ${ROCKSDB_BASE}/db/convenience.cc \ + ${ROCKSDB_BASE}/db/db_filesnapshot.cc \ + ${ROCKSDB_BASE}/db/db_impl/compacted_db_impl.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_compaction_flush.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_debug.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_experimental.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_files.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_open.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_readonly.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_secondary.cc \ + ${ROCKSDB_BASE}/db/db_impl/db_impl_write.cc \ + ${ROCKSDB_BASE}/db/db_info_dumper.cc \ + ${ROCKSDB_BASE}/db/db_iter.cc \ + ${ROCKSDB_BASE}/db/dbformat.cc \ + ${ROCKSDB_BASE}/db/error_handler.cc \ + ${ROCKSDB_BASE}/db/event_helpers.cc \ + ${ROCKSDB_BASE}/db/experimental.cc \ + ${ROCKSDB_BASE}/db/external_sst_file_ingestion_job.cc \ + ${ROCKSDB_BASE}/db/file_indexer.cc \ + ${ROCKSDB_BASE}/db/flush_job.cc \ + ${ROCKSDB_BASE}/db/flush_scheduler.cc \ + ${ROCKSDB_BASE}/db/forward_iterator.cc \ + ${ROCKSDB_BASE}/db/import_column_family_job.cc \ + ${ROCKSDB_BASE}/db/internal_stats.cc \ + ${ROCKSDB_BASE}/db/logs_with_prep_tracker.cc \ + ${ROCKSDB_BASE}/db/log_reader.cc \ + ${ROCKSDB_BASE}/db/log_writer.cc \ + ${ROCKSDB_BASE}/db/malloc_stats.cc \ + ${ROCKSDB_BASE}/db/memtable.cc \ + ${ROCKSDB_BASE}/db/memtable_list.cc \ + ${ROCKSDB_BASE}/db/merge_helper.cc \ + ${ROCKSDB_BASE}/db/merge_operator.cc \ + ${ROCKSDB_BASE}/db/output_validator.cc \ + ${ROCKSDB_BASE}/db/periodic_task_scheduler.cc \ + ${ROCKSDB_BASE}/db/range_del_aggregator.cc \ + ${ROCKSDB_BASE}/db/range_tombstone_fragmenter.cc \ + ${ROCKSDB_BASE}/db/repair.cc \ + ${ROCKSDB_BASE}/db/seqno_to_time_mapping.cc \ + ${ROCKSDB_BASE}/db/snapshot_impl.cc \ + ${ROCKSDB_BASE}/db/table_cache.cc \ + ${ROCKSDB_BASE}/db/table_properties_collector.cc \ + ${ROCKSDB_BASE}/db/transaction_log_impl.cc \ + ${ROCKSDB_BASE}/db/trim_history_scheduler.cc \ + ${ROCKSDB_BASE}/db/version_builder.cc \ + ${ROCKSDB_BASE}/db/version_edit.cc \ + ${ROCKSDB_BASE}/db/version_edit_handler.cc \ + ${ROCKSDB_BASE}/db/version_set.cc \ + ${ROCKSDB_BASE}/db/wal_edit.cc \ + ${ROCKSDB_BASE}/db/wal_manager.cc \ + ${ROCKSDB_BASE}/db/wide/wide_column_serialization.cc \ + ${ROCKSDB_BASE}/db/wide/wide_columns.cc \ + ${ROCKSDB_BASE}/db/write_batch.cc \ + ${ROCKSDB_BASE}/db/write_batch_base.cc \ + ${ROCKSDB_BASE}/db/write_controller.cc \ + ${ROCKSDB_BASE}/db/write_stall_stats.cc \ + ${ROCKSDB_BASE}/db/write_thread.cc \ + ${ROCKSDB_BASE}/env/composite_env.cc \ + ${ROCKSDB_BASE}/env/env.cc \ + ${ROCKSDB_BASE}/env/env_chroot.cc \ + ${ROCKSDB_BASE}/env/env_encryption.cc \ + ${ROCKSDB_BASE}/env/env_posix.cc \ + ${ROCKSDB_BASE}/env/file_system.cc \ + ${ROCKSDB_BASE}/env/fs_posix.cc \ + ${ROCKSDB_BASE}/env/fs_remap.cc \ + ${ROCKSDB_BASE}/env/file_system_tracer.cc \ + ${ROCKSDB_BASE}/env/io_posix.cc \ + ${ROCKSDB_BASE}/env/mock_env.cc \ + ${ROCKSDB_BASE}/env/unique_id_gen.cc \ + ${ROCKSDB_BASE}/file/delete_scheduler.cc \ + ${ROCKSDB_BASE}/file/file_prefetch_buffer.cc \ + ${ROCKSDB_BASE}/file/file_util.cc \ + ${ROCKSDB_BASE}/file/filename.cc \ + ${ROCKSDB_BASE}/file/line_file_reader.cc \ + ${ROCKSDB_BASE}/file/random_access_file_reader.cc \ + ${ROCKSDB_BASE}/file/read_write_util.cc \ + ${ROCKSDB_BASE}/file/readahead_raf.cc \ + ${ROCKSDB_BASE}/file/sequence_file_reader.cc \ + ${ROCKSDB_BASE}/file/sst_file_manager_impl.cc \ + ${ROCKSDB_BASE}/file/writable_file_writer.cc \ + ${ROCKSDB_BASE}/logging/auto_roll_logger.cc \ + ${ROCKSDB_BASE}/logging/event_logger.cc \ + ${ROCKSDB_BASE}/logging/log_buffer.cc \ + ${ROCKSDB_BASE}/memory/arena.cc \ + ${ROCKSDB_BASE}/memory/concurrent_arena.cc \ + ${ROCKSDB_BASE}/memory/jemalloc_nodump_allocator.cc \ + ${ROCKSDB_BASE}/memory/memkind_kmem_allocator.cc \ + ${ROCKSDB_BASE}/memory/memory_allocator.cc \ + ${ROCKSDB_BASE}/memtable/alloc_tracker.cc \ + ${ROCKSDB_BASE}/memtable/hash_linklist_rep.cc \ + ${ROCKSDB_BASE}/memtable/hash_skiplist_rep.cc \ + ${ROCKSDB_BASE}/memtable/skiplistrep.cc \ + ${ROCKSDB_BASE}/memtable/vectorrep.cc \ + ${ROCKSDB_BASE}/memtable/write_buffer_manager.cc \ + ${ROCKSDB_BASE}/monitoring/histogram.cc \ + ${ROCKSDB_BASE}/monitoring/histogram_windowing.cc \ + ${ROCKSDB_BASE}/monitoring/in_memory_stats_history.cc \ + ${ROCKSDB_BASE}/monitoring/instrumented_mutex.cc \ + ${ROCKSDB_BASE}/monitoring/iostats_context.cc \ + ${ROCKSDB_BASE}/monitoring/perf_context.cc \ + ${ROCKSDB_BASE}/monitoring/perf_level.cc \ + ${ROCKSDB_BASE}/monitoring/persistent_stats_history.cc \ + ${ROCKSDB_BASE}/monitoring/statistics.cc \ + ${ROCKSDB_BASE}/monitoring/thread_status_impl.cc \ + ${ROCKSDB_BASE}/monitoring/thread_status_updater.cc \ + ${ROCKSDB_BASE}/monitoring/thread_status_updater_debug.cc \ + ${ROCKSDB_BASE}/monitoring/thread_status_util.cc \ + ${ROCKSDB_BASE}/monitoring/thread_status_util_debug.cc \ + ${ROCKSDB_BASE}/options/cf_options.cc \ + ${ROCKSDB_BASE}/options/configurable.cc \ + ${ROCKSDB_BASE}/options/customizable.cc \ + ${ROCKSDB_BASE}/options/db_options.cc \ + ${ROCKSDB_BASE}/options/options.cc \ + ${ROCKSDB_BASE}/options/options_helper.cc \ + ${ROCKSDB_BASE}/options/options_parser.cc \ + ${ROCKSDB_BASE}/port/mmap.cc \ + ${ROCKSDB_BASE}/port/port_posix.cc \ + ${ROCKSDB_BASE}/port/win/env_default.cc \ + ${ROCKSDB_BASE}/port/win/env_win.cc \ + ${ROCKSDB_BASE}/port/win/io_win.cc \ + ${ROCKSDB_BASE}/port/win/port_win.cc \ + ${ROCKSDB_BASE}/port/win/win_logger.cc \ + ${ROCKSDB_BASE}/port/win/win_thread.cc \ + ${ROCKSDB_BASE}/port/stack_trace.cc \ + ${ROCKSDB_BASE}/table/adaptive/adaptive_table_factory.cc \ + ${ROCKSDB_BASE}/table/block_based/binary_search_index_reader.cc \ + ${ROCKSDB_BASE}/table/block_based/block.cc \ + ${ROCKSDB_BASE}/table/block_based/block_based_table_builder.cc \ + ${ROCKSDB_BASE}/table/block_based/block_based_table_factory.cc \ + ${ROCKSDB_BASE}/table/block_based/block_based_table_iterator.cc \ + ${ROCKSDB_BASE}/table/block_based/block_based_table_reader.cc \ + ${ROCKSDB_BASE}/table/block_based/block_builder.cc \ + ${ROCKSDB_BASE}/table/block_based/block_cache.cc \ + ${ROCKSDB_BASE}/table/block_based/block_prefetcher.cc \ + ${ROCKSDB_BASE}/table/block_based/block_prefix_index.cc \ + ${ROCKSDB_BASE}/table/block_based/data_block_hash_index.cc \ + ${ROCKSDB_BASE}/table/block_based/data_block_footer.cc \ + ${ROCKSDB_BASE}/table/block_based/filter_block_reader_common.cc \ + ${ROCKSDB_BASE}/table/block_based/filter_policy.cc \ + ${ROCKSDB_BASE}/table/block_based/flush_block_policy.cc \ + ${ROCKSDB_BASE}/table/block_based/full_filter_block.cc \ + ${ROCKSDB_BASE}/table/block_based/hash_index_reader.cc \ + ${ROCKSDB_BASE}/table/block_based/index_builder.cc \ + ${ROCKSDB_BASE}/table/block_based/index_reader_common.cc \ + ${ROCKSDB_BASE}/table/block_based/parsed_full_filter_block.cc \ + ${ROCKSDB_BASE}/table/block_based/partitioned_filter_block.cc \ + ${ROCKSDB_BASE}/table/block_based/partitioned_index_iterator.cc \ + ${ROCKSDB_BASE}/table/block_based/partitioned_index_reader.cc \ + ${ROCKSDB_BASE}/table/block_based/reader_common.cc \ + ${ROCKSDB_BASE}/table/block_based/uncompression_dict_reader.cc \ + ${ROCKSDB_BASE}/table/block_fetcher.cc \ + ${ROCKSDB_BASE}/table/cuckoo/cuckoo_table_builder.cc \ + ${ROCKSDB_BASE}/table/cuckoo/cuckoo_table_factory.cc \ + ${ROCKSDB_BASE}/table/cuckoo/cuckoo_table_reader.cc \ + ${ROCKSDB_BASE}/table/format.cc \ + ${ROCKSDB_BASE}/table/get_context.cc \ + ${ROCKSDB_BASE}/table/iterator.cc \ + ${ROCKSDB_BASE}/table/merging_iterator.cc \ + ${ROCKSDB_BASE}/table/compaction_merging_iterator.cc \ + ${ROCKSDB_BASE}/table/meta_blocks.cc \ + ${ROCKSDB_BASE}/table/persistent_cache_helper.cc \ + ${ROCKSDB_BASE}/table/plain/plain_table_bloom.cc \ + ${ROCKSDB_BASE}/table/plain/plain_table_builder.cc \ + ${ROCKSDB_BASE}/table/plain/plain_table_factory.cc \ + ${ROCKSDB_BASE}/table/plain/plain_table_index.cc \ + ${ROCKSDB_BASE}/table/plain/plain_table_key_coding.cc \ + ${ROCKSDB_BASE}/table/plain/plain_table_reader.cc \ + ${ROCKSDB_BASE}/table/sst_file_dumper.cc \ + ${ROCKSDB_BASE}/table/sst_file_reader.cc \ + ${ROCKSDB_BASE}/table/sst_file_writer.cc \ + ${ROCKSDB_BASE}/table/table_factory.cc \ + ${ROCKSDB_BASE}/table/table_properties.cc \ + ${ROCKSDB_BASE}/table/two_level_iterator.cc \ + ${ROCKSDB_BASE}/table/unique_id.cc \ + ${ROCKSDB_BASE}/test_util/sync_point.cc \ + ${ROCKSDB_BASE}/test_util/sync_point_impl.cc \ + ${ROCKSDB_BASE}/test_util/transaction_test_util.cc \ + ${ROCKSDB_BASE}/tools/dump/db_dump_tool.cc \ + ${ROCKSDB_BASE}/trace_replay/trace_record_handler.cc \ + ${ROCKSDB_BASE}/trace_replay/trace_record_result.cc \ + ${ROCKSDB_BASE}/trace_replay/trace_record.cc \ + ${ROCKSDB_BASE}/trace_replay/trace_replay.cc \ + ${ROCKSDB_BASE}/trace_replay/block_cache_tracer.cc \ + ${ROCKSDB_BASE}/trace_replay/io_tracer.cc \ + ${ROCKSDB_BASE}/util/async_file_reader.cc \ + ${ROCKSDB_BASE}/util/build_version.cc.in \ + ${ROCKSDB_BASE}/util/cleanable.cc \ + ${ROCKSDB_BASE}/util/coding.cc \ + ${ROCKSDB_BASE}/util/compaction_job_stats_impl.cc \ + ${ROCKSDB_BASE}/util/comparator.cc \ + ${ROCKSDB_BASE}/util/compression.cc \ + ${ROCKSDB_BASE}/util/compression_context_cache.cc \ + ${ROCKSDB_BASE}/util/concurrent_task_limiter_impl.cc \ + ${ROCKSDB_BASE}/util/crc32c.cc \ + ${ROCKSDB_BASE}/util/crc32c_arm64.cc \ + ${ROCKSDB_BASE}/util/data_structure.cc \ + ${ROCKSDB_BASE}/util/dynamic_bloom.cc \ + ${ROCKSDB_BASE}/util/hash.cc \ + ${ROCKSDB_BASE}/util/murmurhash.cc \ + ${ROCKSDB_BASE}/util/random.cc \ + ${ROCKSDB_BASE}/util/rate_limiter.cc \ + ${ROCKSDB_BASE}/util/ribbon_config.cc \ + ${ROCKSDB_BASE}/util/slice.cc \ + ${ROCKSDB_BASE}/util/file_checksum_helper.cc \ + ${ROCKSDB_BASE}/util/status.cc \ + ${ROCKSDB_BASE}/util/stderr_logger.cc \ + ${ROCKSDB_BASE}/util/string_util.cc \ + ${ROCKSDB_BASE}/util/thread_local.cc \ + ${ROCKSDB_BASE}/util/threadpool_imp.cc \ + ${ROCKSDB_BASE}/util/xxhash.cc \ + ${ROCKSDB_BASE}/utilities/agg_merge/agg_merge.cc \ + ${ROCKSDB_BASE}/utilities/backup/backup_engine.cc \ + ${ROCKSDB_BASE}/utilities/blob_db/blob_compaction_filter.cc \ + ${ROCKSDB_BASE}/utilities/blob_db/blob_db.cc \ + ${ROCKSDB_BASE}/utilities/blob_db/blob_db_impl.cc \ + ${ROCKSDB_BASE}/utilities/blob_db/blob_db_impl_filesnapshot.cc \ + ${ROCKSDB_BASE}/utilities/blob_db/blob_file.cc \ + ${ROCKSDB_BASE}/utilities/cache_dump_load.cc \ + ${ROCKSDB_BASE}/utilities/cache_dump_load_impl.cc \ + ${ROCKSDB_BASE}/utilities/cassandra/cassandra_compaction_filter.cc \ + ${ROCKSDB_BASE}/utilities/cassandra/format.cc \ + ${ROCKSDB_BASE}/utilities/cassandra/merge_operator.cc \ + ${ROCKSDB_BASE}/utilities/checkpoint/checkpoint_impl.cc \ + ${ROCKSDB_BASE}/utilities/compaction_filters.cc \ + ${ROCKSDB_BASE}/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \ + ${ROCKSDB_BASE}/utilities/convenience/info_log_finder.cc \ + ${ROCKSDB_BASE}/utilities/counted_fs.cc \ + ${ROCKSDB_BASE}/utilities/debug.cc \ + ${ROCKSDB_BASE}/utilities/env_mirror.cc \ + ${ROCKSDB_BASE}/utilities/env_timed.cc \ + ${ROCKSDB_BASE}/utilities/fault_injection_env.cc \ + ${ROCKSDB_BASE}/utilities/fault_injection_fs.cc \ + ${ROCKSDB_BASE}/utilities/fault_injection_secondary_cache.cc \ + ${ROCKSDB_BASE}/utilities/leveldb_options/leveldb_options.cc \ + ${ROCKSDB_BASE}/utilities/memory/memory_util.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/max.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/put.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/sortlist.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/string_append/stringappend.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/string_append/stringappend2.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/uint64add.cc \ + ${ROCKSDB_BASE}/utilities/merge_operators/bytesxor.cc \ + ${ROCKSDB_BASE}/utilities/object_registry.cc \ + ${ROCKSDB_BASE}/utilities/option_change_migration/option_change_migration.cc \ + ${ROCKSDB_BASE}/utilities/options/options_util.cc \ + ${ROCKSDB_BASE}/utilities/persistent_cache/block_cache_tier.cc \ + ${ROCKSDB_BASE}/utilities/persistent_cache/block_cache_tier_file.cc \ + ${ROCKSDB_BASE}/utilities/persistent_cache/block_cache_tier_metadata.cc \ + ${ROCKSDB_BASE}/utilities/persistent_cache/persistent_cache_tier.cc \ + ${ROCKSDB_BASE}/utilities/persistent_cache/volatile_tier_impl.cc \ + ${ROCKSDB_BASE}/utilities/simulator_cache/cache_simulator.cc \ + ${ROCKSDB_BASE}/utilities/simulator_cache/sim_cache.cc \ + ${ROCKSDB_BASE}/utilities/table_properties_collectors/compact_on_deletion_collector.cc \ + ${ROCKSDB_BASE}/utilities/trace/file_trace_reader_writer.cc \ + ${ROCKSDB_BASE}/utilities/trace/replayer_impl.cc \ + ${ROCKSDB_BASE}/utilities/transactions/lock/lock_manager.cc \ + ${ROCKSDB_BASE}/utilities/transactions/lock/point/point_lock_tracker.cc \ + ${ROCKSDB_BASE}/utilities/transactions/lock/point/point_lock_manager.cc \ + ${ROCKSDB_BASE}/utilities/transactions/optimistic_transaction.cc \ + ${ROCKSDB_BASE}/utilities/transactions/optimistic_transaction_db_impl.cc \ + ${ROCKSDB_BASE}/utilities/transactions/pessimistic_transaction.cc \ + ${ROCKSDB_BASE}/utilities/transactions/pessimistic_transaction_db.cc \ + ${ROCKSDB_BASE}/utilities/transactions/snapshot_checker.cc \ + ${ROCKSDB_BASE}/utilities/transactions/transaction_base.cc \ + ${ROCKSDB_BASE}/utilities/transactions/transaction_db_mutex_impl.cc \ + ${ROCKSDB_BASE}/utilities/transactions/transaction_util.cc \ + ${ROCKSDB_BASE}/utilities/transactions/write_prepared_txn.cc \ + ${ROCKSDB_BASE}/utilities/transactions/write_prepared_txn_db.cc \ + ${ROCKSDB_BASE}/utilities/transactions/write_unprepared_txn.cc \ + ${ROCKSDB_BASE}/utilities/transactions/write_unprepared_txn_db.cc \ + ${ROCKSDB_BASE}/utilities/ttl/db_ttl_impl.cc \ + ${ROCKSDB_BASE}/utilities/wal_filter.cc \ + ${ROCKSDB_BASE}/utilities/write_batch_with_index/write_batch_with_index.cc \ + ${ROCKSDB_BASE}/utilities/write_batch_with_index/write_batch_with_index_internal.cc + +# It's tedious to extract only relevant headers. Instead we just take them all, even +# if it's more data. +# +tar --wildcards -xzf "$ROCKSDB_ARCHIVE" '*.h' + +# We build from a particular released version, so no git revision information +# is available +sed \ + -e 's/@GIT_SHA@//' \ + -e 's/@GIT_TAG@/""/' \ + -e 's/@GIT_MOD@/0/' \ + -e 's/@BUILD_DATE@/""/' \ + -e 's/@GIT_DATE@/""/' \ + -e 's/@ROCKSDB_PLUGIN_BUILTINS@//' \ + -e 's/@ROCKSDB_PLUGIN_EXTERNS@//' \ + "${ROCKSDB_BASE}/util/build_version.cc.in" > "${ROCKSDB_BASE}/util/build_version.cc" + + From 9844b1e891478dd21932c6fbaba52d79cb4492ee Mon Sep 17 00:00:00 2001 From: chessai Date: Wed, 26 Jun 2024 20:24:20 -0500 Subject: [PATCH 3/4] add rocksdb test suite Change-Id: I9b5ae5d568cdff4605a68ac9c912fd312e2290b0 --- chainweb.cabal | 19 ++++++++++ vendored/rocksdb-haskell-kadena/test/Main.hs | 37 ++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 vendored/rocksdb-haskell-kadena/test/Main.hs diff --git a/chainweb.cabal b/chainweb.cabal index 7913456e3c..557d2bdfdf 100644 --- a/chainweb.cabal +++ b/chainweb.cabal @@ -567,6 +567,25 @@ library rocksdb-haskell-kadena vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_logger.cc vendored/rocksdb-haskell-kadena/rocksdb-8.3.2/port/win/win_thread.cc +test-suite rocksdb-haskell-kadena-tests + import: warning-flags, debugging-flags + hs-source-dirs: vendored/rocksdb-haskell-kadena/test/ + default-language: Haskell2010 + main-is: Main.hs + type: exitcode-stdio-1.0 + build-depends: + , base >= 4.10 && < 5 + , chainweb:rocksdb-haskell-kadena + , hspec >= 1.8 + , process >= 1.1.0.2 + , bytestring >= 0.10.4.0 + , data-default + , resourcet + , transformers + , temporary + , hspec-expectations + , QuickCheck + library chainweb-storage import: warning-flags, debugging-flags hs-source-dirs: vendored/chainweb-storage/src diff --git a/vendored/rocksdb-haskell-kadena/test/Main.hs b/vendored/rocksdb-haskell-kadena/test/Main.hs new file mode 100644 index 0000000000..858a9597f8 --- /dev/null +++ b/vendored/rocksdb-haskell-kadena/test/Main.hs @@ -0,0 +1,37 @@ +{-# LANGUAGE BinaryLiterals #-} +{-# LANGUAGE OverloadedStrings #-} + +module Main where + +import Control.Monad.IO.Class (MonadIO (liftIO)) +import Control.Monad.Trans.Resource (MonadResource, runResourceT) +import Data.Default (def) +import System.IO.Temp (withSystemTempDirectory) +import Database.RocksDB (Compression (..), Options(..), DB, defaultOptions, open, get, put) +import Test.Hspec (describe, hspec, it, shouldReturn) +import Test.QuickCheck (Arbitrary (..), UnicodeString (..), generate) + +initializeDB :: MonadResource m => FilePath -> m DB +initializeDB path = + open + path + defaultOptions {createIfMissing = True, compression = NoCompression} + +main :: IO () +main = hspec $ do + describe "Basic DB Functionality" $ do + it "should put items into the database and retrieve them" $ do + runResourceT $ withSystemTempDirectory "rocksdb" $ \path -> do + db <- initializeDB path + put db def "zzz" "zzz" + get db mempty "zzz" + `shouldReturn` (Just "zzz") + + it "should put items into a database whose filepath has unicode characters and\ + \ retrieve them" $ do + runResourceT $ withSystemTempDirectory "rocksdb" $ \path -> do + unicode <- getUnicodeString <$> liftIO (generate arbitrary) + db <- initializeDB $ path ++ unicode + put db def "zzz" "zzz" + get db mempty "zzz" + `shouldReturn` (Just "zzz") From 0d99abdc4cfe375d3eb1a86907b3ce3e6b60b32d Mon Sep 17 00:00:00 2001 From: chessai Date: Wed, 26 Jun 2024 20:55:20 -0500 Subject: [PATCH 4/4] add chainweb-storage test suite Change-Id: Iae52a8990380c1d077629b91aaed2f8ded327761 --- chainweb.cabal | 19 ++ vendored/chainweb-storage/test/Main.hs | 309 +++++++++++++++++++++++++ 2 files changed, 328 insertions(+) create mode 100644 vendored/chainweb-storage/test/Main.hs diff --git a/chainweb.cabal b/chainweb.cabal index 557d2bdfdf..ff89b2fa12 100644 --- a/chainweb.cabal +++ b/chainweb.cabal @@ -619,6 +619,25 @@ library chainweb-storage , unordered-containers >=0.2 , vector >=0.12 +test-suite chainweb-storage-tests + import: warning-flags, debugging-flags + hs-source-dirs: vendored/chainweb-storage/test + default-language: Haskell2010 + main-is: Main.hs + type: exitcode-stdio-1.0 + ghc-options: + -threaded + -with-rtsopts=-N + build-depends: + , chainweb:chainweb-storage + , async >= 2.2 + , base >=4.10 && < 5 + , bytestring >= 0.10 + , exceptions >= 0.10 + , lens >= 4.16 + , nothunks >= 0.1.0.0 + , vector >= 0.12 + library import: warning-flags, debugging-flags default-language: Haskell2010 diff --git a/vendored/chainweb-storage/test/Main.hs b/vendored/chainweb-storage/test/Main.hs new file mode 100644 index 0000000000..07f3d84de8 --- /dev/null +++ b/vendored/chainweb-storage/test/Main.hs @@ -0,0 +1,309 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ImportQualifiedPost #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +-- | +-- Module: Main +-- Copyright: Copyright © 2019 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz +-- Stability: experimental +-- +-- Tests for "Chainweb.Storage.RocksDB" +module Main + ( main, + ) +where + +import Chainweb.Storage.Table (IsCasValue(..), Iterator(..), IterableTable(..), Entry(..), ReadableTable(..), casInsert, casInsertBatch, casDelete, casDeleteBatch, tableInsert, tableInsertBatch, tableDelete, tableLookupBatch) +import Chainweb.Storage.Table.RocksDB (RocksDb, RocksDbTable, RocksDbUpdate(..), Codec(..), updateBatch, newTable, tableMinKey, tableMaxKey, tableMinValue, tableMaxValue, withTempRocksDb) +import Control.Concurrent.Async (mapConcurrently_) +import Control.Exception (Exception) +import Control.Lens (_1, _2, folded, firstOf, lastOf) +import Control.Monad (unless) +import Control.Monad.Catch (throwM) +import Data.ByteString.Char8 qualified as B8 +import Data.Foldable (forM_, traverse_) +import Data.List qualified as List +import GHC.Stack (HasCallStack, callStack, prettyCallStack) +import NoThunks.Class (NoThunks, unsafeNoThunks) +import Text.Read (readEither) + +-- -------------------------------------------------------------------------- -- +-- Utils + +data RocksDbTableTestException + = CodecException !String + | RocksDbTableTestFailure !String + +instance Show RocksDbTableTestException where + show (CodecException str) = + unlines + [ "Codec exception" + , str + ] + show (RocksDbTableTestFailure str) = + unlines + [ "RocksDb table test failure" + , str + ] + +instance Exception RocksDbTableTestException + +assertIO :: HasCallStack => Eq a => Show a => IO a -> a -> IO () +assertIO f r = + f >>= \a -> + unless (a == r) $ + throwM $ + RocksDbTableTestFailure $ unlines + [ "test failed:" + , unwords ["expected:", show r] + , unwords ["actual:", show a] + , prettyCallStack callStack + ] + +assertNoThunks :: (HasCallStack, NoThunks a) => a -> IO () +assertNoThunks a = case unsafeNoThunks $! a of + Nothing -> return () + Just e -> + throwM $ + RocksDbTableTestFailure $ unlines + [ "test failed:" + , unwords ["unexpected thunk:", show e] + , prettyCallStack callStack + ] +{-# noinline assertNoThunks #-} + +-- -------------------------------------------------------------------------- -- +-- Test Table + +intCodec :: Codec Int +intCodec = Codec + (B8.pack . show) + (either (throwM . CodecException) return . readEither @Int . B8.unpack) + +intTable :: RocksDb -> B8.ByteString -> RocksDbTable Int Int +intTable db tableName = newTable db intCodec intCodec [tableName] + +-- -------------------------------------------------------------------------- -- +-- Tests + +assertEmptyTable :: HasCallStack => RocksDbTable Int Int -> IO () +assertEmptyTable t = do + assertNoThunks t + assertIO (tableLookup t 1) Nothing + assertEntries t [] + +assertEntries :: HasCallStack => RocksDbTable Int Int -> [(Int, Int)] -> IO () +assertEntries t l_ = do + assertNoThunks t + forM_ l $ \(k, v) -> assertIO (tableLookup t k) (Just v) + assertIO (tableLookupBatch t ks) (Just <$> vs) + + assertIO (tableMinKey t) (firstOf (folded . _1) l) + assertIO (tableMinValue t) (firstOf (folded . _2) l) + + assertIO (tableMaxKey t) (lastOf (folded . _1) l) + assertIO (tableMaxValue t) (lastOf (folded . _2) l) + + -- check forward iteration and first and last + withTableIterator t $ \i -> do + assertIO (iterFirst i >> iterKey i) (firstOf (folded . _1) l) + assertIO (iterLast i >> iterKey i) (lastOf (folded . _1) l) + iterFirst i + assertIO (iterValid i) (not $ null l) + forM_ l $ \(k, v) -> do + assertIO (iterEntry i) (Just (Entry k v)) + iterNext i + assertIO (iterValid i) False + + -- check backward iteration + withTableIterator t $ \i -> do + iterLast i + assertIO (iterValid i) (not $ null l) + forM_ (reverse l) $ \(k, v) -> do + assertIO (iterEntry i) (Just (Entry k v)) + iterPrev i + assertIO (iterValid i) False + where + l = List.sort l_ + (ks, vs) = unzip l + +tableTests :: HasCallStack => RocksDb -> B8.ByteString -> IO () +tableTests db tableName = do + assertNoThunks t + assertEmptyTable t + + tableInsert t 1 8 + assertEntries t [(1, 8)] + + tableInsert t 2 9 + assertEntries t [(1, 8), (2, 9)] + + tableDelete t 1 + assertEntries t [(2, 9)] + + tableInsert t 2 8 + assertEntries t [(2, 8)] + + tableDelete t 2 + assertEmptyTable t + where + !t = intTable db tableName + +tableBatchTests :: HasCallStack => RocksDb -> B8.ByteString -> IO () +tableBatchTests db tableName = do + assertNoThunks t + assertEmptyTable t + + updateBatch [] + assertEmptyTable t + + updateBatch [RocksDbInsert t 1 8] + assertEntries t [(1, 8)] + + updateBatch [RocksDbInsert t 2 9] + assertEntries t [(1, 8), (2, 9)] + + updateBatch [RocksDbDelete t 2] + assertEntries t [(1, 8)] + + updateBatch [RocksDbInsert t 2 9, RocksDbDelete t 2] + assertEntries t [(1, 8)] + + updateBatch [RocksDbInsert t 2 9, RocksDbDelete t 2, RocksDbInsert t 2 9] + assertEntries t [(1, 8), (2, 9)] + + updateBatch [RocksDbInsert t 1 8, RocksDbDelete t 1] + assertEntries t [(2, 9)] + + updateBatch [RocksDbInsert t 1 7, RocksDbInsert t 1 8, RocksDbInsert t 1 8] + assertEntries t [(1, 8), (2, 9)] + + updateBatch [RocksDbDelete t 1, RocksDbInsert t 3 7] + assertEntries t [(2, 9), (3, 7)] + + updateBatch [RocksDbInsert t 4 6, RocksDbInsert t 5 5] + assertEntries t [(2, 9), (3, 7), (4, 6), (5, 5)] + + updateBatch [RocksDbDelete t 2, RocksDbDelete t 3, RocksDbDelete t 4, RocksDbDelete t 5] + assertEmptyTable t + where + t = intTable db tableName + +-- Orphan instance +-- +instance IsCasValue Int where + type CasKeyType Int = Int + casKey = (+ 10) + +casBatchTests :: HasCallStack => RocksDb -> B8.ByteString -> IO () +casBatchTests db tableName = do + assertEmptyTable t + + tableInsertBatch t mempty + assertEmptyTable t + + casInsertBatch t [1] + assertCasEntries t [1] + + casInsertBatch t [2] + assertCasEntries t [1, 2] + + casDeleteBatch t [2] + assertCasEntries t [1] + + casInsertBatch t [1] + assertCasEntries t [1] + + casInsertBatch t [2, 2, 2] + assertCasEntries t [1, 2] + + casInsertBatch t [1, 2, 3, 4] + assertCasEntries t [1, 2, 3, 4] + + casDeleteBatch t [5] + assertCasEntries t [1, 2, 3, 4] + + casDeleteBatch t [1, 3, 1] + assertCasEntries t [2, 4] + + casDeleteBatch t [] + assertCasEntries t [2, 4] + + casDeleteBatch t [2, 4] + assertEmptyTable t + where + t = intTable db tableName + +casTests :: HasCallStack => RocksDb -> B8.ByteString -> IO () +casTests db tableName = do + assertEmptyTable t + assertIO (tableMember t 1) False + assertIO (tableLookup t 1) Nothing + + casInsertBatch t mempty + assertEmptyTable t + + casInsert t 1 + assertCasEntries t [1] + assertIO (tableMember t (casKey @Int 1)) True + assertIO (tableLookup t (casKey @Int 1)) (Just 1) + + casInsert t 2 + assertCasEntries t [1, 2] + assertIO (tableMember t (casKey @Int 1)) True + assertIO (tableMember t (casKey @Int 2)) True + assertIO (tableLookup t (casKey @Int 1)) (Just 1) + assertIO (tableLookup t (casKey @Int 2)) (Just 2) + assertIO (tableLookupBatch t [casKey @Int 1, casKey @Int 2]) [Just 1, Just 2] + + casDelete t 2 + assertCasEntries t [1] + assertIO (tableMember t (casKey @Int 1)) True + assertIO (tableMember t (casKey @Int 2)) False + assertIO (tableLookup t (casKey @Int 1)) (Just 1) + assertIO (tableLookup t (casKey @Int 2)) Nothing + assertIO (tableLookupBatch t [casKey @Int 1, casKey @Int 2]) [Just 1, Nothing] + + casInsert t 1 + assertCasEntries t [1] + + traverse_ @[] (casInsert t) [2, 2, 2] + assertCasEntries t [1, 2] + + traverse_ @[] (casInsert t) [1, 2, 3, 4] + assertCasEntries t [1, 2, 3, 4] + + casDelete t (casKey @Int 5) + assertCasEntries t [1, 2, 3, 4] + + traverse_ @[] (casDelete t) [1, 3, 1] + assertCasEntries t [2, 4] + + traverse_ @[] (casDelete t) [2, 4] + assertEmptyTable t + where + t = intTable db tableName + +assertCasEntries :: HasCallStack => RocksDbTable Int Int -> [Int] -> IO () +assertCasEntries t l = do + assertNoThunks t + assertEntries t [(casKey v, v) | v <- l] + +-- -------------------------------------------------------------------------- -- +-- Main + +main :: IO () +main = withTempRocksDb "testDb" $ \db -> do + tableTests db "testTable0" + mapConcurrently_ + (\i -> tableTests db $ "testTable" <> B8.pack (show i)) + ([0 .. 100] :: [Int]) + tableBatchTests db "testTable" + casTests db "testTable" + casBatchTests db "testTable"